Sample Script
SS_WebPageToCSV
Purpose
Extracts a table from a web page and converts it to CSV (Comma Separated Values) format.
Source Code
#####################################################################
# SCRIPT: SS_WebPageToCSV.txt
#
# This script extracts a table from a web page and creates corresponding CSV
# (Comma Separated Values) output. The CSV output can then be saved to a file.
# The file can then be used with a spreadsheet program.
#
# The name of the web page is passed as input argument/FVA $page to this script.
# It can be either a web page or a local file and has one of the following forms -
# "http://www.xxx.yyy/.../zzz.html", "C:/.../file.html" . We are using the extension
# of .html as an example only, the script will accept any extension such as .asp, .php, etc.
#
# A web page may have several tables. To extract the correct table, these tables are numbered
# starting at 1. For example, the 12th table starts at the 12th instance of <table in the web
# page. The number of the table to extract is passed to this script as input argument/FVA $number.
# It can be 1, 12, etc.
#
# Download this script into directory C:/Scripts to a file named SS_WebPageToCSV.txt.
# Then call it as below.
#
# script "C:/Scripts/SS_WebPageToCSV.txt" page("http://www.xxx.yyy/.../zzz.mmm") number(12)
#
# The above will produce text output on screen. If you want to store the output in a file,
# simply redirect the script output to a local file, as below -
#
# script "C:/Scripts/SS_WebPageToCSV.txt" page("http://www.xxx.yyy/.../zzz.mmm") number(12) > "C:/table.csv"
#
# This script makes use of other biterscripting sample scripts. To install all sample scripts
# all at once, use the following command.
#
# script "http://www.biterscripting.com/Download/SS_AllSamples.txt"
#
# The above will install all sample scripts in the directory C:/Scripts.
# If you don't have biterscripting, you can download it from biterscripting.com .
#
#####################################################################
var str page # name of input page or file
var int number # number of the table to convert to CSV format.
# Read the file contents into a variable $html.
var str html
cat $page > $html
# Extract the number'th table.
script "C:/Scripts/SS_ExtractTable.txt" input($html) number($number) > $html
# Extract rows one by one. The rows are between <tr...</tr> pairs.
# We will insert a marker [ROW SEPARATOR] after each row.
# We will collect output in a str variable $table.
var str table, row, column
while ( { sen -c -r "^<tr&</tr&\>^" $html } > 0 ) # We are backslashing > , because it has special meaning in regular expressions.
do
stex -c -r "^<tr&</tr&\>^" $html> $row
# The row is in $row.
# Extract columns one by one. The columns are between <td...</td> pairs.
# We will insert a marker [COLUMN SEPARATOR] after each column.
while ( { sen -c -r "^<td&</td&\>^" $row } > 0 ) # We are backslashing > , because it has special meaning in regular expressions.
do
stex -c -r "^<td&</td&\>^" $row > $column
echo $column >> $table
echo "[COLUMN SEPARATOR]" >> $table
done
# Add the ROW SEPARATOR
echo "[ROW SEPARATOR]" >> $table
done
# $table now has rows separated by ROW SEPARATORs and columns separated by COLUMN SEPARATORs.
# $table still has html tags in it. These include <td we left above.
# But, it may also have other tags such as <br>, <p>, <div, {...}, &..;, etc.
# We will remove all these tags using sample script SS_RemoveTags.
script SS_RemoveTags.txt input($table) start_tag("<") end_tag(">") > $table
script SS_RemoveTags.txt input($table) start_tag("{") end_tag("}") > $table
script SS_RemoveTags.txt input($table) start_tag("&") end_tag(";") > $table
# Remove all extract spaces, tabs, etc. We will replace them with one space.
# We will collect the output in a str variable $csv. We will also use
# a temporary variable $temp.
var str csv, temp
while ( { sen -r "^,^" $table } > 0 ) # Note that the , here means ONE unprintable character in regular expressions.
do
stex -r "]^,^" $table > $temp
set $csv = $csv + $temp + " "
stex -r "^;^]" $table > null # Will will discard this output.
done
# There may be something left in $table
set $csv = $csv + $table
# Replace ROW SEPARATORs with newlines and COLUMN SEPARATORs with commas.
while ( { sen "^[ROW SEPARATOR]^" $csv } > 0 )
sal "^[ROW SEPARATOR]^" "\n" $csv > null
while ( { sen "^[COLUMN SEPARATOR]^" $csv } > 0 )
sal "^[COLUMN SEPARATOR]^" "," $csv > null # Use a \t (tab) here instead of comma (,) for tab separated columns.
# We will remove empty rows. You can take this code out if
# you wish to retain empty rows.
set $temp = ""
while ($csv <> "")
do
lex -e "1" $csv > $row
# Is there anything in $row other than space or comma ?
if ( { sen -r "^(# \,)^" $row } > 0 )
do
echo $row >> $temp
echo "\n" >> $temp
done
endif
done
set $csv = $temp ; set $temp = ""
# $csv now has rows separated by newlines, columns separated by commas, all html tags removed
# and extra spaces, newlines, etc. replaced by a single space.
echo $csv
|
© 2008-2012, biterScripting.com. All rights reserved.
biterScripting, biterScript, biterBrowser, biterMobile, biterScripting.com, FVA (Forward Variable Assignment) are trademarks of biterScripting.com. Is it biterScripting-compatible ? is a service mark of biterScripting.com.
Explorer, Unix, Windows are trademarks, service marks or other forms of intellectual property of their respective owners.
|