##################################################################### # SCRIPT: SS_WebPageToCSV.txt # # This script extracts a table from a web page and creates corresponding CSV # (Comma Separated Values) output. The CSV output can then be saved to a file. # The file can then be used with a spreadsheet program. # # The name of the web page is passed as input argument/FVA $page to this script. # It can be either a web page or a local file and has one of the following forms - # "http://www.xxx.yyy/.../zzz.html", "C:/.../file.html" . We are using the extension # of .html as an example only, the script will accept any extension such as .asp, .php, etc. # # A web page may have several tables. To extract the correct table, these tables are numbered # starting at 1. For example, the 12th table starts at the 12th instance of
| ^" $row } > 0 ) # We are backslashing > , because it has special meaning in regular expressions. do stex -c -r "^ | ^" $row > $column echo $column >> $table echo "[COLUMN SEPARATOR]" >> $table done # Add the ROW SEPARATOR echo "[ROW SEPARATOR]" >> $table done # $table now has rows separated by ROW SEPARATORs and columns separated by COLUMN SEPARATORs. # $table still has html tags in it. These include | , , ") > $table
script SS_RemoveTags.txt input($table) start_tag("{") end_tag("}") > $table
script SS_RemoveTags.txt input($table) start_tag("&") end_tag(";") > $table
# Remove all extract spaces, tabs, etc. We will replace them with one space.
# We will collect the output in a str variable $csv. We will also use
# a temporary variable $temp.
var str csv, temp
while ( { sen -r "^,^" $table } > 0 ) # Note that the , here means ONE unprintable character in regular expressions.
do
stex -r "]^,^" $table > $temp
set $csv = $csv + $temp + " "
stex -r "^;^]" $table > null # Will will discard this output.
done
# There may be something left in $table
set $csv = $csv + $table
# Replace ROW SEPARATORs with newlines and COLUMN SEPARATORs with commas.
while ( { sen "^[ROW SEPARATOR]^" $csv } > 0 )
sal "^[ROW SEPARATOR]^" "\n" $csv > null
while ( { sen "^[COLUMN SEPARATOR]^" $csv } > 0 )
sal "^[COLUMN SEPARATOR]^" "," $csv > null # Use a \t (tab) here instead of comma (,) for tab separated columns.
# We will remove empty rows. You can take this code out if
# you wish to retain empty rows.
set $temp = ""
while ($csv <> "")
do
lex -e "1" $csv > $row
# Is there anything in $row other than space or comma ?
if ( { sen -r "^(# \,)^" $row } > 0 )
do
echo $row >> $temp
echo "\n" >> $temp
done
endif
done
set $csv = $temp ; set $temp = ""
# $csv now has rows separated by newlines, columns separated by commas, all html tags removed
# and extra spaces, newlines, etc. replaced by a single space.
echo $csv
|