##################################################################### # SCRIPT: SS_WebPageToCSV.txt # # This script extracts a table from a web page and creates corresponding CSV # (Comma Separated Values) output. The CSV output can then be saved to a file. # The file can then be used with a spreadsheet program. # # The name of the web page is passed as input argument/FVA $page to this script. # It can be either a web page or a local file and has one of the following forms - # "http://www.xxx.yyy/.../zzz.html", "C:/.../file.html" . We are using the extension # of .html as an example only, the script will accept any extension such as .asp, .php, etc. # # A web page may have several tables. To extract the correct table, these tables are numbered # starting at 1. For example, the 12th table starts at the 12th instance of "C:/table.csv" # # This script makes use of other biterscripting sample scripts. To install all sample scripts # all at once, use the following command. # # script "http://www.biterscripting.com/Download/SS_AllSamples.txt" # # The above will install all sample scripts in the directory C:/Scripts. # If you don't have biterscripting, you can download it from biterscripting.com . # ##################################################################### var str page # name of input page or file var int number # number of the table to convert to CSV format. # Read the file contents into a variable $html. var str html cat $page > $html # Extract the number'th table. script "C:/Scripts/SS_ExtractTable.txt" input($html) number($number) > $html # Extract rows one by one. The rows are between pairs. # We will insert a marker [ROW SEPARATOR] after each row. # We will collect output in a str variable $table. var str table, row, column while ( { sen -c -r "^^" $html } > 0 ) # We are backslashing > , because it has special meaning in regular expressions. do stex -c -r "^^" $html> $row # The row is in $row. # Extract columns one by one. The columns are between pairs. # We will insert a marker [COLUMN SEPARATOR] after each column. while ( { sen -c -r "^^" $row } > 0 ) # We are backslashing > , because it has special meaning in regular expressions. do stex -c -r "^^" $row > $column echo $column >> $table echo "[COLUMN SEPARATOR]" >> $table done # Add the ROW SEPARATOR echo "[ROW SEPARATOR]" >> $table done # $table now has rows separated by ROW SEPARATORs and columns separated by COLUMN SEPARATORs. # $table still has html tags in it. These include
,

, ") > $table script SS_RemoveTags.txt input($table) start_tag("{") end_tag("}") > $table script SS_RemoveTags.txt input($table) start_tag("&") end_tag(";") > $table # Remove all extract spaces, tabs, etc. We will replace them with one space. # We will collect the output in a str variable $csv. We will also use # a temporary variable $temp. var str csv, temp while ( { sen -r "^,^" $table } > 0 ) # Note that the , here means ONE unprintable character in regular expressions. do stex -r "]^,^" $table > $temp set $csv = $csv + $temp + " " stex -r "^;^]" $table > null # Will will discard this output. done # There may be something left in $table set $csv = $csv + $table # Replace ROW SEPARATORs with newlines and COLUMN SEPARATORs with commas. while ( { sen "^[ROW SEPARATOR]^" $csv } > 0 ) sal "^[ROW SEPARATOR]^" "\n" $csv > null while ( { sen "^[COLUMN SEPARATOR]^" $csv } > 0 ) sal "^[COLUMN SEPARATOR]^" "," $csv > null # Use a \t (tab) here instead of comma (,) for tab separated columns. # We will remove empty rows. You can take this code out if # you wish to retain empty rows. set $temp = "" while ($csv <> "") do lex -e "1" $csv > $row # Is there anything in $row other than space or comma ? if ( { sen -r "^(# \,)^" $row } > 0 ) do echo $row >> $temp echo "\n" >> $temp done endif done set $csv = $temp ; set $temp = "" # $csv now has rows separated by newlines, columns separated by commas, all html tags removed # and extra spaces, newlines, etc. replaced by a single space. echo $csv