##################################################################### # SCRIPT: SS_WebToText.txt # # This script reads web page files .html and writes corresponding plain text # versions into files .txt in the same directory. It does this for # all .html pages within a specified directory. The plain text versions # of web pages, thus created, can then be used for spell-checking, review by # legal department, inclusion into legal documents, inclusion into # requirements/specifications documents, keeping the page lengths within limits, or other # purposes. The script can handle a large number (including thousands) # of web pages. # # Download this script into directory C:/Scripts to a file named sS_WebToText.txt # # Let's say you have all your web pages in a directory C:/myproject. # In that case, invoke biterScripting Interactive and type the following # on the command line. # # script "C:/Scripts/SS_WebToText.txt" dir("C:/myproject") files("*.html") # # The above needs to be entered exactly as shown here. The script will execute # and create respective .txt files. For example, if the web page is # C:/myproject/disclaimer/newcustomers.html, the corresponding text version # will be in file C:/myproject/disclaimer/newcustomers.txt . # # If the number of web pages is really large, or if the web page contents are # huge, the script, depending on the speed of your computer, may take some time # to complete execution. The script will execute in the background, so you can # continue to use your computer for other purposes while the script is running. # # The script does not change the original (input) web pages. # # If you don't have biterscripting, you can download it from biterScripting.com . # ##################################################################### var str files # patterns for file names var str dir # dir where entire project is # Collect a list of files var str fileList find -rn $files $dir > $fileList # We will collect the text version into the following variable var str text_version # Process files one by one while ( $fileList <> "") do # Get the next file var str file lex "1" $fileList > $file # Use script SS_WebPageToText to convert the contents of this file into plain text. script SS_WebPageToText.txt page($file) > $text_version # Get the file name without the ending .html, etc. stex "[^.^l" $file > null # Add .txt extension to file name. set $file = $file + ".txt" # Write the modified content to the .txt file. echo $text_version > { echo $file } done # end of do after while ( $fileList <> "") # All text version are now available in corresponding .txt files in the respective directories.