PRODUCT |

|

|
|

|
|
|
|
|
FAQ |
|

|
|
|
|
|
|
LEARN SCRIPTING |
|

|
|
|
|
|
|
|
|
|
|
SAMPLE SCRIPTS |
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
HELP / DOCUMENTATION |
|

|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
Help Page - SS_URLs
( Some help pages may not display correctly in html because those help pages may have sample code in them, part of which may be mis-interpreted as html tags.
All help pages, including this help page, are available in biterScripting with the help command. )
|
|
Sample Script
SS_URLs
Purpose
Collects URLs referenced by a URL.
Source Code
#####################################################################
# SCRIPT: SS_URLs
#
# This script extracts a list of URL addresses from one URL.
# The URL is assigned using FVA (Forward Variable Assignment)
# for str variable $URL. The value of $URL is of form
# "http://www.xxx.yyy" or "http://xxx.yyy.zzz/www.../qqq.html"
#
# The list of domains to be ignored is passed using FVA for str variabls
# $ignore_domains. The format is <domain>|<domain>|<domain> ...
# Each <domain> is in the form "http://www.abc.def" .
#
# All URLs and domains passed to this script need to be in the standard format
# http://www.abc.def/ or http://www.abc.def/...../page.html
# The http:// part IS REQUIRED. The page name extension can be anything such as
# .html, .aspx, .js, etc.
#
# The list of URL addresses found on this web page is written
# to output stream, one URL address per line.
#
# This script can be stored, and edited as needed, in a file called
# SS_URLs.txt. The script can then be called as
#
# script SS_URLs.txt URL("<URL>") ignore_domains("<domain>|<domain>|...")
#
#####################################################################
var str URL # Name of the URL
var str ignore_domains # List of domains to be ignored. Domains are separated by |.
var str foundURL # URLs found, one at a time
var str content # Collects the contents of the URL here.
# $content is progressively cut down as each new URL is found.
repro $URL >$content
# We will change the value of $wsep to suit our purpose.
# But, we will save the original value, so we can restore it
# after we are done.
var str saved_wsep
set $saved_wsep = $wsep
set $wsep="<> \t,;?&\\\"\r\n"
while ( { sen -c "^href=\"^" $content } > 0 )
do
# Strip off the portion upto and including "href=\""
# We will use case-insensitive so it can match any of href, HREF, etc.
stex -c "^href=\"^]" $content > null
# The remaining $content now starts with a URL address followed by
# a double quote (") or a question mark (?).
wex "1" $content > $foundURL # Get the URL address
# Remove anything beginning with #.
stex "[^#^" $foundURL >null
# Process this URL further only if it is not empty, and it is not the same URL as $URL ?
if ( ($foundURL <> "") AND ($foundURL <> $URL ) )
do
# Output the found URL. But first check if this is
# an absolute or relative URL.
var str fullURL
var str s
set $s = { stex -c -p "[^http:/^" $foundURL }
if ( $s == $foundURL )
do
# This is absolute URL.
set $fullURL = $foundURL
done
else
do
# Prepend URL with our own address.
set $fullURL = $URL+"/"+$foundURL
done
endif
# Cure the format of this URL.
script SS_CureURL.txt URL($fullURL) > $fullURL
# Check if this URL is to be ignored. The SS_IgnoreURL will assign an
# empty value to $fullURL if this URL is to be ignored.
script SS_IgnoreURL.txt URL($fullURL) ignore_domains($ignore_domains) > $fullURL
# Is this URL (name) empty ?
if ( $fullURL <> "" )
echo $fullURL
endif
done
endif
done
# Restore original value of $wsep.
set $wsep = $saved_wsep
|
|
© 2008-2013, biterScripting.com. All rights reserved.
biterScripting, biterScript, biterBrowser, biterMobile, biterScripting.com, FVA (Forward Variable Assignment) are trademarks of biterScripting.com. Is it biterScripting-compatible ? is a service mark of biterScripting.com.
Explorer, Unix, Windows are trademarks, service marks or other forms of intellectual property of their respective owners.
|
|