[open-ils-commits] r1236 - constrictor/trunk/samples (erickson)
svn at svn.open-ils.org
svn at svn.open-ils.org
Wed Feb 23 11:31:09 EST 2011
Author: erickson
Date: 2011-02-23 11:31:06 -0500 (Wed, 23 Feb 2011)
New Revision: 1236
Modified:
constrictor/trunk/samples/web_spider.py
Log:
improved url deconstruction; ignore JS and anchor links
Modified: constrictor/trunk/samples/web_spider.py
===================================================================
--- constrictor/trunk/samples/web_spider.py 2011-02-23 16:12:34 UTC (rev 1235)
+++ constrictor/trunk/samples/web_spider.py 2011-02-23 16:31:06 UTC (rev 1236)
@@ -63,19 +63,25 @@
return proto, host, path
-
def handle_starttag(self, tag, attrs):
if tag == 'a' and attrs:
- link = attrs[0][1]
+ link = [h for h in attrs if h[0] == 'href']
+ if len(link) == 0: return
+ link = link[0][1]
+
if link[:4] != "http":
proto, host, path = self.url_parts(self.url)
+ # Ignore href=javascript:foo and page anchors
+ if link[:11] == 'javascript:' or link[:1] == '#':
+ return
+
if link[:1] == '/': # full path
- link = "%s://%s%s" % (proto, host, link)
+ path = link
elif link[:1] == '?': # GET params only
res = re.match('(.*)\?.*', path)
More information about the open-ils-commits
mailing list