[open-ils-commits] r1236 - constrictor/trunk/samples (erickson)

svn at svn.open-ils.org svn at svn.open-ils.org
Wed Feb 23 11:31:09 EST 2011


Author: erickson
Date: 2011-02-23 11:31:06 -0500 (Wed, 23 Feb 2011)
New Revision: 1236

Modified:
   constrictor/trunk/samples/web_spider.py
Log:
improved url deconstruction; ignore JS and anchor links

Modified: constrictor/trunk/samples/web_spider.py
===================================================================
--- constrictor/trunk/samples/web_spider.py	2011-02-23 16:12:34 UTC (rev 1235)
+++ constrictor/trunk/samples/web_spider.py	2011-02-23 16:31:06 UTC (rev 1236)
@@ -63,19 +63,25 @@
 
             return proto, host, path
 
-           
             
         def handle_starttag(self, tag, attrs): 
 
             if tag == 'a' and attrs: 
 
-                link = attrs[0][1] 
 
+                link = [h for h in attrs if h[0] == 'href']
+                if len(link) == 0: return
+                link = link[0][1] 
+
                 if link[:4] != "http": 
                     proto, host, path = self.url_parts(self.url)
 
+                    # Ignore href=javascript:foo and page anchors
+                    if link[:11] == 'javascript:' or link[:1] == '#':
+                        return
+
                     if link[:1] == '/': # full path
-                        link = "%s://%s%s" % (proto, host, link)
+                        path = link
 
                     elif link[:1] == '?': # GET params only
                         res = re.match('(.*)\?.*', path)



More information about the open-ils-commits mailing list