[open-ils-commits] r1411 - in constrictor/trunk: . samples (erickson)

svn at svn.open-ils.org svn at svn.open-ils.org
Thu Apr 21 10:40:49 EDT 2011


Author: erickson
Date: 2011-04-21 10:40:46 -0400 (Thu, 21 Apr 2011)
New Revision: 1411

Modified:
   constrictor/trunk/constrictor.properties
   constrictor/trunk/samples/web_spider.py
Log:
allow for a list of limit_path options for tighter control on which resources to crawl

Modified: constrictor/trunk/constrictor.properties
===================================================================
--- constrictor/trunk/constrictor.properties	2011-04-20 19:41:07 UTC (rev 1410)
+++ constrictor/trunk/constrictor.properties	2011-04-21 14:40:46 UTC (rev 1411)
@@ -42,7 +42,7 @@
 constrictor.plugin.web_spider.max_pages=100
 
 # Only allow the spider to fetch pages with a certain base path
-constrictor.plugin.web_spider.limit_path=/somepath
+constrictor.plugin.web_spider.limit_paths=/somepath,/otherpath
 
 
 

Modified: constrictor/trunk/samples/web_spider.py
===================================================================
--- constrictor/trunk/samples/web_spider.py	2011-04-20 19:41:07 UTC (rev 1410)
+++ constrictor/trunk/samples/web_spider.py	2011-04-21 14:40:46 UTC (rev 1411)
@@ -29,7 +29,7 @@
 
     class Spider(HTMLParser): 
 
-        def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]): 
+        def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]): 
 
             HTMLParser.__init__(self) 
             self.url = url
@@ -38,7 +38,7 @@
             self.max_visits = max_visits
             self.allowed_hosts = allowed_hosts
             proto, self.host, path = self.url_parts(url)
-            self.limit_path = limit_path
+            self.limit_paths = limit_paths
 
             try:
                 foo = self.allowed_hosts.index(self.host)
@@ -120,11 +120,16 @@
                     log_info("Skipping remote host %s..." % host)
                     continue
 
-                if self.limit_path:
-                    if path[:len(self.limit_path)] != self.limit_path:
-                        log_info("Skipping forbidden base path %s..." % path)
-                        continue
+                valid = False;
+                for lpath in self.limit_paths:
+                    if path[:len(lpath)] == lpath:
+                        valid = True
+                        break
 
+                if not valid:
+                    log_info("Skipping forbidden base path %s..." % path)
+                    continue
+
                 try: 
                     log_info("Opening URL %s" % self.url)              
                     res = PageFetchTask(self).start()
@@ -141,14 +146,15 @@
         props = Properties.get_properties()
         start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
         max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
-        limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+        limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths')
+        limit_paths = limit_paths.split(',')
 
         if not start_url or not max_pages:
             log_error("Missing required properties: " +
                 "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
             return False
         
-        spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+        spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths)
         result = spider.crawl() 
 
         return True



More information about the open-ils-commits mailing list