[open-ils-commits] r1411 - in constrictor/trunk: . samples (erickson)
svn at svn.open-ils.org
svn at svn.open-ils.org
Thu Apr 21 10:40:49 EDT 2011
Author: erickson
Date: 2011-04-21 10:40:46 -0400 (Thu, 21 Apr 2011)
New Revision: 1411
Modified:
constrictor/trunk/constrictor.properties
constrictor/trunk/samples/web_spider.py
Log:
allow for a list of limit_path options for tighter control on which resources to crawl
Modified: constrictor/trunk/constrictor.properties
===================================================================
--- constrictor/trunk/constrictor.properties 2011-04-20 19:41:07 UTC (rev 1410)
+++ constrictor/trunk/constrictor.properties 2011-04-21 14:40:46 UTC (rev 1411)
@@ -42,7 +42,7 @@
constrictor.plugin.web_spider.max_pages=100
# Only allow the spider to fetch pages with a certain base path
-constrictor.plugin.web_spider.limit_path=/somepath
+constrictor.plugin.web_spider.limit_paths=/somepath,/otherpath
Modified: constrictor/trunk/samples/web_spider.py
===================================================================
--- constrictor/trunk/samples/web_spider.py 2011-04-20 19:41:07 UTC (rev 1410)
+++ constrictor/trunk/samples/web_spider.py 2011-04-21 14:40:46 UTC (rev 1411)
@@ -29,7 +29,7 @@
class Spider(HTMLParser):
- def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]):
+ def __init__(self, url, max_visits, limit_paths, allowed_hosts=[]):
HTMLParser.__init__(self)
self.url = url
@@ -38,7 +38,7 @@
self.max_visits = max_visits
self.allowed_hosts = allowed_hosts
proto, self.host, path = self.url_parts(url)
- self.limit_path = limit_path
+ self.limit_paths = limit_paths
try:
foo = self.allowed_hosts.index(self.host)
@@ -120,11 +120,16 @@
log_info("Skipping remote host %s..." % host)
continue
- if self.limit_path:
- if path[:len(self.limit_path)] != self.limit_path:
- log_info("Skipping forbidden base path %s..." % path)
- continue
+ valid = False;
+ for lpath in self.limit_paths:
+ if path[:len(lpath)] == lpath:
+ valid = True
+ break
+ if not valid:
+ log_info("Skipping forbidden base path %s..." % path)
+ continue
+
try:
log_info("Opening URL %s" % self.url)
res = PageFetchTask(self).start()
@@ -141,14 +146,15 @@
props = Properties.get_properties()
start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
- limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+ limit_paths = props.get_property('constrictor.plugin.web_spider.limit_paths')
+ limit_paths = limit_paths.split(',')
if not start_url or not max_pages:
log_error("Missing required properties: " +
"constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
return False
- spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+ spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_paths)
result = spider.crawl()
return True
More information about the open-ils-commits
mailing list