[open-ils-commits] r1215 - in constrictor/trunk: . constrictor samples (erickson)

svn at svn.open-ils.org svn at svn.open-ils.org
Sat Jan 29 11:06:55 EST 2011

Author: erickson
Date: 2011-01-29 11:06:52 -0500 (Sat, 29 Jan 2011)
New Revision: 1215

Web spider sample script

Simple web spider script that visits, reports, parses, collects links,
and continues until it has fetched a configured number of pages.  Each page
load is a constrictor Task so timing data can be collected.

Modified: constrictor/trunk/constrictor/data.py
--- constrictor/trunk/constrictor/data.py	2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/data.py	2011-01-29 16:06:52 UTC (rev 1215)
@@ -118,7 +118,6 @@
         for task in self.runtime_data:
             task_times += task['duration']
-            log.log_debug("Storing " + task['name'])
             if task['name'] not in task_counts:
                 task_counts[task['name']] = 0;

Modified: constrictor/trunk/constrictor/log.py
--- constrictor/trunk/constrictor/log.py	2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/log.py	2011-01-29 16:06:52 UTC (rev 1215)
@@ -27,7 +27,7 @@
 def log_error(msg=''):
     if loglevel < 1: return
     from script import ScriptThread
-    sys.stderr.write('Error[%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
+    sys.stderr.write('Err [%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
 def log_info(msg=''):
@@ -38,4 +38,4 @@
 def log_debug(msg=''):
     if loglevel < 3: return
     from script import ScriptThread
-    print 'Debug[%d]: %s' % (ScriptThread.get_thread_id(), msg)
+    print 'Debg[%d]: %s' % (ScriptThread.get_thread_id(), msg)

Modified: constrictor/trunk/constrictor/properties.py
--- constrictor/trunk/constrictor/properties.py	2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/properties.py	2011-01-29 16:06:52 UTC (rev 1215)
@@ -9,9 +9,7 @@
     - added property name sorting to the store() method
-import sys,os
-import re
-import time
+import sys,os, re, time
 class IllegalArgumentException(Exception):
@@ -318,6 +316,27 @@
         except KeyError:
             if hasattr(self._props,name):
                 return getattr(self._props, name)
+    def get_thread_prop(self, prop, unique=False):
+        from constrictor.script import ScriptThread
+        data = self.get_property(prop)
+        data = data.split(',')
+        currentThread = ScriptThread.get_thread_id()
+        totalThreads = self.get_property('constrictor.numThreads')
+        if len(data) > currentThread:
+            return data[currentThread]
+        if unique:
+            raise Exception(
+                "Too many threads for unique data.  Thread index is %d, size of dataset is %d" % (
+                    currentThread, len(data)))
+        # data sharing is OK  
+        return data[currentThread % len(data)]
 if __name__=="__main__":
     p = Properties()

Modified: constrictor/trunk/constrictor/task.py
--- constrictor/trunk/constrictor/task.py	2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/task.py	2011-01-29 16:06:52 UTC (rev 1215)
@@ -29,7 +29,9 @@
         on the actual Task object.
-    def __init__(self, name=''):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
         self.name = name

Modified: constrictor/trunk/constrictor.properties
--- constrictor/trunk/constrictor.properties	2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor.properties	2011-01-29 16:06:52 UTC (rev 1215)
@@ -33,9 +33,21 @@
 #logs to stdout and stderr.  options are 0=none,1=error,2=info,3=debug
+# ---- Setings fro sample web spider plugin  --------------
+# Initial URL.  Can be different per thread w/ comma-separated list
+# Each spider thread will stop crawling after fetching this many pages
+# Only allow the spider to fetch pages with a certain base path
 # ---- Properties for the Evergreen contrib module --------------
 # Where on the server can we find the latest IDL file

Added: constrictor/trunk/samples/web_spider.py
--- constrictor/trunk/samples/web_spider.py	                        (rev 0)
+++ constrictor/trunk/samples/web_spider.py	2011-01-29 16:06:52 UTC (rev 1215)
@@ -0,0 +1,153 @@
+# --------------------------------------------------------------
+# Simple script sample.  Eacch task just sleeps for some portion 
+# of a second
+# --------------------------------------------------------------
+import random, time, sys, re
+import urllib2
+from HTMLParser import HTMLParser 
+from constrictor.task import Task
+from constrictor.script import Script, ScriptManager
+from constrictor.properties import Properties
+from constrictor.log import *
+class PageFetchTask(Task):
+    def __init__(self, spider, name=None):
+        Task.__init__(self, name)
+        self.spider = spider
+    def run(self):
+        # fetch a single page
+        return self.spider.fetch_url()
+class WebSpiderScript(Script):
+    # Heavily modified version of the script found at 
+    # http://www.halotis.com/2009/09/16/python-web-crawler-script/
+    class Spider(HTMLParser): 
+        def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]): 
+            HTMLParser.__init__(self) 
+            self.url = url
+            self.db = {self.url: 1} 
+            self.url_list = [self.url]   
+            self.max_visits = max_visits
+            self.allowed_hosts = allowed_hosts
+            proto, self.host, path = self.url_parts(url)
+            self.limit_path = limit_path
+            try:
+                foo = self.allowed_hosts.index(self.host)
+            except ValueError:
+                self.allowed_hosts.append(self.host)
+        def url_parts(self, url):
+            proto = ''
+            host = ''
+            path = ''
+            res = re.search('^(https?)://([^\/]+)(.*)', url)
+            try:
+                proto = res.group(1)
+                host = res.group(2)
+            except IndexError:
+                raise Exception("Invalid URL: %s" % url) 
+            try:
+                path = res.group(3)
+            except IndexError:
+                pass
+            return proto, host, path
+        def handle_starttag(self, tag, attrs): 
+            if tag == 'a' and attrs: 
+                link = attrs[0][1] 
+                if link[:4] != "http": 
+                    proto, host, path = self.url_parts(self.url)
+                    if link[:1] == '/': # full path
+                        link = "%s://%s%s" % (proto, host, link)
+                    elif link[:1] == '?': # GET params only
+                        res = re.match('(.*)\?.*', path)
+                        path = "%s%s" % (res.group(1), link)
+                    else: # relative path
+                        parts = path.split('/')
+                        path = path.replace(parts[-1:][0], link)
+                    link = "%s://%s%s" % (proto, host, path)
+                if link not in self.db: 
+                    self.url_list.append(link) 
+                self.db[link] = (self.db.get(link) or 0) + 1   
+        def fetch_url(self):
+            req = urllib2.urlopen(self.url) 
+            return req.read() 
+        def crawl(self): 
+            visited = 0
+            for self.url in self.url_list:
+                if visited > self.max_visits: break
+                log_debug("Visited %d URLs" % visited)
+                proto, host, path = self.url_parts(self.url)
+                try:
+                    self.allowed_hosts.index(host)
+                except ValueError:
+                    log_info("Skipping remote host %s..." % host)
+                    continue
+                if self.limit_path:
+                    if path[:len(self.limit_path)] != self.limit_path:
+                        log_info("Skipping forbidden base path %s..." % path)
+                        continue
+                try: 
+                    log_info("Opening URL %s" % self.url)              
+                    res = PageFetchTask(self).start()
+                    self.reset() 
+                    self.feed(res) 
+                    visited += 1
+                except: 
+                    self.reset() 
+            log_info("Found %d distinct URLs" % len(self.db.keys()))
+    def run(self):
+        props = Properties.get_properties()
+        start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
+        max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
+        limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+        if not start_url or not max_pages:
+            log_error("Missing required properties: " +
+                "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
+            return False
+        spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+        result = spider.crawl() 
+        return True
+# Launch the script

Property changes on: constrictor/trunk/samples/web_spider.py
Name: svn:executable
   + *

More information about the open-ils-commits mailing list