[open-ils-commits] r1215 - in constrictor/trunk: . constrictor samples (erickson)
svn at svn.open-ils.org
svn at svn.open-ils.org
Sat Jan 29 11:06:55 EST 2011
Author: erickson
Date: 2011-01-29 11:06:52 -0500 (Sat, 29 Jan 2011)
New Revision: 1215
Added:
constrictor/trunk/samples/web_spider.py
Modified:
constrictor/trunk/constrictor.properties
constrictor/trunk/constrictor/data.py
constrictor/trunk/constrictor/log.py
constrictor/trunk/constrictor/properties.py
constrictor/trunk/constrictor/task.py
Log:
Web spider sample script
Simple web spider script that visits, reports, parses, collects links,
and continues until it has fetched a configured number of pages. Each page
load is a constrictor Task so timing data can be collected.
Modified: constrictor/trunk/constrictor/data.py
===================================================================
--- constrictor/trunk/constrictor/data.py 2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/data.py 2011-01-29 16:06:52 UTC (rev 1215)
@@ -118,7 +118,6 @@
for task in self.runtime_data:
task_times += task['duration']
- log.log_debug("Storing " + task['name'])
if task['name'] not in task_counts:
task_counts[task['name']] = 0;
Modified: constrictor/trunk/constrictor/log.py
===================================================================
--- constrictor/trunk/constrictor/log.py 2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/log.py 2011-01-29 16:06:52 UTC (rev 1215)
@@ -27,7 +27,7 @@
def log_error(msg=''):
if loglevel < 1: return
from script import ScriptThread
- sys.stderr.write('Error[%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
+ sys.stderr.write('Err [%d]: %s\n' % (ScriptThread.get_thread_id(), msg))
sys.stderr.flush()
def log_info(msg=''):
@@ -38,4 +38,4 @@
def log_debug(msg=''):
if loglevel < 3: return
from script import ScriptThread
- print 'Debug[%d]: %s' % (ScriptThread.get_thread_id(), msg)
+ print 'Debg[%d]: %s' % (ScriptThread.get_thread_id(), msg)
Modified: constrictor/trunk/constrictor/properties.py
===================================================================
--- constrictor/trunk/constrictor/properties.py 2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/properties.py 2011-01-29 16:06:52 UTC (rev 1215)
@@ -9,9 +9,7 @@
- added property name sorting to the store() method
"""
-import sys,os
-import re
-import time
+import sys,os, re, time
class IllegalArgumentException(Exception):
@@ -318,6 +316,27 @@
except KeyError:
if hasattr(self._props,name):
return getattr(self._props, name)
+
+ def get_thread_prop(self, prop, unique=False):
+ from constrictor.script import ScriptThread
+
+ data = self.get_property(prop)
+ data = data.split(',')
+
+ currentThread = ScriptThread.get_thread_id()
+ totalThreads = self.get_property('constrictor.numThreads')
+
+ if len(data) > currentThread:
+ return data[currentThread]
+
+ if unique:
+ raise Exception(
+ "Too many threads for unique data. Thread index is %d, size of dataset is %d" % (
+ currentThread, len(data)))
+
+ # data sharing is OK
+ return data[currentThread % len(data)]
+
if __name__=="__main__":
p = Properties()
Modified: constrictor/trunk/constrictor/task.py
===================================================================
--- constrictor/trunk/constrictor/task.py 2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor/task.py 2011-01-29 16:06:52 UTC (rev 1215)
@@ -29,7 +29,9 @@
on the actual Task object.
"""
- def __init__(self, name=''):
+ def __init__(self, name=None):
+ if name is None:
+ name = self.__class__.__name__
self.name = name
self.reset()
Modified: constrictor/trunk/constrictor.properties
===================================================================
--- constrictor/trunk/constrictor.properties 2011-01-29 16:06:51 UTC (rev 1214)
+++ constrictor/trunk/constrictor.properties 2011-01-29 16:06:52 UTC (rev 1215)
@@ -33,9 +33,21 @@
#logs to stdout and stderr. options are 0=none,1=error,2=info,3=debug
constrictor.loglevel=2
+# ---- Setings fro sample web spider plugin --------------
+# Initial URL. Can be different per thread w/ comma-separated list
+constrictor.plugin.web_spider.start_url=http://example.org/somepath?foo=bar,http://example.org/somepath?foo=bar2,http://example.org/somepath?foo=bar3
+# Each spider thread will stop crawling after fetching this many pages
+constrictor.plugin.web_spider.max_pages=100
+# Only allow the spider to fetch pages with a certain base path
+constrictor.plugin.web_spider.limit_path=/somepath
+
+
+
+
+
# ---- Properties for the Evergreen contrib module --------------
# Where on the server can we find the latest IDL file
Added: constrictor/trunk/samples/web_spider.py
===================================================================
--- constrictor/trunk/samples/web_spider.py (rev 0)
+++ constrictor/trunk/samples/web_spider.py 2011-01-29 16:06:52 UTC (rev 1215)
@@ -0,0 +1,153 @@
+#!/usr/bin/python
+# --------------------------------------------------------------
+# Simple script sample. Eacch task just sleeps for some portion
+# of a second
+# --------------------------------------------------------------
+
+import random, time, sys, re
+import urllib2
+from HTMLParser import HTMLParser
+from constrictor.task import Task
+from constrictor.script import Script, ScriptManager
+from constrictor.properties import Properties
+from constrictor.log import *
+
+
+class PageFetchTask(Task):
+ def __init__(self, spider, name=None):
+ Task.__init__(self, name)
+ self.spider = spider
+
+ def run(self):
+ # fetch a single page
+ return self.spider.fetch_url()
+
+class WebSpiderScript(Script):
+
+ # Heavily modified version of the script found at
+ # http://www.halotis.com/2009/09/16/python-web-crawler-script/
+
+ class Spider(HTMLParser):
+
+ def __init__(self, url, max_visits, limit_path='', allowed_hosts=[]):
+
+ HTMLParser.__init__(self)
+ self.url = url
+ self.db = {self.url: 1}
+ self.url_list = [self.url]
+ self.max_visits = max_visits
+ self.allowed_hosts = allowed_hosts
+ proto, self.host, path = self.url_parts(url)
+ self.limit_path = limit_path
+
+ try:
+ foo = self.allowed_hosts.index(self.host)
+ except ValueError:
+ self.allowed_hosts.append(self.host)
+
+ def url_parts(self, url):
+ proto = ''
+ host = ''
+ path = ''
+
+ res = re.search('^(https?)://([^\/]+)(.*)', url)
+ try:
+ proto = res.group(1)
+ host = res.group(2)
+ except IndexError:
+ raise Exception("Invalid URL: %s" % url)
+ try:
+ path = res.group(3)
+ except IndexError:
+ pass
+
+ return proto, host, path
+
+
+
+ def handle_starttag(self, tag, attrs):
+
+ if tag == 'a' and attrs:
+
+ link = attrs[0][1]
+
+ if link[:4] != "http":
+ proto, host, path = self.url_parts(self.url)
+
+ if link[:1] == '/': # full path
+ link = "%s://%s%s" % (proto, host, link)
+
+ elif link[:1] == '?': # GET params only
+ res = re.match('(.*)\?.*', path)
+ path = "%s%s" % (res.group(1), link)
+
+ else: # relative path
+ parts = path.split('/')
+ path = path.replace(parts[-1:][0], link)
+
+ link = "%s://%s%s" % (proto, host, path)
+
+ if link not in self.db:
+ self.url_list.append(link)
+
+ self.db[link] = (self.db.get(link) or 0) + 1
+
+ def fetch_url(self):
+ req = urllib2.urlopen(self.url)
+ return req.read()
+
+ def crawl(self):
+
+ visited = 0
+
+ for self.url in self.url_list:
+
+ if visited > self.max_visits: break
+
+ log_debug("Visited %d URLs" % visited)
+
+ proto, host, path = self.url_parts(self.url)
+
+ try:
+ self.allowed_hosts.index(host)
+ except ValueError:
+ log_info("Skipping remote host %s..." % host)
+ continue
+
+ if self.limit_path:
+ if path[:len(self.limit_path)] != self.limit_path:
+ log_info("Skipping forbidden base path %s..." % path)
+ continue
+
+ try:
+ log_info("Opening URL %s" % self.url)
+ res = PageFetchTask(self).start()
+ self.reset()
+ self.feed(res)
+ visited += 1
+ except:
+ self.reset()
+
+ log_info("Found %d distinct URLs" % len(self.db.keys()))
+
+ def run(self):
+
+ props = Properties.get_properties()
+ start_url = props.get_thread_prop('constrictor.plugin.web_spider.start_url')
+ max_pages = props.get_property('constrictor.plugin.web_spider.max_pages')
+ limit_path = props.get_property('constrictor.plugin.web_spider.limit_path')
+
+ if not start_url or not max_pages:
+ log_error("Missing required properties: " +
+ "constrictor.plugin.web_spider.start_url, constrictor.plugin.web_spider.max_pages")
+ return False
+
+ spider = WebSpiderScript.Spider(start_url, int(max_pages), limit_path)
+ result = spider.crawl()
+
+ return True
+
+# Launch the script
+ScriptManager.go(WebSpiderScript())
+
+
Property changes on: constrictor/trunk/samples/web_spider.py
___________________________________________________________________
Name: svn:executable
+ *
More information about the open-ils-commits
mailing list