[open-ils-commits] r1170 - in servres/trunk: . conifer conifer/uwindsor_migration eres (gfawcett)
svn at svn.open-ils.org
svn at svn.open-ils.org
Sat Jan 8 19:56:08 EST 2011
Author: gfawcett
Date: 2011-01-08 19:56:07 -0500 (Sat, 08 Jan 2011)
New Revision: 1170
Added:
servres/trunk/conifer/uwindsor_migration/
servres/trunk/conifer/uwindsor_migration/eres.py
servres/trunk/conifer/uwindsor_migration/metadata.py
servres/trunk/eres/
servres/trunk/eres/.gitignore
Modified:
servres/trunk/.gitignore
Log:
uwindsor_migration/, containing some migrational stuff. (Remove this later.)
Modified: servres/trunk/.gitignore
===================================================================
--- servres/trunk/.gitignore 2011-01-07 20:46:01 UTC (rev 1169)
+++ servres/trunk/.gitignore 2011-01-09 00:56:07 UTC (rev 1170)
@@ -14,3 +14,4 @@
*~
/conifer/test.db
/conifer/syrup/test.db
+/conifer/uwindsor_migration/data/
Added: servres/trunk/conifer/uwindsor_migration/eres.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/eres.py (rev 0)
+++ servres/trunk/conifer/uwindsor_migration/eres.py 2011-01-09 00:56:07 UTC (rev 1170)
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+# This script scrapes ERES and saves raw content to the 'data' directory.
+
+from subprocess import *
+import os
+import re
+import sys
+
+import warnings
+warnings.filterwarnings('ignore') # to avoid some twill import noise.
+
+from twill.commands import *
+from twill import get_browser
+
+try:
+ username = os.environ['ERESUSER']
+ password = os.environ['ERESPASS']
+except:
+ print
+ print 'Example usage:'
+ print ' ERESUSER=xxxx ERESPASS=xxx %s <coursecode>' % sys.argv[0]
+ print
+ print 'Course codes are like CRIM48-567, as they appear in the ERES interface.'
+ print
+ print 'Fancier usage: '
+ print ' export ERESUSER=xxx; export ERESPASS=xxx'
+ print ' export CODES="coursecode1 coursecode2 coursecode3 ..."'
+ print ' for code in $CODES; do %s $code; done' % sys.argv[0]
+ raise SystemExit
+
+browser = get_browser()
+
+redirect_output('/dev/null')
+go('http://ereserves.uwindsor.ca/eres/login.aspx')
+
+fv(1, 3, username)
+fv(1, 4, password)
+submit(5)
+
+go('http://ereserves.uwindsor.ca/eres/courses.aspx')
+
+COURSE = sys.argv[1]
+
+follow(COURSE)
+
+PATH = 'data/%s' % COURSE
+
+try:
+ os.makedirs(PATH)
+except:
+ pass
+
+submit(3) # 'accept' on the License page
+
+follow('Documents')
+BASE = url('.*').rsplit('/', 1)[0]
+
+filename = '%s/items.html' % PATH
+save_html(filename)
+html = open(filename).read()
+
+save_cookies('%s/c' % PATH)
+log = open('%s/log' % PATH, 'w')
+
+itemlinkpat = re.compile(r"documentview.aspx\?cid=(\d+)&associd=(\d+)")
+done = set()
+
+n = 0
+for (cid, aid) in itemlinkpat.findall(html):
+ if (cid, aid) in done:
+ continue
+
+ itemurl = "%s/documentview.aspx?cid=%s&associd=%s" % (BASE, cid, aid)
+ print n, itemurl
+ go(itemurl)
+
+ filename = '%s/item%03d.html' % (PATH, n)
+ save_html(filename)
+ html = open(filename).read()
+
+ linkpat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+ m = linkpat.search(html)
+ if m:
+ print >> log, (n, 'link', m.groups())
+ else:
+ filepat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""")
+ m = filepat.search(html)
+ if m:
+ print >> log, (n, 'file', m.groups())
+ urlpath, itemid, origfile = m.groups()
+ binary_url = '%s/%s' % (BASE, urlpath)
+ cookie = browser.cj[0]
+ destfile = '%s/data%03d' % (PATH, n)
+ cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile)
+ os.system(cmd)
+ back()
+ done.add((cid, aid))
+ n += 1
+
+log.close()
Property changes on: servres/trunk/conifer/uwindsor_migration/eres.py
___________________________________________________________________
Name: svn:executable
+ *
Added: servres/trunk/conifer/uwindsor_migration/metadata.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/metadata.py (rev 0)
+++ servres/trunk/conifer/uwindsor_migration/metadata.py 2011-01-09 00:56:07 UTC (rev 1170)
@@ -0,0 +1,73 @@
+# After having scraped ERES, the Metadata class can extract items'
+# metadata from the the scraped HTML.
+
+from pprint import pprint
+import re
+import os
+
+class Metadata(object):
+
+ def __init__(self, path):
+ self._path = path
+ self.html = open(name).read()
+ self.localid = re.search(r'item(\d+)', self._path).group(1)
+ self._scrape()
+ del self.html
+
+ @property
+ def data(self):
+ return self.__dict__
+
+ def __scrape(self, **kwargs):
+ for name, pat in kwargs.items():
+ try:
+ setattr(self, name, re.search(pat, self.html).group(1).strip())
+ except:
+ pass
+
+ def _scrape(self):
+ self.__scrape(
+ title=r'<td align="left" nowrap="nowrap">Title:</td><td align="left" width="100%">(.*?)<',
+ source_title=r'<td align="left" nowrap="nowrap">Title Primary:</td><td align="left" width="100%">(.*?)<',
+ journal=r'<td align="left" nowrap="nowrap">Journal:</td><td align="left" width="100%">(.*?)<',
+ volume=r'<td align="left" nowrap="nowrap">Volume:</td><td align="left" width="100%">(.*?)<',
+ issue=r'<td align="left" nowrap="nowrap">Issue:</td><td align="left" width="100%">(.*?)<',
+ author=r'<td align="left" nowrap="nowrap">Author Primary:</td><td align="left" width="100%">(.*?)<',
+ author2=r'<td align="left" nowrap="nowrap">Author Secondary:</td><td align="left" width="100%">(.*?)<',
+ pages='<td align="left" nowrap="nowrap">Page Range / Chapter:</td><td align="left" width="100%">(.*?)<',
+ publisher='<td align="left" nowrap="nowrap">Publisher:</td><td align="left" width="100%">(.*?)<',
+ published='<td align="left" nowrap="nowrap">Date Published:</td><td align="left" width="100%">(.*?)<',
+ course='<td class="HEADER1" valign="middle" align="left" height="25"> (.*?) -',
+ instructor='<td class="HEADER1" valign="middle" align="left" height="25"> .*? - .*? - (.*?)<',
+ term='<td class="HEADER1" valign="middle" align="left" height="25"> .*? - .*? \((.*?)\)',
+ )
+ if hasattr(self, 'journal'):
+ self.source_title = self.journal
+ del self.journal
+
+ pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+ m = pat.search(self.html)
+ if m:
+ self.type = 'url'
+ self.url = m.group(1)
+ else:
+ pat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""")
+ m = pat.search(self.html)
+ if m:
+ self.type = 'file'
+ urlpath, itemid, origfile = m.groups()
+ self.filename = origfile
+ datafile = re.sub(r'(.*)/item(\d+).html',
+ r'\1/data\2', self._path)
+ datafile = os.path.abspath(datafile)
+ self.datafile = datafile
+
+
+
+if __name__ == '__main__':
+ items = []
+ for name in os.popen('find data -name "item0*.html"').readlines():
+ name = name.strip()
+ m = Metadata(name)
+ items.append(m)
+ pprint(m.data)
Added: servres/trunk/eres/.gitignore
===================================================================
--- servres/trunk/eres/.gitignore (rev 0)
+++ servres/trunk/eres/.gitignore 2011-01-09 00:56:07 UTC (rev 1170)
@@ -0,0 +1,5 @@
+/data/
+*#
+*~
+*.pyc
+.#*
More information about the open-ils-commits
mailing list