[open-ils-commits] r1170 - in servres/trunk: . conifer conifer/uwindsor_migration eres (gfawcett)

Sat Jan 8 19:56:08 EST 2011

Author: gfawcett
Date: 2011-01-08 19:56:07 -0500 (Sat, 08 Jan 2011)
New Revision: 1170

Added:
   servres/trunk/conifer/uwindsor_migration/
   servres/trunk/conifer/uwindsor_migration/eres.py
   servres/trunk/conifer/uwindsor_migration/metadata.py
   servres/trunk/eres/
   servres/trunk/eres/.gitignore
Modified:
   servres/trunk/.gitignore
Log:
uwindsor_migration/, containing some migrational stuff. (Remove this later.)

Modified: servres/trunk/.gitignore
===================================================================

--- servres/trunk/.gitignore	2011-01-07 20:46:01 UTC (rev 1169)
+++ servres/trunk/.gitignore	2011-01-09 00:56:07 UTC (rev 1170)
@@ -14,3 +14,4 @@
 *~
 /conifer/test.db
 /conifer/syrup/test.db
+/conifer/uwindsor_migration/data/

Added: servres/trunk/conifer/uwindsor_migration/eres.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/eres.py	                        (rev 0)
+++ servres/trunk/conifer/uwindsor_migration/eres.py	2011-01-09 00:56:07 UTC (rev 1170)
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+
+# This script scrapes ERES and saves raw content to the 'data' directory.
+
+from subprocess import *
+import os
+import re
+import sys
+
+import warnings
+warnings.filterwarnings('ignore') # to avoid some twill import noise.
+
+from twill.commands import *
+from twill import get_browser
+
+try:
+    username = os.environ['ERESUSER']
+    password = os.environ['ERESPASS']
+except:
+    print
+    print 'Example usage:'
+    print ' ERESUSER=xxxx ERESPASS=xxx %s <coursecode>' % sys.argv[0]
+    print
+    print 'Course codes are like CRIM48-567, as they appear in the ERES interface.'
+    print
+    print 'Fancier usage: '
+    print ' export ERESUSER=xxx; export ERESPASS=xxx'
+    print ' export CODES="coursecode1 coursecode2 coursecode3 ..."'
+    print ' for code in $CODES; do %s $code; done' % sys.argv[0]
+    raise SystemExit
+
+browser = get_browser()
+
+redirect_output('/dev/null')
+go('http://ereserves.uwindsor.ca/eres/login.aspx')
+
+fv(1, 3, username)
+fv(1, 4, password)
+submit(5)
+
+go('http://ereserves.uwindsor.ca/eres/courses.aspx')
+
+COURSE = sys.argv[1]
+
+follow(COURSE)
+
+PATH = 'data/%s' % COURSE
+
+try:
+    os.makedirs(PATH)
+except:
+    pass
+
+submit(3)                       # 'accept' on the License page
+
+follow('Documents')
+BASE = url('.*').rsplit('/', 1)[0]
+
+filename = '%s/items.html' % PATH
+save_html(filename)
+html = open(filename).read()
+
+save_cookies('%s/c' % PATH)
+log = open('%s/log' % PATH, 'w')
+
+itemlinkpat = re.compile(r"documentview.aspx\?cid=(\d+)&associd=(\d+)")
+done = set()
+
+n = 0
+for (cid, aid) in itemlinkpat.findall(html):
+    if (cid, aid) in done:
+        continue
+
+    itemurl = "%s/documentview.aspx?cid=%s&associd=%s" % (BASE, cid, aid)
+    print n, itemurl
+    go(itemurl)
+
+    filename = '%s/item%03d.html' % (PATH, n)
+    save_html(filename)
+    html = open(filename).read()
+
+    linkpat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+    m = linkpat.search(html)
+    if m:
+        print >> log, (n, 'link', m.groups())
+    else:
+        filepat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""")
+        m = filepat.search(html)
+        if m:
+            print >> log, (n, 'file', m.groups())
+            urlpath, itemid, origfile = m.groups()
+            binary_url = '%s/%s' % (BASE, urlpath)
+            cookie = browser.cj[0]
+            destfile = '%s/data%03d' % (PATH, n)
+            cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile)
+            os.system(cmd)
+    back()
+    done.add((cid, aid))
+    n += 1
+
+log.close()


Property changes on: servres/trunk/conifer/uwindsor_migration/eres.py
___________________________________________________________________
Name: svn:executable
   + *

Added: servres/trunk/conifer/uwindsor_migration/metadata.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/metadata.py	                        (rev 0)
+++ servres/trunk/conifer/uwindsor_migration/metadata.py	2011-01-09 00:56:07 UTC (rev 1170)
@@ -0,0 +1,73 @@
+# After having scraped ERES, the Metadata class can extract items'
+# metadata from the the scraped HTML.
+
+from pprint import pprint
+import re
+import os
+
+class Metadata(object):
+
+    def __init__(self, path):
+        self._path = path
+        self.html = open(name).read()
+        self.localid = re.search(r'item(\d+)', self._path).group(1)
+        self._scrape()
+        del self.html
+
+    @property
+    def data(self):
+        return self.__dict__
+
+    def __scrape(self, **kwargs):
+        for name, pat in kwargs.items():
+            try:
+                setattr(self, name, re.search(pat, self.html).group(1).strip())
+            except:
+                pass
+
+    def _scrape(self):
+        self.__scrape(
+            title=r'<td align="left" nowrap="nowrap">Title:</td><td align="left" width="100%">(.*?)<',
+            source_title=r'<td align="left" nowrap="nowrap">Title Primary:</td><td align="left" width="100%">(.*?)<',
+            journal=r'<td align="left" nowrap="nowrap">Journal:</td><td align="left" width="100%">(.*?)<',
+            volume=r'<td align="left" nowrap="nowrap">Volume:</td><td align="left" width="100%">(.*?)<',
+            issue=r'<td align="left" nowrap="nowrap">Issue:</td><td align="left" width="100%">(.*?)<',
+            author=r'<td align="left" nowrap="nowrap">Author Primary:</td><td align="left" width="100%">(.*?)<',
+            author2=r'<td align="left" nowrap="nowrap">Author Secondary:</td><td align="left" width="100%">(.*?)<',
+            pages='<td align="left" nowrap="nowrap">Page Range / Chapter:</td><td align="left" width="100%">(.*?)<',
+            publisher='<td align="left" nowrap="nowrap">Publisher:</td><td align="left" width="100%">(.*?)<',
+            published='<td align="left" nowrap="nowrap">Date Published:</td><td align="left" width="100%">(.*?)<',
+            course='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;(.*?) -',
+            instructor='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? - (.*?)<',
+            term='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? \((.*?)\)',
+            )
+        if hasattr(self, 'journal'):
+            self.source_title = self.journal
+            del self.journal
+
+        pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+        m = pat.search(self.html)
+        if m:
+            self.type = 'url'
+            self.url = m.group(1)
+        else:
+            pat = re.compile(r"""onClick="javascript:pop\('(download.aspx\?docID=(\d+)&shortname=(.*?))'""")
+            m = pat.search(self.html)
+            if m:
+                self.type = 'file'
+                urlpath, itemid, origfile = m.groups()
+                self.filename = origfile
+                datafile = re.sub(r'(.*)/item(\d+).html', 
+                                  r'\1/data\2', self._path)
+                datafile = os.path.abspath(datafile)
+                self.datafile = datafile
+
+
+
+if __name__ == '__main__':
+    items = []
+    for name in os.popen('find data -name "item0*.html"').readlines():
+        name = name.strip()
+        m = Metadata(name)
+        items.append(m)
+        pprint(m.data)

Added: servres/trunk/eres/.gitignore
===================================================================
--- servres/trunk/eres/.gitignore	                        (rev 0)
+++ servres/trunk/eres/.gitignore	2011-01-09 00:56:07 UTC (rev 1170)
@@ -0,0 +1,5 @@
+/data/
+*#
+*~
+*.pyc
+.#*