[open-ils-commits] r1172 - in servres/trunk/conifer: syrup uwindsor_migration (gfawcett)

Sat Jan 8 20:00:04 EST 2011

Author: gfawcett
Date: 2011-01-08 20:00:03 -0500 (Sat, 08 Jan 2011)
New Revision: 1172

Added:
   servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py
Modified:
   servres/trunk/conifer/syrup/models.py
   servres/trunk/conifer/uwindsor_migration/eres.py
   servres/trunk/conifer/uwindsor_migration/metadata.py
Log:
uwindsor: migration of content from ERES to Syrup actually works!

Modified: servres/trunk/conifer/syrup/models.py
===================================================================

--- servres/trunk/conifer/syrup/models.py	2011-01-09 00:56:13 UTC (rev 1171)
+++ servres/trunk/conifer/syrup/models.py	2011-01-09 01:00:03 UTC (rev 1172)
@@ -602,6 +602,7 @@
         ('FD', 'fair dealing'),
         ('PG', 'permission granted'),
         ('LC', 'licensed content'),
+        ('AV', 'available to students'),
         ]
 
     copyright_status = m.CharField(max_length=2, 

Added: servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py	                        (rev 0)
+++ servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py	2011-01-09 01:00:03 UTC (rev 1172)
@@ -0,0 +1,106 @@
+#!/usr/bin/env python-django
+
+from conifer.syrup.models import *
+
+from django.core.files import File
+import shutil
+import re
+import hashlib
+import os, sys
+from os.path import *
+from metadata import Metadata
+from pprint import pprint
+from django.conf import settings
+
+upload_dir = Item._meta.get_field('fileobj').upload_to
+
+known_profs = dict([
+        ("Burgess","aburgess"),
+        ("Fitzgerald","afitz"),
+        ("Burr","burrc"),
+        ("Jacobs","djacobs"),
+        ("Gannage","gannage"),
+        ("Huffaker","huffaker"),
+        ("Carter","icarter"),
+        ("Lewis","lewis3"),
+        ("Parr","parr1"),
+        ("McKay","pmckay"),
+        ("Phipps","pphipps"),
+        ("Samson","psamson"),
+        ("Dienesch","rdienesc"),
+        ("Orsini","sorsini"),
+        ("Yun","yshhsy"),])
+
+def ensure_user(username):
+    user, created = User.objects.get_or_create(username=username)
+    user.maybe_decorate()
+    return user
+
+def site_for_item(item):
+    termcode, prof = item.term, item.instructor
+    termcode = termcode.split(' ')[-1] + termcode[0] # Winter 2011 -> 2011W
+    coursecode = re.search('\d\d-\d\d\d', item.course).group(0)
+    profs = [ensure_user(known_profs[p.strip()])
+             for p in prof.split(',')]
+    primary = profs[0]
+    course = Course.objects.get(code__contains=coursecode)
+    term = Term.objects.get(code=termcode)
+    site, created = Site.objects.get_or_create(
+        owner = primary,
+        start_term = term,
+        course = course,
+        defaults = dict(service_desk = ServiceDesk.default(),
+                        end_term = term))
+    return site
+    
+DATA = 'data/'
+COURSES = os.listdir(DATA)
+
+
+for course in COURSES:
+    items = list(Metadata.find_all(join(DATA, course)))
+    if not items:
+        continue
+    _item = items[0]
+
+    site = site_for_item(_item)
+    print site
+
+    Item.objects.filter(site=site).delete() # fixme, just for testing.
+
+    for m in items:
+        d = m.data.copy()
+
+        if 'author2' in d:
+            d['author'] = '%s;%s' % (d['author'], d['author2'])
+
+        for key in ['_path', 'author2', 'course', 'datafile', 'filename', 'instructor', 
+                    'localid', 'term', 'type']:
+            if key in d:
+                del d[key]
+        
+        if m.type == 'url':
+            assert 'url' in d, ('No URL', m.data)
+            Item.objects.create(site=site, item_type='URL', **d)
+
+        elif m.type == 'file':
+            if m.mimetype is None:
+                pprint(m.data)
+                raise Exception('stop: a bad file?')
+
+            with open(m.datafile) as f:
+                digest = hashlib.md5(f.read()).hexdigest()
+            dest = digest
+            i = Item.objects.create(site=site, item_type='ELEC',
+                                    fileobj_mimetype = m.mimetype,
+                                    fileobj_origname = m.filename,
+                                    copyright_status='AV',
+                                    **d)
+
+            fullpath = os.path.join(settings.MEDIA_ROOT, upload_dir, dest)
+            if os.path.isfile(fullpath):
+                i.fileobj.name = os.path.join(upload_dir, dest)
+            else:
+                with open(m.datafile) as f:
+                    i.fileobj.save(dest, File(f), save=False)
+            i.save()

Modified: servres/trunk/conifer/uwindsor_migration/eres.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/eres.py	2011-01-09 00:56:13 UTC (rev 1171)
+++ servres/trunk/conifer/uwindsor_migration/eres.py	2011-01-09 01:00:03 UTC (rev 1172)
@@ -68,6 +68,8 @@
 
 n = 0
 for (cid, aid) in itemlinkpat.findall(html):
+    print (n, cid, aid)
+
     if (cid, aid) in done:
         continue
 
@@ -90,9 +92,11 @@
             print >> log, (n, 'file', m.groups())
             urlpath, itemid, origfile = m.groups()
             binary_url = '%s/%s' % (BASE, urlpath)
+            binary_url = binary_url.replace('[', r'\[').replace(']', r'\]')   
             cookie = browser.cj[0]
             destfile = '%s/data%03d' % (PATH, n)
             cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile)
+            #print cmd
             os.system(cmd)
     back()
     done.add((cid, aid))

Modified: servres/trunk/conifer/uwindsor_migration/metadata.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/metadata.py	2011-01-09 00:56:13 UTC (rev 1171)
+++ servres/trunk/conifer/uwindsor_migration/metadata.py	2011-01-09 01:00:03 UTC (rev 1172)
@@ -9,7 +9,7 @@
 
     def __init__(self, path):
         self._path = path
-        self.html = open(name).read()
+        self.html = open(path).read()
         self.localid = re.search(r'item(\d+)', self._path).group(1)
         self._scrape()
         del self.html
@@ -39,13 +39,13 @@
             published='<td align="left" nowrap="nowrap">Date Published:</td><td align="left" width="100%">(.*?)<',
             course='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;(.*?) -',
             instructor='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? - (.*?)<',
-            term='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.*? - .*? \((.*?)\)',
+            term='<td class="HEADER1" valign="middle" align="left" height="25">&nbsp;&nbsp;.* - .* \((.*?)\)',
             )
         if hasattr(self, 'journal'):
             self.source_title = self.journal
             del self.journal
 
-        pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+        pat = re.compile(r"""onClick="javascript:popall\('(.*?)'.*?">Click here for more information</a>""")
         m = pat.search(self.html)
         if m:
             self.type = 'url'
@@ -62,12 +62,24 @@
                 datafile = os.path.abspath(datafile)
                 self.datafile = datafile
 
+    @property
+    def mimetype(self):
+        assert self.datafile
+        with os.popen('file -i ' + self.datafile) as f:
+            tmp = f.readline()
+        try:
+            return re.search(r': (\w+/\w+);', tmp).group(1)
+        except:
+            return None
+        
 
+    @classmethod
+    def find_all(cls, path):
+        for name in os.popen('find "%s" -name "item0*.html"' % path).readlines():
+            yield Metadata(name.strip())
 
 if __name__ == '__main__':
-    items = []
-    for name in os.popen('find data -name "item0*.html"').readlines():
-        name = name.strip()
-        m = Metadata(name)
-        items.append(m)
-        pprint(m.data)
+    for m in Metadata.find_all('data/'):
+        #pprint(m.data)
+        if m.type == 'file':
+            pprint(m.mimetype)