[open-ils-commits] r1172 - in servres/trunk/conifer: syrup uwindsor_migration (gfawcett)
svn at svn.open-ils.org
svn at svn.open-ils.org
Sat Jan 8 20:00:04 EST 2011
Author: gfawcett
Date: 2011-01-08 20:00:03 -0500 (Sat, 08 Jan 2011)
New Revision: 1172
Added:
servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py
Modified:
servres/trunk/conifer/syrup/models.py
servres/trunk/conifer/uwindsor_migration/eres.py
servres/trunk/conifer/uwindsor_migration/metadata.py
Log:
uwindsor: migration of content from ERES to Syrup actually works!
Modified: servres/trunk/conifer/syrup/models.py
===================================================================
--- servres/trunk/conifer/syrup/models.py 2011-01-09 00:56:13 UTC (rev 1171)
+++ servres/trunk/conifer/syrup/models.py 2011-01-09 01:00:03 UTC (rev 1172)
@@ -602,6 +602,7 @@
('FD', 'fair dealing'),
('PG', 'permission granted'),
('LC', 'licensed content'),
+ ('AV', 'available to students'),
]
copyright_status = m.CharField(max_length=2,
Added: servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py (rev 0)
+++ servres/trunk/conifer/uwindsor_migration/eres-into-syrup.py 2011-01-09 01:00:03 UTC (rev 1172)
@@ -0,0 +1,106 @@
+#!/usr/bin/env python-django
+
+from conifer.syrup.models import *
+
+from django.core.files import File
+import shutil
+import re
+import hashlib
+import os, sys
+from os.path import *
+from metadata import Metadata
+from pprint import pprint
+from django.conf import settings
+
+upload_dir = Item._meta.get_field('fileobj').upload_to
+
+known_profs = dict([
+ ("Burgess","aburgess"),
+ ("Fitzgerald","afitz"),
+ ("Burr","burrc"),
+ ("Jacobs","djacobs"),
+ ("Gannage","gannage"),
+ ("Huffaker","huffaker"),
+ ("Carter","icarter"),
+ ("Lewis","lewis3"),
+ ("Parr","parr1"),
+ ("McKay","pmckay"),
+ ("Phipps","pphipps"),
+ ("Samson","psamson"),
+ ("Dienesch","rdienesc"),
+ ("Orsini","sorsini"),
+ ("Yun","yshhsy"),])
+
+def ensure_user(username):
+ user, created = User.objects.get_or_create(username=username)
+ user.maybe_decorate()
+ return user
+
+def site_for_item(item):
+ termcode, prof = item.term, item.instructor
+ termcode = termcode.split(' ')[-1] + termcode[0] # Winter 2011 -> 2011W
+ coursecode = re.search('\d\d-\d\d\d', item.course).group(0)
+ profs = [ensure_user(known_profs[p.strip()])
+ for p in prof.split(',')]
+ primary = profs[0]
+ course = Course.objects.get(code__contains=coursecode)
+ term = Term.objects.get(code=termcode)
+ site, created = Site.objects.get_or_create(
+ owner = primary,
+ start_term = term,
+ course = course,
+ defaults = dict(service_desk = ServiceDesk.default(),
+ end_term = term))
+ return site
+
+DATA = 'data/'
+COURSES = os.listdir(DATA)
+
+
+for course in COURSES:
+ items = list(Metadata.find_all(join(DATA, course)))
+ if not items:
+ continue
+ _item = items[0]
+
+ site = site_for_item(_item)
+ print site
+
+ Item.objects.filter(site=site).delete() # fixme, just for testing.
+
+ for m in items:
+ d = m.data.copy()
+
+ if 'author2' in d:
+ d['author'] = '%s;%s' % (d['author'], d['author2'])
+
+ for key in ['_path', 'author2', 'course', 'datafile', 'filename', 'instructor',
+ 'localid', 'term', 'type']:
+ if key in d:
+ del d[key]
+
+ if m.type == 'url':
+ assert 'url' in d, ('No URL', m.data)
+ Item.objects.create(site=site, item_type='URL', **d)
+
+ elif m.type == 'file':
+ if m.mimetype is None:
+ pprint(m.data)
+ raise Exception('stop: a bad file?')
+
+ with open(m.datafile) as f:
+ digest = hashlib.md5(f.read()).hexdigest()
+ dest = digest
+ i = Item.objects.create(site=site, item_type='ELEC',
+ fileobj_mimetype = m.mimetype,
+ fileobj_origname = m.filename,
+ copyright_status='AV',
+ **d)
+
+ fullpath = os.path.join(settings.MEDIA_ROOT, upload_dir, dest)
+ if os.path.isfile(fullpath):
+ i.fileobj.name = os.path.join(upload_dir, dest)
+ else:
+ with open(m.datafile) as f:
+ i.fileobj.save(dest, File(f), save=False)
+ i.save()
Modified: servres/trunk/conifer/uwindsor_migration/eres.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/eres.py 2011-01-09 00:56:13 UTC (rev 1171)
+++ servres/trunk/conifer/uwindsor_migration/eres.py 2011-01-09 01:00:03 UTC (rev 1172)
@@ -68,6 +68,8 @@
n = 0
for (cid, aid) in itemlinkpat.findall(html):
+ print (n, cid, aid)
+
if (cid, aid) in done:
continue
@@ -90,9 +92,11 @@
print >> log, (n, 'file', m.groups())
urlpath, itemid, origfile = m.groups()
binary_url = '%s/%s' % (BASE, urlpath)
+ binary_url = binary_url.replace('[', r'\[').replace(']', r'\]')
cookie = browser.cj[0]
destfile = '%s/data%03d' % (PATH, n)
cmd = 'curl -s -b "%s=%s" "%s" > %s' % (cookie.name, cookie.value, binary_url, destfile)
+ #print cmd
os.system(cmd)
back()
done.add((cid, aid))
Modified: servres/trunk/conifer/uwindsor_migration/metadata.py
===================================================================
--- servres/trunk/conifer/uwindsor_migration/metadata.py 2011-01-09 00:56:13 UTC (rev 1171)
+++ servres/trunk/conifer/uwindsor_migration/metadata.py 2011-01-09 01:00:03 UTC (rev 1172)
@@ -9,7 +9,7 @@
def __init__(self, path):
self._path = path
- self.html = open(name).read()
+ self.html = open(path).read()
self.localid = re.search(r'item(\d+)', self._path).group(1)
self._scrape()
del self.html
@@ -39,13 +39,13 @@
published='<td align="left" nowrap="nowrap">Date Published:</td><td align="left" width="100%">(.*?)<',
course='<td class="HEADER1" valign="middle" align="left" height="25"> (.*?) -',
instructor='<td class="HEADER1" valign="middle" align="left" height="25"> .*? - .*? - (.*?)<',
- term='<td class="HEADER1" valign="middle" align="left" height="25"> .*? - .*? \((.*?)\)',
+ term='<td class="HEADER1" valign="middle" align="left" height="25"> .* - .* \((.*?)\)',
)
if hasattr(self, 'journal'):
self.source_title = self.journal
del self.journal
- pat = re.compile(r"""onClick="javascript:popall\('(.*)'.*?">Click here for more information</a>""")
+ pat = re.compile(r"""onClick="javascript:popall\('(.*?)'.*?">Click here for more information</a>""")
m = pat.search(self.html)
if m:
self.type = 'url'
@@ -62,12 +62,24 @@
datafile = os.path.abspath(datafile)
self.datafile = datafile
+ @property
+ def mimetype(self):
+ assert self.datafile
+ with os.popen('file -i ' + self.datafile) as f:
+ tmp = f.readline()
+ try:
+ return re.search(r': (\w+/\w+);', tmp).group(1)
+ except:
+ return None
+
+ @classmethod
+ def find_all(cls, path):
+ for name in os.popen('find "%s" -name "item0*.html"' % path).readlines():
+ yield Metadata(name.strip())
if __name__ == '__main__':
- items = []
- for name in os.popen('find data -name "item0*.html"').readlines():
- name = name.strip()
- m = Metadata(name)
- items.append(m)
- pprint(m.data)
+ for m in Metadata.find_all('data/'):
+ #pprint(m.data)
+ if m.type == 'file':
+ pprint(m.mimetype)
More information about the open-ils-commits
mailing list