[open-ils-commits] r1101 - conifer/branches/rel_1_6_1/tools/ebooks (dbs)
svn at svn.open-ils.org
svn at svn.open-ils.org
Wed Dec 8 12:06:21 EST 2010
Author: dbs
Date: 2010-12-08 12:06:18 -0500 (Wed, 08 Dec 2010)
New Revision: 1101
Modified:
conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
Log:
Teach the ebook processing script how to turn LoC URLs into indexable content
There is a risk that we will duplicate content already in the MARC record,
but so far that doesn't appear to be the case. Also, we skip the
OCRed machine-generated tables of contents because their content is
entirely untrustworthy.
Modified: conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py 2010-12-07 18:36:10 UTC (rev 1100)
+++ conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py 2010-12-08 17:06:18 UTC (rev 1101)
@@ -13,7 +13,8 @@
be accommodated in batch load.
"""
-import os, os.path, sys, getopt, pymarc, pymarc.marc8, re
+import os, os.path, sys, getopt, pymarc, pymarc.marc8, re, urllib2
+from BeautifulSoup import BeautifulSoup
class Institution():
"""Defines standard settings for each Conifer institution"""
@@ -241,15 +242,13 @@
def process_fields(record, options):
"""Decide which fields to add, delete, and keep"""
- url = False
new_record = pymarc.Record()
for field in record.get_fields():
- # Only process the first 856 field, for better or worse
+ # Process all of the 856 fields
if field.tag == '856':
- if url == False:
- url = True
- new_fields = process_urls(field, options)
+ new_fields = process_urls(field, options)
+ if new_fields:
for new_856 in new_fields:
new_record.add_field(new_856)
# Strip out 9xx fields: we don't want local fields in our records
@@ -313,18 +312,79 @@
print "* No subfield 'u' found in this 856"
return None
+ # If we have a ToC or author notes or whatever, replace with content
+ if field['u'].find('.loc.gov') > -1:
+ content = substitute_content(field)
+ if (content):
+ new_fields.append(content)
+ else:
+ for lib in options['libraries']:
+ data = options['settings'].get_settings(lib)
+ subs = get_subfields(field, data)
+ eight_five_six = pymarc.Field(tag = '856',
+ indicators = ['4', '0'],
+ subfields = subs
+ )
+ new_fields.append(eight_five_six)
- for lib in options['libraries']:
- data = options['settings'].get_settings(lib)
- subs = get_subfields(field, data)
- eight_five_six = pymarc.Field(tag = '856',
- indicators = ['4', '0'],
- subfields = subs
+ return new_fields
+
+def substitute_content(field):
+ """Parses a ToC or author notes URL and generates a field"""
+
+ content_field = None
+ raw_content = ''
+
+ url = field['u']
+
+ # Skip machine-generated tables of contents
+ if url.find('/toc/') > -1:
+ return None
+
+ try:
+ req = urllib2.urlopen(url)
+ raw_content = BeautifulSoup(req.read())
+ except urllib2.HTTPError, ex:
+ print("%s for URL %s" % (ex, url))
+ return None
+
+ # Short-circuit if we have an OCRed ToC; the quality is terrible
+ if raw_content.find(text='Electronic data is machine generated'):
+ return None
+ elif raw_content.find('<pre>'):
+ return None
+
+ content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
+ content = content.replace('\n', ' ')
+
+ if url.find('-b.html') > -1:
+ # Biographical note
+ content_field = pymarc.Field(
+ tag = '545',
+ indicators = ['1', ' '],
+ subfields = ['a', content]
)
- new_fields.append(eight_five_six)
+ elif url.find('-d.html') > -1:
+ # Summary written by publisher
+ content_field = pymarc.Field(
+ tag = '520',
+ indicators = ['3', ' '],
+ subfields = ['a', content]
+ )
- return new_fields
+ elif url.find('-t.html') > -1:
+ # Table of contents
+ content_field = pymarc.Field(
+ tag = '505',
+ indicators = [' ', ' '],
+ subfields = ['a', content]
+ )
+ else:
+ print("URL %s didn't match known LoC type" % (url))
+ return None
+ return content_field
+
def get_subfields(field, data):
"""Creates 856 subfields required by Conifer"""
More information about the open-ils-commits
mailing list