[open-ils-commits] r1102 - conifer/branches/rel_1_6_1/tools/ebooks (dbs)
svn at svn.open-ils.org
svn at svn.open-ils.org
Wed Dec 8 14:18:31 EST 2010
Author: dbs
Date: 2010-12-08 14:18:27 -0500 (Wed, 08 Dec 2010)
New Revision: 1102
Modified:
conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
Log:
Better error handling & pull LoC data handling into its own function
Modified: conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py 2010-12-08 17:06:18 UTC (rev 1101)
+++ conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py 2010-12-08 19:18:27 UTC (rev 1102)
@@ -314,9 +314,9 @@
# If we have a ToC or author notes or whatever, replace with content
if field['u'].find('.loc.gov') > -1:
- content = substitute_content(field)
- if (content):
- new_fields.append(content)
+ enrich = substitute_content(field)
+ if enrich and isinstance(enrich, pymarc.field.Field):
+ new_fields.append(enrich)
else:
for lib in options['libraries']:
data = options['settings'].get_settings(lib)
@@ -332,39 +332,38 @@
def substitute_content(field):
"""Parses a ToC or author notes URL and generates a field"""
+ url = field['u']
+
content_field = None
raw_content = ''
- url = field['u']
-
# Skip machine-generated tables of contents
if url.find('/toc/') > -1:
return None
+ # Get the data from the supplied URL
try:
req = urllib2.urlopen(url)
raw_content = BeautifulSoup(req.read())
except urllib2.HTTPError, ex:
print("%s for URL %s" % (ex, url))
return None
+ except urllib2.URLError, ex:
+ print("%s for URL %s" % (ex, url))
+ return None
- # Short-circuit if we have an OCRed ToC; the quality is terrible
- if raw_content.find(text='Electronic data is machine generated'):
+ content = process_loc_data(raw_content)
+ if not content:
return None
- elif raw_content.find('<pre>'):
- return None
- content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
- content = content.replace('\n', ' ')
-
- if url.find('-b.html') > -1:
+ if url.endswith('-b.html'):
# Biographical note
content_field = pymarc.Field(
tag = '545',
indicators = ['1', ' '],
subfields = ['a', content]
)
- elif url.find('-d.html') > -1:
+ elif url.endswith('-d.html'):
# Summary written by publisher
content_field = pymarc.Field(
tag = '520',
@@ -372,7 +371,7 @@
subfields = ['a', content]
)
- elif url.find('-t.html') > -1:
+ elif url.endswith('-t.html'):
# Table of contents
content_field = pymarc.Field(
tag = '505',
@@ -381,10 +380,37 @@
)
else:
print("URL %s didn't match known LoC type" % (url))
- return None
return content_field
+def process_loc_data(raw_content):
+ """Given the LoC enriched data, make it usable"""
+
+ # Short-circuit if we have an OCRed ToC; the quality is terrible
+ if raw_content.find(text='Electronic data is machine generated'):
+ return None
+ elif raw_content.find('<pre>'):
+ return None
+
+ # Get all of the text after the horizontal rule
+ content = ' '.join(
+ raw_content.find('hr').findAllNext(text=True)
+ ).encode('utf8')
+
+ # Remove linefeeds
+ content = content.replace('\n', ' ')
+ content = content.replace('\r', ' ')
+
+ # Remove inline subject headings to avoid too much indexing boost
+ lcsh = content.find('Library of Congress subject headings')
+ if lcsh > -1:
+ content = content[0:lcsh]
+
+ # Farewell, starting and ending whitespace
+ content = content.strip()
+
+ return content
+
def get_subfields(field, data):
"""Creates 856 subfields required by Conifer"""
More information about the open-ils-commits
mailing list