[open-ils-commits] r1102 - conifer/branches/rel_1_6_1/tools/ebooks (dbs)

Wed Dec 8 14:18:31 EST 2010

Author: dbs
Date: 2010-12-08 14:18:27 -0500 (Wed, 08 Dec 2010)
New Revision: 1102

Modified:
   conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
Log:
Better error handling & pull LoC data handling into its own function


Modified: conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
===================================================================

--- conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py	2010-12-08 17:06:18 UTC (rev 1101)
+++ conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py	2010-12-08 19:18:27 UTC (rev 1102)
@@ -314,9 +314,9 @@
 
     # If we have a ToC or author notes or whatever, replace with content
     if field['u'].find('.loc.gov') > -1:
-        content = substitute_content(field)
-        if (content):
-            new_fields.append(content)
+        enrich = substitute_content(field)
+        if enrich and isinstance(enrich, pymarc.field.Field):
+            new_fields.append(enrich)
     else:
         for lib in options['libraries']:
             data = options['settings'].get_settings(lib)
@@ -332,39 +332,38 @@
 def substitute_content(field):
     """Parses a ToC or author notes URL and generates a field"""
 
+    url = field['u']
+
     content_field = None
     raw_content = ''
 
-    url = field['u']
-
     # Skip machine-generated tables of contents
     if url.find('/toc/') > -1:
         return None
 
+    # Get the data from the supplied URL
     try:
         req = urllib2.urlopen(url)
         raw_content = BeautifulSoup(req.read())
     except urllib2.HTTPError, ex:
         print("%s for URL %s" % (ex, url))
         return None
+    except urllib2.URLError, ex:
+        print("%s for URL %s" % (ex, url))
+        return None
 
-    # Short-circuit if we have an OCRed ToC; the quality is terrible
-    if raw_content.find(text='Electronic data is machine generated'):
+    content = process_loc_data(raw_content)
+    if not content:
         return None
-    elif raw_content.find('<pre>'):
-        return None
 
-    content = ''.join(raw_content.find('hr').findAllNext(text=True)).encode('utf8')
-    content = content.replace('\n', ' ')
-
-    if url.find('-b.html') > -1:
+    if url.endswith('-b.html'):
     # Biographical note
         content_field = pymarc.Field(
             tag = '545',
             indicators = ['1', ' '],
             subfields = ['a', content]
         )
-    elif url.find('-d.html') > -1:
+    elif url.endswith('-d.html'):
     # Summary written by publisher
         content_field = pymarc.Field(
             tag = '520',
@@ -372,7 +371,7 @@
             subfields = ['a', content]
         )
 
-    elif url.find('-t.html') > -1:
+    elif url.endswith('-t.html'):
     # Table of contents
         content_field = pymarc.Field(
             tag = '505',
@@ -381,10 +380,37 @@
         )
     else:
         print("URL %s didn't match known LoC type" % (url))
-        return None
 
     return content_field
 
+def process_loc_data(raw_content):
+    """Given the LoC enriched data, make it usable"""
+
+    # Short-circuit if we have an OCRed ToC; the quality is terrible
+    if raw_content.find(text='Electronic data is machine generated'):
+        return None
+    elif raw_content.find('<pre>'):
+        return None
+
+    # Get all of the text after the horizontal rule
+    content = ' '.join(
+        raw_content.find('hr').findAllNext(text=True)
+    ).encode('utf8')
+
+    # Remove linefeeds
+    content = content.replace('\n', ' ')
+    content = content.replace('\r', ' ')
+
+    # Remove inline subject headings to avoid too much indexing boost
+    lcsh = content.find('Library of Congress subject headings')
+    if lcsh > -1:
+        content = content[0:lcsh]
+
+    # Farewell, starting and ending whitespace
+    content = content.strip()
+
+    return content
+
 def get_subfields(field, data):
     """Creates 856 subfields required by Conifer"""