[open-ils-commits] r1106 - conifer/branches/rel_1_6_1/tools/ebooks (dbs)
svn at svn.open-ils.org
svn at svn.open-ils.org
Thu Dec 16 11:42:27 EST 2010
Author: dbs
Date: 2010-12-16 11:42:24 -0500 (Thu, 16 Dec 2010)
New Revision: 1106
Modified:
conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
Log:
Convert uniformly to Unicode output
LoC added content appears to come in ISO-8859-1, so decode that accordingly.
Then generate UTF8-encoded Unicode MARC records on output.
Modified: conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py 2010-12-13 18:54:58 UTC (rev 1105)
+++ conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py 2010-12-16 16:42:24 UTC (rev 1106)
@@ -220,29 +220,34 @@
"""Converts raw ebook MARC records to Conifer-ready MARC records"""
sample = ''
- reader = pymarc.MARCReader(open(options['input'], 'rb'))
- writer = pymarc.MARCWriter(open(options['output'], 'wb'))
+ reader = pymarc.MARCReader(
+ open(options['input'], mode='rb'), to_unicode=True
+ )
+ writer = pymarc.MARCWriter(open(options['output'], mode='wb'))
if ('sample' in options):
- sample = pymarc.MARCWriter(open(options['sample'], 'wb'))
+ sample = pymarc.MARCWriter(open(options['sample'], mode='wb'))
cnt = 0
for record in reader:
cnt = cnt + 1
- if not (record['856'] and record['856']['u']):
- print("* No 856 for record # %s in file %s"
- % (cnt, options['input'])
- )
+ try:
+ if not (record['856'] and record['856']['u']):
+ print("* No 856 for record # %s in file %s"
+ % (cnt, options['input'])
+ )
- new_record = process_fields(record, options)
+ new_record = process_fields(record, options)
- writer.write(new_record)
- if (sample and ((cnt == 1) or (cnt % 100 == 0))):
- sample.write(new_record)
+ writer.write(new_record)
+ if (sample and ((cnt == 1) or (cnt % 100 == 0))):
+ sample.write(new_record)
+ except Exception, ex:
+ print("* Error processing record %s - %s" % (cnt, ex))
def process_fields(record, options):
"""Decide which fields to add, delete, and keep"""
- new_record = pymarc.Record()
+ new_record = pymarc.Record(to_unicode=True, force_utf8=True)
for field in record.get_fields():
# Process all of the 856 fields
@@ -435,7 +440,7 @@
content = content[0:lcsh]
# Farewell, starting and ending whitespace
- content = content.strip()
+ content = content.strip().decode('iso8859-1')
return content
More information about the open-ils-commits
mailing list