[open-ils-commits] r1076 - in conifer/branches/rel_1_6_1/tools: . ebooks (dbs)

svn at svn.open-ils.org svn at svn.open-ils.org
Fri Nov 19 17:33:09 EST 2010

Author: dbs
Date: 2010-11-19 17:33:03 -0500 (Fri, 19 Nov 2010)
New Revision: 1076

Close to working ebook MARC record processing script

Needs some massive refactoring for a smarter, more object-oriented approach.
But some days you just need to bash out something that mostly works.

Also needs the ebrary URL distinction and Algoma's settings.

Added: conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py
--- conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py	                        (rev 0)
+++ conifer/branches/rel_1_6_1/tools/ebooks/prep_ebook_records.py	2010-11-19 22:33:03 UTC (rev 1076)
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+import codecs, os, os.path, sys, getopt, pymarc, pymarc.marc8
+class Institution():
+    """Defines standard settings for each Conifer institution"""
+    def __init__(self):
+        """Initialize the Institution object"""
+        self.algoma = { \
+            "code": "OSTMA", \
+            "ebrary_code": "XXX", \
+            "proxy": "XXX", \
+            "public_note": "XXX", \
+            "link_text": "XXX" \
+        }
+        self.laurentian = { \
+            "code": "OSUL", \
+            "ebrary_code": "jndlu", \
+            "proxy": "https://librweb.laurentian.ca/login?url=", \
+            "public_note": "Available online / disponible en ligne", \
+            "link_text": "Available online / disponible en ligne" \
+        }
+        self.windsor = { \
+            "code": "OWA", \
+            "ebrary_code": "oculwindsor", \
+            "proxy": "http://ezproxy.uwindsor.ca/login?url=", \
+            "public_note": "To view Windsor's electronic resource click here.", \
+            "link_text": "To view Windsor's electronic resource click here." \
+        }
+    def get_settings(self, lib):
+        """Return the settings for a library by name"""
+        return getattr(self, lib)
+def do_help():
+    '''
+    Print help for the Conifer ebook MARCXML processor
+    '''
+    print '''
+Conifer ebook MARCXML processor
+This script takes a set of MARCXML records and processes them to generate a set
+of MARCXML records ready for loading into the Conifer consortial library
+system. The processing consists of taking the existing 856 field and creating
+one or more new 856 fields for each Conifer institution that should have access
+to these resources.
+The script customizes the following aspects of each record:
+  * Adds one 856 per institution specified at the command line:
+      * $u (URL) - prepends the institutional proxy and, for eBrary records,
+        changes the insitutional code
+      * $y (link text) - sets preferred text of the link to the resource
+      * $z (public note) - sets public note for the resource
+  * Adds a 710 field to identify the publisher using the value specified
+    at the command line
+  * Adds a 590 internal note field using the value specified at the command
+    line.
+Required arguments:
+    -i / --input : The name of the input MARCXML file.
+    -o / --output : The name of the output MARCXML file.
+    -p / --publisher : The name of the publisher to be inserted in a 710 field.
+    -A / --algoma: Add an 856 for Algoma University
+    -L / --laurentian: Add an 856 for Laurentian University
+    -W / --windsor : Add an 856 for University of Windsor
+Optional arguments:
+    -n / --note : The text of the internal note to be inserted into a 590 field.
+    -h / --help : Prints help message
+    %s --algoma --windsor -i crkn.xml -o /tmp/crkn_out.xml -p "eBrary Inc."
+    ''' % sys.argv[0]
+    sys.exit(0)
+def consolidate_options(opts):
+    """Make long arguments the standard form in command line options"""
+    _options = dict(opts)
+    for key, val in opts:
+        if key == '-i':
+            _options['--input'] = val
+        elif key == '-o':
+            _options['--output'] = val
+        elif key == '-p':
+            _options['--publisher'] = val
+        elif key == '-n':
+            _options['--note'] = val
+        elif key == '-A':
+            _options['--algoma'] = val
+        elif key == '-L':
+            _options['--laurentian'] = val
+        elif key == '-W':
+            _options['--windsor'] = val
+        elif key == '-h':
+            _options['--help'] = val
+    return _options
+def check_options(options):
+    """Check the validity of options that were passed in"""
+    _help = False
+    if '--help' in options:
+        do_help()
+    if '--input' not in options:
+        print "* Missing -i / --input argument!"
+        _help = True
+    if '--output' not in options:
+        print "* Missing -o / --output argument!"
+        _help = True
+    if '--publisher' not in options:
+        print "* Missing -p / --publisher argument!"
+        _help = True
+    _libraries = dict() 
+    if '--algoma' in options:
+        _libraries['algoma'] = True
+    if '--laurentian' in options:
+        _libraries['laurentian'] = True
+    if '--windsor' in options:
+        _libraries['windsor'] = True
+    if len(_libraries.keys()) == 0:
+        _help = True
+    if _help == True:
+        do_help()
+    # Get the input and output files
+    _input = options['--input']
+    _output = options['--output']
+    try:
+        os.stat(_input)
+    except OSError:
+        print("* Cannot read input file %s" % (_input))
+        sys.exit(0)
+    try:
+        os.access(os.path.dirname(_output), os.W_OK)
+    except OSError:
+        print("* Cannot write to output path %s" % (os.path.dirname(_output)))
+        sys.exit(0)
+    clean_opts = dict()
+    clean_opts['publisher'] = options['--publisher']
+    if '--note' in options:
+        clean_opts['note'] = options['--note']
+    clean_opts['libraries'] = _libraries
+    clean_opts['input'] = _input
+    clean_opts['output'] = _output
+    clean_opts['settings'] = Institution()
+    return clean_opts
+def parse_opts():
+    """Get command-line arguments from the script"""
+    try:
+        _short_opts = 'i:o:p:ALWn:h'
+        _long_opts = ['input=', 'output=', 'publisher=', 'algoma', \
+                'laurentian', 'windsor', 'note=', 'help']
+        opts, args = getopt.getopt(sys.argv[1:], _short_opts, _long_opts) 
+    except getopt.GetoptError, ex:
+        print "* %s" % str(ex)
+        do_help()
+    _options = consolidate_options(opts)
+    return check_options(_options)    
+def process_records(options):
+    """Converts raw ebook MARC records to Conifer-ready MARC records"""
+    reader = pymarc.MARCReader(open(options['input'], 'rb'))
+    writer = pymarc.MARCWriter(open(options['output'], 'wb'))
+    cnt = 0
+    for record in reader:
+        url = False
+        cnt = cnt + 1
+        if record['856'] and record['856']['u']:
+            url_tag = record['856']['u']
+            # print url
+        else:
+            print("* No 856 for record # %s" % (cnt))
+        new_record = pymarc.Record()
+        for field in record.get_fields():
+            # Only process the first 856 field, for better or worse
+            if field.tag == '856':
+                if url == False:
+                    url = True
+                    new_fields = process_urls(field, options)
+                    for nf in new_fields:
+                        new_record.add_field(nf)
+            else:
+                new_record.add_field(field)
+        seven_ten = pymarc.Field(tag = '710',
+            indicators = ['2', ' '],
+            subfields = [
+                'a', options['publisher']
+            ]
+        )
+        new_record.add_field(seven_ten)
+        if 'note' in options:
+            note = pymarc.Field(tag = '590',
+                indicators = [' ', ' '],
+                subfields = [
+                    'a', options['note']
+                ]
+            )
+            new_record.add_field(note)
+        writer.write(new_record)
+def process_urls(field, options):
+    """Creates 856 fields required by Conifer"""
+    new_fields = []
+    try:
+        url = field['u']
+    except Error:
+        print "* No subfield 'u' found in this 856"
+        return None
+    for lib in options['libraries']:
+        data = options['settings'].get_settings(lib)
+        eight_five_six = pymarc.Field(tag = '856',
+            indicators = ['4', '0'],
+            subfields = [
+                'u', data['proxy'] + url,
+                'y', data['link_text'],
+                'z', data['public_note'],
+                '9', data['code']
+            ]
+        )
+        new_fields.append(eight_five_six)
+    return new_fields
+if __name__ == '__main__':
+    options = parse_opts() 
+    process_records(options);
+## Okay, made it through the basic invocation requirements; moving on
+#For each MARC record:
+#Find the 856 u ($url)
+#for each institution:
+#create a new 856 40
+#if $url =~ /\.ebrary\./, then:
+#$url =~ s/^.*?id=(\d+)\s*$/$1/
+#$url = http://site.ebrary.com/lib/ + institution.ebrary_code + "/Doc?id=" + $url
+#$url = institution.proxy + $url
+#$u = $url
+#$y = institution.link_text
+#$z = institution.public_note
+#$9 = institution.code

More information about the open-ils-commits mailing list