[open-ils-commits] r561 - conifer/trunk/tools/migration-scripts (dbs)
svn at svn.open-ils.org
svn at svn.open-ils.org
Thu Jul 2 23:42:40 EDT 2009
Author: dbs
Date: 2009-07-02 23:42:38 -0400 (Thu, 02 Jul 2009)
New Revision: 561
Added:
conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl
Log:
Migration guy - clean up thine own mess
Added: conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl
===================================================================
--- conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl (rev 0)
+++ conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl 2009-07-03 03:42:38 UTC (rev 561)
@@ -0,0 +1,166 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+# Let's say you accidentally stripped the diacritics from, oh, 189,000 records during a migration.
+# If you still have the original records, you might want to use a script like this to load
+# them back into the database.
+
+require 'oils_header.pl';
+use Error qw/:try/;
+use Digest::MD5 qw/md5_hex/;
+use OpenSRF::Utils::JSON;
+use OpenILS::Application::AppUtils;
+use Unicode::Normalize;
+use Encode;
+
+use FileHandle;
+use Time::HiRes qw/time/;
+use Getopt::Long;
+use MARC::Batch;
+use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+use MARC::Charset;
+use DBI;
+
+my ($marcfile, $marctype, $enc, $config, $username, $password) = ('/openils/migration/windsor/bib20090430.mrc', 'USMARC', 'UTF8', '/openils/conf/opensrf_core.xml');
+
+GetOptions(
+ 'encoding=s' => \$enc, # set assumed MARC encoding for MARC::Charset
+ 'config=s' => \$config, # location of OpenSRF core config file, defaults to /openils/conf/opensrf_core.xml
+ "username=s" => \$username, # EG username
+ "password=s" => \$password, # EG password
+);
+
+if ($enc) {
+ MARC::Charset->ignore_errors(1);
+ MARC::Charset->assume_encoding($enc);
+}
+
+OpenSRF::System->bootstrap_client( config_file => $config );
+
+# Login to Evergreen and get an authentication token
+my $auth = oils_login($username, $password);
+if (!$auth) {
+ die "Could not retrieve an authentication token";
+}
+
+select STDERR; $| = 1;
+select STDOUT; $| = 1;
+binmode STDOUT, ":utf8";
+
+my $batch = new MARC::Batch ( $marctype, $marcfile );
+$batch->strict_off();
+$batch->warnings_off();
+
+my $starttime = time;
+my $rec;
+my $count = 0;
+my $recs = 0;
+PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) {
+ next if ($rec == -1);
+
+ $count++;
+
+ if ($rec->as_formatted =~ m/[^\x00-\x7f]/) {
+ $rec_count++;
+ print "$rec_count of $count\n";
+ update_id_field(\$rec);
+ fix_URIs(\$rec);
+ update_marc(\$rec);
+
+ # Exit nice and early so that we don't wander off and update a whole batch without testing
+ if ($rec_count > 0) {
+ exit;
+ }
+ }
+
+}
+
+# Set the 001 and 901 to our record ID in Conifer
+# Windsor records are offset by 1 million from their legacy ID
+sub update_id_field {
+ my $rec = shift;
+
+ my $tcn = $$rec->field('001');
+ my $rec_id = $tcn->data + 1000000;
+ $tcn->update($rec_id);
+ my $id_field = MARC::Field->new('901', '', '', 'a' => $rec_id, 'b' => 'Unknown', 'c' => $rec_id);
+ $$rec->append_fields($id_field);
+}
+
+sub fix_URIs {
+ my $marc = shift;
+
+ my @uri_fields = $$marc->field('856');
+ foreach my $uri (@uri_fields) {
+ my ($orgunit);
+
+ # There's no way we should have multiples, but let's iterate anyway
+ my @urls = $uri->subfield('u');
+
+ foreach my $url (@urls) {
+ # For general use we should factor these out to a hash. Oh well.
+
+ # We're filtering by proxy address, because theoretically anything
+ # that is not proxied is open to the world to access and doesn't
+ # need to be treated as a URI particular to that org_unit
+ if ($url =~ m/librweb.laurentian.ca/o) {
+ $orgunit = 'OSUL';
+ } elsif ($url =~ m/libproxy.auc.ca/o) {
+ $orgunit = 'OSTMA';
+ } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) {
+ $orgunit = 'OSM';
+ } elsif ($url =~ m/ezproxy.uwindsor.ca/o or $url =~ m/webvoy.uwindsor.ca/o ) {
+ $orgunit = 'OWA';
+ }
+
+ if ($orgunit) {
+ my $clean_url = $url;
+ $clean_url =~ s/^\s*(.*?)\s*$/$1/o;
+ if ($url ne $clean_url) {
+ $uri->update(u => $clean_url);
+ }
+
+ my $ind1 = $uri->indicator(1);
+ if ($ind1 and $ind1 ne '1' and $ind1 ne '4') {
+ $uri->update(ind1 => '4');
+ }
+
+ my $ind2 = $uri->indicator(2);
+ if ($ind2 and $ind2 ne '0' and $ind2 ne '1') {
+ $uri->update(ind2 => '1');
+ }
+
+ # Risking that we only have one subfield 9 here
+ # Should be a slight risk as it's not defined in the spec
+ my $aou = $uri->subfield('9');
+ if (!$aou or $aou ne $orgunit) {
+ $uri->update(9 => $orgunit);
+ }
+ }
+ }
+ }
+}
+
+sub update_marc {
+ my $rec = shift;
+
+ # Borrowed from marc2bre.pl to get clean XML
+ (my $xml = $$rec->as_xml_record()) =~ s/\n//sog;
+ $xml =~ s/^<\?xml.+\?\s*>//go;
+ $xml =~ s/>\s+</></go;
+ $xml =~ s/\p{Cc}//go;
+ $xml = OpenILS::Application::AppUtils->entityize($xml);
+ $xml =~ s/[\x00-\x1f]//go;
+
+ # Update and ingest this puppy
+ my $update = OpenILS::Application::AppUtils->simplereq('open-ils.cat',
+ 'open-ils.cat.biblio.record.xml.update',
+ ($auth, int($$rec->field('001')->data), $xml)
+ );
+
+ # Return the cleaned-up XML in case we want to inspect it
+ return $xml;
+}
+
+
More information about the open-ils-commits
mailing list