[open-ils-commits] r561 - conifer/trunk/tools/migration-scripts (dbs)

svn at svn.open-ils.org svn at svn.open-ils.org
Thu Jul 2 23:42:40 EDT 2009


Author: dbs
Date: 2009-07-02 23:42:38 -0400 (Thu, 02 Jul 2009)
New Revision: 561

Added:
   conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl
Log:
Migration guy - clean up thine own mess


Added: conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl
===================================================================
--- conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl	                        (rev 0)
+++ conifer/trunk/tools/migration-scripts/fix_windsors_diacritics.pl	2009-07-03 03:42:38 UTC (rev 561)
@@ -0,0 +1,166 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+# Let's say you accidentally stripped the diacritics from, oh, 189,000 records during a migration.
+# If you still have the original records, you might want to use a script like this to load
+# them back into the database.
+
+require 'oils_header.pl';
+use Error qw/:try/;
+use Digest::MD5 qw/md5_hex/;
+use OpenSRF::Utils::JSON;
+use OpenILS::Application::AppUtils;
+use Unicode::Normalize;
+use Encode;
+
+use FileHandle;
+use Time::HiRes qw/time/;
+use Getopt::Long;
+use MARC::Batch;
+use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+use MARC::Charset;
+use DBI;
+
+my ($marcfile, $marctype, $enc, $config, $username, $password) = ('/openils/migration/windsor/bib20090430.mrc', 'USMARC', 'UTF8', '/openils/conf/opensrf_core.xml');
+
+GetOptions(
+	'encoding=s'	=> \$enc, # set assumed MARC encoding for MARC::Charset
+	'config=s'	=> \$config, # location of OpenSRF core config file, defaults to /openils/conf/opensrf_core.xml
+	"username=s"	=> \$username, # EG username
+	"password=s"	=> \$password, # EG password
+);
+
+if ($enc) {
+	MARC::Charset->ignore_errors(1);
+	MARC::Charset->assume_encoding($enc);
+}
+
+OpenSRF::System->bootstrap_client( config_file => $config );
+
+# Login to Evergreen and get an authentication token
+my $auth = oils_login($username, $password);
+if (!$auth) {
+	die "Could not retrieve an authentication token";
+}
+
+select STDERR; $| = 1;
+select STDOUT; $| = 1;
+binmode STDOUT, ":utf8";
+
+my $batch = new MARC::Batch ( $marctype, $marcfile );
+$batch->strict_off();
+$batch->warnings_off();
+
+my $starttime = time;
+my $rec;
+my $count = 0;
+my $recs = 0;
+PROCESS: while ( try { $rec = $batch->next } otherwise { $rec = -1 } ) {
+	next if ($rec == -1);
+
+	$count++;
+
+	if ($rec->as_formatted =~ m/[^\x00-\x7f]/) {
+		$rec_count++;
+		print "$rec_count of $count\n";
+		update_id_field(\$rec);
+		fix_URIs(\$rec);
+		update_marc(\$rec);
+
+		# Exit nice and early so that we don't wander off and update a whole batch without testing
+		if ($rec_count > 0) {
+			exit;
+		}
+	}
+	
+}
+
+# Set the 001 and 901 to our record ID in Conifer
+# Windsor records are offset by 1 million from their legacy ID
+sub update_id_field {
+	my $rec = shift;
+
+	my $tcn = $$rec->field('001');
+	my $rec_id = $tcn->data + 1000000;
+	$tcn->update($rec_id);
+	my $id_field = MARC::Field->new('901', '', '', 'a' => $rec_id, 'b' => 'Unknown', 'c' => $rec_id);
+	$$rec->append_fields($id_field);
+}
+
+sub fix_URIs {
+	my $marc = shift;
+
+	my @uri_fields = $$marc->field('856');
+	foreach my $uri (@uri_fields) {
+		my ($orgunit);
+
+		# There's no way we should have multiples, but let's iterate anyway
+		my @urls = $uri->subfield('u');
+
+		foreach my $url (@urls) {
+			# For general use we should factor these out to a hash. Oh well.
+
+			# We're filtering by proxy address, because theoretically anything
+			# that is not proxied is open to the world to access and doesn't
+			# need to be treated as a URI particular to that org_unit
+			if ($url =~ m/librweb.laurentian.ca/o) {
+				$orgunit = 'OSUL';
+			} elsif ($url =~ m/libproxy.auc.ca/o) {
+				$orgunit = 'OSTMA';
+			} elsif ($url =~ m/normedproxy.lakeheadu.ca/o) {
+				$orgunit = 'OSM';
+			} elsif ($url =~ m/ezproxy.uwindsor.ca/o or $url =~ m/webvoy.uwindsor.ca/o ) {
+				$orgunit = 'OWA';
+			}
+
+			if ($orgunit) {
+				my $clean_url = $url;
+				$clean_url =~ s/^\s*(.*?)\s*$/$1/o;
+				if ($url ne $clean_url) {
+					$uri->update(u => $clean_url);
+				}
+
+				my $ind1 = $uri->indicator(1);
+				if ($ind1 and $ind1 ne '1' and $ind1 ne '4') {
+					$uri->update(ind1 => '4');
+				}
+
+				my $ind2 = $uri->indicator(2);
+				if ($ind2 and $ind2 ne '0' and $ind2 ne '1') {
+					$uri->update(ind2 => '1');
+				}
+
+				# Risking that we only have one subfield 9 here
+				# Should be a slight risk as it's not defined in the spec
+				my $aou = $uri->subfield('9');
+				if (!$aou or $aou ne $orgunit) {
+					$uri->update(9 => $orgunit);
+				}
+			}
+		}
+	}
+}
+
+sub update_marc {
+	my $rec = shift;
+
+	# Borrowed from marc2bre.pl to get clean XML
+	(my $xml = $$rec->as_xml_record()) =~ s/\n//sog;
+	$xml =~ s/^<\?xml.+\?\s*>//go;
+	$xml =~ s/>\s+</></go;
+	$xml =~ s/\p{Cc}//go;
+	$xml = OpenILS::Application::AppUtils->entityize($xml);
+	$xml =~ s/[\x00-\x1f]//go;
+
+	# Update and ingest this puppy
+	my $update = OpenILS::Application::AppUtils->simplereq('open-ils.cat', 
+		'open-ils.cat.biblio.record.xml.update', 
+		($auth, int($$rec->field('001')->data), $xml)
+	);
+
+	# Return the cleaned-up XML in case we want to inspect it
+	return $xml;
+}
+
+



More information about the open-ils-commits mailing list