[open-ils-commits] r340 - conifer/trunk/tools/migration-scripts (dbs)

svn at svn.open-ils.org svn at svn.open-ils.org
Thu Apr 16 14:33:46 EDT 2009


Author: dbs
Date: 2009-04-16 14:33:42 -0400 (Thu, 16 Apr 2009)
New Revision: 340

Modified:
   conifer/trunk/tools/migration-scripts/fixURIs.pl
Log:
Clean up:
  * allow command line specification of input/output files
  * provide more granular feedback on breakdown of what got touched


Modified: conifer/trunk/tools/migration-scripts/fixURIs.pl
===================================================================
--- conifer/trunk/tools/migration-scripts/fixURIs.pl	2009-04-16 18:07:04 UTC (rev 339)
+++ conifer/trunk/tools/migration-scripts/fixURIs.pl	2009-04-16 18:33:42 UTC (rev 340)
@@ -1,68 +1,99 @@
 #!/usr/bin/perl -w
 use strict;
+use Getopt::Long;
 use MARC::File::XML( BinaryEncoding => 'utf8', RecordFormat => 'USMARC' );
 
-# Clean up URIs prior to batch ingest
+# Clean up URIs from MARCXML records prior to batch ingest
 #   * If we detect a proxy URL:
 #     * Ensure ind1 = 4
 #     * Ensure ind2 = 2
 #     * Ensure $9 = aou.shortname
 #   * Trim whitespace and other tweaks while we're at it?
 
-my $input = MARC::File::XML->in( shift );
-my $output = MARC::File::XML->out( 'bibs_edited.xml' );
+my ($input_file, $output_file);
+GetOptions(
+    'input=s' => \$input_file,
+    'output=s' => \$output_file
+);
 
-my $touched = 0;
+if (!$input_file or !$output_file) {
+    print("Please specify the following options:\n");
+    print("\t--input : input file of MARCXML records\n");
+    print("\t--output : output file of processed MARCXML records\n");
+    exit();
+}
+
+my $input = MARC::File::XML->in( $input_file );
+my $output = MARC::File::XML->out( $output_file );
+
+my ($touched, $url_cnt, $ind1_cnt, $ind2_cnt, $sub9_cnt) = (0, 0, 0, 0, 0);
 while (my $marc = $input->next()) {
-	my $edited = 0;
-	my @uri_fields = $marc->field('856');
-	foreach my $uri (@uri_fields) {
-		my ($orgunit);
+    my $edited = 0;
+    my @uri_fields = $marc->field('856');
+    foreach my $uri (@uri_fields) {
+        my ($orgunit);
 
-		# There's no way we should have multiples, but let's iterate anyway
-		my @urls = $uri->subfield('u');
+        # There's no way we should have multiples, but let's iterate anyway
+        my @urls = $uri->subfield('u');
 
-		foreach my $url (@urls) {
-			if ($url =~ m/librweb.laurentian.ca/o) {
-				$orgunit = 'OSUL';
-			} elsif ($url =~ m/libproxy.auc.ca/o) {
-				$orgunit = 'OSTMA';
-			} elsif ($url =~ m/normedproxy.lakeheadu.ca/o) {
-				$orgunit = 'OSM';
-			}
+        foreach my $url (@urls) {
+            # For general use we should factor these out to a hash. Oh well.
 
-			if ($orgunit) {
-				my $clean_url = $url;
-				$clean_url =~ s/^\s*(.*?)\s*$/$1/o;
-				if ($url ne $clean_url) {
-					$uri->update(u => $clean_url);
-					$edited++;
-				}
+            # We're filtering by proxy address, because theoretically anything
+            # that is not proxied is open to the world to access and doesn't
+            # need to be treated as a URI particular to that org_unit
+            if ($url =~ m/librweb.laurentian.ca/o) {
+                $orgunit = 'OSUL';
+            } elsif ($url =~ m/libproxy.auc.ca/o) {
+                $orgunit = 'OSTMA';
+            } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) {
+                $orgunit = 'OSM';
+            }
 
-				my $ind1 = $uri->indicator(1);
-				if ($ind1 and $ind1 ne '1' and $ind1 ne '4') {
-					$uri->update(ind1 => '4');
-					$edited++;
-				}
+            if ($orgunit) {
+                my $clean_url = $url;
+                $clean_url =~ s/^\s*(.*?)\s*$/$1/o;
+                if ($url ne $clean_url) {
+                    $uri->update(u => $clean_url);
+                    $edited++;
+                    $url_cnt++;
+                }
 
-				my $ind2 = $uri->indicator(2);
-				if ($ind2 and $ind2 ne '0' and $ind2 ne '1') {
-					$uri->update(ind2 => '1');
-					$edited++;
-				}
+                my $ind1 = $uri->indicator(1);
+                if ($ind1 and $ind1 ne '1' and $ind1 ne '4') {
+                    $uri->update(ind1 => '4');
+                    $edited++;
+                    $ind1_cnt++;
+                }
 
-				# Risking that we only have one subfield 9 here
-				my $aou = $uri->subfield('9');
-				if (!$aou or $aou ne $orgunit) {
-					$uri->update(9 => $orgunit);
-					$edited++;
-				}
-			}
-		}
-	}
-	if ($edited) {
-		$touched++;
-	}
-	$output->write($marc);
+                my $ind2 = $uri->indicator(2);
+                if ($ind2 and $ind2 ne '0' and $ind2 ne '1') {
+                    $uri->update(ind2 => '1');
+                    $edited++;
+                    $ind2_cnt++;
+                }
+
+                # Risking that we only have one subfield 9 here
+                # Should be a slight risk as it's not defined in the spec
+                my $aou = $uri->subfield('9');
+                if (!$aou or $aou ne $orgunit) {
+                    $uri->update(9 => $orgunit);
+                    $edited++;
+                    $sub9_cnt++;
+                }
+            }
+        }
+    }
+    if ($edited) {
+        $touched++;
+    }
+    $output->write($marc);
 }
 $output->close();
+print "Touched $touched records to fix URIs.\n";
+print "\t$url_cnt URLs were touched\n";
+print "\t$ind1_cnt indicator 1 values were touched\n";
+print "\t$ind2_cnt indicator 2 values were touched\n";
+print "\t$sub9_cnt subfield '9' values were touched\n";
+
+# vim: et:ts=4:sw=4:



More information about the open-ils-commits mailing list