[open-ils-commits] r340 - conifer/trunk/tools/migration-scripts (dbs)
svn at svn.open-ils.org
svn at svn.open-ils.org
Thu Apr 16 14:33:46 EDT 2009
Author: dbs
Date: 2009-04-16 14:33:42 -0400 (Thu, 16 Apr 2009)
New Revision: 340
Modified:
conifer/trunk/tools/migration-scripts/fixURIs.pl
Log:
Clean up:
* allow command line specification of input/output files
* provide more granular feedback on breakdown of what got touched
Modified: conifer/trunk/tools/migration-scripts/fixURIs.pl
===================================================================
--- conifer/trunk/tools/migration-scripts/fixURIs.pl 2009-04-16 18:07:04 UTC (rev 339)
+++ conifer/trunk/tools/migration-scripts/fixURIs.pl 2009-04-16 18:33:42 UTC (rev 340)
@@ -1,68 +1,99 @@
#!/usr/bin/perl -w
use strict;
+use Getopt::Long;
use MARC::File::XML( BinaryEncoding => 'utf8', RecordFormat => 'USMARC' );
-# Clean up URIs prior to batch ingest
+# Clean up URIs from MARCXML records prior to batch ingest
# * If we detect a proxy URL:
# * Ensure ind1 = 4
# * Ensure ind2 = 2
# * Ensure $9 = aou.shortname
# * Trim whitespace and other tweaks while we're at it?
-my $input = MARC::File::XML->in( shift );
-my $output = MARC::File::XML->out( 'bibs_edited.xml' );
+my ($input_file, $output_file);
+GetOptions(
+ 'input=s' => \$input_file,
+ 'output=s' => \$output_file
+);
-my $touched = 0;
+if (!$input_file or !$output_file) {
+ print("Please specify the following options:\n");
+ print("\t--input : input file of MARCXML records\n");
+ print("\t--output : output file of processed MARCXML records\n");
+ exit();
+}
+
+my $input = MARC::File::XML->in( $input_file );
+my $output = MARC::File::XML->out( $output_file );
+
+my ($touched, $url_cnt, $ind1_cnt, $ind2_cnt, $sub9_cnt) = (0, 0, 0, 0, 0);
while (my $marc = $input->next()) {
- my $edited = 0;
- my @uri_fields = $marc->field('856');
- foreach my $uri (@uri_fields) {
- my ($orgunit);
+ my $edited = 0;
+ my @uri_fields = $marc->field('856');
+ foreach my $uri (@uri_fields) {
+ my ($orgunit);
- # There's no way we should have multiples, but let's iterate anyway
- my @urls = $uri->subfield('u');
+ # There's no way we should have multiples, but let's iterate anyway
+ my @urls = $uri->subfield('u');
- foreach my $url (@urls) {
- if ($url =~ m/librweb.laurentian.ca/o) {
- $orgunit = 'OSUL';
- } elsif ($url =~ m/libproxy.auc.ca/o) {
- $orgunit = 'OSTMA';
- } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) {
- $orgunit = 'OSM';
- }
+ foreach my $url (@urls) {
+ # For general use we should factor these out to a hash. Oh well.
- if ($orgunit) {
- my $clean_url = $url;
- $clean_url =~ s/^\s*(.*?)\s*$/$1/o;
- if ($url ne $clean_url) {
- $uri->update(u => $clean_url);
- $edited++;
- }
+ # We're filtering by proxy address, because theoretically anything
+ # that is not proxied is open to the world to access and doesn't
+ # need to be treated as a URI particular to that org_unit
+ if ($url =~ m/librweb.laurentian.ca/o) {
+ $orgunit = 'OSUL';
+ } elsif ($url =~ m/libproxy.auc.ca/o) {
+ $orgunit = 'OSTMA';
+ } elsif ($url =~ m/normedproxy.lakeheadu.ca/o) {
+ $orgunit = 'OSM';
+ }
- my $ind1 = $uri->indicator(1);
- if ($ind1 and $ind1 ne '1' and $ind1 ne '4') {
- $uri->update(ind1 => '4');
- $edited++;
- }
+ if ($orgunit) {
+ my $clean_url = $url;
+ $clean_url =~ s/^\s*(.*?)\s*$/$1/o;
+ if ($url ne $clean_url) {
+ $uri->update(u => $clean_url);
+ $edited++;
+ $url_cnt++;
+ }
- my $ind2 = $uri->indicator(2);
- if ($ind2 and $ind2 ne '0' and $ind2 ne '1') {
- $uri->update(ind2 => '1');
- $edited++;
- }
+ my $ind1 = $uri->indicator(1);
+ if ($ind1 and $ind1 ne '1' and $ind1 ne '4') {
+ $uri->update(ind1 => '4');
+ $edited++;
+ $ind1_cnt++;
+ }
- # Risking that we only have one subfield 9 here
- my $aou = $uri->subfield('9');
- if (!$aou or $aou ne $orgunit) {
- $uri->update(9 => $orgunit);
- $edited++;
- }
- }
- }
- }
- if ($edited) {
- $touched++;
- }
- $output->write($marc);
+ my $ind2 = $uri->indicator(2);
+ if ($ind2 and $ind2 ne '0' and $ind2 ne '1') {
+ $uri->update(ind2 => '1');
+ $edited++;
+ $ind2_cnt++;
+ }
+
+ # Risking that we only have one subfield 9 here
+ # Should be a slight risk as it's not defined in the spec
+ my $aou = $uri->subfield('9');
+ if (!$aou or $aou ne $orgunit) {
+ $uri->update(9 => $orgunit);
+ $edited++;
+ $sub9_cnt++;
+ }
+ }
+ }
+ }
+ if ($edited) {
+ $touched++;
+ }
+ $output->write($marc);
}
$output->close();
+print "Touched $touched records to fix URIs.\n";
+print "\t$url_cnt URLs were touched\n";
+print "\t$ind1_cnt indicator 1 values were touched\n";
+print "\t$ind2_cnt indicator 2 values were touched\n";
+print "\t$sub9_cnt subfield '9' values were touched\n";
+
+# vim: et:ts=4:sw=4:
More information about the open-ils-commits
mailing list