[open-ils-commits] r358 - conifer/trunk/tools/migration-scripts (dbs)

svn at svn.open-ils.org svn at svn.open-ils.org
Sun Apr 19 17:21:03 EDT 2009


Author: dbs
Date: 2009-04-19 17:20:57 -0400 (Sun, 19 Apr 2009)
New Revision: 358

Modified:
   conifer/trunk/tools/migration-scripts/fix_bad_marcxml.pl
Log:
Fix a few common yaz-marcdump conversion to XML errors


Modified: conifer/trunk/tools/migration-scripts/fix_bad_marcxml.pl
===================================================================
--- conifer/trunk/tools/migration-scripts/fix_bad_marcxml.pl	2009-04-18 20:20:13 UTC (rev 357)
+++ conifer/trunk/tools/migration-scripts/fix_bad_marcxml.pl	2009-04-19 21:20:57 UTC (rev 358)
@@ -3,19 +3,20 @@
 use warnings;
 
 foreach my $file (@ARGV) {
-	clean_empty_datafields($file);
+	process_file($file);
 }
 
-sub clean_empty_datafields {
+sub process_file {
 	my $file = shift;
 
 	# Empty datafields anger MARC::File::XML
 	open(FH, '<', $file) or die $!;
 	open(CLEAN, '>', "$file.new");
 
-	my ($trim, $lastline) = (0, '');
+	my ($trim, $lastline, $lineno) = (0, '', 1);
 	while (<FH>) {
 		if ($_ =~ m#</datafield># and $lastline =~ m#<datafield#) {
+			print STDERR "Empty datafield at line $lineno of file $file\n";
 			$trim = 1;
 		} elsif ($trim) {
 			$trim = 0;
@@ -23,8 +24,25 @@
 			print CLEAN $lastline;
 			$trim = 0;
 		}
+
+		# Given questionable input, yaz-marcdump creates invalid XML like this:
+		#   <datafield tag="500" ind1=" " ind2=" ">
+		#    <subfield code="a">In subtitle "sports" appears as "</subfield>
+		#    <subfield code="p">ort</subfield>
+		#    <subfield code=""">.</subfield>
+		#  </datafield>
+		#
+		# This will at least enable MARC::File::XML to process it:
+		if ($_ =~ m#<subfield code=""">#o) {
+			print STDERR "Bad subfield code \" at line $lineno of file $file\n";
+			$_ =~ s{<subfield code=""">}{<subfield code="a">}o;
+		} elsif ($_ =~ m#<subfield code="<">#o) {
+			print STDERR "Bad subfield code < at line $lineno of file $file\n";
+			$_ =~ s{<subfield code="<">}{<subfield code="a">}o;
+		}
 		
 		$lastline = $_;
+		$lineno++;
 	}
 	print CLEAN $lastline;
 



More information about the open-ils-commits mailing list