[open-ils-commits] r7699 - in branches/rel_1_2/Open-ILS/src/extras: . import

svn at svn.open-ils.org svn at svn.open-ils.org
Fri Aug 17 13:43:19 EDT 2007


Author: miker
Date: 2007-08-17 13:39:06 -0400 (Fri, 17 Aug 2007)
New Revision: 7699

Modified:
   branches/rel_1_2/Open-ILS/src/extras/import/marc2are.pl
   branches/rel_1_2/Open-ILS/src/extras/import/marc2bre.pl
   branches/rel_1_2/Open-ILS/src/extras/marc2html
Log:
removing control characters from XML with reckless abandon

Modified: branches/rel_1_2/Open-ILS/src/extras/import/marc2are.pl
===================================================================
--- branches/rel_1_2/Open-ILS/src/extras/import/marc2are.pl	2007-08-17 17:37:42 UTC (rev 7698)
+++ branches/rel_1_2/Open-ILS/src/extras/import/marc2are.pl	2007-08-17 17:39:06 UTC (rev 7699)
@@ -66,6 +66,7 @@
 	$xml =~ s/>\s+</></go;
 	$xml =~ s/\p{Cc}//go;
 	$xml = entityize($xml);
+	$xml =~ s/[\x00-\x1f]//go;
 
 	my $bib = new Fieldmapper::authority::record_entry;
 	$bib->id($id);

Modified: branches/rel_1_2/Open-ILS/src/extras/import/marc2bre.pl
===================================================================
--- branches/rel_1_2/Open-ILS/src/extras/import/marc2bre.pl	2007-08-17 17:37:42 UTC (rev 7698)
+++ branches/rel_1_2/Open-ILS/src/extras/import/marc2bre.pl	2007-08-17 17:39:06 UTC (rev 7699)
@@ -158,6 +158,7 @@
 	$xml =~ s/>\s+</></go;
 	$xml =~ s/\p{Cc}//go;
 	$xml = entityize($xml);
+	$xml =~ s/[\x00-\x1f]//go;
 
 	my $bib = new Fieldmapper::biblio::record_entry;
 	$bib->id($id);

Modified: branches/rel_1_2/Open-ILS/src/extras/marc2html
===================================================================
--- branches/rel_1_2/Open-ILS/src/extras/marc2html	2007-08-17 17:37:42 UTC (rev 7698)
+++ branches/rel_1_2/Open-ILS/src/extras/marc2html	2007-08-17 17:39:06 UTC (rev 7699)
@@ -1,19 +1,21 @@
 #!/usr/bin/perl
 
-use Error;
+use Error qw/:try/;
 use MARC::Batch;
 use MARC::File::XML;
 use XML::LibXSLT;
 use XML::LibXML;
 use Unicode::Normalize;
 use Getopt::Long;
+use FileHandle;
 
-my ($split,$enc,$marc,$out) = (100);
+my ($split,$enc,$marc,$out,$bad) = (100);
 GetOptions(
 	'split=i' => \$split,
 	'marc=s'  => \$marc,
 	'encoding=s'  => \$enc,
 	'out_dir=s'  => \$out,
+	'bad=s'  => \$bad,
 );
 
 if ($enc) {
@@ -31,6 +33,7 @@
 
 $stylesheet = $xslt->parse_stylesheet( $parser->parse_string($xsl) );
 
+$bad = new FileHandle( $bad => '>:raw' ) if ($bad);
 
 my $xml = '';
 my $current = 1;
@@ -42,8 +45,21 @@
 $marc->warnings_off;
 
 while (my $r = $marc->next) {
-	$xml .= entityize(MARC::File::XML::record($r));
+	my $rxml = entityize(MARC::File::XML::record($r));
+	$rxml =~ s/[\x00-\x1f]//go;
 
+	try { $doc = $parser->parse_string($rxml); }
+	catch Error with {
+		my $e = shift;
+		warn "arg ... bad record $current, skipping: $e\n";
+		$current++;
+		print $bad $r->as_usmarc if ($bad);
+		$r = undef;
+	};
+	next unless ($r);
+
+	$xml .= $rxml;
+
 	unless ($current % $split) {
 		$xml = <<"		XML";
 			<collection xmlns="http://www.loc.gov/MARC21/slim">
@@ -51,7 +67,11 @@
 			</collection>
 		XML
 
-		my $doc = $parser->parse_string($xml);
+		my $doc;
+		try { $doc = $parser->parse_string($xml); }
+		catch Error with { my $e = shift; warn "ARG! Doc failed to parse:\n$e\n-------------------------------------------\n$xml\n"; };
+		die unless $doc;
+
 		$xml = '';
 
 		my $results = $stylesheet->transform($doc, prev => "'$prev'", next => "'$next'");



More information about the open-ils-commits mailing list