[open-ils-commits] r7698 - in trunk/Open-ILS/src/extras: . import
svn at svn.open-ils.org
svn at svn.open-ils.org
Fri Aug 17 13:42:03 EDT 2007
Author: miker
Date: 2007-08-17 13:37:42 -0400 (Fri, 17 Aug 2007)
New Revision: 7698
Modified:
trunk/Open-ILS/src/extras/import/marc2are.pl
trunk/Open-ILS/src/extras/import/marc2bre.pl
trunk/Open-ILS/src/extras/marc2html
Log:
removing control characters from XML with reckless abandon
Modified: trunk/Open-ILS/src/extras/import/marc2are.pl
===================================================================
--- trunk/Open-ILS/src/extras/import/marc2are.pl 2007-08-16 21:06:21 UTC (rev 7697)
+++ trunk/Open-ILS/src/extras/import/marc2are.pl 2007-08-17 17:37:42 UTC (rev 7698)
@@ -67,6 +67,7 @@
$xml =~ s/>\s+</></go;
$xml =~ s/\p{Cc}//go;
$xml = entityize($xml);
+ $xml =~ s/[\x00-\x1f]//go;
my $bib = new Fieldmapper::authority::record_entry;
$bib->id($id);
Modified: trunk/Open-ILS/src/extras/import/marc2bre.pl
===================================================================
--- trunk/Open-ILS/src/extras/import/marc2bre.pl 2007-08-16 21:06:21 UTC (rev 7697)
+++ trunk/Open-ILS/src/extras/import/marc2bre.pl 2007-08-17 17:37:42 UTC (rev 7698)
@@ -159,6 +159,7 @@
$xml =~ s/>\s+</></go;
$xml =~ s/\p{Cc}//go;
$xml = entityize($xml);
+ $xml =~ s/[\x00-\x1f]//go;
my $bib = new Fieldmapper::biblio::record_entry;
$bib->id($id);
Modified: trunk/Open-ILS/src/extras/marc2html
===================================================================
--- trunk/Open-ILS/src/extras/marc2html 2007-08-16 21:06:21 UTC (rev 7697)
+++ trunk/Open-ILS/src/extras/marc2html 2007-08-17 17:37:42 UTC (rev 7698)
@@ -1,19 +1,21 @@
#!/usr/bin/perl
-use Error;
+use Error qw/:try/;
use MARC::Batch;
use MARC::File::XML;
use XML::LibXSLT;
use XML::LibXML;
use Unicode::Normalize;
use Getopt::Long;
+use FileHandle;
-my ($split,$enc,$marc,$out) = (100);
+my ($split,$enc,$marc,$out,$bad) = (100);
GetOptions(
'split=i' => \$split,
'marc=s' => \$marc,
'encoding=s' => \$enc,
'out_dir=s' => \$out,
+ 'bad=s' => \$bad,
);
if ($enc) {
@@ -31,6 +33,7 @@
$stylesheet = $xslt->parse_stylesheet( $parser->parse_string($xsl) );
+$bad = new FileHandle( $bad => '>:raw' ) if ($bad);
my $xml = '';
my $current = 1;
@@ -42,8 +45,21 @@
$marc->warnings_off;
while (my $r = $marc->next) {
- $xml .= entityize(MARC::File::XML::record($r));
+ my $rxml = entityize(MARC::File::XML::record($r));
+ $rxml =~ s/[\x00-\x1f]//go;
+ try { $doc = $parser->parse_string($rxml); }
+ catch Error with {
+ my $e = shift;
+ warn "arg ... bad record $current, skipping: $e\n";
+ $current++;
+ print $bad $r->as_usmarc if ($bad);
+ $r = undef;
+ };
+ next unless ($r);
+
+ $xml .= $rxml;
+
unless ($current % $split) {
$xml = <<" XML";
<collection xmlns="http://www.loc.gov/MARC21/slim">
@@ -51,7 +67,11 @@
</collection>
XML
- my $doc = $parser->parse_string($xml);
+ my $doc;
+ try { $doc = $parser->parse_string($xml); }
+ catch Error with { my $e = shift; warn "ARG! Doc failed to parse:\n$e\n-------------------------------------------\n$xml\n"; };
+ die unless $doc;
+
$xml = '';
my $results = $stylesheet->transform($doc, prev => "'$prev'", next => "'$next'");
More information about the open-ils-commits
mailing list