[open-ils-commits] [GIT] Evergreen ILS branch master updated. c694929e4ba58ecf13e5903624ec6c784c37f439
Evergreen Git
git at git.evergreen-ils.org
Wed Mar 7 16:04:23 EST 2012
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Evergreen ILS".
The branch, master has been updated
via c694929e4ba58ecf13e5903624ec6c784c37f439 (commit)
via 2788298ec23d1caff3755f9c151d03510420651d (commit)
via d939d7d09f231319a59f7bc309b7e40c451f273e (commit)
from a0fdeb77d15a979d80d3f4ea2e83c3e46cfe4157 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit c694929e4ba58ecf13e5903624ec6c784c37f439
Author: Dan Scott <dscott at laurentian.ca>
Date: Sun Mar 4 03:00:49 2012 -0500
Decode the string to UTF8, always
Even if you know that the caller is passing you a decoded UTF8 string,
you can and should decode it yourself, because some day a caller isn't
going to decode it first and you're going to wind up in misery trying to
figure out why you're broken.
In this case, it resolves the mystery of why the unit tests failed when
Vandelay seemed to be ticking along fine. As the comment in clean_marc()
mentioned, "assume input is already in UTF8" - but as soon as it isn't,
boom.
Signed-off-by: Dan Scott <dscott at laurentian.ca>
Signed-off-by: Jason Stephenson <jstephenson at mvlc.org>
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
index 9035576..0823e56 100644
--- a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
@@ -111,12 +111,12 @@ sub _normalize_codes {
# Assumes input is already in UTF-8.
sub clean_marc {
my $input = shift;
- my $xml = (isa $input, 'MARC::Record') ? $input->as_xml_record() : $input;
+ my $xml = decode_utf8((isa $input, 'MARC::Record') ? $input->as_xml_record() : $input);
$xml =~ s/\n//sog;
$xml =~ s/^<\?xml.+\?\s*>//go;
$xml =~ s/>\s+</></go;
- $xml = OpenILS::Application::AppUtils->entityize($xml);
$xml =~ s/\p{Cc}//go;
+ $xml = OpenILS::Application::AppUtils->entityize($xml);
$xml =~ s/[\x00-\x1f]//go;
return $xml;
}
commit 2788298ec23d1caff3755f9c151d03510420651d
Author: Dan Scott <dscott at laurentian.ca>
Date: Sun Mar 4 02:41:11 2012 -0500
Fix Unicode mangling in clean_marc function
Calling s/\p{Cc}//go; before entityize() was resulting in all xFFFD
entities being returned for the upper case diacritic characters, which
in turn caused the new unit test to fail (yay unit tests). I added a
corresponding unit tese for entityize() to ensure that the problem
wasn't coming from that function. Switching the order in which the p{Cc}
regex and entityize() calls resolved the corruption in the unit test.
This suggests that Vandelay may be introducing significant corruption to
imported records and that backporting of this commit to the inline
Vandelay variants from previous releases may be warranted.
Signed-off-by: Dan Scott <dscott at laurentian.ca>
Signed-off-by: Jason Stephenson <jstephenson at mvlc.org>
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
index 9ddca6e..9035576 100644
--- a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
@@ -115,8 +115,8 @@ sub clean_marc {
$xml =~ s/\n//sog;
$xml =~ s/^<\?xml.+\?\s*>//go;
$xml =~ s/>\s+</></go;
- $xml =~ s/\p{Cc}//go;
$xml = OpenILS::Application::AppUtils->entityize($xml);
+ $xml =~ s/\p{Cc}//go;
$xml =~ s/[\x00-\x1f]//go;
return $xml;
}
diff --git a/Open-ILS/src/perlmods/t/01-OpenILS-Application.t b/Open-ILS/src/perlmods/t/01-OpenILS-Application.t
index 06f3ad4..a4367f6 100644
--- a/Open-ILS/src/perlmods/t/01-OpenILS-Application.t
+++ b/Open-ILS/src/perlmods/t/01-OpenILS-Application.t
@@ -1,6 +1,6 @@
#!perl -T
-use Test::More tests => 13;
+use Test::More tests => 14;
BEGIN {
use_ok( 'OpenILS::Application' );
@@ -18,3 +18,9 @@ use_ok( 'OpenILS::Application::ResolverResolver' );
use_ok( 'OpenILS::Application::Serial' );
use_ok( 'OpenILS::Application::SuperCat' );
use_ok( 'OpenILS::Application::Vandelay' );
+
+is(
+ OpenILS::Application::AppUtils::entityize(0, 'èöçÇÈÀ'),
+ 'èöçÇÈÀ',
+ 'entityize: diacritics'
+);
diff --git a/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t b/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t
index 924e2a3..9878956 100644
--- a/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t
+++ b/Open-ILS/src/perlmods/t/14-OpenILS-Utils.t
@@ -1,6 +1,6 @@
#!perl -T
-use Test::More tests => 22;
+use Test::More tests => 24;
use_ok( 'OpenILS::Utils::Configure' );
use_ok( 'OpenILS::Utils::Cronscript' );
@@ -43,3 +43,40 @@ is($apostring, "its time", "naco_normalize: strip apostrophes");
my $apos = OpenILS::Utils::Normalize::search_normalize("it's time");
is($apos, "it s time", "search_normalize: replace apostrophes with space");
+
+my $raw_marcxml = <<RAWMARC;
+<?xml version="1.0" encoding="utf-8"?>
+<record>
+ <leader>01614nmm a22003975u 4500</leader>
+ <controlfield tag="001">978-0-387-35767-6</controlfield>
+ <controlfield tag="003">Springer</controlfield>
+ <controlfield tag="005">20071022150035.8</controlfield>
+ <controlfield tag="007">cr nn 008mamaa</controlfield>
+ <controlfield tag="008">071022s2008 xx j eng d</controlfield>
+ <datafield tag="020" ind1=" " ind2=" ">
+ <subfield code="a">9780387685748</subfield>
+ </datafield>
+ <datafield tag="100" ind1="1" ind2=" ">
+ <subfield code="a">Neteler, Markus.</subfield>
+ </datafield>
+ <datafield tag="245" ind1="1" ind2="0">
+ <subfield code="a">Open Source GIS</subfield>
+ <subfield code="h">[electronic resource] :</subfield>
+ <subfield code="b">A GRASS GIS Approach /</subfield>
+ <subfield code="c">edited by Markus Neteler, Helena Mitasova.</subfield>
+ </datafield>
+ <datafield tag="250" ind1=" " ind2=" ">
+ <subfield code="a">Third Edition.</subfield>
+ </datafield>
+ <datafield tag="260" ind1=" " ind2=" ">
+ <subfield code="a">Boston, MA :</subfield>
+ <subfield code="b">Springer Science+Business Media, LLC,</subfield>
+ <subfield code="c">2008.</subfield>
+ </datafield>
+</record>
+RAWMARC
+my $exp_xml = '<record><leader>01614nmm a22003975u 4500</leader><controlfield tag="001">978-0-387-35767-6</controlfield><controlfield tag="003">Springer</controlfield><controlfield tag="005">20071022150035.8</controlfield><controlfield tag="007">cr nn 008mamaa</controlfield><controlfield tag="008">071022s2008 xx j eng d</controlfield><datafield tag="020" ind1=" " ind2=" "><subfield code="a">9780387685748</subfield></datafield><datafield tag="100" ind1="1" ind2=" "><subfield code="a">Neteler, Markus.</subfield></datafield><datafield tag="245" ind1="1" ind2="0"><subfield code="a">Open Source GIS</subfield><subfield code="h">[electronic resource] :</subfield><subfield code="b">A GRASS GIS Approach /</subfield><subfield code="c">edited by Markus Neteler, Helena Mitasova.</subfield></datafield><datafield tag="250" ind1=" " ind2=" "><subfield code="a">Third Edition.</subfield></datafield><datafield tag="260" ind1=" " ind2=" "><subfield code="a">Boston, MA :</subf
ield><subfield code="b">Springer Science+Business Media, LLC,</subfield><subfield code="c">2008.</subfield></datafield></record>';
+my $clean_xml = OpenILS::Utils::Normalize::clean_marc($raw_marcxml);
+is($clean_xml, $exp_xml, "clean_marc: header and space normalization");
+
+is(OpenILS::Utils::Normalize::clean_marc('èöçÇÈÀ'), 'èöçÇÈÀ', 'clean_marc: diacritics');
commit d939d7d09f231319a59f7bc309b7e40c451f273e
Author: Jason Stephenson <jstephenson at mvlc.org>
Date: Wed Nov 9 15:34:27 2011 -0500
Add clean_marc function to OpenILS::Utils::Normalize.
Add a library function to clean up MARC records for how we like to
store them in the biblio.record_entry table. Having this in a library
will reduce code duplication.
Also, replace nearly identical code in OpenILS::Application::Vandelay
and OpenILS::Application::Acq::Order with calls to this new function.
Signed-off-by: Jason Stephenson <jstephenson at mvlc.org>
Signed-off-by: Dan Scott <dscott at laurentian.ca>
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Acq/Order.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Acq/Order.pm
index 1b1aff3..9eef738 100644
--- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Acq/Order.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Acq/Order.pm
@@ -181,6 +181,7 @@ use OpenSRF::Utils::JSON;
use OpenSRF::AppSession;
use OpenILS::Utils::Fieldmapper;
use OpenILS::Utils::CStoreEditor q/:funcs/;
+use OpenILS::Utils::Normalize qw/clean_marc/;
use OpenILS::Const qw/:const/;
use OpenSRF::EX q/:try/;
use OpenILS::Application::AppUtils;
@@ -1258,13 +1259,7 @@ sub upload_records {
last unless $r;
try {
- ($xml = $r->as_xml_record()) =~ s/\n//sog;
- $xml =~ s/^<\?xml.+\?\s*>//go;
- $xml =~ s/>\s+</></go;
- $xml =~ s/\p{Cc}//go;
- $xml = $U->entityize($xml);
- $xml =~ s/[\x00-\x1f]//go;
-
+ $xml = clean_marc($r);
} catch Error with {
$err = shift;
$logger->warn("Proccessing XML of record $count in set $key failed with error $err. Skipping this record");
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Application/Vandelay.pm b/Open-ILS/src/perlmods/lib/OpenILS/Application/Vandelay.pm
index b30d652..c4d4332 100644
--- a/Open-ILS/src/perlmods/lib/OpenILS/Application/Vandelay.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Application/Vandelay.pm
@@ -9,6 +9,7 @@ use OpenSRF::Utils::SettingsClient;
use OpenSRF::Utils::Cache;
use OpenILS::Utils::Fieldmapper;
use OpenILS::Utils::CStoreEditor qw/:funcs/;
+use OpenILS::Utils::Normalize qw/clean_marc/;
use MARC::Batch;
use MARC::Record;
use MARC::File::XML ( BinaryEncoding => 'UTF-8' );
@@ -285,12 +286,7 @@ sub process_spool {
$logger->info("processing record $count");
try {
- (my $xml = $r->as_xml_record()) =~ s/\n//sog;
- $xml =~ s/^<\?xml.+\?\s*>//go;
- $xml =~ s/>\s+</></go;
- $xml =~ s/\p{Cc}//go;
- $xml = $U->entityize($xml);
- $xml =~ s/[\x00-\x1f]//go;
+ my $xml = clean_marc($r);
my $qrec;
# Check the leader to ensure we've got something resembling the expected
diff --git a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
index e3e699f..9ddca6e 100644
--- a/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
+++ b/Open-ILS/src/perlmods/lib/OpenILS/Utils/Normalize.pm
@@ -3,9 +3,13 @@ use strict;
use warnings;
use Unicode::Normalize;
use Encode;
+use UNIVERSAL qw/isa/;
+use MARC::Record;
+use MARC::File::XML ( BinaryEncoding => 'UTF-8' );
+use OpenILS::Application::AppUtils;
use Exporter 'import';
-our @EXPORT_OK = qw( naco_normalize search_normalize );
+our @EXPORT_OK = qw( clean_marc naco_normalize search_normalize );
sub naco_normalize {
my $str = decode_utf8(shift);
@@ -97,4 +101,24 @@ sub _normalize_codes {
return lc $str;
}
+# Cleans up a MARC::Record or MARCXML string for storage in the
+# Open-ILS database.
+#
+# Takes either a MARC::Record or a string of MARCXML.
+#
+# Returns a string of MARCXML as Open-ILS likes to store it.
+#
+# Assumes input is already in UTF-8.
+sub clean_marc {
+ my $input = shift;
+ my $xml = (isa $input, 'MARC::Record') ? $input->as_xml_record() : $input;
+ $xml =~ s/\n//sog;
+ $xml =~ s/^<\?xml.+\?\s*>//go;
+ $xml =~ s/>\s+</></go;
+ $xml =~ s/\p{Cc}//go;
+ $xml = OpenILS::Application::AppUtils->entityize($xml);
+ $xml =~ s/[\x00-\x1f]//go;
+ return $xml;
+}
+
1;
-----------------------------------------------------------------------
Summary of changes:
.../perlmods/lib/OpenILS/Application/Acq/Order.pm | 9 +----
.../perlmods/lib/OpenILS/Application/Vandelay.pm | 8 +---
.../src/perlmods/lib/OpenILS/Utils/Normalize.pm | 26 +++++++++++++-
Open-ILS/src/perlmods/t/01-OpenILS-Application.t | 8 ++++-
Open-ILS/src/perlmods/t/14-OpenILS-Utils.t | 39 +++++++++++++++++++-
5 files changed, 74 insertions(+), 16 deletions(-)
hooks/post-receive
--
Evergreen ILS
More information about the open-ils-commits
mailing list