[open-ils-commits] r15351 - trunk/Open-ILS/src/sql/Pg (dbs)
svn at svn.open-ils.org
svn at svn.open-ils.org
Wed Jan 20 18:15:17 EST 2010
Author: dbs
Date: 2010-01-20 18:15:10 -0500 (Wed, 20 Jan 2010)
New Revision: 15351
Modified:
trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql
trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql
Log:
Correct Unicode handling for in-db ingest
Ensure MARC::File::XML treats the content as UTF-8; then follow
recommended practice of explicitly decoding the UTF-8 string to a
bytes string before operating on it, then encoding it back to UTF8
when it's returned.
'Qu?\195?\169bec' now gets naco-normalized to 'quebec' as one would expect. YAY!
Modified: trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql 2010-01-20 21:40:05 UTC (rev 15350)
+++ trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql 2010-01-20 23:15:10 UTC (rev 15351)
@@ -34,13 +34,15 @@
$$ LANGUAGE SQL STRICT IMMUTABLE;
CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
- use Unicode::Normalize;
- use Encode;
+ use Unicode::Normalize;
+ use Encode;
- my $txt = lc(encode_utf8(shift));
+ # When working with Unicode data, the first step is to decode it to
+ # a byte string; after that, lowercasing is safe
+ my $txt = lc(decode_utf8(shift));
my $sf = shift;
- $txt = NFD($txt);
+ $txt = NFD($txt);
$txt =~ s/\pM+//go; # Remove diacritics
$txt =~ s/\xE6/AE/go; # Convert ae digraph
@@ -73,7 +75,9 @@
$txt =~ s/^\s+//o; # Remove leading space
$txt =~ s/\s+$//o; # Remove trailing space
- return $txt;
+ # Encoding the outgoing string is good practice, but not strictly
+ # necessary in this case because we've stripped everything from it
+ return encode_utf8($txt);
$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$
Modified: trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql 2010-01-20 21:40:05 UTC (rev 15350)
+++ trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql 2010-01-20 23:15:10 UTC (rev 15351)
@@ -375,7 +375,7 @@
CREATE OR REPLACE FUNCTION biblio.flatten_marc ( TEXT ) RETURNS SETOF metabib.full_rec AS $func$
use MARC::Record;
-use MARC::File::XML;
+use MARC::File::XML (BinaryEncoding => 'UTF-8');
my $xml = shift;
my $r = MARC::Record->new_from_xml( $xml );
More information about the open-ils-commits
mailing list