[open-ils-commits] r15351 - trunk/Open-ILS/src/sql/Pg (dbs)

svn at svn.open-ils.org svn at svn.open-ils.org
Wed Jan 20 18:15:17 EST 2010


Author: dbs
Date: 2010-01-20 18:15:10 -0500 (Wed, 20 Jan 2010)
New Revision: 15351

Modified:
   trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql
   trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql
Log:
Correct Unicode handling for in-db ingest

Ensure MARC::File::XML treats the content as UTF-8; then follow
recommended practice of explicitly decoding the UTF-8 string to a
bytes string before operating on it, then encoding it back to UTF8
when it's returned.

'Qu?\195?\169bec' now gets naco-normalized to 'quebec' as one would expect. YAY!


Modified: trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql	2010-01-20 21:40:05 UTC (rev 15350)
+++ trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql	2010-01-20 23:15:10 UTC (rev 15351)
@@ -34,13 +34,15 @@
 $$ LANGUAGE SQL STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
-    use Unicode::Normalize;
-    use Encode;
+	use Unicode::Normalize;
+	use Encode;
 
-	my $txt = lc(encode_utf8(shift));
+	# When working with Unicode data, the first step is to decode it to
+	# a byte string; after that, lowercasing is safe
+	my $txt = lc(decode_utf8(shift));
 	my $sf = shift;
 
-    $txt = NFD($txt);
+	$txt = NFD($txt);
 	$txt =~ s/\pM+//go;	# Remove diacritics
 
 	$txt =~ s/\xE6/AE/go;	# Convert ae digraph
@@ -73,7 +75,9 @@
 	$txt =~ s/^\s+//o;	# Remove leading space
 	$txt =~ s/\s+$//o;	# Remove trailing space
 
-	return $txt;
+	# Encoding the outgoing string is good practice, but not strictly
+	# necessary in this case because we've stripped everything from it
+	return encode_utf8($txt);
 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$

Modified: trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql	2010-01-20 21:40:05 UTC (rev 15350)
+++ trunk/Open-ILS/src/sql/Pg/030.schema.metabib.sql	2010-01-20 23:15:10 UTC (rev 15351)
@@ -375,7 +375,7 @@
 CREATE OR REPLACE FUNCTION biblio.flatten_marc ( TEXT ) RETURNS SETOF metabib.full_rec AS $func$
 
 use MARC::Record;
-use MARC::File::XML;
+use MARC::File::XML (BinaryEncoding => 'UTF-8');
 
 my $xml = shift;
 my $r = MARC::Record->new_from_xml( $xml );



More information about the open-ils-commits mailing list