[open-ils-commits] r19205 - in branches/rel_2_0/Open-ILS: src/perlmods/OpenILS/Application/Storage src/sql/Pg src/sql/Pg/upgrade tests (gmc)

svn at svn.open-ils.org svn at svn.open-ils.org
Wed Jan 19 11:07:17 EST 2011


Author: gmc
Date: 2011-01-19 11:07:14 -0500 (Wed, 19 Jan 2011)
New Revision: 19205

Added:
   branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql
   branches/rel_2_0/Open-ILS/tests/naco_normalize.t
Modified:
   branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm
   branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql
   branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql
   branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql
Log:
backport naco_normalize revisions to rel_2_0

This implements the latest version of the NACO
normalization specification found at

http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf

This version of the algorithm is more general -- for example,
all combining characters are removed -- so there should be
fewer fiddly edge cases to worry about for most European
languages.

Rebuilding the metabib.*_field_entry tables (e.g., by using
reingest-1.6-2.0.pl) is recommended if there are any bibs that contain
any non-ASCII characters.

Normalized text is now left in the NFKD form, so while this should
be transparent to the search system after reindexing, it does mean
that (for example) Korean text in metabib.*_field_entry may not
be in the same Unicode normalization form as that found in
biblio.record_entry.

Also includes fix for bug #684467: more bulletproofing of naco_normalize

Signed-off-by: Galen Charlton <gmc at esilibrary.com>

Modified: branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm
===================================================================
--- branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm	2011-01-19 15:59:01 UTC (rev 19204)
+++ branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm	2011-01-19 16:07:14 UTC (rev 19205)
@@ -6,6 +6,7 @@
 use OpenSRF::Utils::Logger qw/:level/;
 use Parse::RecDescent;
 use Unicode::Normalize;
+use Encode;
 
 my $_default_grammar_parser = new Parse::RecDescent ( <<'GRAMMAR' );
 
@@ -27,45 +28,66 @@
 
 GRAMMAR
 
+# FIXME - this is a copy-and-paste of the naco_normalize
+#         stored procedure
 sub naco_normalize {
 
-    my $txt = lc(shift);
+    my $str = decode_utf8(shift);
     my $sf = shift;
 
-    $txt = NFD($txt);
-    $txt =~ s/\pM+//go; # Remove diacritics
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
 
-    $txt =~ s/\xE6/AE/go;   # Convert ae digraph
-    $txt =~ s/\x{153}/OE/go;# Convert oe digraph
-    $txt =~ s/\xFE/TH/go;   # Convert Icelandic thorn
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
 
-    $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
-    $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
 
-    $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;     # Convert Latin and Greek
-    $txt =~ tr/\x{2113}\xF0\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LD /; # Convert Misc
-    $txt =~ tr/\'\[\]\|//d;                         # Remove Misc
+    $str = NFKD($str);
 
-    if ($sf && $sf =~ /^a/o) {
-        my $commapos = index($txt,',');
-        if ($commapos > -1) {
-            if ($commapos != length($txt) - 1) {
-                my @list = split /,/, $txt;
-                my $first = shift @list;
-                $txt = $first . ',' . join(' ', @list);
-            } else {
-                $txt =~ s/,/ /go;
-            }
-        }
-    } else {
-        $txt =~ s/,/ /go;
-    }
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
 
-    $txt =~ s/\s+/ /go; # Compress multiple spaces
-    $txt =~ s/^\s+//o;  # Remove leading space
-    $txt =~ s/\s+$//o;  # Remove trailing space
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
 
-    return $txt;
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($str, ',');
+		if ($commapos > -1) {
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
+			}
+		}
+	}
+
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
 }
 
 #' stupid vim syntax highlighting ...

Modified: branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql	2011-01-19 15:59:01 UTC (rev 19204)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql	2011-01-19 16:07:14 UTC (rev 19205)
@@ -70,7 +70,7 @@
     install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
-INSERT INTO config.upgrade_log (version) VALUES ('0476'); -- dbs
+INSERT INTO config.upgrade_log (version) VALUES ('0478'); -- gmcharlt
 
 CREATE TABLE config.bib_source (
 	id		SERIAL	PRIMARY KEY,

Modified: branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql	2011-01-19 15:59:01 UTC (rev 19204)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql	2011-01-19 16:07:14 UTC (rev 19205)
@@ -34,56 +34,67 @@
 $$ LANGUAGE SQL STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
-	use Unicode::Normalize;
-	use Encode;
 
-	# When working with Unicode data, the first step is to decode it to
-	# a byte string; after that, lowercasing is safe
-	my $txt = lc(decode_utf8(shift));
-	my $sf = shift;
+    use strict;
+    use Unicode::Normalize;
+    use Encode;
 
-	$txt = NFD($txt);
-	$txt =~ s/\pM+//go;	# Remove diacritics
+    my $str = decode_utf8(shift);
+    my $sf = shift;
 
-	# remove non-combining diacritics
-	# this list of characters follows the NACO normalization spec,
-	# but a looser but more comprehensive version might be
-	# $txt =~ s/\pLm+//go;
-	$txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
 
-	$txt =~ s/\xE6/AE/go;	# Convert ae digraph
-	$txt =~ s/\x{153}/OE/go;# Convert oe digraph
-	$txt =~ s/\xFE/TH/go;	# Convert Icelandic thorn
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
 
-	$txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
-	$txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
 
-	$txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;	 	# Convert Latin and Greek
-	$txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /;	# Convert Misc
-	$txt =~ tr/\'\[\]\|//d;							# Remove Misc
+    $str = NFKD($str);
 
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
+
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
+
 	if ($sf && $sf =~ /^a/o) {
-		my $commapos = index($txt,',');
+		my $commapos = index($str, ',');
 		if ($commapos > -1) {
-			if ($commapos != length($txt) - 1) {
-				my @list = split /,/, $txt;
-				my $first = shift @list;
-				$txt = $first . ',' . join(' ', @list);
-			} else {
-				$txt =~ s/,/ /go;
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
 			}
 		}
-	} else {
-		$txt =~ s/,/ /go;
 	}
 
-	$txt =~ s/\s+/ /go;	# Compress multiple spaces
-	$txt =~ s/^\s+//o;	# Remove leading space
-	$txt =~ s/\s+$//o;	# Remove trailing space
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
 
-	# Encoding the outgoing string is good practice, but not strictly
-	# necessary in this case because we've stripped everything from it
-	return encode_utf8($txt);
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT ) RETURNS TEXT AS $func$

Modified: branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql	2011-01-19 15:59:01 UTC (rev 19204)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql	2011-01-19 16:07:14 UTC (rev 19205)
@@ -7003,56 +7003,67 @@
 DROP TABLE IF EXISTS config.index_normalizer CASCADE;
 
 CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
+
+    use strict;
     use Unicode::Normalize;
     use Encode;
 
-    # When working with Unicode data, the first step is to decode it to
-    # a byte string; after that, lowercasing is safe
-    my $txt = lc(decode_utf8(shift));
+    my $str = decode_utf8(shift);
     my $sf = shift;
 
-    $txt = NFD($txt);
-    $txt =~ s/\pM+//go; # Remove diacritics
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
 
-    # remove non-combining diacritics
-    # this list of characters follows the NACO normalization spec,
-    # but a looser but more comprehensive version might be
-    # $txt =~ s/\pLm+//go;
-    $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
 
-    $txt =~ s/\xE6/AE/go;   # Convert ae digraph
-    $txt =~ s/\x{153}/OE/go;# Convert oe digraph
-    $txt =~ s/\xFE/TH/go;   # Convert Icelandic thorn
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
 
-    $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
-    $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
+    $str = NFKD($str);
 
-    $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;     # Convert Latin and Greek
-    $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc
-    $txt =~ tr/\'\[\]\|//d;                         # Remove Misc
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
 
-    if ($sf && $sf =~ /^a/o) {
-        my $commapos = index($txt,',');
-        if ($commapos > -1) {
-            if ($commapos != length($txt) - 1) {
-                my @list = split /,/, $txt;
-                my $first = shift @list;
-                $txt = $first . ',' . join(' ', @list);
-            } else {
-                $txt =~ s/,/ /go;
-            }
-        }
-    } else {
-        $txt =~ s/,/ /go;
-    }
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
 
-    $txt =~ s/\s+/ /go; # Compress multiple spaces
-    $txt =~ s/^\s+//o;  # Remove leading space
-    $txt =~ s/\s+$//o;  # Remove trailing space
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($str, ',');
+		if ($commapos > -1) {
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
+			}
+		}
+	}
 
-    # Encoding the outgoing string is good practice, but not strictly
-    # necessary in this case because we've stripped everything from it
-    return encode_utf8($txt);
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
 $func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
 
 -- Some handy functions, based on existing ones, to provide optional ingest normalization

Added: branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql	                        (rev 0)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0478.schema.naco_normalize_tweak.sql	2011-01-19 16:07:14 UTC (rev 19205)
@@ -0,0 +1,69 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0478'); -- gmcharlt
+
+CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
+
+    use strict;
+    use Unicode::Normalize;
+    use Encode;
+
+    my $str = decode_utf8(shift);
+    my $sf = shift;
+
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
+
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (ß) and certain ligatures (ff, fi, ffl, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
+
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
+
+    $str = NFKD($str);
+
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
+
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
+
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($str, ',');
+		if ($commapos > -1) {
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
+			}
+		}
+	}
+
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
+$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
+
+COMMIT;

Added: branches/rel_2_0/Open-ILS/tests/naco_normalize.t
===================================================================
--- branches/rel_2_0/Open-ILS/tests/naco_normalize.t	                        (rev 0)
+++ branches/rel_2_0/Open-ILS/tests/naco_normalize.t	2011-01-19 16:07:14 UTC (rev 19205)
@@ -0,0 +1,91 @@
+use strict;
+use warnings;
+use utf8;
+
+use Test::More tests => 50;
+use Unicode::Normalize;
+use DBI;
+
+use OpenILS::Application::Storage::FTS;
+
+# This could be made better in at least one of two ways (or both);
+# 1. put PL/Perl code that doesn't require a database into external
+#    modules so that test frameworks can get at it more easily
+# 2. Build a test harness that knows how to find an Evergreen
+#    database to use for non-destructive testing.  Of course, there
+#    can be a chicken-and-egg problem here; also, a complete test
+#    suite would need to be able to do *destructive* testing, from
+#    which we'd presumably want to protect production databases.
+
+# Database connection parameters
+my $db_driver = 'Pg';
+my $db_host   = 'evergreen';
+my $db_port   = '5432';
+my $db_name   = 'evergreen';
+my $db_user   = 'evergreen';
+my $db_pw     = 'evergreen';
+my $dsn       = "dbi:" . $db_driver . ":dbname=" . $db_name .';host=' . $db_host . ';port=' . $db_port;
+
+binmode STDOUT, ':utf8';
+binmode STDERR, ':utf8';
+
+my @test_cases = (
+    [ 'abc', 'abc', 'regular text' ],
+    [ 'ABC', 'abc', 'regular text' ],
+    [ 'åbçdéñœöîøæÇıÂÅÍÎÏÔÔÒÚÆŒè', 'abcdenoeoioaeciaaiiiooouaeoee', 'European diacritics' ],
+    [ '“‘„«quotes»’”', 'quotes', 'special quotes' ],
+    [ '˜abcœ def', 'def', 'special non-filing characters designation' ],
+    [ 'œabcdef', 'abcdef', 'unpaired start of string' ],
+    [ 'ß', 'ss', 'sharp S (eszett)' ],
+    [ 'flfiff', 'flfiff', 'ligatures' ],
+    [ 'ƠơƯư²IJij', 'oouu2ijij', 'NFKD applied correctly' ],
+    [ 'ÆØÞæðøþĐđıŁłŒœʻʼℓ', 'aeothaedothddilloeoel', 'part 3.6' ],
+    [ 'Ð', 'd', 'uppercase eth (missing from 3.6?)' ],
+    [ 'ıİ', 'ii', 'Turkish I' ],
+    [ '[book\'s cover]', 'books cover', 'square brackets and apostrophe' ],
+    [ '  grue   food ', 'grue food', 'trim spaces' ],
+    # note addition of NFKD() to transform expected output
+    [ '한국어 조선말', NFKD('한국어 조선말'), 'Korean text' ],
+    [ '普通話 / 普通话', '普通話 普通话', 'Chinese text' ],
+    [ 'العربية', 'العربية', 'Arabic text' ],
+    [ 'ქართული ენა', 'ქართული ენა', 'Georgian text' ],
+    [ 'русский язык', 'русскии язык', 'Russian text' ],
+    [ "\r\npa\tper\f", 'paper', 'other whitespace' ],
+    [ '#1: ∃ C++, @ home & abroad', '#1 c++ @ home & abroad', 'other punctuation' ],
+    [ '٠١٢٣٤٥', '012345', 'other decimal digits' ],
+    [ '²³¹', '231', 'superscript numbers' ],
+    [ '♭©®♯', '♭ ♯', 'other symbols' ],
+);
+
+# test copy of naco_normalize in OpenILS::Application::Storage::FTS
+foreach my $case (@test_cases) {
+    is(OpenILS::Application::Storage::FTS::naco_normalize($case->[0]), $case->[1], $case->[2] . ' (FTS.pm)');
+}
+is(OpenILS::Application::Storage::FTS::naco_normalize('Smith, Jane. Poet, painter, and author', 'a'),
+    'smith, jane poet painter and author',
+    'retain first comma (FTS.pm)');
+
+SKIP: {
+    my $dbh = DBI->connect($dsn, $db_user, $db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, PrintError => 0});
+    skip "Failed to connect to database: $DBI::errstr", 25 if (!defined($dbh));
+
+    # test stored procedure
+    my $sth1 = $dbh->prepare_cached('SELECT public.naco_normalize(?)');
+    my $sth2 = $dbh->prepare_cached('SELECT public.naco_normalize(?, ?)');
+    sub naco_normalize_wrapper {
+        my ($str, $sf) = @_;
+        if (defined $sf) {
+            $sth2->execute($str, $sf);
+            return $sth2->fetchrow_array;
+        } else {
+            $sth1->execute($str);
+            return $sth1->fetchrow_array;
+        }
+    }
+
+    foreach my $case (@test_cases) {
+        is(naco_normalize_wrapper($case->[0]), $case->[1], $case->[2] . ' (stored procedure)');
+    }
+    is(naco_normalize_wrapper('Smith, Jane. Poet, painter, and author', 'a'), 'smith, jane poet painter and author',
+        'retain first comma (stored procedure)');
+}



More information about the open-ils-commits mailing list