[open-ils-commits] r18477 - in branches/rel_2_0/Open-ILS/src/sql/Pg: . upgrade (gmc)

svn at svn.open-ils.org svn at svn.open-ils.org
Tue Oct 26 15:36:40 EDT 2010


Author: gmc
Date: 2010-10-26 15:36:35 -0400 (Tue, 26 Oct 2010)
New Revision: 18477

Added:
   branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql
Modified:
   branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql
   branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql
   branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql
Log:
fix NACO normalization of four letter modifier characters

Signed-off-by: Galen Charlton <gmc at esilibrary.com>


Modified: branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql	2010-10-26 19:35:38 UTC (rev 18476)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/002.schema.config.sql	2010-10-26 19:36:35 UTC (rev 18477)
@@ -70,7 +70,7 @@
     install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
-INSERT INTO config.upgrade_log (version) VALUES ('0445'); -- miker
+INSERT INTO config.upgrade_log (version) VALUES ('0446'); -- gmc
 
 CREATE TABLE config.bib_source (
 	id		SERIAL	PRIMARY KEY,

Modified: branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql	2010-10-26 19:35:38 UTC (rev 18476)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/020.schema.functions.sql	2010-10-26 19:36:35 UTC (rev 18477)
@@ -45,6 +45,12 @@
 	$txt = NFD($txt);
 	$txt =~ s/\pM+//go;	# Remove diacritics
 
+	# remove non-combining diacritics
+	# this list of characters follows the NACO normalization spec,
+	# but a looser but more comprehensive version might be
+	# $txt =~ s/\pLm+//go;
+	$txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
+
 	$txt =~ s/\xE6/AE/go;	# Convert ae digraph
 	$txt =~ s/\x{153}/OE/go;# Convert oe digraph
 	$txt =~ s/\xFE/TH/go;	# Convert Icelandic thorn

Modified: branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql	2010-10-26 19:35:38 UTC (rev 18476)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/1.6.1-2.0-upgrade-db.sql	2010-10-26 19:36:35 UTC (rev 18477)
@@ -6911,8 +6911,14 @@
     my $sf = shift;
 
     $txt = NFD($txt);
-    $txt =~ s/\pM+//go;     # Remove diacritics
+    $txt =~ s/\pM+//go; # Remove diacritics
 
+    # remove non-combining diacritics
+    # this list of characters follows the NACO normalization spec,
+    # but a looser but more comprehensive version might be
+    # $txt =~ s/\pLm+//go;
+    $txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
+
     $txt =~ s/\xE6/AE/go;   # Convert ae digraph
     $txt =~ s/\x{153}/OE/go;# Convert oe digraph
     $txt =~ s/\xFE/TH/go;   # Convert Icelandic thorn
@@ -6920,8 +6926,8 @@
     $txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
     $txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
 
-    $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;         # Convert Latin and Greek
-    $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /;     # Convert Misc
+    $txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;     # Convert Latin and Greek
+    $txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /; # Convert Misc
     $txt =~ tr/\'\[\]\|//d;                         # Remove Misc
 
     if ($sf && $sf =~ /^a/o) {
@@ -6939,9 +6945,9 @@
         $txt =~ s/,/ /go;
     }
 
-    $txt =~ s/\s+/ /go;     # Compress multiple spaces
-    $txt =~ s/^\s+//o;      # Remove leading space
-    $txt =~ s/\s+$//o;      # Remove trailing space
+    $txt =~ s/\s+/ /go; # Compress multiple spaces
+    $txt =~ s/^\s+//o;  # Remove leading space
+    $txt =~ s/\s+$//o;  # Remove trailing space
 
     # Encoding the outgoing string is good practice, but not strictly
     # necessary in this case because we've stripped everything from it

Copied: branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql (from rev 18476, trunk/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql)
===================================================================
--- branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql	                        (rev 0)
+++ branches/rel_2_0/Open-ILS/src/sql/Pg/upgrade/0446.schema.naco-normalize-modifiers.sql	2010-10-26 19:36:35 UTC (rev 18477)
@@ -0,0 +1,58 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0446'); -- gmc
+
+CREATE OR REPLACE FUNCTION public.naco_normalize( TEXT, TEXT ) RETURNS TEXT AS $func$
+	use Unicode::Normalize;
+	use Encode;
+
+	# When working with Unicode data, the first step is to decode it to
+	# a byte string; after that, lowercasing is safe
+	my $txt = lc(decode_utf8(shift));
+	my $sf = shift;
+
+	$txt = NFD($txt);
+	$txt =~ s/\pM+//go;	# Remove diacritics
+
+	# remove non-combining diacritics
+	# this list of characters follows the NACO normalization spec,
+	# but a looser but more comprehensive version might be
+	# $txt =~ s/\pLm+//go;
+	$txt =~ tr/\x{02B9}\x{02BA}\x{02BB}\x{02BC}//d;
+
+	$txt =~ s/\xE6/AE/go;	# Convert ae digraph
+	$txt =~ s/\x{153}/OE/go;# Convert oe digraph
+	$txt =~ s/\xFE/TH/go;	# Convert Icelandic thorn
+
+	$txt =~ tr/\x{2070}\x{2071}\x{2072}\x{2073}\x{2074}\x{2075}\x{2076}\x{2077}\x{2078}\x{2079}\x{207A}\x{207B}/0123456789+-/;# Convert superscript numbers
+	$txt =~ tr/\x{2080}\x{2081}\x{2082}\x{2083}\x{2084}\x{2085}\x{2086}\x{2087}\x{2088}\x{2089}\x{208A}\x{208B}/0123456889+-/;# Convert subscript numbers
+
+	$txt =~ tr/\x{0251}\x{03B1}\x{03B2}\x{0262}\x{03B3}/AABGG/;	 	# Convert Latin and Greek
+	$txt =~ tr/\x{2113}\xF0\x{111}\!\"\(\)\-\{\}\<\>\;\:\.\?\xA1\xBF\/\\\@\*\%\=\xB1\+\xAE\xA9\x{2117}\$\xA3\x{FFE1}\xB0\^\_\~\`/LDD /;	# Convert Misc
+	$txt =~ tr/\'\[\]\|//d;							# Remove Misc
+
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($txt,',');
+		if ($commapos > -1) {
+			if ($commapos != length($txt) - 1) {
+				my @list = split /,/, $txt;
+				my $first = shift @list;
+				$txt = $first . ',' . join(' ', @list);
+			} else {
+				$txt =~ s/,/ /go;
+			}
+		}
+	} else {
+		$txt =~ s/,/ /go;
+	}
+
+	$txt =~ s/\s+/ /go;	# Compress multiple spaces
+	$txt =~ s/^\s+//o;	# Remove leading space
+	$txt =~ s/\s+$//o;	# Remove trailing space
+
+	# Encoding the outgoing string is good practice, but not strictly
+	# necessary in this case because we've stripped everything from it
+	return encode_utf8($txt);
+$func$ LANGUAGE 'plperlu' STRICT IMMUTABLE;
+
+END;



More information about the open-ils-commits mailing list