[open-ils-commits] r19335 - in branches/rel_2_0/Open-ILS: src/perlmods/OpenILS/Application src/perlmods/OpenILS/Application/Storage src/perlmods/OpenILS/Application/Storage/Publisher src/perlmods/OpenILS/Utils tests (dbs)

Sun Jan 30 00:19:18 EST 2011

Author: dbs
Date: 2011-01-30 00:19:15 -0500 (Sun, 30 Jan 2011)
New Revision: 19335

Added:
   branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm
Modified:
   branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm
   branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm
   branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm
   branches/rel_2_0/Open-ILS/tests/naco_normalize.t
Log:
Correct authority browsing for reals

First, restore the >= enable before and after ranges in 
authority_tag_sf_browse(), after I mistakenly removed it in
r19131; the second storage request for $after does not
stomp on the prior $before results, it simply gets pushed
onto the carefully constructed list of $before results,
ensuring that our target is in the middle of page 0.

Second, we're treating all of the "tag" members in the
method registration as list references now (for the purpose
of searching against 4xx/5xx in the .refs. variants), but 
that was blowing up when we registered just a single tag as 
a string and tried to treat the scalar as a list reference.
I could have checked to see if what we had incoming was a 
reference and dance accordingly, but opted to just define
all single-tag entries as single-element arrays instead. 
Applied the same to startwith.

Finally, in r19331 I had used chop() to ensure that an
exact match for startwith would be returned as element 1 on
page 0, instead of appearing as the last element of page -1.
I had said that the right way to do this would be to naco_normalize()
the value to match the normalized afr.value, and so this is what
I have done. Rather than torturously using O:A:Storage:FTS to get
at the naco_normalize() definition, I moved the function into its
own Utils package and adjusted its usage accordingly through the
affected code. One step closer to single-sourcing the function
in the database, as well?


Modified: branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm
===================================================================

--- branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm	2011-01-30 05:10:11 UTC (rev 19334)
+++ branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/FTS.pm	2011-01-30 05:19:15 UTC (rev 19335)
@@ -5,8 +5,7 @@
 package OpenILS::Application::Storage::FTS;
 use OpenSRF::Utils::Logger qw/:level/;
 use Parse::RecDescent;
-use Unicode::Normalize;
-use Encode;
+use OpenILS::Utils::Normalize qw( naco_normalize );
 
 my $_default_grammar_parser = new Parse::RecDescent ( <<'GRAMMAR' );
 
@@ -28,70 +27,6 @@
 
 GRAMMAR
 
-# FIXME - this is a copy-and-paste of the naco_normalize
-#         stored procedure
-sub naco_normalize {
-
-    my $str = decode_utf8(shift);
-    my $sf = shift;
-
-    # Apply NACO normalization to input string; based on
-    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
-    #
-    # Note that unlike a strict reading of the NACO normalization rules,
-    # output is returned as lowercase instead of uppercase for compatibility
-    # with previous versions of the Evergreen naco_normalize routine.
-
-    # Convert to upper-case first; even though final output will be lowercase, doing this will
-    # ensure that the German eszett (ß) and certain ligatures (ﬀ, ﬁ, ﬄ, etc.) will be handled correctly.
-    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
-    $str = uc $str;
-
-    # remove non-filing strings
-    $str =~ s/\x{0098}.*?\x{009C}//g;
-
-    $str = NFKD($str);
-
-    # additional substitutions - 3.6.
-    $str =~ s/\x{00C6}/AE/g;
-    $str =~ s/\x{00DE}/TH/g;
-    $str =~ s/\x{0152}/OE/g;
-    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
-
-    # transformations based on Unicode category codes
-    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
-
-	if ($sf && $sf =~ /^a/o) {
-		my $commapos = index($str, ',');
-		if ($commapos > -1) {
-			if ($commapos != length($str) - 1) {
-                $str =~ s/,/\x07/; # preserve first comma
-			}
-		}
-	}
-
-    # since we've stripped out the control characters, we can now
-    # use a few as placeholders temporarily
-    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
-    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
-    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
-
-    # decimal digits
-    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
-
-    # intentionally skipping step 8 of the NACO algorithm; if the string
-    # gets normalized away, that's fine.
-
-    # leading and trailing spaces
-    $str =~ s/\s+/ /g;
-    $str =~ s/^\s+//;
-    $str =~ s/\s+$//g;
-
-    return lc $str;
-}
-
-#' stupid vim syntax highlighting ...
-
 sub compile {
 
 	$log->debug("You must override me somewhere, or I will make searching really slow!!!!",ERROR);;

Modified: branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm
===================================================================
--- branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm	2011-01-30 05:10:11 UTC (rev 19334)
+++ branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/Storage/Publisher/authority.pm	2011-01-30 05:19:15 UTC (rev 19335)
@@ -4,6 +4,7 @@
 use OpenSRF::EX qw/:try/;
 use OpenILS::Application::Storage::FTS;
 use OpenILS::Utils::Fieldmapper;
+use OpenILS::Utils::Normalize qw( naco_normalize );
 use OpenSRF::Utils::Logger qw/:level/;
 use OpenSRF::Utils::Cache;
 use Data::Dumper;
@@ -33,7 +34,7 @@
 	for my $t ( @tags ) {
 		for my $search ( @searches ) {
 			my $sf = $$search{subfield};
-			my $term = OpenILS::Application::Storage::FTS::naco_normalize($$search{term}, $sf);
+			my $term = naco_normalize($$search{term}, $sf);
 
 			$tag = [$tag] if (!ref($tag));
 

Modified: branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm
===================================================================
--- branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm	2011-01-30 05:10:11 UTC (rev 19334)
+++ branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Application/SuperCat.pm	2011-01-30 05:19:15 UTC (rev 19335)
@@ -14,6 +14,7 @@
 
 use strict;
 use warnings;
+use OpenILS::Utils::Normalize qw( naco_normalize );
 
 # All OpenSRF applications must be based on OpenSRF::Application or
 # a subclass thereof.  Makes sense, eh?
@@ -863,7 +864,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_browse',
 	api_name  => 'open-ils.supercat.authority.title.browse',
-	tag       => '130', subfield => 'a',
+	tag       => ['130'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -908,7 +909,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_browse',
 	api_name  => 'open-ils.supercat.authority.topic.browse',
-	tag       => '150', subfield => 'a',
+	tag       => ['150'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -923,7 +924,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_browse',
 	api_name  => 'open-ils.supercat.authority.title.refs.browse',
-	tag       => '130', subfield => 'a',
+	tag       => ['130'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -968,7 +969,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_browse',
 	api_name  => 'open-ils.supercat.authority.topic.refs.browse',
-	tag       => '150', subfield => 'a',
+	tag       => ['150'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -991,6 +992,9 @@
     my $page_size = shift || 9;
     my $page = shift || 0;
 
+    # Match authority.full_rec normalization
+    $value = naco_normalize($value, $subfield);
+
     my ($before_limit,$after_limit) = (0,0);
     my ($before_offset,$after_offset) = (0,0);
 
@@ -1016,13 +1020,13 @@
     }
     my @list = ();
 
-    if ($page < 0) {
+    if ($page <= 0) {
         my $before = $_storage->request(
             "open-ils.cstore.json_query.atomic",
             { select    => { afr => [qw/record value/] },
               from      => { 'are', 'afr' },
               where     => {
-                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => lc($value) } },
+                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => $value } },
                 '+are' => { 'deleted' => 'f' }
               },
               order_by  => { afr => { value => 'desc' } },
@@ -1039,7 +1043,7 @@
             { select    => { afr => [qw/record value/] },
               from      => { 'are', 'afr' },
               where     => {
-                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => lc($value) } },
+                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => $value } },
                 '+are' => { 'deleted' => 'f' }
               },
               order_by  => { afr => { value => 'asc' } },
@@ -1374,7 +1378,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_startwith',
 	api_name  => 'open-ils.supercat.authority.title.startwith',
-	tag       => '130', subfield => 'a',
+	tag       => ['130'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -1419,7 +1423,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_startwith',
 	api_name  => 'open-ils.supercat.authority.topic.startwith',
-	tag       => '150', subfield => 'a',
+	tag       => ['150'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -1434,7 +1438,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_startwith',
 	api_name  => 'open-ils.supercat.authority.title.refs.startwith',
-	tag       => '130', subfield => 'a',
+	tag       => ['130'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -1479,7 +1483,7 @@
 __PACKAGE__->register_method(
 	method    => 'general_authority_startwith',
 	api_name  => 'open-ils.supercat.authority.topic.refs.startwith',
-	tag       => '150', subfield => 'a',
+	tag       => ['150'], subfield => 'a',
 	api_level => 1,
 	argc      => 1,
 	signature =>
@@ -1498,10 +1502,14 @@
 
     my $tag = shift;
     my $subfield = shift;
+
     my $value = shift;
     my $limit = shift || 10;
     my $page = shift || 0;
 
+    # Match authority.full_rec normalization
+    $value = naco_normalize($value, $subfield);
+
     my $ref_limit = $limit;
     my $offset = $limit * abs($page);
     my $_storage = OpenSRF::AppSession->create( 'open-ils.cstore' );
@@ -1527,7 +1535,7 @@
             { select    => { afr => [qw/record value/] },
               from      => { 'afr', 'are' },
               where     => {
-                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => lc($value) } },
+                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '<' => $value } },
                 '+are' => { deleted => 'f' }
               },
               order_by  => { afr => { value => 'desc' } },
@@ -1544,7 +1552,7 @@
             { select    => { afr => [qw/record value/] },
               from      => { 'afr', 'are' },
               where     => {
-                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => lc($value) } },
+                '+afr' => { tag => \@ref_tags, subfield => $subfield, value => { '>=' => $value } },
                 '+are' => { deleted => 'f' }
               },
               order_by  => { afr => { value => 'asc' } },
@@ -3387,4 +3395,4 @@
 
 
 1;
-# vim: noet:ts=4:sw=4
+# vim: et:ts=4:sw=4

Copied: branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm (from rev 19332, trunk/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm)
===================================================================
--- branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm	                        (rev 0)
+++ branches/rel_2_0/Open-ILS/src/perlmods/OpenILS/Utils/Normalize.pm	2011-01-30 05:19:15 UTC (rev 19335)
@@ -0,0 +1,70 @@
+package OpenILS::Utils::Normalize;
+use strict;
+use warnings;
+use Unicode::Normalize;
+use Encode;
+
+use Exporter 'import';
+our @EXPORT_OK = qw( naco_normalize );
+
+sub naco_normalize {
+
+    my $str = decode_utf8(shift);
+    my $sf = shift;
+
+    # Apply NACO normalization to input string; based on
+    # http://www.loc.gov/catdir/pcc/naco/SCA_PccNormalization_Final_revised.pdf
+    #
+    # Note that unlike a strict reading of the NACO normalization rules,
+    # output is returned as lowercase instead of uppercase for compatibility
+    # with previous versions of the Evergreen naco_normalize routine.
+
+    # Convert to upper-case first; even though final output will be lowercase, doing this will
+    # ensure that the German eszett (ß) and certain ligatures (ﬀ, ﬁ, ﬄ, etc.) will be handled correctly.
+    # If there are any bugs in Perl's implementation of upcasing, they will be passed through here.
+    $str = uc $str;
+
+    # remove non-filing strings
+    $str =~ s/\x{0098}.*?\x{009C}//g;
+
+    $str = NFKD($str);
+
+    # additional substitutions - 3.6.
+    $str =~ s/\x{00C6}/AE/g;
+    $str =~ s/\x{00DE}/TH/g;
+    $str =~ s/\x{0152}/OE/g;
+    $str =~ tr/\x{0110}\x{00D0}\x{00D8}\x{0141}\x{2113}\x{02BB}\x{02BC}]['/DDOLl/d;
+
+    # transformations based on Unicode category codes
+    $str =~ s/[\p{Cc}\p{Cf}\p{Co}\p{Cs}\p{Lm}\p{Mc}\p{Me}\p{Mn}]//g;
+
+	if ($sf && $sf =~ /^a/o) {
+		my $commapos = index($str, ',');
+		if ($commapos > -1) {
+			if ($commapos != length($str) - 1) {
+                $str =~ s/,/\x07/; # preserve first comma
+			}
+		}
+	}
+
+    # since we've stripped out the control characters, we can now
+    # use a few as placeholders temporarily
+    $str =~ tr/+&@\x{266D}\x{266F}#/\x01\x02\x03\x04\x05\x06/;
+    $str =~ s/[\p{Pc}\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Po}\p{Ps}\p{Sk}\p{Sm}\p{So}\p{Zl}\p{Zp}\p{Zs}]/ /g;
+    $str =~ tr/\x01\x02\x03\x04\x05\x06\x07/+&@\x{266D}\x{266F}#,/;
+
+    # decimal digits
+    $str =~ tr/\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{07C0}-\x{07C9}\x{0966}-\x{096F}\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}\x{0BE6}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}\x{1040}-\x{1049}\x{1090}-\x{1099}\x{17E0}-\x{17E9}\x{1810}-\x{1819}\x{1946}-\x{194F}\x{19D0}-\x{19D9}\x{1A80}-\x{1A89}\x{1A90}-\x{1A99}\x{1B50}-\x{1B59}\x{1BB0}-\x{1BB9}\x{1C40}-\x{1C49}\x{1C50}-\x{1C59}\x{A620}-\x{A629}\x{A8D0}-\x{A8D9}\x{A900}-\x{A909}\x{A9D0}-\x{A9D9}\x{AA50}-\x{AA59}\x{ABF0}-\x{ABF9}\x{FF10}-\x{FF19}/0-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-90-9/;
+
+    # intentionally skipping step 8 of the NACO algorithm; if the string
+    # gets normalized away, that's fine.
+
+    # leading and trailing spaces
+    $str =~ s/\s+/ /g;
+    $str =~ s/^\s+//;
+    $str =~ s/\s+$//g;
+
+    return lc $str;
+}
+
+1;

Modified: branches/rel_2_0/Open-ILS/tests/naco_normalize.t
===================================================================
--- branches/rel_2_0/Open-ILS/tests/naco_normalize.t	2011-01-30 05:10:11 UTC (rev 19334)
+++ branches/rel_2_0/Open-ILS/tests/naco_normalize.t	2011-01-30 05:19:15 UTC (rev 19335)
@@ -6,7 +6,7 @@
 use Unicode::Normalize;
 use DBI;
 
-use OpenILS::Application::Storage::FTS;
+use OpenILS::Utils::Normalize qw( naco_normalize );
 
 # This could be made better in at least one of two ways (or both);
 # 1. put PL/Perl code that doesn't require a database into external
@@ -57,13 +57,13 @@
     [ '♭©®♯', '♭ ♯', 'other symbols' ],
 );
 
-# test copy of naco_normalize in OpenILS::Application::Storage::FTS
+# test copy of naco_normalize in OpenILS::Utils::Normalize
 foreach my $case (@test_cases) {
-    is(OpenILS::Application::Storage::FTS::naco_normalize($case->[0]), $case->[1], $case->[2] . ' (FTS.pm)');
+    is(naco_normalize($case->[0]), $case->[1], $case->[2] . ' (Normalize.pm)');
 }
-is(OpenILS::Application::Storage::FTS::naco_normalize('Smith, Jane. Poet, painter, and author', 'a'),
+is(naco_normalize('Smith, Jane. Poet, painter, and author', 'a'),
     'smith, jane poet painter and author',
-    'retain first comma (FTS.pm)');
+    'retain first comma (Normalize.pm)');
 
 SKIP: {
     my $dbh = DBI->connect($dsn, $db_user, $db_pw, {AutoCommit => 1, pg_enable_utf8 => 1, PrintError => 0});