[open-ils-commits] r17714 - in trunk/Open-ILS/src/sql/Pg: . upgrade (dbs)

svn at svn.open-ils.org svn at svn.open-ils.org
Wed Sep 15 18:06:43 EDT 2010


Author: dbs
Date: 2010-09-15 18:06:37 -0400 (Wed, 15 Sep 2010)
New Revision: 17714

Added:
   trunk/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql
Modified:
   trunk/Open-ILS/src/sql/Pg/002.schema.config.sql
   trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql
   trunk/Open-ILS/src/sql/Pg/800.fkeys.sql
Log:
Make authority.normalize_heading() more defensive, and drop back to a plain (non-unique) index

NOTE: Database server now requires UUID::Tiny CPAN module in its Perl repertoire

When faced with terrible input, authority.normalize_heading() will generate a
heading based on the MD5 UUID of the input value, flagged with "BAD_MARCXML"
if the MARCXML could not be parsed, or "NOHEADING" if there was no 1xx field.
Previously, authority.normalize_heading() would throw raw, ugly Perl errors.

Many thanks to Mike Rylander and Galen Charlton for their suggestions on how
to break the original version of authority.normalize_heading(), this code
should be much more robust as a result.


Modified: trunk/Open-ILS/src/sql/Pg/002.schema.config.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/002.schema.config.sql	2010-09-15 20:43:54 UTC (rev 17713)
+++ trunk/Open-ILS/src/sql/Pg/002.schema.config.sql	2010-09-15 22:06:37 UTC (rev 17714)
@@ -68,7 +68,7 @@
     install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
-INSERT INTO config.upgrade_log (version) VALUES ('0401'); -- dbs
+INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs
 
 CREATE TABLE config.bib_source (
 	id		SERIAL	PRIMARY KEY,

Modified: trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql	2010-09-15 20:43:54 UTC (rev 17713)
+++ trunk/Open-ILS/src/sql/Pg/020.schema.functions.sql	2010-09-15 22:06:37 UTC (rev 17714)
@@ -264,13 +264,28 @@
 CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
     use strict;
     use warnings;
+
+    use utf8;
     use MARC::Record;
     use MARC::File::XML (BinaryEncoding => 'UTF8');
+    use UUID::Tiny ':std';
 
-    my $xml = shift();
-    my $r = MARC::Record->new_from_xml( $xml );
-    return undef unless ($r);
+    my $xml = shift() or return undef;
 
+    my $r;
+
+    # Prevent errors in XML parsing from blowing out ungracefully
+    eval {
+        $r = MARC::Record->new_from_xml( $xml );
+        1;
+    } or do {
+       return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+    };
+
+    if (!$r) {
+       return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+    }
+
     # From http://www.loc.gov/standards/sourcelist/subject.html
     my $thes_code_map = {
         a => 'lcsh',
@@ -285,7 +300,11 @@
     };
 
     # Default to "No attempt to code" if the leader is horribly broken
-    my $thes_char = substr($r->field('008')->data(), 11, 1) || '|';
+    my $fixed_field = $r->field('008');
+    my $thes_char = '|';
+    if ($fixed_field) { 
+        $thes_char = substr($fixed_field->data(), 11, 1) || '|';
+    }
 
     my $thes_code = 'UNDEFINED';
 
@@ -296,19 +315,26 @@
         $thes_code = $thes_code_map->{$thes_char};
     }
 
+    my $auth_txt = '';
     my $head = $r->field('1..');
-    my $auth_txt = '';
-    foreach my $sf ($head->subfields()) {
-        $auth_txt .= $sf->[1];
+    if ($head) {
+        # Concatenate all of these subfields together, prefixed by their code
+        # to prevent collisions along the lines of "Fiction, North Carolina"
+        foreach my $sf ($head->subfields()) {
+            $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
+        }
     }
-
     
     # Perhaps better to parameterize the spi and pass as a parameter
     $auth_txt =~ s/'//go;
-    my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
-    my $norm_txt = $result->{rows}[0]->{norm_text};
 
-    return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+    if ($auth_txt) {
+        my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
+        my $norm_txt = $result->{rows}[0]->{norm_text};
+        return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+    }
+
+    return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
 $func$ LANGUAGE 'plperlu' IMMUTABLE;
 
 COMMENT ON FUNCTION authority.normalize_heading( TEXT ) IS $$

Modified: trunk/Open-ILS/src/sql/Pg/800.fkeys.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/800.fkeys.sql	2010-09-15 20:43:54 UTC (rev 17713)
+++ trunk/Open-ILS/src/sql/Pg/800.fkeys.sql	2010-09-15 22:06:37 UTC (rev 17714)
@@ -117,6 +117,6 @@
 ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT view_perm_fkey FOREIGN KEY (view_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE ON DELETE RESTRICT DEFERRABLE INITIALLY DEFERRED;
 ALTER TABLE config.org_unit_setting_type ADD CONSTRAINT update_perm_fkey FOREIGN KEY (update_perm) REFERENCES permission.perm_list (id) ON UPDATE CASCADE DEFERRABLE INITIALLY DEFERRED;
 
-CREATE UNIQUE INDEX unique_by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE;
+CREATE INDEX by_heading_and_thesaurus ON authority.record_entry (authority.normalize_heading(marc)) WHERE deleted IS FALSE or deleted = FALSE;
 
 COMMIT;

Added: trunk/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql	                        (rev 0)
+++ trunk/Open-ILS/src/sql/Pg/upgrade/0402.schema.unique_authority_index_revisited.sql	2010-09-15 22:06:37 UTC (rev 17714)
@@ -0,0 +1,91 @@
+BEGIN;
+
+-- Make the authority heading normalization routine more defensive
+-- Also drop back to a plain index for 2.0, we will get more restrictive over time
+
+INSERT INTO config.upgrade_log (version) VALUES ('0402'); -- dbs
+
+CREATE OR REPLACE FUNCTION authority.normalize_heading( TEXT ) RETURNS TEXT AS $func$
+    use strict;
+    use warnings;
+
+    use utf8;
+    use MARC::Record;
+    use MARC::File::XML (BinaryEncoding => 'UTF8');
+    use UUID::Tiny ':std';
+
+    my $xml = shift() or return undef;
+
+    my $r;
+
+    # Prevent errors in XML parsing from blowing out ungracefully
+    eval {
+        $r = MARC::Record->new_from_xml( $xml );
+        1;
+    } or do {
+       return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+    };
+
+    if (!$r) {
+       return 'BAD_MARCXML_' . create_uuid_as_string(UUID_MD5, $xml);
+    }
+
+    # From http://www.loc.gov/standards/sourcelist/subject.html
+    my $thes_code_map = {
+        a => 'lcsh',
+        b => 'lcshac',
+        c => 'mesh',
+        d => 'nal',
+        k => 'cash',
+        n => 'notapplicable',
+        r => 'aat',
+        s => 'sears',
+        v => 'rvm',
+    };
+
+    # Default to "No attempt to code" if the leader is horribly broken
+    my $fixed_field = $r->field('008');
+    my $thes_char = '|';
+    if ($fixed_field) { 
+        $thes_char = substr($fixed_field->data(), 11, 1) || '|';
+    }
+
+    my $thes_code = 'UNDEFINED';
+
+    if ($thes_char eq 'z') {
+        # Grab the 040 $f per http://www.loc.gov/marc/authority/ad040.html
+        $thes_code = $r->subfield('040', 'f') || 'UNDEFINED';
+    } elsif ($thes_code_map->{$thes_char}) {
+        $thes_code = $thes_code_map->{$thes_char};
+    }
+
+    my $auth_txt = '';
+    my $head = $r->field('1..');
+    if ($head) {
+        # Concatenate all of these subfields together, prefixed by their code
+        # to prevent collisions along the lines of "Fiction, North Carolina"
+        foreach my $sf ($head->subfields()) {
+            $auth_txt .= '‡' . $sf->[0] . ' ' . $sf->[1];
+        }
+    }
+    
+    # Perhaps better to parameterize the spi and pass as a parameter
+    $auth_txt =~ s/'//go;
+
+    if ($auth_txt) {
+        my $result = spi_exec_query("SELECT public.naco_normalize('$auth_txt') AS norm_text");
+        my $norm_txt = $result->{rows}[0]->{norm_text};
+        return $head->tag() . "_" . $thes_code . " " . $norm_txt;
+    }
+
+    return 'NOHEADING_' . $thes_code . ' ' . create_uuid_as_string(UUID_MD5, $xml);
+$func$ LANGUAGE 'plperlu' IMMUTABLE;
+
+DROP INDEX authority.unique_by_heading_and_thesaurus;
+
+CREATE INDEX by_heading_and_thesaurus
+    ON authority.record_entry (authority.normalize_heading(marc))
+    WHERE deleted IS FALSE or deleted = FALSE
+;
+
+COMMIT;



More information about the open-ils-commits mailing list