[open-ils-commits] r14375 - in trunk/Open-ILS/src/sql/Pg: . upgrade (miker)

Mon Oct 12 13:06:05 EDT 2009

Author: miker
Date: 2009-10-12 13:06:01 -0400 (Mon, 12 Oct 2009)
New Revision: 14375

Added:
   trunk/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql
   trunk/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql
Modified:
   trunk/Open-ILS/src/sql/Pg/002.functions.config.sql
   trunk/Open-ILS/src/sql/Pg/002.schema.config.sql
   trunk/Open-ILS/src/sql/Pg/950.data.seed-values.sql
Log:
This patch moves towards in-database indexed value normalization.  The eventual
goal of this is to move to pure database ingest handled by triggers.  This will
free us from the grip of the Ingest server, speed up ingest altogether, and
cause ingest to occur entirely within the same database transaction as the
INSERT or UPDATE to the MARC that is the cause of the ingest.  This means
no more potential for race conditions on ingest, and simpler data import.

In this first step, we add some normalization routines for dealing with basic
string data.  NACO normalization and the like.  With these functions we can
do everything that the Ingest server can do with regard to munging indexed
strings.

You can register these normalizers with specific indexed fields, and define the
order in which they are to be applied.

Next up: work on the scaffolding to actually apply the functions, define the
IDL entries, and create MARC-handling functions to do the xpath dances.

This functionallity will require either:

 * a custom (I have a patch) pgxml contrib module or
 * Postgresql 8.3+ XML/XPath support




Modified: trunk/Open-ILS/src/sql/Pg/002.functions.config.sql
===================================================================

--- trunk/Open-ILS/src/sql/Pg/002.functions.config.sql	2009-10-12 16:54:39 UTC (rev 14374)
+++ trunk/Open-ILS/src/sql/Pg/002.functions.config.sql	2009-10-12 17:06:01 UTC (rev 14375)
@@ -120,5 +120,9 @@
         SELECT SUBSTRING( $1 FROM $_$^\S+$_$);
 $$ LANGUAGE SQL;
 
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+        SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
 COMMIT;
 

Modified: trunk/Open-ILS/src/sql/Pg/002.schema.config.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/002.schema.config.sql	2009-10-12 16:54:39 UTC (rev 14374)
+++ trunk/Open-ILS/src/sql/Pg/002.schema.config.sql	2009-10-12 17:06:01 UTC (rev 14375)
@@ -51,7 +51,7 @@
     install_date    TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW()
 );
 
-INSERT INTO config.upgrade_log (version) VALUES ('0031'); -- miker
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
 
 CREATE TABLE config.bib_source (
 	id		SERIAL	PRIMARY KEY,
@@ -562,5 +562,38 @@
       ( datatype <> 'link' AND fm_class IS NULL ) )
 );
 
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+        SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+        SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+        SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+        id              SERIAL  PRIMARY KEY,
+        name            TEXT    UNIQUE NOT NULL,
+        func            TEXT    NOT NULL,
+        param_count     INT     NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+        id      SERIAL  PRIMARY KEY,
+        field   INT     NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+        norm    INT     NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+        params  TEXT,
+        pos     INT     NOT NULL DEFAULT 0
+);
+
+
 COMMIT;
 

Modified: trunk/Open-ILS/src/sql/Pg/950.data.seed-values.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/950.data.seed-values.sql	2009-10-12 16:54:39 UTC (rev 14374)
+++ trunk/Open-ILS/src/sql/Pg/950.data.seed-values.sql	2009-10-12 17:06:01 UTC (rev 14375)
@@ -769,6 +769,8 @@
 INSERT INTO config.z3950_attr (id, source, name, label, code, format)
 	VALUES (9, 'loc', 'item_type', oils_i18n_gettext(9, 'Item Type', 'cza', 'label'), 1001, 1);
 
+UPDATE config.z3950_attr SET truncation = 1 WHERE source = 'loc';
+
 INSERT INTO config.z3950_attr (id, source, name, label, code, format)
 	VALUES (10, 'oclc', 'tcn', oils_i18n_gettext(10, 'Title Control Number', 'cza', 'label'), 12, 1);
 INSERT INTO config.z3950_attr (id, source, name, label, code, format)
@@ -2117,3 +2119,85 @@
     'bool'
 );
 
+
+-- in-db indexing normalizers
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize',
+	'Apply NACO normalization rules to the extracted text.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Normalize date range',
+	'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+	'split_date_range',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize -- retain first comma',
+	'Apply NACO normalization rules to the extracted text, retaining the first comma.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize_keep_comma',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Strip Diacritics',
+	'Convert text to NFD form and remove non-spacing combining marks.',
+	'remove_diacritics',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Up-case',
+	'Convert text upper case.',
+	'uppercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Down-case',
+	'Convert text lower case.',
+	'lowercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Extract Dewey-like number',
+	'Extract a string of numeric characters ther resembles a DDC number.',
+	'call_number_dewey',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Left truncation',
+	'Discard the specified number of characters from the left side of the string.',
+	'left_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Right truncation',
+	'Include only the specified number of characters from the left side of the string.',
+	'right_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'First word',
+	'Include only the first space-separated word of a string.',
+	'first_word',
+	0
+);
+
+-- make use of the index normalizers
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+	SELECT	m.id,
+		i.id
+	  FROM	config.metabib_field m,
+		config.index_normalizer i
+	  WHERE	i.func IN ('naco_normalize','split_date_range');
+
+

Added: trunk/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql	                        (rev 0)
+++ trunk/Open-ILS/src/sql/Pg/upgrade/0032.ingest-normalizers.sql	2009-10-12 17:06:01 UTC (rev 14375)
@@ -0,0 +1,41 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0032'); -- miker
+
+-- Some handy functions, based on existing ones, to provide optional ingest normalization
+
+CREATE OR REPLACE FUNCTION public.left_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+	SELECT SUBSTRING($1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.right_trunc( TEXT, INT ) RETURNS TEXT AS $func$
+	SELECT SUBSTRING($1,1,$2);
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.naco_normalize_keep_comma( TEXT ) RETURNS TEXT AS $func$
+	SELECT public.naco_normalize($1,'a');
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION public.split_date_range( TEXT ) RETURNS TEXT AS $func$
+	SELECT REGEXP_REPLACE( $1, E'(\\d{4})-(\\d{4})', E'\\1 \\2', 'g' );
+$func$ LANGUAGE SQL STRICT IMMUTABLE;
+
+-- And ... a table in which to register them
+
+CREATE TABLE config.index_normalizer (
+	id		SERIAL	PRIMARY KEY,
+	name		TEXT	UNIQUE NOT NULL,
+	func		TEXT	NOT NULL,
+	param_count	INT	NOT NULL DEFAULT 0
+);
+
+CREATE TABLE config.metabib_field_index_norm_map (
+	id	SERIAL	PRIMARY KEY,
+	field	INT	NOT NULL REFERENCES config.metabib_field (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+	norm	INT	NOT NULL REFERENCES config.index_normalizer (id) ON DELETE CASCADE DEFERRABLE INITIALLY DEFERRED,
+	params	TEXT,
+	pos	INT	NOT NULL DEFAULT 0
+);
+
+COMMIT;
+

Added: trunk/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql
===================================================================
--- trunk/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql	                        (rev 0)
+++ trunk/Open-ILS/src/sql/Pg/upgrade/0033.data.ingest-normalizers.sql	2009-10-12 17:06:01 UTC (rev 14375)
@@ -0,0 +1,85 @@
+BEGIN;
+
+INSERT INTO config.upgrade_log (version) VALUES ('0033'); -- miker
+
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize',
+	'Apply NACO normalization rules to the extracted text.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Normalize date range',
+	'Split date ranges in the form of "XXXX-YYYY" into "XXXX YYYY" for proper index.',
+	'split_date_range',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'NACO Normalize -- retain first comma',
+	'Apply NACO normalization rules to the extracted text, retaining the first comma.  See http://www.loc.gov/catdir/pcc/naco/normrule-2.html for details.',
+	'naco_normalize_keep_comma',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Strip Diacritics',
+	'Convert text to NFD form and remove non-spacing combining marks.',
+	'remove_diacritics',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Up-case',
+	'Convert text upper case.',
+	'uppercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Down-case',
+	'Convert text lower case.',
+	'lowercase',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Extract Dewey-like number',
+	'Extract a string of numeric characters ther resembles a DDC number.',
+	'call_number_dewey',
+	0
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Left truncation',
+	'Discard the specified number of characters from the left side of the string.',
+	'left_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'Right truncation',
+	'Include only the specified number of characters from the left side of the string.',
+	'right_trunc',
+	1
+);
+
+INSERT INTO config.index_normalizer (name, description, func, param_count) VALUES (
+	'First word',
+	'Include only the first space-separated word of a string.',
+	'first_word',
+	0
+);
+
+
+INSERT INTO config.metabib_field_index_norm_map (field,norm)
+	SELECT	m.id,
+		i.id
+	  FROM	config.metabib_field m,
+		config.index_normalizer i
+	  WHERE	i.func IN ('naco_normalize','split_date_range');
+
+COMMIT;
+