[open-ils-commits] r1295 - conifer/branches/rel_1_6_1/tools/ebooks (dbs)

svn at svn.open-ils.org svn at svn.open-ils.org
Thu Mar 31 23:09:36 EDT 2011


Author: dbs
Date: 2011-03-31 23:09:34 -0400 (Thu, 31 Mar 2011)
New Revision: 1295

Added:
   conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl
   conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql
   conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py
Log:
Commit current ebook processing scripts

These scripts are helping me attempt to make sense of the past
three years of our electronic book loading, which happened
consortially and non-consortially, and with and without a working
ingest for multiple located URIs per record.


Added: conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl	                        (rev 0)
+++ conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl	2011-04-01 03:09:34 UTC (rev 1295)
@@ -0,0 +1,92 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use DBI;
+use Spreadsheet::WriteExcel;
+
+my %library = (
+    id => 109,
+    name => 'Windsor'
+);
+
+my @ebook_records = qw/
+    cambridge-2009-12-01.mrc
+    cambridge-2010-04-12.mrc
+    cambridge-2010-08-18.mrc
+    cambridge-2010-09-30_137.mrc
+    duke-2010-08-24_92.mrc
+    duke-2011-02-02_10.mrc
+    duke-2011-02-14_15.mrc
+    gibson-chrc-2010-08-06.mrc
+    gibson-chrc-2010-08-20.mrc
+    gibson-chrc-2010-09-17.mrc
+    gibson-chrc-2010-10-21_66.mrc
+    gibson-chrc-2010-12-02_127.mrc
+    gibson_chrc-2011-02-23_212.mrc
+    oxford-2010-03-04.mrc
+    oxford-2010-04-27.mrc
+    oxford-2010-09-30_156.mrc
+    oxford-2010-10-28_49.mrc
+    oxford_2010-03-04.mrc
+    springer-2009-12-01.mrc
+    springer-2010-02-11.mrc
+    springer-2010-04-28_1218.mrc
+    springer-2010-06-27_165.mrc
+    springer-2011-02-17_1751.mrc
+/;
+
+my $dbh = DBI->connect("dbi:Pg:dbname=conifer;host=polaris.cs.uoguelph.ca", "evergreen", "") || die "Can't connect to database.\n";
+
+my $workbook = Spreadsheet::WriteExcel->new("/openils/var/web/ebooks/" . $library{"name"} . ".xls");
+
+my $worksheet = $workbook->add_worksheet("Cover Sheet");
+
+my $bold = $workbook->add_format();
+$bold->set_bold();
+
+$worksheet->write(5, 0, "Ebook analysis:", $bold);
+$worksheet->write(5, 1, $library{"name"});
+
+foreach my $marc (@ebook_records) {
+    add_analysis($marc, $library{"id"});
+}
+
+$dbh->disconnect();
+$workbook->close();
+exit;
+
+sub add_analysis {
+    my ($collection, $library) = @_;
+
+    # Format as a string. Doesn't change to a number when edited
+    my $format_num = $workbook->add_format(num_format => '@');
+
+    $worksheet = $workbook->add_worksheet("$collection");
+
+    my $col = 0;
+    foreach my $head ("Collection", "Ebook ID", "Record ID", "ISBN", "System Control", "Author", "Title") {
+        $worksheet->write(0, $col, $head, $bold);
+        $col++;
+    }
+
+    my $sth = $dbh->prepare("SELECT collection, id, record, isbn, sysctl, author, title
+        FROM scratchpad.ebook_missing_record_matches(?, ?)");
+    $sth->execute($library, $collection);
+
+    my $row = 1;
+    while (my $a = $sth->fetchrow_hashref()) {
+        $col = 0;
+        $worksheet->write_string($row, $col++, $a->{collection}, $bold);
+        $worksheet->write($row, $col++, $a->{id});
+        $worksheet->write($row, $col++, $a->{record});
+        $worksheet->write_string($row, $col++, $a->{isbn}, $format_num);
+        $worksheet->write_string($row, $col++, $a->{sysctl});
+        $worksheet->write_string($row, $col++, $a->{author});
+        $worksheet->write_string($row, $col++, $a->{title});
+
+        $row++;
+    }
+}
+

Added: conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql	                        (rev 0)
+++ conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql	2011-04-01 03:09:34 UTC (rev 1295)
@@ -0,0 +1,156 @@
+TRUNCATE scratchpad.ebook_collections_to_records;
+
+TRUNCATE scratchpad.ebook_links_by_institution;
+
+-- Find any bib records that match on isbn
+INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT situ.collection, situ.id, rmsr.id AS record FROM scratchpad.ids_to_urls situ INNER JOIN reporter.materialized_simple_record rmsr ON ARRAY[situ.isbn] <@ rmsr.isbn WHERE situ.isbn IS NOT NULL AND (collection, situ.id, rmsr.id) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records);
+
+-- Find any bib records that match on isbn, round 2, because reporter.materialized_simple_record needs updating
+INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT collection, situ.id, record FROM metabib.full_rec mfr INNER JOIN scratchpad.ids_to_urls situ ON situ.isbn = mfr.value AND mfr.tag = '020' AND mfr.subfield = 'a' WHERE (collection, situ.id, record) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records) AND situ.isbn IS NOT NULL;
+
+-- Find any bib records that match on system control number
+INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT collection, situ.id, record FROM metabib.full_rec mfr INNER JOIN scratchpad.ids_to_urls situ ON LOWER(situ.sysctl) = mfr.value AND mfr.tag = '035' AND mfr.subfield = 'a' WHERE (collection, situ.id, record) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records) AND situ.sysctl IS NOT NULL;
+
+-- Set the canonical number of unique records per batch
+INSERT INTO scratchpad.ebook_links_by_institution (library, cnt, collection) SELECT 'ALL', COUNT(*), x.collection FROM (SELECT DISTINCT id, collection FROM scratchpad.ids_to_urls) AS x GROUP BY x.collection ORDER BY x.collection;
+
+-- Table that maps which ebook record each library has for a given collection record
+CREATE TABLE scratchpad.ebook_record_by_library (library INTEGER, record BIGINT, id INTEGER, collection TEXT);
+
+-- Populate the table
+INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 109, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 109) ORDER BY collection, id;
+
+INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 103, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 103) ORDER BY collection, id;
+
+INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 124, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 124) ORDER BY collection, id;
+
+CREATE TYPE scratchpad.ebook AS (id INTEGER, isbn TEXT, sysctl TEXT, author TEXT, title TEXT, url TEXT);
+
+CREATE OR REPLACE FUNCTION scratchpad.ebook_missing_records (IN INTEGER, IN TEXT)
+    RETURNS SETOF scratchpad.ebook AS 
+$$
+    SELECT DISTINCT id, isbn, sysctl, author, title, url
+        FROM scratchpad.ids_to_urls
+        WHERE collection = $2
+            AND id NOT IN (
+            SELECT id
+                FROM scratchpad.ebook_record_by_library
+                WHERE collection = $2
+                AND library = $1
+            )
+        ORDER BY id;
+$$ LANGUAGE SQL;
+
+-- Create a function to tell us which records a particular library has for a particular collection
+CREATE OR REPLACE FUNCTION scratchpad.ebook_has_records (IN INTEGER, IN TEXT, OUT INTEGER, OUT TEXT, OUT TEXT, OUT TEXT, OUT TEXT)
+    RETURNS SETOF record AS 
+$$
+    SELECT DISTINCT situ.id, isbn, sysctl, author, title
+        FROM scratchpad.ids_to_urls situ 
+            INNER JOIN scratchpad.ebook_collections_to_records seb
+                ON situ.id = seb.id AND situ.collection = seb.collection
+            INNER JOIN asset.call_number acn
+                ON acn.record = seb.record
+--            INNER JOIN asset.uri_call_number_map auricnm
+--                ON auricnm.call_number = acn.id
+        WHERE situ.collection = $2 AND acn.deleted IS FALSE AND acn.owning_lib = $1
+        ORDER BY id;
+$$ LANGUAGE SQL;
+
+-- Should have scripted this but went with the power of Vim macros
+-- OWA missing records
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2009-12-01.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-04-12.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-04-12.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-08-18.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-08-18.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-09-30_137.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-09-30_137.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2010-08-24_92.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2010-08-24_92.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2011-02-02_10.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2011-02-02_10.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2011-02-14_15.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2011-02-14_15.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-08-06.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-08-06.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-08-20.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-08-20.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-09-17.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-09-17.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-10-21_66.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-10-21_66.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-12-02_127.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-12-02_127.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson_chrc-2011-02-23_212.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson_chrc-2011-02-23_212.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-03-04.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-04-27.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-04-27.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-09-30_156.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-09-30_156.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-10-28_49.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-10-28_49.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford_2010-03-04.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford_2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2010-02-11.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2010-04-28_1218.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2011-06-27_165.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2011-02-17_1751.mrc')) AS foo;
+
+-- Laurentian ebook links
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2009-12-01.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-04-12.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-04-12.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-08-18.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-08-18.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-09-30_137.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-09-30_137.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2010-08-24_92.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2010-08-24_92.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2011-02-02_10.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2011-02-02_10.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2011-02-14_15.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2011-02-14_15.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-08-06.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-08-06.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-08-20.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-08-20.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-09-17.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-09-17.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-10-21_66.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-10-21_66.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-12-02_127.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-12-02_127.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson_chrc-2011-02-23_212.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson_chrc-2011-02-23_212.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-03-04.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-04-27.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-04-27.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-09-30_156.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-09-30_156.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-10-28_49.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-10-28_49.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2010-02-11.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2010-04-28_1218.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2011-06-27_165.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2011-02-17_1751.mrc')) AS foo;
+
+-- Algoma ebook links
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2009-12-01.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-04-12.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-04-12.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-08-18.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-08-18.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-09-30_137.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-09-30_137.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2010-08-24_92.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2010-08-24_92.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2011-02-02_10.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2011-02-02_10.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2011-02-14_15.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2011-02-14_15.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-08-06.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-08-06.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-08-20.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-08-20.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-09-17.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-09-17.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-10-21_66.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-10-21_66.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-12-02_127.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-12-02_127.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson_chrc-2011-02-23_212.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson_chrc-2011-02-23_212.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-03-04.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-04-27.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-04-27.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-09-30_156.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-09-30_156.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-10-28_49.mrc',  COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-10-28_49.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2010-02-11.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2010-04-28_1218.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2011-06-27_165.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2011-02-17_1751.mrc')) AS foo;
+
+-- List record issues for springer
+SELECT * FROM scratchpad.ebook_links_by_institution WHERE collection LIKE 'sprin%' ORDER BY collection, library;
+
+-- Count how many records are missing from a given library's collection
+SELECT COUNT(*) FROM (SELECT DISTINCT collection, id FROM scratchpad.ids_to_urls WHERE collection = 'springer-2009-12-01.mrc' AND id NOT IN (SELECT id FROM scratchpad.ebook_record_by_library WHERE collection = 'springer-2009-12-01.mrc' AND library = 109)) AS foo;
+
+-- List the records missing from a given library's collection
+SELECT * FROM scratchpad.ids_to_urls WHERE collection = 'springer-2009-12-01.mrc' AND id NOT IN (SELECT id FROM scratchpad.ebook_record_by_library WHERE collection = 'springer-2009-12-01.mrc' AND library = 109)
+
+-- Now, a function to easily generate records to look up as potential matches
+CREATE TYPE scratchpad.ebook_missing_record_matches AS (collection TEXT, id INTEGER, record BIGINT, isbn TEXT, sysctl TEXT, author TEXT, title TEXT);
+
+CREATE FUNCTION scratchpad.ebook_missing_record_matches (IN library INTEGER, IN collection TEXT)
+    RETURNS SETOF scratchpad.ebook_missing_record_matches AS
+$$
+SELECT DISTINCT $2, seb.id, seb.record, situ.isbn, situ.sysctl, situ.author, situ.title 
+    FROM scratchpad.ebook_collections_to_records seb 
+        INNER JOIN scratchpad.ids_to_urls situ
+            ON situ.collection = $2 AND situ.id = seb.id 
+    WHERE seb.id IN (
+        SELECT id FROM scratchpad.ebook_missing_records($1, $2)
+    ) AND seb.collection = $2
+$$ LANGUAGE SQL;

Added: conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py	                        (rev 0)
+++ conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py	2011-04-01 03:09:34 UTC (rev 1295)
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""
+Iterate through a set of records and generate a TSV file containing
+every ISBN and system control number mapped to every URL in each record.
+"""
+
+#import os, os.path, sys, pymarc, pymarc.marc8, re
+import glob, pymarc, pymarc.marc8, re
+
+def parse_file(infile, writer):
+    """
+    Parse the file of MARC records
+    """
+    reader = pymarc.MARCReader(open(infile, 'rb'))
+    cnt = 0
+
+    for record in reader:
+        cnt = cnt + 1
+
+        isbn = get_field('020', 'a', record, cnt, infile)
+        sys_ctl_num = get_field('035', 'a', record, cnt, infile)
+
+        if not (record['856'] and record['856']['u']):
+            print("* No URL for record %s in file %s" % (cnt, infile))
+            continue
+
+        for url_field in record.get_fields('856'):
+            if url_field.indicator1 != '4':
+                # print("* Record %d has an 856 with ind1 = %s" % (cnt, url_field.indicator1))
+                continue
+    #                if  not (url_field.indicator2 == '0' or url_field.indicator2 == '1'):
+    #                    print("* Record %d has an 856 with ind2 = %s" % (cnt, url_field.indicator2))
+    #                    continue
+            for url in url_field.get_subfields('u'):
+                if url.find('loc.gov') > -1:
+                    # print("* Record %d has an 856 with url containing %s" % (cnt, url))
+                    continue
+                isbn = re.sub(r'^\D*(\d+)\D.*?$', r'\1', isbn)
+                writer.write('%d\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
+                    cnt, sys_ctl_num, isbn, record.author(), record.title(), url, infile
+                ))
+
+def get_field(field, subfield, record, cnt, infile):
+    """
+    Return a field and subfield without complaining
+
+    Should just try/catch this sucker
+    """
+    if not (record[field] and record[field][subfield]):
+        print("* No [%s][%s] for record %s in file %s" % (field, subfield, cnt, infile))
+        return('None')
+    return(record[field][subfield])
+
+if __name__ == '__main__':
+
+    OUTFILE = '/home/dan/Downloads/ebooks/isbns_to_urls.tsv'
+    tsv_writer = open(OUTFILE, 'w')
+    for marc in glob.glob('*.mrc'):
+        parse_file(marc, tsv_writer)



More information about the open-ils-commits mailing list