[open-ils-commits] r1295 - conifer/branches/rel_1_6_1/tools/ebooks (dbs)
svn at svn.open-ils.org
svn at svn.open-ils.org
Thu Mar 31 23:09:36 EDT 2011
Author: dbs
Date: 2011-03-31 23:09:34 -0400 (Thu, 31 Mar 2011)
New Revision: 1295
Added:
conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl
conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql
conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py
Log:
Commit current ebook processing scripts
These scripts are helping me attempt to make sense of the past
three years of our electronic book loading, which happened
consortially and non-consortially, and with and without a working
ingest for multiple located URIs per record.
Added: conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl (rev 0)
+++ conifer/branches/rel_1_6_1/tools/ebooks/ebook_reports.pl 2011-04-01 03:09:34 UTC (rev 1295)
@@ -0,0 +1,92 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+use DBI;
+use Spreadsheet::WriteExcel;
+
+my %library = (
+ id => 109,
+ name => 'Windsor'
+);
+
+my @ebook_records = qw/
+ cambridge-2009-12-01.mrc
+ cambridge-2010-04-12.mrc
+ cambridge-2010-08-18.mrc
+ cambridge-2010-09-30_137.mrc
+ duke-2010-08-24_92.mrc
+ duke-2011-02-02_10.mrc
+ duke-2011-02-14_15.mrc
+ gibson-chrc-2010-08-06.mrc
+ gibson-chrc-2010-08-20.mrc
+ gibson-chrc-2010-09-17.mrc
+ gibson-chrc-2010-10-21_66.mrc
+ gibson-chrc-2010-12-02_127.mrc
+ gibson_chrc-2011-02-23_212.mrc
+ oxford-2010-03-04.mrc
+ oxford-2010-04-27.mrc
+ oxford-2010-09-30_156.mrc
+ oxford-2010-10-28_49.mrc
+ oxford_2010-03-04.mrc
+ springer-2009-12-01.mrc
+ springer-2010-02-11.mrc
+ springer-2010-04-28_1218.mrc
+ springer-2010-06-27_165.mrc
+ springer-2011-02-17_1751.mrc
+/;
+
+my $dbh = DBI->connect("dbi:Pg:dbname=conifer;host=polaris.cs.uoguelph.ca", "evergreen", "") || die "Can't connect to database.\n";
+
+my $workbook = Spreadsheet::WriteExcel->new("/openils/var/web/ebooks/" . $library{"name"} . ".xls");
+
+my $worksheet = $workbook->add_worksheet("Cover Sheet");
+
+my $bold = $workbook->add_format();
+$bold->set_bold();
+
+$worksheet->write(5, 0, "Ebook analysis:", $bold);
+$worksheet->write(5, 1, $library{"name"});
+
+foreach my $marc (@ebook_records) {
+ add_analysis($marc, $library{"id"});
+}
+
+$dbh->disconnect();
+$workbook->close();
+exit;
+
+sub add_analysis {
+ my ($collection, $library) = @_;
+
+ # Format as a string. Doesn't change to a number when edited
+ my $format_num = $workbook->add_format(num_format => '@');
+
+ $worksheet = $workbook->add_worksheet("$collection");
+
+ my $col = 0;
+ foreach my $head ("Collection", "Ebook ID", "Record ID", "ISBN", "System Control", "Author", "Title") {
+ $worksheet->write(0, $col, $head, $bold);
+ $col++;
+ }
+
+ my $sth = $dbh->prepare("SELECT collection, id, record, isbn, sysctl, author, title
+ FROM scratchpad.ebook_missing_record_matches(?, ?)");
+ $sth->execute($library, $collection);
+
+ my $row = 1;
+ while (my $a = $sth->fetchrow_hashref()) {
+ $col = 0;
+ $worksheet->write_string($row, $col++, $a->{collection}, $bold);
+ $worksheet->write($row, $col++, $a->{id});
+ $worksheet->write($row, $col++, $a->{record});
+ $worksheet->write_string($row, $col++, $a->{isbn}, $format_num);
+ $worksheet->write_string($row, $col++, $a->{sysctl});
+ $worksheet->write_string($row, $col++, $a->{author});
+ $worksheet->write_string($row, $col++, $a->{title});
+
+ $row++;
+ }
+}
+
Added: conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql (rev 0)
+++ conifer/branches/rel_1_6_1/tools/ebooks/ebooks.sql 2011-04-01 03:09:34 UTC (rev 1295)
@@ -0,0 +1,156 @@
+TRUNCATE scratchpad.ebook_collections_to_records;
+
+TRUNCATE scratchpad.ebook_links_by_institution;
+
+-- Find any bib records that match on isbn
+INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT situ.collection, situ.id, rmsr.id AS record FROM scratchpad.ids_to_urls situ INNER JOIN reporter.materialized_simple_record rmsr ON ARRAY[situ.isbn] <@ rmsr.isbn WHERE situ.isbn IS NOT NULL AND (collection, situ.id, rmsr.id) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records);
+
+-- Find any bib records that match on isbn, round 2, because reporter.materialized_simple_record needs updating
+INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT collection, situ.id, record FROM metabib.full_rec mfr INNER JOIN scratchpad.ids_to_urls situ ON situ.isbn = mfr.value AND mfr.tag = '020' AND mfr.subfield = 'a' WHERE (collection, situ.id, record) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records) AND situ.isbn IS NOT NULL;
+
+-- Find any bib records that match on system control number
+INSERT INTO scratchpad.ebook_collections_to_records (collection, id, record) SELECT DISTINCT collection, situ.id, record FROM metabib.full_rec mfr INNER JOIN scratchpad.ids_to_urls situ ON LOWER(situ.sysctl) = mfr.value AND mfr.tag = '035' AND mfr.subfield = 'a' WHERE (collection, situ.id, record) NOT IN (SELECT collection, id, record FROM scratchpad.ebook_collections_to_records) AND situ.sysctl IS NOT NULL;
+
+-- Set the canonical number of unique records per batch
+INSERT INTO scratchpad.ebook_links_by_institution (library, cnt, collection) SELECT 'ALL', COUNT(*), x.collection FROM (SELECT DISTINCT id, collection FROM scratchpad.ids_to_urls) AS x GROUP BY x.collection ORDER BY x.collection;
+
+-- Table that maps which ebook record each library has for a given collection record
+CREATE TABLE scratchpad.ebook_record_by_library (library INTEGER, record BIGINT, id INTEGER, collection TEXT);
+
+-- Populate the table
+INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 109, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 109) ORDER BY collection, id;
+
+INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 103, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 103) ORDER BY collection, id;
+
+INSERT INTO scratchpad.ebook_record_by_library (library, record, id, collection) SELECT 124, record, id, seb.collection FROM scratchpad.ebook_collections_to_records seb WHERE seb.record IN (SELECT record FROM asset.call_number acn WHERE acn.deleted IS FALSE AND acn.owning_lib = 124) ORDER BY collection, id;
+
+CREATE TYPE scratchpad.ebook AS (id INTEGER, isbn TEXT, sysctl TEXT, author TEXT, title TEXT, url TEXT);
+
+CREATE OR REPLACE FUNCTION scratchpad.ebook_missing_records (IN INTEGER, IN TEXT)
+ RETURNS SETOF scratchpad.ebook AS
+$$
+ SELECT DISTINCT id, isbn, sysctl, author, title, url
+ FROM scratchpad.ids_to_urls
+ WHERE collection = $2
+ AND id NOT IN (
+ SELECT id
+ FROM scratchpad.ebook_record_by_library
+ WHERE collection = $2
+ AND library = $1
+ )
+ ORDER BY id;
+$$ LANGUAGE SQL;
+
+-- Create a function to tell us which records a particular library has for a particular collection
+CREATE OR REPLACE FUNCTION scratchpad.ebook_has_records (IN INTEGER, IN TEXT, OUT INTEGER, OUT TEXT, OUT TEXT, OUT TEXT, OUT TEXT)
+ RETURNS SETOF record AS
+$$
+ SELECT DISTINCT situ.id, isbn, sysctl, author, title
+ FROM scratchpad.ids_to_urls situ
+ INNER JOIN scratchpad.ebook_collections_to_records seb
+ ON situ.id = seb.id AND situ.collection = seb.collection
+ INNER JOIN asset.call_number acn
+ ON acn.record = seb.record
+-- INNER JOIN asset.uri_call_number_map auricnm
+-- ON auricnm.call_number = acn.id
+ WHERE situ.collection = $2 AND acn.deleted IS FALSE AND acn.owning_lib = $1
+ ORDER BY id;
+$$ LANGUAGE SQL;
+
+-- Should have scripted this but went with the power of Vim macros
+-- OWA missing records
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-04-12.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-04-12.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-08-18.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-08-18.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'cambridge-2010-09-30_137.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'cambridge-2010-09-30_137.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2010-08-24_92.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2010-08-24_92.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2011-02-02_10.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2011-02-02_10.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'duke-2011-02-14_15.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'duke-2011-02-14_15.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-08-06.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-08-06.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-08-20.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-08-20.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-09-17.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-09-17.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-10-21_66.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-10-21_66.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson-chrc-2010-12-02_127.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson-chrc-2010-12-02_127.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'gibson_chrc-2011-02-23_212.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'gibson_chrc-2011-02-23_212.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-04-27.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-04-27.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-09-30_156.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-09-30_156.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford-2010-10-28_49.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford-2010-10-28_49.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'oxford_2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'oxford_2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2010-02-11.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2010-04-28_1218.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2011-06-27_165.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OWA', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(109, 'springer-2011-02-17_1751.mrc')) AS foo;
+
+-- Laurentian ebook links
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-04-12.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-04-12.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-08-18.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-08-18.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'cambridge-2010-09-30_137.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'cambridge-2010-09-30_137.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2010-08-24_92.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2010-08-24_92.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2011-02-02_10.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2011-02-02_10.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'duke-2011-02-14_15.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'duke-2011-02-14_15.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-08-06.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-08-06.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-08-20.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-08-20.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-09-17.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-09-17.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-10-21_66.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-10-21_66.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson-chrc-2010-12-02_127.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson-chrc-2010-12-02_127.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'gibson_chrc-2011-02-23_212.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'gibson_chrc-2011-02-23_212.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-04-27.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-04-27.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-09-30_156.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-09-30_156.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'oxford-2010-10-28_49.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'oxford-2010-10-28_49.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2010-02-11.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2010-04-28_1218.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2011-06-27_165.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSUL', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(103, 'springer-2011-02-17_1751.mrc')) AS foo;
+
+-- Algoma ebook links
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-04-12.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-04-12.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-08-18.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-08-18.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'cambridge-2010-09-30_137.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'cambridge-2010-09-30_137.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2010-08-24_92.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2010-08-24_92.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2011-02-02_10.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2011-02-02_10.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'duke-2011-02-14_15.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'duke-2011-02-14_15.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-08-06.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-08-06.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-08-20.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-08-20.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-09-17.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-09-17.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-10-21_66.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-10-21_66.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson-chrc-2010-12-02_127.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson-chrc-2010-12-02_127.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'gibson_chrc-2011-02-23_212.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'gibson_chrc-2011-02-23_212.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-03-04.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-03-04.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-04-27.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-04-27.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-09-30_156.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-09-30_156.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'oxford-2010-10-28_49.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'oxford-2010-10-28_49.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2009-12-01.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2009-12-01.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-02-11.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2010-02-11.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-04-28_1218.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2010-04-28_1218.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2010-06-27_165.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2011-06-27_165.mrc')) AS foo;
+INSERT INTO scratchpad.ebook_links_by_institution (library, collection, cnt) SELECT 'OSTMA', 'springer-2011-02-17_1751.mrc', COUNT(*) FROM (SELECT DISTINCT id, isbn, sysctl, title, author FROM scratchpad.ebook_missing_records(124, 'springer-2011-02-17_1751.mrc')) AS foo;
+
+-- List record issues for springer
+SELECT * FROM scratchpad.ebook_links_by_institution WHERE collection LIKE 'sprin%' ORDER BY collection, library;
+
+-- Count how many records are missing from a given library's collection
+SELECT COUNT(*) FROM (SELECT DISTINCT collection, id FROM scratchpad.ids_to_urls WHERE collection = 'springer-2009-12-01.mrc' AND id NOT IN (SELECT id FROM scratchpad.ebook_record_by_library WHERE collection = 'springer-2009-12-01.mrc' AND library = 109)) AS foo;
+
+-- List the records missing from a given library's collection
+SELECT * FROM scratchpad.ids_to_urls WHERE collection = 'springer-2009-12-01.mrc' AND id NOT IN (SELECT id FROM scratchpad.ebook_record_by_library WHERE collection = 'springer-2009-12-01.mrc' AND library = 109)
+
+-- Now, a function to easily generate records to look up as potential matches
+CREATE TYPE scratchpad.ebook_missing_record_matches AS (collection TEXT, id INTEGER, record BIGINT, isbn TEXT, sysctl TEXT, author TEXT, title TEXT);
+
+CREATE FUNCTION scratchpad.ebook_missing_record_matches (IN library INTEGER, IN collection TEXT)
+ RETURNS SETOF scratchpad.ebook_missing_record_matches AS
+$$
+SELECT DISTINCT $2, seb.id, seb.record, situ.isbn, situ.sysctl, situ.author, situ.title
+ FROM scratchpad.ebook_collections_to_records seb
+ INNER JOIN scratchpad.ids_to_urls situ
+ ON situ.collection = $2 AND situ.id = seb.id
+ WHERE seb.id IN (
+ SELECT id FROM scratchpad.ebook_missing_records($1, $2)
+ ) AND seb.collection = $2
+$$ LANGUAGE SQL;
Added: conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py
===================================================================
--- conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py (rev 0)
+++ conifer/branches/rel_1_6_1/tools/ebooks/map_isbns_to_urls.py 2011-04-01 03:09:34 UTC (rev 1295)
@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""
+Iterate through a set of records and generate a TSV file containing
+every ISBN and system control number mapped to every URL in each record.
+"""
+
+#import os, os.path, sys, pymarc, pymarc.marc8, re
+import glob, pymarc, pymarc.marc8, re
+
+def parse_file(infile, writer):
+ """
+ Parse the file of MARC records
+ """
+ reader = pymarc.MARCReader(open(infile, 'rb'))
+ cnt = 0
+
+ for record in reader:
+ cnt = cnt + 1
+
+ isbn = get_field('020', 'a', record, cnt, infile)
+ sys_ctl_num = get_field('035', 'a', record, cnt, infile)
+
+ if not (record['856'] and record['856']['u']):
+ print("* No URL for record %s in file %s" % (cnt, infile))
+ continue
+
+ for url_field in record.get_fields('856'):
+ if url_field.indicator1 != '4':
+ # print("* Record %d has an 856 with ind1 = %s" % (cnt, url_field.indicator1))
+ continue
+ # if not (url_field.indicator2 == '0' or url_field.indicator2 == '1'):
+ # print("* Record %d has an 856 with ind2 = %s" % (cnt, url_field.indicator2))
+ # continue
+ for url in url_field.get_subfields('u'):
+ if url.find('loc.gov') > -1:
+ # print("* Record %d has an 856 with url containing %s" % (cnt, url))
+ continue
+ isbn = re.sub(r'^\D*(\d+)\D.*?$', r'\1', isbn)
+ writer.write('%d\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
+ cnt, sys_ctl_num, isbn, record.author(), record.title(), url, infile
+ ))
+
+def get_field(field, subfield, record, cnt, infile):
+ """
+ Return a field and subfield without complaining
+
+ Should just try/catch this sucker
+ """
+ if not (record[field] and record[field][subfield]):
+ print("* No [%s][%s] for record %s in file %s" % (field, subfield, cnt, infile))
+ return('None')
+ return(record[field][subfield])
+
+if __name__ == '__main__':
+
+ OUTFILE = '/home/dan/Downloads/ebooks/isbns_to_urls.tsv'
+ tsv_writer = open(OUTFILE, 'w')
+ for marc in glob.glob('*.mrc'):
+ parse_file(marc, tsv_writer)
More information about the open-ils-commits
mailing list