#!/usr/bin/perl
use strict;
+use warnings;
+
BEGIN {
+
# find Koha's Perl modules
# test carefully before changing this
use FindBin;
eval { require "$FindBin::Bin/kohalib.pl" };
}
+use Koha::Script;
use C4::Context;
use C4::Biblio;
use Getopt::Long;
+use Pod::Usage;
+use Data::Dumper;
+use Time::HiRes qw/time/;
+use POSIX qw/strftime ceil/;
+use Module::Load::Conditional qw(can_load);
+
+sub usage {
+ pod2usage( -verbose => 2 );
+ exit;
+}
$| = 1;
# command-line parameters
-my $verbose = 0;
-my $test_only = 0;
-my $want_help = 0;
+my $verbose = 0;
+my $link_report = 0;
+my $test_only = 0;
+my $want_help = 0;
+my $auth_limit;
+my $bib_limit;
+my $commit = 100;
my $result = GetOptions(
- 'verbose' => \$verbose,
- 'test' => \$test_only,
- 'h|help' => \$want_help
+ 'v|verbose' => \$verbose,
+ 't|test' => \$test_only,
+ 'l|link-report' => \$link_report,
+ 'a|auth-limit=s' => \$auth_limit,
+ 'b|bib-limit=s' => \$bib_limit,
+ 'c|commit=i' => \$commit,
+ 'h|help' => \$want_help
);
-if (not $result or $want_help) {
- print_usage();
- exit 0;
+binmode( STDOUT, ":utf8" );
+
+if ( not $result or $want_help ) {
+ usage();
+}
+
+my $linker_module =
+ "C4::Linker::" . ( C4::Context->preference("LinkerModule") || 'Default' );
+unless ( can_load( modules => { $linker_module => undef } ) ) {
+ $linker_module = 'C4::Linker::Default';
+ unless ( can_load( modules => { $linker_module => undef } ) ) {
+ die "Unable to load linker module. Aborting.";
+ }
}
+my $linker = $linker_module->new(
+ {
+ 'auth_limit' => $auth_limit,
+ 'options' => C4::Context->preference("LinkerOptions")
+ }
+);
+
my $num_bibs_processed = 0;
-my $num_bibs_modified = 0;
-my $num_bad_bibs = 0;
+my $num_bibs_modified = 0;
+my $num_bad_bibs = 0;
+my %unlinked_headings;
+my %linked_headings;
+my %fuzzy_headings;
my $dbh = C4::Context->dbh;
$dbh->{AutoCommit} = 0;
-process_bibs();
+process_bibs( $linker, $bib_limit, $auth_limit, $commit );
$dbh->commit();
exit 0;
sub process_bibs {
- my $sql = "SELECT biblionumber FROM biblio ORDER BY biblionumber ASC";
+ my ( $linker, $bib_limit, $auth_limit, $commit ) = @_;
+ my $bib_where = '';
+ my $starttime = time();
+ if ($bib_limit) {
+ $bib_where = "WHERE $bib_limit";
+ }
+ my $sql =
+ "SELECT biblionumber FROM biblio $bib_where ORDER BY biblionumber ASC";
my $sth = $dbh->prepare($sql);
$sth->execute();
- while (my ($biblionumber) = $sth->fetchrow_array()) {
+ while ( my ($biblionumber) = $sth->fetchrow_array() ) {
$num_bibs_processed++;
- process_bib($biblionumber);
+ process_bib( $linker, $biblionumber );
- if (not $test_only and ($num_bibs_processed % 100) == 0) {
+ if ( not $test_only and ( $num_bibs_processed % $commit ) == 0 ) {
print_progress_and_commit($num_bibs_processed);
}
}
- if (not $test_only) {
+ if ( not $test_only ) {
$dbh->commit;
}
- print <<_SUMMARY_;
+ my $headings_linked = 0;
+ my $headings_unlinked = 0;
+ my $headings_fuzzy = 0;
+ for ( values %linked_headings ) { $headings_linked += $_; }
+ for ( values %unlinked_headings ) { $headings_unlinked += $_; }
+ for ( values %fuzzy_headings ) { $headings_fuzzy += $_; }
+
+ my $endtime = time();
+ my $totaltime = ceil (($endtime - $starttime) * 1000);
+ $starttime = strftime('%D %T', localtime($starttime));
+ $endtime = strftime('%D %T', localtime($endtime));
+
+ my $summary = <<_SUMMARY_;
Bib authority heading linking report
-------------------------------------
-Number of bibs checked: $num_bibs_processed
-Number of bibs modified: $num_bibs_modified
-Number of bibs with errors: $num_bad_bibs
+=======================================================
+Linker module: $linker_module
+Run started at: $starttime
+Run ended at: $endtime
+Total run time: $totaltime ms
+Number of bibs checked: $num_bibs_processed
+Number of bibs modified: $num_bibs_modified
+Number of bibs with errors: $num_bad_bibs
+Number of headings linked: $headings_linked
+Number of headings unlinked: $headings_unlinked
+Number of headings fuzzily linked: $headings_fuzzy
_SUMMARY_
+ $summary .= "\n**** Ran in test mode only ****\n" if $test_only;
+ print $summary;
+
+ if ($link_report) {
+ my @keys;
+ print <<_LINKED_HEADER_;
+
+Linked headings (from most frequent to least):
+-------------------------------------------------------
+
+_LINKED_HEADER_
+
+ @keys = sort {
+ $linked_headings{$b} <=> $linked_headings{$a} or "\L$a" cmp "\L$b"
+ } keys %linked_headings;
+ foreach my $key (@keys) {
+ print "$key:\t" . $linked_headings{$key} . " occurrences\n";
+ }
+
+ print <<_UNLINKED_HEADER_;
+
+Unlinked headings (from most frequent to least):
+-------------------------------------------------------
+
+_UNLINKED_HEADER_
+
+ @keys = sort {
+ $unlinked_headings{$b} <=> $unlinked_headings{$a}
+ or "\L$a" cmp "\L$b"
+ } keys %unlinked_headings;
+ foreach my $key (@keys) {
+ print "$key:\t" . $unlinked_headings{$key} . " occurrences\n";
+ }
+
+ print <<_FUZZY_HEADER_;
+
+Fuzzily-matched headings (from most frequent to least):
+-------------------------------------------------------
+
+_FUZZY_HEADER_
+
+ @keys = sort {
+ $fuzzy_headings{$b} <=> $fuzzy_headings{$a} or "\L$a" cmp "\L$b"
+ } keys %fuzzy_headings;
+ foreach my $key (@keys) {
+ print "$key:\t" . $fuzzy_headings{$key} . " occurrences\n";
+ }
+ print $summary;
+ }
}
sub process_bib {
+ my $linker = shift;
my $biblionumber = shift;
- my $bib = GetMarcBiblio($biblionumber);
- unless (defined $bib) {
- print "\nCould not retrieve bib $biblionumber from the database - record is corrupt.\n";
+ my $bib = GetMarcBiblio({ biblionumber => $biblionumber });
+ unless ( defined $bib ) {
+ print
+"\nCould not retrieve bib $biblionumber from the database - record is corrupt.\n";
$num_bad_bibs++;
return;
}
- my $headings_changed = LinkBibHeadingsToAuthorities($bib);
+ my $frameworkcode = GetFrameworkCode($biblionumber);
- if ($headings_changed) {
+ my ( $headings_changed, $results ) =
+ LinkBibHeadingsToAuthorities( $linker, $bib, $frameworkcode );
+ foreach my $key ( keys %{ $results->{'unlinked'} } ) {
+ $unlinked_headings{$key} += $results->{'unlinked'}->{$key};
+ }
+ foreach my $key ( keys %{ $results->{'linked'} } ) {
+ $linked_headings{$key} += $results->{'linked'}->{$key};
+ }
+ foreach my $key ( keys %{ $results->{'fuzzy'} } ) {
+ $fuzzy_headings{$key} += $results->{'fuzzy'}->{$key};
+ }
+
+ if ($headings_changed) {
if ($verbose) {
- my $title = substr($bib->title, 0, 20);
- print "Bib $biblionumber ($title): $headings_changed headings changed\n";
+ my $title = substr( $bib->title, 0, 20 );
+ printf(
+ "Bib %12d (%-20s): %3d headings changed\n",
+ $biblionumber,
+ $title,
+ $headings_changed
+ );
}
- if (not $test_only) {
- ModBiblio($bib, $biblionumber, GetFrameworkCode($biblionumber));
+ if ( not $test_only ) {
+ ModBiblio( $bib, $biblionumber, $frameworkcode, 1 );
+ #Last param is to note ModBiblio was called from linking script and bib should not be linked again
$num_bibs_modified++;
}
}
print "... processed $recs records\n";
}
-sub print_usage {
- print <<_USAGE_;
-$0: link headings in bib records to authorities.
-
-This batch job checks each bib record in the Koha
-database and attempts to link each of its headings
-to the matching authority record.
-
-Parameters:
- --verbose print the number of headings changed
- for each bib
- --test only test the authority linking
- and report the results; do not
- change the bib records.
- --comment <comment> optional comment to describe
- the record batch; if the comment
- has spaces in it, surround the
- comment with quotation marks.
- --help or -h show this message.
-_USAGE_
-}
+=head1 NAME
+
+link_bibs_to_authorities.pl
+
+=head1 SYNOPSIS
+
+ link_bibs_to_authorities.pl
+ link_bibs_to_authorities.pl -v
+ link_bibs_to_authorities.pl -l
+ link_bibs_to_authorities.pl --commit=1000
+ link_bibs_to_authorities.pl --auth-limit=STRING
+ link_bibs_to_authorities.pl --bib-limit=STRING
+
+=head1 DESCRIPTION
+
+This batch job checks each bib record in the Koha database and attempts to link
+each of its headings to the matching authority record.
+
+=over 8
+
+=item B<--help>
+
+Prints this help
+
+=item B<-v|--verbose>
+
+Provide verbose log information (print the number of headings changed for each
+bib record).
+
+=item B<-l|--link-report>
+
+Provide a report of all the headings that were processed: which were matched,
+which were not, etc.
+
+=item B<--auth-limit=S>
+
+Only process those headings which match an authority record that matches the
+user-specified WHERE clause.
+
+=item B<--bib-limit=S>
+
+Only process those bib records that match the user-specified WHERE clause.
+
+=item B<--commit=N>
+
+Commit the results to the database after every N records are processed.
+
+=item B<--test>
+
+Only test the authority linking and report the results; do not change the bib
+records.
+
+=back
+
+=cut