X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=misc%2Flink_bibs_to_authorities.pl;h=37596c5024540b7588527a62201e46bf38565379;hb=6b928438ce06f7f7d664766b8206001c57bea817;hp=e3f9bdc824152a7b7bf9b6a35d3609c3a10de77b;hpb=5e0b850d49f452ffbca41f47dc190d6d4f2323c7;p=koha_fer diff --git a/misc/link_bibs_to_authorities.pl b/misc/link_bibs_to_authorities.pl index e3f9bdc824..37596c5024 100755 --- a/misc/link_bibs_to_authorities.pl +++ b/misc/link_bibs_to_authorities.pl @@ -1,8 +1,10 @@ #!/usr/bin/perl use strict; -#use warnings; FIXME - Bug 2505 +use warnings; + BEGIN { + # find Koha's Perl modules # test carefully before changing this use FindBin; @@ -12,86 +14,208 @@ BEGIN { use C4::Context; use C4::Biblio; use Getopt::Long; +use Pod::Usage; +use Data::Dumper; +use Time::HiRes qw/time/; +use POSIX qw/strftime ceil/; +use Module::Load::Conditional qw(can_load); + +sub usage { + pod2usage( -verbose => 2 ); + exit; +} $| = 1; # command-line parameters -my $verbose = 0; -my $test_only = 0; -my $want_help = 0; +my $verbose = 0; +my $link_report = 0; +my $test_only = 0; +my $want_help = 0; +my $auth_limit; +my $bib_limit; +my $commit = 100; my $result = GetOptions( - 'verbose' => \$verbose, - 'test' => \$test_only, - 'h|help' => \$want_help + 'v|verbose' => \$verbose, + 't|test' => \$test_only, + 'l|link-report' => \$link_report, + 'a|auth-limit=s' => \$auth_limit, + 'b|bib-limit=s' => \$bib_limit, + 'c|commit=i' => \$commit, + 'h|help' => \$want_help ); -if (not $result or $want_help) { - print_usage(); - exit 0; +binmode( STDOUT, ":utf8" ); + +if ( not $result or $want_help ) { + usage(); +} + +my $linker_module = + "C4::Linker::" . ( C4::Context->preference("LinkerModule") || 'Default' ); +unless ( can_load( modules => { $linker_module => undef } ) ) { + $linker_module = 'C4::Linker::Default'; + unless ( can_load( modules => { $linker_module => undef } ) ) { + die "Unable to load linker module. Aborting."; + } } +my $linker = $linker_module->new( + { + 'auth_limit' => $auth_limit, + 'options' => C4::Context->preference("LinkerOptions") + } +); + my $num_bibs_processed = 0; -my $num_bibs_modified = 0; -my $num_bad_bibs = 0; +my $num_bibs_modified = 0; +my $num_bad_bibs = 0; +my %unlinked_headings; +my %linked_headings; +my %fuzzy_headings; my $dbh = C4::Context->dbh; $dbh->{AutoCommit} = 0; -process_bibs(); +process_bibs( $linker, $bib_limit, $auth_limit, $commit ); $dbh->commit(); exit 0; sub process_bibs { - my $sql = "SELECT biblionumber FROM biblio ORDER BY biblionumber ASC"; + my ( $linker, $bib_limit, $auth_limit, $commit ) = @_; + my $bib_where = ''; + my $starttime = time(); + if ($bib_limit) { + $bib_where = "WHERE $bib_limit"; + } + my $sql = + "SELECT biblionumber FROM biblio $bib_where ORDER BY biblionumber ASC"; my $sth = $dbh->prepare($sql); $sth->execute(); - while (my ($biblionumber) = $sth->fetchrow_array()) { + while ( my ($biblionumber) = $sth->fetchrow_array() ) { $num_bibs_processed++; - process_bib($biblionumber); + process_bib( $linker, $biblionumber ); - if (not $test_only and ($num_bibs_processed % 100) == 0) { + if ( not $test_only and ( $num_bibs_processed % $commit ) == 0 ) { print_progress_and_commit($num_bibs_processed); } } - if (not $test_only) { + if ( not $test_only ) { $dbh->commit; } - print <<_SUMMARY_; + my $headings_linked = 0; + my $headings_unlinked = 0; + my $headings_fuzzy = 0; + for ( values %linked_headings ) { $headings_linked += $_; } + for ( values %unlinked_headings ) { $headings_unlinked += $_; } + for ( values %fuzzy_headings ) { $headings_fuzzy += $_; } + + my $endtime = time(); + my $totaltime = ceil (($endtime - $starttime) * 1000); + $starttime = strftime('%D %T', localtime($starttime)); + $endtime = strftime('%D %T', localtime($endtime)); + + my $summary = <<_SUMMARY_; Bib authority heading linking report ------------------------------------- -Number of bibs checked: $num_bibs_processed -Number of bibs modified: $num_bibs_modified -Number of bibs with errors: $num_bad_bibs +======================================================= +Linker module: $linker_module +Run started at: $starttime +Run ended at: $endtime +Total run time: $totaltime ms +Number of bibs checked: $num_bibs_processed +Number of bibs modified: $num_bibs_modified +Number of bibs with errors: $num_bad_bibs +Number of headings linked: $headings_linked +Number of headings unlinked: $headings_unlinked +Number of headings fuzzily linked: $headings_fuzzy _SUMMARY_ + $summary .= "\n**** Ran in test mode only ****\n" if $test_only; + print $summary; + + if ($link_report) { + my @keys; + print <<_LINKED_HEADER_; + +Linked headings (from most frequent to least): +------------------------------------------------------- + +_LINKED_HEADER_ + + @keys = sort { + $linked_headings{$b} <=> $linked_headings{$a} or "\L$a" cmp "\L$b" + } keys %linked_headings; + foreach my $key (@keys) { + print "$key:\t" . $linked_headings{$key} . " occurrences\n"; + } + + print <<_UNLINKED_HEADER_; + +Unlinked headings (from most frequent to least): +------------------------------------------------------- + +_UNLINKED_HEADER_ + + @keys = sort { + $unlinked_headings{$b} <=> $unlinked_headings{$a} + or "\L$a" cmp "\L$b" + } keys %unlinked_headings; + foreach my $key (@keys) { + print "$key:\t" . $unlinked_headings{$key} . " occurrences\n"; + } + + print <<_FUZZY_HEADER_; + +Fuzzily-matched headings (from most frequent to least): +------------------------------------------------------- + +_FUZZY_HEADER_ + + @keys = sort { + $fuzzy_headings{$b} <=> $fuzzy_headings{$a} or "\L$a" cmp "\L$b" + } keys %fuzzy_headings; + foreach my $key (@keys) { + print "$key:\t" . $fuzzy_headings{$key} . " occurrences\n"; + } + print $summary; + } } sub process_bib { + my $linker = shift; my $biblionumber = shift; my $bib = GetMarcBiblio($biblionumber); - unless (defined $bib) { - print "\nCould not retrieve bib $biblionumber from the database - record is corrupt.\n"; + unless ( defined $bib ) { + print +"\nCould not retrieve bib $biblionumber from the database - record is corrupt.\n"; $num_bad_bibs++; return; } - my $headings_changed = LinkBibHeadingsToAuthorities($bib); + my ( $headings_changed, $results ) = + LinkBibHeadingsToAuthorities( $linker, $bib, + GetFrameworkCode($biblionumber) ); + foreach my $key ( keys %{ $results->{'unlinked'} } ) { + $unlinked_headings{$key} += $results->{'unlinked'}->{$key}; + } + foreach my $key ( keys %{ $results->{'linked'} } ) { + $linked_headings{$key} += $results->{'linked'}->{$key}; + } + foreach my $key ( keys %{ $results->{'fuzzy'} } ) { + $fuzzy_headings{$key} += $results->{'fuzzy'}->{$key}; + } - if ($headings_changed) { + if ($headings_changed) { if ($verbose) { - my $title = substr($bib->title, 0, 20); - print "Bib $biblionumber ($title): $headings_changed headings changed\n"; + my $title = substr( $bib->title, 0, 20 ); + print +"Bib $biblionumber ($title): $headings_changed headings changed\n"; } - if (not $test_only) { - # delete any item tags - my ($itemtag, $itemsubfield) = GetMarcFromKohaField("items.itemnumber", ''); - foreach my $field ($bib->field($itemtag)) { - $bib->delete_field($field); - } - ModBiblio($bib, $biblionumber, GetFrameworkCode($biblionumber)); + if ( not $test_only ) { + ModBiblio( $bib, $biblionumber, GetFrameworkCode($biblionumber) ); $num_bibs_modified++; } } @@ -103,20 +227,58 @@ sub print_progress_and_commit { print "... processed $recs records\n"; } -sub print_usage { - print <<_USAGE_; -$0: link headings in bib records to authorities. - -This batch job checks each bib record in the Koha -database and attempts to link each of its headings -to the matching authority record. - -Parameters: - --verbose print the number of headings changed - for each bib - --test only test the authority linking - and report the results; do not - change the bib records. - --help or -h show this message. -_USAGE_ -} +=head1 NAME + +link_bibs_to_authorities.pl + +=head1 SYNOPSIS + + link_bibs_to_authorities.pl + link_bibs_to_authorities.pl -v + link_bibs_to_authorities.pl -l + link_bibs_to_authorities.pl --commit=1000 + link_bibs_to_authorities.pl --auth-limit=STRING + link_bibs_to_authorities.pl --bib-limit=STRING + +=head1 DESCRIPTION + +This batch job checks each bib record in the Koha database and attempts to link +each of its headings to the matching authority record. + +=over 8 + +=item B<--help> + +Prints this help + +=item B<-v|--verbose> + +Provide verbose log information (print the number of headings changed for each +bib record). + +=item B<-l|--link-report> + +Provide a report of all the headings that were processed: which were matched, +which were not, etc. + +=item B<--auth-limit=S> + +Only process those headings which match an authority record that matches the +user-specified WHERE clause. + +=item B<--bib-limit=S> + +Only process those bib records that match the user-specified WHERE clause. + +=item B<--commit=N> + +Commit the results to the database after every N records are processed. + +=item B<--test> + +Only test the authority linking and report the results; do not change the bib +records. + +=back + +=cut