X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=C4%2FMatcher.pm;h=7322c14417ee81eb78f95119c5b9c6a2d652cec0;hb=bf8e0a394fb1704345e8cc4cae45d86c31e596ef;hp=c0634ec3a3ffacce6577e5e48967aaa84b9235d8;hpb=6e71b80ca39e6f1d2c31659a9a656de807ef3f32;p=koha-ffzg.git diff --git a/C4/Matcher.pm b/C4/Matcher.pm index c0634ec3a3..7322c14417 100644 --- a/C4/Matcher.pm +++ b/C4/Matcher.pm @@ -4,31 +4,32 @@ package C4::Matcher; # # This file is part of Koha. # -# Koha is free software; you can redistribute it and/or modify it under the -# terms of the GNU General Public License as published by the Free Software -# Foundation; either version 2 of the License, or (at your option) any later -# version. +# Koha is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. # -# Koha is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# Koha is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License along -# with Koha; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# You should have received a copy of the GNU General Public License +# along with Koha; if not, see . -use strict; -use warnings; +use Modern::Perl; -use C4::Context; -use MARC::Record; -use vars qw($VERSION); - -BEGIN { - # set the version for version checking - $VERSION = 3.07.00.049; -} +use Koha::SearchEngine; +use Koha::SearchEngine::Search; +use Koha::SearchEngine::QueryBuilder; +use Koha::Util::Normalize qw( + ISBN + legacy_default + lower_case + remove_spaces + upper_case +); =head1 NAME @@ -48,7 +49,7 @@ C4::Matcher - find MARC records matching another one $matcher->add_matchpoint('isbn', 1000, [ { tag => '020', subfields => 'a', norms => [] } ]); $matcher->add_simple_required_check('245', 'a', -1, 0, '', '245', 'a', -1, 0, ''); - $matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ], + $matcher->add_required_check([ { tag => '245', subfields => 'a', norms => [] } ], [ { tag => '245', subfields => 'a', norms => [] } ]); my @matches = $matcher->get_matches($marc_record, $max_matches); @@ -169,7 +170,7 @@ sub fetch { $sth->execute($id); my $row = $sth->fetchrow_hashref; $sth->finish(); - return undef unless defined $row; + return unless defined $row; my $self = {}; $self->{'id'} = $row->{'matcher_id'}; @@ -335,19 +336,19 @@ sub _store_matchpoint { my $matcher_id = $self->{'id'}; $sth = $dbh->prepare_cached("INSERT INTO matchpoints (matcher_id, search_index, score) VALUES (?, ?, ?)"); - $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'}); + $sth->execute($matcher_id, $matchpoint->{'index'}, $matchpoint->{'score'}||0); my $matchpoint_id = $dbh->{'mysql_insertid'}; my $seqnum = 0; foreach my $component (@{ $matchpoint->{'components'} }) { $seqnum++; $sth = $dbh->prepare_cached("INSERT INTO matchpoint_components - (matchpoint_id, sequence, tag, subfields, offset, length) + (matchpoint_id, sequence, tag, subfields, `offset`, length) VALUES (?, ?, ?, ?, ?, ?)"); $sth->bind_param(1, $matchpoint_id); $sth->bind_param(2, $seqnum); $sth->bind_param(3, $component->{'tag'}); $sth->bind_param(4, join "", sort keys %{ $component->{'subfields'} }); - $sth->bind_param(5, $component->{'offset'}); + $sth->bind_param(5, $component->{'offset'}||0); $sth->bind_param(6, $component->{'length'}); $sth->execute(); my $matchpoint_component_id = $dbh->{'mysql_insertid'}; @@ -624,23 +625,57 @@ sub get_matches { my $self = shift; my ($source_record, $max_matches) = @_; - my %matches = (); + my $matches = {}; + + foreach my $matchpoint ( @{ $self->{'matchpoints'} } ) { + my @source_keys = _get_match_keys( $source_record, $matchpoint ); - foreach my $matchpoint (@{ $self->{'matchpoints'} }) { - my @source_keys = _get_match_keys($source_record, $matchpoint); next if scalar(@source_keys) == 0; + + @source_keys = C4::Koha::GetVariationsOfISBNs(@source_keys) + if ( $matchpoint->{index} =~ /^isbn$/i + && C4::Context->preference('AggressiveMatchOnISBN') ); + + @source_keys = C4::Koha::GetVariationsOfISSNs(@source_keys) + if ( $matchpoint->{index} =~ /^issn$/i + && C4::Context->preference('AggressiveMatchOnISSN') ); + # build query my $query; my $error; my $searchresults; my $total_hits; - if ($self->{'record_type'} eq 'biblio') { - $query = join(" or ", map { "$matchpoint->{'index'}=$_" } @source_keys); -# FIXME only searching biblio index at the moment - require C4::Search; - ($error, $searchresults, $total_hits) = C4::Search::SimpleSearch($query, 0, $max_matches); - } elsif ($self->{'record_type'} eq 'authority') { - my $authresults; + if ( $self->{'record_type'} eq 'biblio' ) { + + my $phr = ( C4::Context->preference('AggressiveMatchOnISBN') || C4::Context->preference('AggressiveMatchOnISSN') ) ? ',phr' : q{}; + $query = join( " OR ", + map { "$matchpoint->{'index'}$phr=\"$_\"" } @source_keys ); + #NOTE: double-quote the values so you don't get a "Embedded truncation not supported" error when a term has a ? in it. + + # Use state variables to avoid recreating the objects every time. + # With Elasticsearch this also avoids creating a massive amount of + # ES connectors that would eventually run out of file descriptors. + state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::BIBLIOS_INDEX}); + ( $error, $searchresults, $total_hits ) = + $searcher->simple_search_compat( $query, 0, $max_matches, undef, skip_normalize => 1 ); + + if ( defined $error ) { + warn "search failed ($query) $error"; + } + else { + foreach my $matched ( @{$searchresults} ) { + my $target_record = C4::Search::new_record_from_zebra( 'biblioserver', $matched ); + my ( $biblionumber_tag, $biblionumber_subfield ) = C4::Biblio::GetMarcFromKohaField( "biblio.biblionumber" ); + my $id = ( $biblionumber_tag > 10 ) ? + $target_record->field($biblionumber_tag)->subfield($biblionumber_subfield) : + $target_record->field($biblionumber_tag)->data(); + $matches->{$id}->{score} += $matchpoint->{score}; + $matches->{$id}->{record} = $target_record; + } + } + + } + elsif ( $self->{'record_type'} eq 'authority' ) { my @marclist; my @and_or; my @excluding = []; @@ -648,58 +683,67 @@ sub get_matches { my @value; foreach my $key (@source_keys) { push @marclist, $matchpoint->{'index'}; - push @and_or, 'or'; + push @and_or, 'or'; push @operator, 'exact'; - push @value, $key; + push @value, $key; } - require C4::AuthoritiesMarc; - ($authresults, $total_hits) = C4::AuthoritiesMarc::SearchAuthorities( - \@marclist, \@and_or, \@excluding, \@operator, - \@value, 0, 20, undef, 'AuthidAsc', 1 + # Use state variables to avoid recreating the objects every time. + # With Elasticsearch this also avoids creating a massive amount of + # ES connectors that would eventually run out of file descriptors. + state $builder = Koha::SearchEngine::QueryBuilder->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX}); + state $searcher = Koha::SearchEngine::Search->new({index => $Koha::SearchEngine::AUTHORITIES_INDEX}); + my $search_query = $builder->build_authorities_query_compat( + \@marclist, \@and_or, \@excluding, \@operator, + \@value, undef, 'AuthidAsc' ); - foreach my $result (@$authresults) { - push @$searchresults, $result->{'authid'}; - } - } + my ( $authresults, $total ) = $searcher->search_auth_compat( $search_query, 0, 20 ); - if (defined $error ) { - warn "search failed ($query) $error"; - } else { - foreach my $matched (@{$searchresults}) { - $matches{$matched} += $matchpoint->{'score'}; + foreach my $result (@$authresults) { + my $id = $result->{authid}; + $matches->{$id}->{score} += $matchpoint->{'score'}; + $matches->{$id}->{record} = $id; } } } # get rid of any that don't meet the threshold - %matches = map { ($matches{$_} >= $self->{'threshold'}) ? ($_ => $matches{$_}) : () } keys %matches; - - # get rid of any that don't meet the required checks - %matches = map { _passes_required_checks($source_record, $_, $self->{'required_checks'}) ? ($_ => $matches{$_}) : () } - keys %matches unless ($self->{'record_type'} eq 'auth'); + $matches = { map { ($matches->{$_}->{score} >= $self->{'threshold'}) ? ($_ => $matches->{$_}) : () } keys %$matches }; my @results = (); if ($self->{'record_type'} eq 'biblio') { require C4::Biblio; - foreach my $marcblob (keys %matches) { - my $target_record = MARC::Record->new_from_usmarc($marcblob); - my $record_number; - my $result = C4::Biblio::TransformMarcToKoha(C4::Context->dbh, $target_record, ''); - $record_number = $result->{'biblionumber'}; - push @results, { 'record_id' => $record_number, 'score' => $matches{$marcblob} }; + # get rid of any that don't meet the required checks + $matches = { + map { + _passes_required_checks( $source_record, $matches->{$_}->{'record'}, $self->{'required_checks'} ) + ? ( $_ => $matches->{$_} ) + : () + } keys %$matches + }; + + foreach my $id ( keys %$matches ) { + push @results, { + record_id => $id, + score => $matches->{$id}->{score} + }; } } elsif ($self->{'record_type'} eq 'authority') { require C4::AuthoritiesMarc; - foreach my $authid (keys %matches) { - push @results, { 'record_id' => $authid, 'score' => $matches{$authid} }; + foreach my $id (keys %$matches) { + push @results, { + record_id => $id, + score => $matches->{$id}->{score} + }; } } - @results = sort { $b->{'score'} cmp $a->{'score'} } @results; + @results = sort { + $b->{'score'} cmp $a->{'score'} or + $b->{'record_id'} cmp $a->{'record_id'} + } @results; if (scalar(@results) > $max_matches) { @results = @results[0..$max_matches-1]; } return @results; - } =head2 dump @@ -735,8 +779,7 @@ sub dump { } sub _passes_required_checks { - my ($source_record, $target_blob, $matchchecks) = @_; - my $target_record = MARC::Record->new_from_usmarc($target_blob); # FIXME -- need to avoid parsing record twice + my ($source_record, $target_record, $matchchecks) = @_; # no checks supplied == automatic pass return 1 if $#{ $matchchecks } == -1; @@ -750,6 +793,7 @@ sub _passes_required_checks { } sub _get_match_keys { + my $source_record = shift; my $matchpoint = shift; my $check_only_first_repeat = @_ ? shift : 0; @@ -768,33 +812,72 @@ sub _get_match_keys { # If there are two 003s and two 001s, there will be two keys: # first 003 + first 001 # second 003 + second 001 - + my @keys = (); for (my $i = 0; $i <= $#{ $matchpoint->{'components'} }; $i++) { my $component = $matchpoint->{'components'}->[$i]; my $j = -1; - FIELD: foreach my $field ($source_record->field($component->{'tag'})) { + + my @fields = (); + my $tag = $component->{'tag'}; + if ($tag && $tag eq 'LDR'){ + $fields[0] = $source_record->leader(); + } + else { + @fields = $source_record->field($tag); + } + + FIELD: foreach my $field (@fields) { $j++; last FIELD if $j > 0 and $check_only_first_repeat; last FIELD if $i > 0 and $j > $#keys; - my $key = ""; - my $string; - if ($field->is_control_field()) { - $string=$field->data(); + + my $string; + if ( ! ref $field ){ + $string = "$field"; + } + elsif ( $field->is_control_field() ) { + $string = $field->data(); + } elsif ( defined $component->{subfields} && keys %{$component->{subfields}} ){ + $string = $field->as_string( + join('', keys %{ $component->{ subfields } }), ' ' # ' ' as separator + ); } else { - foreach my $subfield ($field->subfields()) { - if (exists $component->{'subfields'}->{$subfield->[0]}) { - $string .= " " . $subfield->[1]; - } - } - } + $string = $field->as_string(); + } + if ($component->{'length'}>0) { - $string= substr($string, $component->{'offset'}, $component->{'length'}); - # FIXME normalize, substr + $string= substr($string, $component->{'offset'}, $component->{'length'}); } elsif ($component->{'offset'}) { - $string= substr($string, $component->{'offset'}); + $string= substr($string, $component->{'offset'}); } - $key = _normalize($string); + + my $norms = $component->{'norms'}; + my $key = $string; + + foreach my $norm ( @{ $norms } ) { + if ( grep { $norm eq $_ } valid_normalization_routines() ) { + if ( $norm eq 'remove_spaces' ) { + $key = remove_spaces($key); + } + elsif ( $norm eq 'upper_case' ) { + $key = upper_case($key); + } + elsif ( $norm eq 'lower_case' ) { + $key = lower_case($key); + } + elsif ( $norm eq 'legacy_default' ) { + $key = legacy_default($key); + } + elsif ( $norm eq 'ISBN' ) { + $key = ISBN($key); + } + } else { + warn "Invalid normalization routine required ($norm)" + unless $norm eq 'none'; + } + } + if ($i == 0) { push @keys, $key if $key; } else { @@ -819,16 +902,15 @@ sub _parse_match_component { return $component; } -# FIXME - default normalizer -sub _normalize { - my $value = uc shift; - $value =~ s/[.;:,\]\[\)\(\/'"]//g; - $value =~ s/^\s+//; - #$value =~ s/^\s+$//; - $value =~ s/\s+$//; - $value =~ s/\s+/ /g; - #$value =~ s/[.;,\]\[\)\(\/"']//g; - return $value; +sub valid_normalization_routines { + + return ( + 'remove_spaces', + 'upper_case', + 'lower_case', + 'legacy_default', + 'ISBN' + ); } 1;