X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=Koha%2FSearchEngine%2FElasticsearch%2FSearch.pm;h=a5b410e3a67110d5cc3ac655a93250d93b122df3;hb=7d8b96803f664d86762a6afb966051f7d565c40e;hp=e76ab3ea0d202023fd9783cc9d50ddbf8e38a7b7;hpb=cd9946f0d4cc76df12883c4a0fec9f4e38fa623c;p=srvgit diff --git a/Koha/SearchEngine/Elasticsearch/Search.pm b/Koha/SearchEngine/Elasticsearch/Search.pm index e76ab3ea0d..a5b410e3a6 100644 --- a/Koha/SearchEngine/Elasticsearch/Search.pm +++ b/Koha/SearchEngine/Elasticsearch/Search.pm @@ -4,27 +4,29 @@ package Koha::SearchEngine::Elasticsearch::Search; # # This file is part of Koha. # -# Koha is free software; you can redistribute it and/or modify it under the -# terms of the GNU General Public License as published by the Free Software -# Foundation; either version 3 of the License, or (at your option) any later -# version. +# Koha is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. # -# Koha is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# Koha is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License along -# with Koha; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# You should have received a copy of the GNU General Public License +# along with Koha; if not, see . =head1 NAME -Koha::SearchEngine::ElasticSearch::Search - search functions for Elasticsearch +Koha::SearchEngine::Elasticsearch::Search - search functions for Elasticsearch =head1 SYNOPSIS - my $searcher = Koha::SearchEngine::ElasticSearch::Search->new(); - my $builder = Koha::SearchEngine::Elasticsearch::QueryBuilder->new(); + my $searcher = + Koha::SearchEngine::Elasticsearch::Search->new( { index => $index } ); + my $builder = Koha::SearchEngine::Elasticsearch::QueryBuilder->new( + { index => $index } ); my $query = $builder->build_query('perl'); my $results = $searcher->search($query); print "There were " . $results->total . " results.\n"; @@ -36,13 +38,22 @@ Koha::SearchEngine::ElasticSearch::Search - search functions for Elasticsearch =cut -use base qw(Koha::ElasticSearch); -use Koha::ItemTypes; +use Modern::Perl; +use base qw(Koha::SearchEngine::Elasticsearch); +use C4::Context; +use C4::AuthoritiesMarc; +use Koha::ItemTypes; +use Koha::AuthorisedValues; +use Koha::SearchEngine::QueryBuilder; +use Koha::SearchEngine::Search; +use Koha::Exceptions::Elasticsearch; +use MARC::Record; use Catmandu::Store::ElasticSearch; - +use MARC::File::XML; use Data::Dumper; #TODO remove use Carp qw(cluck); +use MIME::Base64; Koha::SearchEngine::Elasticsearch::Search->mk_accessors(qw( store )); @@ -73,22 +84,25 @@ sub search { my ($self, $query, $page, $count, %options) = @_; my $params = $self->get_elasticsearch_params(); - my %paging; # 20 is the default number of results per page - $paging{limit} = $count || 20; - # ES/Catmandu doesn't want pages, it wants a record to start from. + $query->{size} = $count // 20; + # ES doesn't want pages, it wants a record to start from. if (exists $options{offset}) { - $paging{start} = $options{offset}; + $query->{from} = $options{offset}; } else { $page = (!defined($page) || ($page <= 0)) ? 0 : $page - 1; - $paging{start} = $page * $paging{limit}; + $query->{from} = $page * $query->{size}; + } + my $elasticsearch = $self->get_elasticsearch(); + my $results = eval { + $elasticsearch->search( + index => $params->{index_name}, + body => $query + ); + }; + if ($@) { + die $self->process_error($@); } - $self->store( - Catmandu::Store::ElasticSearch->new( - %$params, trace_calls => 1, - ) - ) unless $self->store; - my $results = $self->store->bag->search( %$query, %paging ); return $results; } @@ -97,7 +111,7 @@ sub search { my $count = $searcher->count($query); This mimics a search request, but just gets the result count instead. That's -faster than pulling all the data in, ususally. +faster than pulling all the data in, usually. =cut @@ -109,8 +123,8 @@ sub count { Catmandu::Store::ElasticSearch->new( %$params, trace_calls => 0, ) ) unless $self->store; - my $searcher = $self->store->bag->searcher(query => $query); - my $count = $searcher->count(); + my $search = $self->store->bag->search( %$query); + my $count = $search->total() || 0; return $count; } @@ -118,7 +132,7 @@ sub count { my ( $error, $results, $facets ) = $search->search_compat( $query, $simple_query, \@sort_by, \@servers, - $results_per_page, $offset, $expanded_facet, $branches, + $results_per_page, $offset, undef, $item_types, $query_type, $scan ) @@ -130,37 +144,44 @@ get ignored here, along with some other things (like C<@servers>.) sub search_compat { my ( - $self, $query, $simple_query, $sort_by, - $servers, $results_per_page, $offset, $expanded_facet, - $branches, $query_type, $scan + $self, $query, $simple_query, $sort_by, + $servers, $results_per_page, $offset, $branches, + $item_types, $query_type, $scan ) = @_; + if ( $scan ) { + return $self->_aggregation_scan( $query, $results_per_page, $offset ); + } + my %options; + if ( !defined $offset or $offset < 0 ) { + $offset = 0; + } $options{offset} = $offset; my $results = $self->search($query, undef, $results_per_page, %options); # Convert each result into a MARC::Record - my (@records, $index); - $index = $offset; # opac-search expects results to be put in the - # right place in the array, according to $offset - $results->each(sub { - # The results come in an array for some reason - my $marc_json = @_[0]->{record}; - my $marc = $self->json2marc($marc_json); - $records[$index++] = $marc; - }); + my @records; + # opac-search expects results to be put in the + # right place in the array, according to $offset + my $index = $offset; + my $hits = $results->{'hits'}; + foreach my $es_record (@{$hits->{'hits'}}) { + $records[$index++] = $self->decode_record_from_result($es_record->{'_source'}); + } + # consumers of this expect a name-spaced result, we provide the default # configuration. my %result; - $result{biblioserver}{hits} = $results->total; + $result{biblioserver}{hits} = $hits->{'total'}; $result{biblioserver}{RECORDS} = \@records; - return (undef, \%result, $self->_convert_facets($results->{facets})); + return (undef, \%result, $self->_convert_facets($results->{aggregations})); } =head2 search_auth_compat my ( $results, $total ) = - $searcher->search_auth_compat( $query, $page, $count, %options ); + $searcher->search_auth_compat( $query, $offset, $count, $skipmetadata, %options ); This has a similar calling convention to L, however it returns its results in a form the same as L. @@ -168,32 +189,39 @@ results in a form the same as L. =cut sub search_auth_compat { - my $self = shift; + my ($self, $query, $offset, $count, $skipmetadata, %options) = @_; - # TODO handle paging + if ( !defined $offset or $offset <= 0 ) { + $offset = 1; + } + # Uh, authority search uses 1-based offset.. + $options{offset} = $offset - 1; my $database = Koha::Database->new(); my $schema = $database->schema(); - my $res = $self->search(@_); + my $res = $self->search($query, undef, $count, %options); + my $bib_searcher = Koha::SearchEngine::Elasticsearch::Search->new({index => 'biblios'}); my @records; - $res->each( - sub { - my %result; - my $record = @_[0]; - my $marc_json = $record->{record}; - - # I wonder if these should be real values defined in the mapping - # rather than hard-coded conversions. - # Our results often come through as nested arrays, to fix this - # requires changes in catmandu. - my $authid = $record->{ 'local-number' }[0][0]; - $result{authid} = $authid; + my $hits = $res->{'hits'}; + foreach my $es_record (@{$hits->{'hits'}}) { + my $record = $es_record->{'_source'}; + my %result; + + # I wonder if these should be real values defined in the mapping + # rather than hard-coded conversions. + #my $record = $_[0]; + # Handle legacy nested arrays indexed with splitting enabled. + my $authid = $record->{ 'local-number' }[0]; + $authid = @$authid[0] if (ref $authid eq 'ARRAY'); + $result{authid} = $authid; + + if (!defined $skipmetadata || !$skipmetadata) { # TODO put all this info into the record at index time so we # don't have to go and sort it all out now. my $authtypecode = $record->{authtype}; my $rs = $schema->resultset('AuthType') - ->search( { authtypecode => $authtypecode } ); + ->search( { authtypecode => $authtypecode } ); # FIXME there's an assumption here that we will get a result. # the original code also makes an assumption that some provided @@ -201,8 +229,8 @@ sub search_auth_compat { # with the record. It's not documented why this is the case, so # it's not reproduced here yet. my $authtype = $rs->single; - my $auth_tag_to_report = $authtype->auth_tag_to_report; - my $marc = $self->json2marc($marc_json); + my $auth_tag_to_report = $authtype ? $authtype->auth_tag_to_report : ""; + my $marc = $self->decode_record_from_result($record); my $mainentry = $marc->field($auth_tag_to_report); my $reported_tag; if ($mainentry) { @@ -211,22 +239,18 @@ sub search_auth_compat { } } # Turn the resultset into a hash - my %authtype_cols; - foreach my $col ($authtype->result_source->columns) { - $authtype_cols{$col} = $authtype->get_column($col); - } - $result{authtype} = $authtype->authtypetext; + $result{authtype} = $authtype ? $authtype->authtypetext : $authtypecode; $result{reported_tag} = $reported_tag; # Reimplementing BuildSummary is out of scope because it'll be hard $result{summary} = - C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid}, + C4::AuthoritiesMarc::BuildSummary( $marc, $result{authid}, $authtypecode ); $result{used} = $self->count_auth_use($bib_searcher, $authid); - push @records, \%result; } - ); - return ( \@records, $res->total ); + push @records, \%result; + } + return ( \@records, $hits->{'total'} ); } =head2 count_auth_use @@ -244,46 +268,159 @@ sub count_auth_use { my $query = { query => { - filtered => { - query => { match_all => {} }, - filter => { term => { an => $authid } } + bool => { +# query => { match_all => {} }, + filter => { term => { 'koha-auth-number' => $authid } } } } }; $bib_searcher->count($query); } +=head2 simple_search_compat + + my ( $error, $marcresults, $total_hits ) = + $searcher->simple_search( $query, $offset, $max_results, %options ); + +This is a simpler interface to the searching, intended to be similar enough to +L. + +Arguments: + +=over 4 +=item C<$query> -=head2 json2marc +A thing to search for. It could be a simple string, or something constructed +with the appropriate QueryBuilder module. - my $marc = $self->json2marc($marc_json); +=item C<$offset> -Converts the form of marc (based on its JSON, but as a Perl structure) that -Catmandu stores into a MARC::Record object. +How many results to skip from the start of the results. + +=item C<$max_results> + +The max number of results to return. The default is 100 (because unlimited +is a pretty terrible thing to do.) + +=item C<%options> + +These options are unused by Elasticsearch + +=back + +Returns: + +=over 4 + +=item C<$error> + +if something went wrong, this'll contain some kind of error +message. + +=item C<$marcresults> + +an arrayref of MARC::Records (note that this is different from the +L version which will return plain XML, but too bad.) + +=item C<$total_hits> + +the total number of results that this search could have returned. + +=back =cut -sub json2marc { - my ( $self, $marcjson ) = @_; +sub simple_search_compat { + my ($self, $query, $offset, $max_results) = @_; - my $marc = MARC::Record->new(); - $marc->encoding('UTF-8'); + return ('No query entered', undef, undef) unless $query; - # fields are like: - # [ '245', '1', '2', 'a' => 'Title', 'b' => 'Subtitle' ] - # conveniently, this is the form that MARC::Field->new() likes - foreach $field (@$marcjson) { - next if @$field < 5; # Shouldn't be possible, but... - if ( $field->[0] eq 'LDR' ) { - $marc->leader( $field->[4] ); - } - else { - my $marc_field = MARC::Field->new(@$field); - $marc->append_fields($marc_field); - } + my %options; + $offset = 0 if not defined $offset or $offset < 0; + $options{offset} = $offset; + $max_results //= 100; + + unless (ref $query) { + # We'll push it through the query builder to sanitise everything. + my $qb = Koha::SearchEngine::QueryBuilder->new({index => $self->index}); + (undef,$query) = $qb->build_query_compat(undef, [$query]); + } + my $results = $self->search($query, undef, $max_results, %options); + my @records; + my $hits = $results->{'hits'}; + foreach my $es_record (@{$hits->{'hits'}}) { + push @records, $self->decode_record_from_result($es_record->{'_source'}); } - return $marc; + return (undef, \@records, $hits->{'total'}); +} + +=head2 extract_biblionumber + + my $biblionumber = $searcher->extract_biblionumber( $searchresult ); + +$searchresult comes from simple_search_compat. + +Returns the biblionumber from the search result record. + +=cut + +sub extract_biblionumber { + my ( $self, $searchresultrecord ) = @_; + return Koha::SearchEngine::Search::extract_biblionumber( $searchresultrecord ); +} + +=head2 decode_record_from_result + my $marc_record = $self->decode_record_from_result(@result); + +Extracts marc data from Elasticsearch result and decodes to MARC::Record object + +=cut + +sub decode_record_from_result { + # Result is passed in as array, will get flattened + # and first element will be $result + my ( $self, $result ) = @_; + if ($result->{marc_format} eq 'base64ISO2709') { + return MARC::Record->new_from_usmarc(decode_base64($result->{marc_data})); + } + elsif ($result->{marc_format} eq 'MARCXML') { + return MARC::Record->new_from_xml($result->{marc_data}, 'UTF-8', uc C4::Context->preference('marcflavour')); + } + elsif ($result->{marc_format} eq 'ARRAY') { + return $self->_array_to_marc($result->{marc_data_array}); + } + else { + Koha::Exceptions::Elasticsearch->throw("Missing marc_format field in Elasticsearch result"); + } +} + +=head2 max_result_window + +Returns the maximum number of results that can be fetched + +This directly requests Elasticsearch for the setting index.max_result_window (or +the default value for this setting in case it is not set) + +=cut + +sub max_result_window { + my ($self) = @_; + + $self->store( + Catmandu::Store::ElasticSearch->new(%{ $self->get_elasticsearch_params }) + ) unless $self->store; + + my $index_name = $self->store->index_name; + my $settings = $self->store->es->indices->get_settings( + index => $index_name, + params => { include_defaults => 'true', flat_settings => 'true' }, + ); + + my $max_result_window = $settings->{$index_name}->{settings}->{'index.max_result_window'}; + $max_result_window //= $settings->{$index_name}->{defaults}->{'index.max_result_window'}; + + return $max_result_window; } =head2 _convert_facets @@ -297,40 +434,66 @@ C, C, etc. =cut sub _convert_facets { - my ( $self, $es ) = @_; + my ( $self, $es, $exp_facet ) = @_; - return undef if !$es; + return if !$es; # These should correspond to the ES field names, as opposed to the CCL # things that zebra uses. - my %type_to_label = ( - author => 'Authors', - location => 'Location', - itype => 'ItemTypes', - se => 'Series', - subject => 'Topics', - 'su-geo' => 'Places', + my %type_to_label; + my %label = ( + author => 'Authors', + itype => 'ItemTypes', + location => 'Location', + 'su-geo' => 'Places', + 'title-series' => 'Series', + subject => 'Topics', + ccode => 'CollectionCodes', + holdingbranch => 'HoldingLibrary', + homebranch => 'HomeLibrary', + ln => 'Language', ); + my @facetable_fields = + Koha::SearchEngine::Elasticsearch->get_facetable_fields; + for my $f (@facetable_fields) { + next unless defined $f->facet_order; + $type_to_label{ $f->name } = + { order => $f->facet_order, label => $label{ $f->name } }; + } # We also have some special cases, e.g. itypes that need to show the # value rather than the code. - my $itypes = Koha::ItemTypes->new(); - my %special = ( itype => sub { $itypes->get_description_for_code(@_) }, ); - my @res; - while ( ( $type, $data ) = each %$es ) { + my @itypes = Koha::ItemTypes->search; + my @libraries = Koha::Libraries->search; + my $library_names = { map { $_->branchcode => $_->branchname } @libraries }; + my @locations = Koha::AuthorisedValues->search( { category => 'LOC' } ); + my $opac = C4::Context->interface eq 'opac' ; + my %special = ( + itype => { map { $_->itemtype => $_->description } @itypes }, + location => { map { $_->authorised_value => ( $opac ? ( $_->lib_opac || $_->lib ) : $_->lib ) } @locations }, + holdingbranch => $library_names, + homebranch => $library_names + ); + my @facets; + $exp_facet //= ''; + while ( my ( $type, $data ) = each %$es ) { next if !exists( $type_to_label{$type} ); + + # We restrict to the most popular $limit !results + my $limit = C4::Context->preference('FacetMaxCount'); my $facet = { - type_id => $type . '_id', - expand => $type, - expandable => 1, # TODO figure how that's supposed to work - "type_label_$type_to_label{$type}" => 1, + type_id => $type . '_id', + "type_label_$type_to_label{$type}{label}" => 1, type_link_value => $type, + order => $type_to_label{$type}{order}, }; - foreach my $term ( @{ $data->{terms} } ) { - my $t = $term->{term}; - my $c = $term->{count}; + $limit = @{ $data->{buckets} } if ( $limit > @{ $data->{buckets} } ); + foreach my $term ( @{ $data->{buckets} }[ 0 .. $limit - 1 ] ) { + my $t = $term->{key}; + my $c = $term->{doc_count}; + my $label; if ( exists( $special{$type} ) ) { - $label = $special{$type}->($t); + $label = $special{$type}->{$t} // $t; } else { $label = $t; @@ -339,15 +502,79 @@ sub _convert_facets { facet_count => $c, facet_link_value => $t, facet_title_value => $t . " ($c)", - facet_label_value => $label, # TODO either truncate this, + facet_label_value => $label, # TODO either truncate this, # or make the template do it like it should anyway type_link_value => $type, }; } - push @res, $facet if exists $facet->{facets}; + push @facets, $facet if exists $facet->{facets}; } - return \@res; + + @facets = sort { $a->{order} <=> $b->{order} } @facets; + return \@facets; } +=head2 _aggregation_scan + + my $result = $self->_aggregration_scan($query, 10, 0); + +Perform an aggregation request for scan purposes. + +=cut + +sub _aggregation_scan { + my ($self, $query, $results_per_page, $offset) = @_; + + if (!scalar(keys %{$query->{aggregations}})) { + my %result = { + biblioserver => { + hits => 0, + RECORDS => undef + } + }; + return (undef, \%result, undef); + } + my ($field) = keys %{$query->{aggregations}}; + $query->{aggregations}{$field}{terms}{size} = 1000; + my $results = $self->search($query, 1, 0); + + # Convert each result into a MARC::Record + my (@records, $index); + # opac-search expects results to be put in the + # right place in the array, according to $offset + $index = $offset - 1; + + my $count = scalar(@{$results->{aggregations}{$field}{buckets}}); + for (my $index = $offset; $index - $offset < $results_per_page && $index < $count; $index++) { + my $bucket = $results->{aggregations}{$field}{buckets}->[$index]; + # Scan values are expressed as: + # - MARC21: 100a (count) and 245a (term) + # - UNIMARC: 200f (count) and 200a (term) + my $marc = MARC::Record->new; + $marc->encoding('UTF-8'); + if (C4::Context->preference('marcflavour') eq 'UNIMARC') { + $marc->append_fields( + MARC::Field->new((200, ' ', ' ', 'f' => $bucket->{doc_count})) + ); + $marc->append_fields( + MARC::Field->new((200, ' ', ' ', 'a' => $bucket->{key})) + ); + } else { + $marc->append_fields( + MARC::Field->new((100, ' ', ' ', 'a' => $bucket->{doc_count})) + ); + $marc->append_fields( + MARC::Field->new((245, ' ', ' ', 'a' => $bucket->{key})) + ); + } + $records[$index] = $marc->as_usmarc(); + }; + # consumers of this expect a namespaced result, we provide the default + # configuration. + my %result; + $result{biblioserver}{hits} = $count; + $result{biblioserver}{RECORDS} = \@records; + return (undef, \%result, undef); +} 1;