=cut
use base qw(Koha::SearchEngine::Elasticsearch);
-use Carp;
use JSON;
-use List::MoreUtils qw/ each_array /;
+use List::MoreUtils qw( each_array );
use Modern::Perl;
-use URI::Escape;
+use URI::Escape qw( uri_escape_utf8 );
use C4::Context;
use Koha::Exceptions;
+use Koha::Caches;
+
+our %index_field_convert = (
+ 'kw' => '',
+ 'ab' => 'abstract',
+ 'au' => 'author',
+ 'lcn' => 'local-classification',
+ 'callnum' => 'local-classification',
+ 'record-type' => 'rtype',
+ 'mc-rtype' => 'rtype',
+ 'mus' => 'rtype',
+ 'lc-card' => 'lc-card-number',
+ 'sn' => 'local-number',
+ 'biblionumber' => 'local-number',
+ 'yr' => 'date-of-publication',
+ 'pubdate' => 'date-of-publication',
+ 'acqdate' => 'date-of-acquisition',
+ 'date/time-last-modified' => 'date-time-last-modified',
+ 'dtlm' => 'date-time-last-modified',
+ 'diss' => 'dissertation-information',
+ 'nb' => 'isbn',
+ 'ns' => 'issn',
+ 'music-number' => 'identifier-publisher-for-music',
+ 'number-music-publisher' => 'identifier-publisher-for-music',
+ 'music' => 'identifier-publisher-for-music',
+ 'ident' => 'identifier-standard',
+ 'cpn' => 'corporate-name',
+ 'cfn' => 'conference-name',
+ 'pn' => 'personal-name',
+ 'pb' => 'publisher',
+ 'pv' => 'provider',
+ 'nt' => 'note',
+ 'notes' => 'note',
+ 'rcn' => 'record-control-number',
+ 'cni' => 'control-number-identifier',
+ 'su' => 'subject',
+ 'su-to' => 'subject',
+ #'su-geo' => 'subject',
+ 'su-ut' => 'subject',
+ 'ti' => 'title',
+ 'se' => 'title-series',
+ 'ut' => 'title-uniform',
+ 'an' => 'koha-auth-number',
+ 'authority-number' => 'koha-auth-number',
+ 'at' => 'authtype',
+ 'he' => 'heading',
+ 'rank' => 'relevance',
+ 'phr' => 'st-phrase',
+ 'wrdl' => 'st-word-list',
+ 'rt' => 'right-truncation',
+ 'rtrn' => 'right-truncation',
+ 'ltrn' => 'left-truncation',
+ 'rltrn' => 'left-and-right',
+ 'mc-itemtype' => 'itemtype',
+ 'mc-ccode' => 'ccode',
+ 'branch' => 'homebranch',
+ 'mc-loc' => 'location',
+ 'loc' => 'location',
+ 'stocknumber' => 'number-local-acquisition',
+ 'inv' => 'number-local-acquisition',
+ 'bc' => 'barcode',
+ 'mc-itype' => 'itype',
+ 'aub' => 'author-personal-bibliography',
+ 'auo' => 'author-in-order',
+ 'ff8-22' => 'ta',
+ 'aud' => 'ta',
+ 'audience' => 'ta',
+ 'frequency-code' => 'ff8-18',
+ 'illustration-code' => 'ff8-18-21',
+ 'regularity-code' => 'ff8-19',
+ 'type-of-serial' => 'ff8-21',
+ 'format' => 'ff8-23',
+ 'conference-code' => 'ff8-29',
+ 'festschrift-indicator' => 'ff8-30',
+ 'index-indicator' => 'ff8-31',
+ 'fiction' => 'lf',
+ 'fic' => 'lf',
+ 'literature-code' => 'lf',
+ 'biography' => 'bio',
+ 'ff8-34' => 'bio',
+ 'biography-code' => 'bio',
+ 'l-format' => 'ff7-01-02',
+ 'lex' => 'lexile-number',
+ 'hi' => 'host-item-number',
+ 'itu' => 'index-term-uncontrolled',
+ 'itg' => 'index-term-genre',
+);
+my $field_name_pattern = '[\w\-]+';
+my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
+
+=head2 get_index_field_convert
+
+ my @index_params = Koha::SearchEngine::Elasticsearch::QueryBuilder->get_index_field_convert();
+
+Converts zebra-style search index notation into elasticsearch-style.
+
+C<@indexes> is an array of index names, as presented to L<build_query_compat>,
+and it returns something that can be sent to L<build_query>.
+
+B<TODO>: this will pull from the elasticsearch mappings table to figure out
+types.
+
+=cut
+
+sub get_index_field_convert() {
+ return \%index_field_convert;
+}
=head2 build_query
my $stemming = C4::Context->preference("QueryStemming") || 0;
my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
- my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
$query = '*' unless defined $query;
my $res;
+ my $fields = $self->_search_fields({
+ is_opac => $options{is_opac},
+ weighted_fields => $options{weighted_fields},
+ });
+ if ($options{whole_record}) {
+ push @$fields, 'marc_data_array.*';
+ }
$res->{query} = {
query_string => {
query => $query,
fuzziness => $fuzzy_enabled ? 'auto' : '0',
default_operator => 'AND',
- default_field => '_all',
+ fields => $fields,
lenient => JSON::true,
- fields => $options{fields} || [],
+ analyze_wildcard => JSON::true,
}
};
+ $res->{query}->{query_string}->{type} = 'cross_fields' if C4::Context->preference('ElasticsearchCrossFields');
if ( $options{sort} ) {
foreach my $sort ( @{ $options{sort} } ) {
# See _convert_facets in Search.pm for how these get turned into
# things that Koha can use.
+ my $size = C4::Context->preference('FacetMaxCount');
$res->{aggregations} = {
- author => { terms => { field => "author__facet" } },
- subject => { terms => { field => "subject__facet" } },
- itype => { terms => { field => "itype__facet" } },
- location => { terms => { field => "location__facet" } },
- 'su-geo' => { terms => { field => "su-geo__facet" } },
- 'title-series' => { terms => { field => "title-series__facet" } },
- ccode => { terms => { field => "ccode__facet" } },
+ author => { terms => { field => "author__facet" , size => $size } },
+ subject => { terms => { field => "subject__facet", size => $size } },
+ itype => { terms => { field => "itype__facet", size => $size} },
+ location => { terms => { field => "location__facet", size => $size } },
+ 'su-geo' => { terms => { field => "su-geo__facet", size => $size} },
+ 'title-series' => { terms => { field => "title-series__facet", size => $size } },
+ ccode => { terms => { field => "ccode__facet", size => $size } },
+ ln => { terms => { field => "ln__facet", size => $size } },
};
my $display_library_facets = C4::Context->preference('DisplayLibraryFacets');
if ( $display_library_facets eq 'both'
or $display_library_facets eq 'home' ) {
- $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet" } };
+ $res->{aggregations}{homebranch} = { terms => { field => "homebranch__facet", size => $size } };
}
if ( $display_library_facets eq 'both'
or $display_library_facets eq 'holding' ) {
- $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet" } };
+ $res->{aggregations}{holdingbranch} = { terms => { field => "holdingbranch__facet", size => $size } };
}
- if ( my $ef = $options{expanded_facet} ) {
- $res->{aggregations}{$ef}{terms}{size} = C4::Context->preference('FacetMaxCount');
- };
return $res;
}
-=head2 build_browse_query
-
- my $browse_query = $builder->build_browse_query($field, $query);
-
-This performs a "starts with" style query on a particular field. The field
-to be searched must have been indexed with an appropriate mapping as a
-"phrase" subfield, which pretty much everything has.
-
-=cut
-
-# XXX this isn't really a browse query like we want in the end
-sub build_browse_query {
- my ( $self, $field, $query ) = @_;
-
- my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
-
- return { query => '*' } if !defined $query;
-
- # TODO this should come from Koha::SearchEngine::Elasticsearch
- my %field_whitelist = (
- title => 1,
- author => 1,
- );
- $field = 'title' if !exists $field_whitelist{$field};
- my $sort = $self->_sort_field($field);
- my $res = {
- query => {
- match_phrase_prefix => {
- "$field.phrase" => {
- query => $query,
- operator => 'or',
- fuzziness => $fuzzy_enabled ? 'auto' : '0',
- }
- }
- },
- sort => [ { $sort => { order => "asc" } } ],
- };
-}
-
=head2 build_query_compat
my (
$stopwords_removed, $query_type
)
= $builder->build_query_compat( \@operators, \@operands, \@indexes,
- \@limits, \@sort_by, $scan, $lang );
+ \@limits, \@sort_by, $scan, $lang, $params );
This handles a search using the same api as L<C4::Search::buildQuery> does.
$lang, $params )
= @_;
-#die Dumper ( $self, $operators, $operands, $indexes, $orig_limits, $sort_by, $scan, $lang );
- my @sort_params = $self->_convert_sort_fields(@$sort_by);
- my @index_params = $self->_convert_index_fields(@$indexes);
- my $limits = $self->_fix_limit_special_cases($orig_limits);
- if ( $params->{suppress} ) { push @$limits, "suppress:0"; }
+ my $query;
+ my $query_str = '';
+ my $search_param_query_str = '';
+ my $limits = ();
+ if ( $scan ) {
+ ($query, $query_str) = $self->_build_scan_query( $operands, $indexes );
+ $search_param_query_str = $query_str;
+ } else {
+ my @sort_params = $self->_convert_sort_fields(@$sort_by);
+ my @index_params = $self->_convert_index_fields(@$indexes);
+ $limits = $self->_fix_limit_special_cases($orig_limits);
+ if ( $params->{suppress} ) { push @$limits, "suppress:false"; }
+ # Merge the indexes in with the search terms and the operands so that
+ # each search thing is a handy unit.
+ unshift @$operators, undef; # The first one can't have an op
+ my @search_params;
+ my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
+ my $ea = each_array( @$operands, @$operators, @index_params );
+ while ( my ( $oand, $otor, $index ) = $ea->() ) {
+ next if ( !defined($oand) || $oand eq '' );
+ $oand = $self->clean_search_term($oand);
+ $oand = $self->_truncate_terms($oand) if ($truncate);
+ push @search_params, {
+ operand => $oand, # the search terms
+ operator => defined($otor) ? uc $otor : undef, # AND and so on
+ $index ? %$index : (),
+ };
+ }
- # Merge the indexes in with the search terms and the operands so that
- # each search thing is a handy unit.
- unshift @$operators, undef; # The first one can't have an op
- my @search_params;
- my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
- my $ea = each_array( @$operands, @$operators, @index_params );
- while ( my ( $oand, $otor, $index ) = $ea->() ) {
- next if ( !defined($oand) || $oand eq '' );
- $oand = $self->_clean_search_term($oand);
- $oand = $self->_truncate_terms($oand) if ($truncate);
- push @search_params, {
- operand => $oand, # the search terms
- operator => defined($otor) ? uc $otor : undef, # AND and so on
- $index ? %$index : (),
- };
+ # We build a string query from limits and the queries. An alternative
+ # would be to pass them separately into build_query and let it build
+ # them into a structured ES query itself. Maybe later, though that'd be
+ # more robust.
+ my @search_param_query_array = $self->_create_query_string(@search_params);
+ $search_param_query_str = join( ' ', @search_param_query_array );
+ my $search_param_limit_str =
+ $self->_join_queries( $self->_convert_index_strings(@$limits) );
+ if ( @search_param_query_array > 1 && $search_param_limit_str ) {
+ $search_param_query_str = "($search_param_query_str)";
+ }
+ $query_str = join( ' AND ',
+ $search_param_query_str || (),
+ $search_param_limit_str || () );
+
+ # If there's no query on the left, let's remove the junk left behind
+ $query_str =~ s/^ AND //;
+ my %options;
+ $options{sort} = \@sort_params;
+ $options{is_opac} = $params->{is_opac};
+ $options{weighted_fields} = $params->{weighted_fields};
+ $options{whole_record} = $params->{whole_record};
+ $query = $self->build_query( $query_str, %options );
}
- # We build a string query from limits and the queries. An alternative
- # would be to pass them separately into build_query and let it build
- # them into a structured ES query itself. Maybe later, though that'd be
- # more robust.
- my $query_str = join( ' AND ',
- join( ' ', $self->_create_query_string(@search_params) ) || (),
- $self->_join_queries( $self->_convert_index_strings(@$limits) ) || () );
-
- my @fields = '_all';
- if ( defined($params->{weighted_fields}) && $params->{weighted_fields} ) {
- push @fields, sprintf("%s^%s", $_->name, $_->weight) for Koha::SearchFields->weighted_fields;
+ # We roughly emulate the CGI parameters of the zebra query builder
+ my $query_cgi = '';
+ shift @$operators; # Shift out the one we unshifted before
+ my $ea = each_array( @$operands, @$operators, @$indexes );
+ while ( my ( $oand, $otor, $index ) = $ea->() ) {
+ $query_cgi .= '&' if $query_cgi;
+ $query_cgi .= 'idx=' . uri_escape_utf8( $index // '') . '&q=' . uri_escape_utf8( $oand );
+ $query_cgi .= '&op=' . uri_escape_utf8( $otor ) if $otor;
}
+ $query_cgi .= '&scan=1' if ( $scan );
- # If there's no query on the left, let's remove the junk left behind
- $query_str =~ s/^ AND //;
- my %options;
- $options{fields} = \@fields;
- $options{sort} = \@sort_params;
- $options{expanded_facet} = $params->{expanded_facet};
- my $query = $self->build_query( $query_str, %options );
-
- #die Dumper($query);
- # We roughly emulate the CGI parameters of the zebra query builder
- my $query_cgi;
- $query_cgi = 'q=' . uri_escape_utf8( $operands->[0] ) if @$operands;
my $simple_query;
$simple_query = $operands->[0] if @$operands == 1;
- my $query_desc = $simple_query;
- my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
+ my $query_desc;
+ if ( $simple_query ) {
+ $query_desc = $simple_query;
+ } else {
+ $query_desc = $search_param_query_str;
+ }
+ my $limit = $self->_join_queries( $self->_convert_index_strings(@$limits));
my $limit_cgi = ( $orig_limits and @$orig_limits )
? '&limit=' . join( '&limit=', map { uri_escape_utf8($_) } @$orig_limits )
: '';
my $limit_desc;
$limit_desc = "$limit" if $limit;
+
return (
undef, $query, $simple_query, $query_cgi, $query_desc,
$limit, $limit_cgi, $limit_desc, undef, undef
foreach my $s ( @{ $search->{searches} } ) {
my ( $wh, $op, $val ) = @{$s}{qw(where operator value)};
- $wh = '_all' if $wh eq '';
- if ( $op eq 'is' || $op eq '=' || $op eq 'exact' ) {
-
- # look for something that matches a term completely
- # note, '=' is about numerical vals. May need special handling.
- # Also, we lowercase our search because the ES
- # index lowercases its values, and term searches don't get the
- # search analyzer applied to them.
- push @query_parts, { match_phrase => {"$wh.phrase" => lc $val} };
+ if ( defined $op && ($op eq 'is' || $op eq '=' || $op eq 'exact') ) {
+ if ($wh) {
+ # Match the whole field, case insensitive, UTF normalized.
+ push @query_parts, { term => { "$wh.ci_raw" => $val } };
+ }
+ else {
+ # Match the whole field for all searchable fields, case insensitive,
+ # UTF normalized.
+ # Given that field data is "The quick brown fox"
+ # "The quick brown fox" and "the quick brown fox" will match
+ # but not "quick brown fox".
+ push @query_parts, {
+ multi_match => {
+ query => $val,
+ fields => $self->_search_fields({ subfield => 'ci_raw' }),
+ }
+ };
+ }
}
- elsif ( $op eq 'start' ) {
- # startswith search, uses lowercase untokenized version of heading
- push @query_parts, { match_phrase_prefix => {"$wh.phrase" => lc $val} };
+ elsif ( defined $op && $op eq 'start') {
+ # Match the prefix within a field for all searchable fields.
+ # Given that field data is "The quick brown fox"
+ # "The quick bro" will match, but not "quick bro"
+
+ # Does not seems to be a multi prefix query
+ # so we need to create one
+ if ($wh) {
+ # Match prefix of the field.
+ push @query_parts, { prefix => {"$wh.ci_raw" => $val} };
+ }
+ else {
+ my @prefix_queries;
+ foreach my $field (@{$self->_search_fields()}) {
+ push @prefix_queries, {
+ prefix => { "$field.ci_raw" => $val }
+ };
+ }
+ push @query_parts, {
+ 'bool' => {
+ 'should' => \@prefix_queries,
+ 'minimum_should_match' => 1
+ }
+ };
+ }
}
else {
- # regular wordlist stuff
+ # Query all searchable fields.
+ # Given that field data is "The quick brown fox"
+ # a search containing any of the words will match, regardless
+ # of order.
+
my @tokens = $self->_split_query( $val );
foreach my $token ( @tokens ) {
$token = $self->_truncate_terms(
- $self->_clean_search_term( $token )
+ $self->clean_search_term( $token )
);
}
my $query = $self->_join_queries( @tokens );
- push @query_parts, { query_string => { default_field => $wh, query => $query } };
+ my $query_string = {
+ query => $query,
+ lenient => JSON::true,
+ analyze_wildcard => JSON::true,
+ };
+ if ($wh) {
+ $query_string->{default_field} = $wh;
+ }
+ else {
+ $query_string->{fields} = $self->_search_fields();
+ }
+ push @query_parts, { query_string => $query_string };
}
}
# Merge the query parts appropriately
# 'should' behaves like 'or'
# 'must' behaves like 'and'
- # Zebra results seem to match must so using that here
- my $query = { query =>
- { bool =>
- { must => \@query_parts }
- }
- };
-
- my %s;
- if ( exists $search->{sort} ) {
- foreach my $k ( keys %{ $search->{sort} } ) {
- my $f = $self->_sort_field($k);
- $s{$f} = $search->{sort}{$k};
- }
- $search->{sort} = \%s;
+ # Zebra behaviour seem to match must so using that here
+ my $elastic_query = {};
+ $elastic_query->{bool}->{must} = \@query_parts;
+
+ # Filter by authtypecode if set
+ if ($search->{authtypecode}) {
+ $elastic_query->{bool}->{filter} = {
+ term => {
+ "authtype.raw" => $search->{authtypecode}
+ }
+ };
}
- # add the sort stuff
- $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
+ my $query = {
+ query => $elastic_query
+ };
+
+ # Add the sort stuff
+ $query->{sort} = [ $search->{sort} ] if exists $search->{sort};
return $query;
}
-
=head2 build_authorities_query_compat
my ($query) =
'match-heading' => 'match-heading',
'see-from' => 'match-heading-see-from',
thesaurus => 'subject-heading-thesaurus',
+ 'thesaurus-conventions' => 'subject-heading-thesaurus-conventions',
any => '',
all => ''
};
+our $thesaurus_to_value = {
+ lcsh => 'a',
+ lcac => 'b',
+ mesh => 'c',
+ nal => 'd',
+ notspecified => 'n',
+ cash => 'k',
+ rvm => 'v',
+};
+
sub build_authorities_query_compat {
my ( $self, $marclist, $and_or, $excluding, $operator, $value,
$authtypecode, $orderby )
# This turns the old-style many-options argument form into a more
# extensible hash form that is understood by L<build_authorities_query>.
my @searches;
+ my $mappings = $self->get_elasticsearch_mappings();
# Convert to lower case
$marclist = [map(lc, @{$marclist})];
$orderby = lc $orderby;
+ my @indexes;
# Make sure everything exists
foreach my $m (@$marclist) {
- Koha::Exceptions::WrongParameter->throw("Invalid marclist field provided: $m")
- unless exists $koha_to_index_name->{$m};
+
+ $m = exists $koha_to_index_name->{$m} ? $koha_to_index_name->{$m} : $m;
+ push @indexes, $m;
+ warn "Unknown search field $m in marclist" unless (defined $mappings->{properties}->{$m} || $m eq '' || $m eq 'match-heading');
}
for ( my $i = 0 ; $i < @$value ; $i++ ) {
next unless $value->[$i]; #clean empty form values, ES doesn't like undefined searches
+ $value->[$i] = $thesaurus_to_value->{ $value->[$i] }
+ if( defined $thesaurus_to_value->{ $value->[$i] } && $indexes[$i] eq 'subject-heading-thesaurus' );
push @searches,
{
- where => $koha_to_index_name->{$marclist->[$i]},
+ where => $indexes[$i],
operator => $operator->[$i],
value => $value->[$i],
};
my %sort;
my $sort_field =
- ( $orderby =~ /^heading/ ) ? 'heading'
- : ( $orderby =~ /^auth/ ) ? 'local-number'
+ ( $orderby =~ /^heading/ ) ? 'heading__sort'
+ : ( $orderby =~ /^auth/ ) ? 'local-number__sort'
: undef;
if ($sort_field) {
my $sort_order = ( $orderby =~ /asc$/ ) ? 'asc' : 'desc';
return $query;
}
+=head2 _build_scan_query
+
+ my ($query, $query_str) = $builder->_build_scan_query(\@operands, \@indexes)
+
+This will build an aggregation scan query that can be issued to elasticsearch from
+the provided string input.
+
+=cut
+
+our %scan_field_convert = (
+ 'ti' => 'title',
+ 'au' => 'author',
+ 'su' => 'subject',
+ 'se' => 'title-series',
+ 'pb' => 'publisher',
+);
+
+sub _build_scan_query {
+ my ( $self, $operands, $indexes ) = @_;
+
+ my $term = scalar( @$operands ) == 0 ? '' : $operands->[0];
+ my $index = scalar( @$indexes ) == 0 ? 'subject' : $indexes->[0];
+
+ my ( $f, $d ) = split( /,/, $index);
+ $index = $scan_field_convert{$f} || $f;
+
+ my $res;
+ $res->{query} = {
+ query_string => {
+ query => '*'
+ }
+ };
+ $res->{aggregations} = {
+ $index => {
+ terms => {
+ field => $index . '__facet',
+ order => { '_key' => 'asc' },
+ include => $self->_create_regex_filter($self->clean_search_term($term)) . '.*'
+ }
+ }
+ };
+ return ($res, $term);
+}
+
+=head2 _create_regex_filter
+
+ my $filter = $builder->_create_regex_filter('term')
+
+This will create a regex filter that can be used with an aggregation query.
+
+=cut
+
+sub _create_regex_filter {
+ my ($self, $term) = @_;
+
+ my $result = '';
+ foreach my $c (split(//, quotemeta($term))) {
+ my $lc = lc($c);
+ my $uc = uc($c);
+ $result .= $lc ne $uc ? '[' . $lc . $uc . ']' : $c;
+ }
+ return $result;
+}
+
=head2 _convert_sort_fields
my @sort_params = _convert_sort_fields(@sort_by)
my %sort_field_convert = (
acqdate => 'date-of-acquisition',
author => 'author',
- call_number => 'local-classification',
+ call_number => 'cn-sort',
popularity => 'issues',
relevance => undef, # default
title => 'title',
pubdate => 'date-of-publication',
+ biblionumber => 'local-number',
);
my %sort_order_convert =
( qw( desc desc ), qw( dsc desc ), qw( asc asc ), qw( az asc ), qw( za desc ) );
} @sort_by;
}
-=head2 _convert_index_fields
-
- my @index_params = $self->_convert_index_fields(@indexes);
-
-Converts zebra-style search index notation into elasticsearch-style.
-
-C<@indexes> is an array of index names, as presented to L<build_query_compat>,
-and it returns something that can be sent to L<build_query>.
-
-B<TODO>: this will pull from the elasticsearch mappings table to figure out
-types.
-
-=cut
-
-our %index_field_convert = (
- 'kw' => '_all',
- 'ab' => 'abstract',
- 'au' => 'author',
- 'lcn' => 'local-classification',
- 'callnum' => 'local-classification',
- 'record-type' => 'rtype',
- 'mc-rtype' => 'rtype',
- 'mus' => 'rtype',
- 'lc-card' => 'lc-card-number',
- 'sn' => 'local-number',
- 'yr' => 'date-of-publication',
- 'pubdate' => 'date-of-publication',
- 'acqdate' => 'date-of-acquisition',
- 'date/time-last-modified' => 'date-time-last-modified',
- 'dtlm' => 'date/time-last-modified',
- 'diss' => 'dissertation-information',
- 'nb' => 'isbn',
- 'ns' => 'issn',
- 'music-number' => 'identifier-publisher-for-music',
- 'number-music-publisher' => 'identifier-publisher-for-music',
- 'music' => 'identifier-publisher-for-music',
- 'ident' => 'identifier-standard',
- 'cpn' => 'corporate-name',
- 'cfn' => 'conference-name',
- 'pn' => 'personal-name',
- 'pb' => 'publisher',
- 'pv' => 'provider',
- 'nt' => 'note',
- 'notes' => 'note',
- 'rcn' => 'record-control-number',
- 'su' => 'subject',
- 'su-to' => 'subject',
- #'su-geo' => 'subject',
- 'su-ut' => 'subject',
- 'ti' => 'title',
- 'se' => 'title-series',
- 'ut' => 'title-uniform',
- 'an' => 'koha-auth-number',
- 'at' => 'authtype',
- 'he' => 'heading',
- 'rank' => 'relevance',
- 'phr' => 'st-phrase',
- 'wrdl' => 'st-word-list',
- 'rt' => 'right-truncation',
- 'rtrn' => 'right-truncation',
- 'ltrn' => 'left-truncation',
- 'rltrn' => 'left-and-right',
- 'mc-itemtype' => 'itemtype',
- 'mc-ccode' => 'ccode',
- 'branch' => 'homebranch',
- 'mc-loc' => 'location',
- 'stocknumber' => 'number-local-acquisition',
- 'inv' => 'number-local-acquisition',
- 'bc' => 'barcode',
- 'mc-itype' => 'itype',
- 'aub' => 'author-personal-bibliography',
- 'auo' => 'author-in-order',
- 'ff8-22' => 'ta',
- 'aud' => 'ta',
- 'audience' => 'ta',
- 'frequency-code' => 'ff8-18',
- 'illustration-code' => 'ff8-18-21',
- 'regularity-code' => 'ff8-19',
- 'type-of-serial' => 'ff8-21',
- 'format' => 'ff8-23',
- 'conference-code' => 'ff8-29',
- 'festschrift-indicator' => 'ff8-30',
- 'index-indicator' => 'ff8-31',
- 'fiction' => 'lf',
- 'fic' => 'lf',
- 'literature-code' => 'lf',
- 'biography' => 'bio',
- 'ff8-34' => 'bio',
- 'biography-code' => 'bio',
- 'l-format' => 'ff7-01-02',
- 'lex' => 'lexile-number',
- 'hi' => 'host-item-number',
- 'itu' => 'index-term-uncontrolled',
- 'itg' => 'index-term-genre',
-);
-my $field_name_pattern = '[\w\-]+';
-my $multi_field_pattern = "(?:\\.$field_name_pattern)*";
-
sub _convert_index_fields {
my ( $self, @indexes ) = @_;
my %index_type_convert =
- ( __default => undef, phr => 'phrase', rtrn => 'right-truncate' );
+ ( __default => undef, phr => 'phrase', rtrn => 'right-truncate', 'st-year' => 'st-year' );
+
+ @indexes = grep { $_ ne q{} } @indexes; # Remove any blank indexes, i.e. keyword
# Convert according to our table, drop anything that doesn't convert.
# If a field starts with mc- we save it as it's used (and removed) later
# when joining things, to indicate we make it an 'OR' join.
# (Sorry, this got a bit ugly after special cases were found.)
- grep { $_->{field} } map {
+ map {
# Lower case all field names
my ( $f, $t ) = map(lc, split /,/);
my $mc = '';
type => $index_type_convert{ $t // '__default' }
};
$r->{field} = ($mc . $r->{field}) if $mc && $r->{field};
- $r;
+ $r->{field} || $r->{type} ? $r : undef;
} @indexes;
}
push @res, $s;
next;
}
- push @res, $conv->{field} . ":"
- . $self->_modify_string_by_type( %$conv, operand => $term );
+ push @res, ($conv->{field} ? $conv->{field} . ':' : '')
+ . $self->_modify_string_by_type( %$conv, operand => $term );
}
return @res;
}
sub _convert_index_strings_freeform {
my ( $self, $search ) = @_;
- # @TODO: Currenty will alter also fields contained within quotes:
+ # @TODO: Currently will alter also fields contained within quotes:
# `searching for "stuff cn:123"` for example will become
# `searching for "stuff local-number:123"
#
# Lower case field names
$search =~ s/($field_name_pattern)(?:,[\w-]*)?($multi_field_pattern):/\L$1\E$2:/og;
# Resolve possible field aliases
- $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1)."$2:"/oge;
+ $search =~ s/($field_name_pattern)($multi_field_pattern):/(exists $index_field_convert{$1} ? $index_field_convert{$1} : $1).($1 eq 'kw' ? "$2" : "$2:")/oge;
return $search;
}
return $str unless $str; # Empty or undef, we can't use it.
$str .= '*' if $type eq 'right-truncate';
- $str = '"' . $str . '"' if $type eq 'phrase';
+ $str = '"' . $str . '"' if $type eq 'phrase' && $str !~ /^".*"$/;
+ if ($type eq 'st-year') {
+ if ($str =~ /^(.*)-(.*)$/) {
+ my $from = $1 || '*';
+ my $until = $2 || '*';
+ $str = "[$from TO $until]";
+ }
+ }
return $str;
}
map { s/^mc-//r } grep { defined($_) && $_ ne '' && $_ =~ /^mc-/ } @parts;
return () unless @norm_parts + @mc_parts;
return ( @norm_parts, @mc_parts )[0] if @norm_parts + @mc_parts == 1;
- my $grouped_mc =
- @mc_parts ? '(' . ( join ' OR ', map { "($_)" } @mc_parts ) . ')' : ();
-
- # Handy trick: $x || () inside a join means that if $x ends up as an
- # empty string, it gets replaced with (), which makes join ignore it.
- # (bad effect: this'll also happen to '0', this hopefully doesn't matter
- # in this case.)
- join( ' AND ',
- join( ' AND ', map { "($_)" } @norm_parts ) || (),
- $grouped_mc || () );
+
+ # Group limits by field, so they can be OR'ed together
+ my %mc_limits;
+ foreach my $mc_part (@mc_parts) {
+ my ($field, $value) = split /:/, $mc_part, 2;
+ $mc_limits{$field} //= [];
+ push @{ $mc_limits{$field} }, $value;
+ }
+
+ @mc_parts = map {
+ sprintf('%s:(%s)', $_, join (' OR ', @{ $mc_limits{$_} }));
+ } sort keys %mc_limits;
+
+ @norm_parts = map { "($_)" } @norm_parts;
+
+ return join( ' AND ', @norm_parts, @mc_parts);
}
=head2 _make_phrases
my $field = $_->{field} ? $_->{field} . ':' : '';
my $oand = $self->_modify_string_by_type(%$_);
+ $oand = "($oand)" if $field && scalar(split(/\s+/, $oand)) > 1 && (!defined $_->{type} || $_->{type} ne 'st-year');
"$otor($field$oand)";
} @queries;
}
-=head2 _clean_search_term
+=head2 clean_search_term
- my $term = $self->_clean_search_term($term);
+ my $term = $self->clean_search_term($term);
This cleans a search term by removing any funny characters that may upset
ES and give us an error. It also calls L<_convert_index_strings_freeform>
=cut
-sub _clean_search_term {
+sub clean_search_term {
my ( $self, $term ) = @_;
+ # Lookahead for checking if we are inside quotes
+ my $lookahead = '(?=(?:[^\"]*+\"[^\"]*+\")*+[^\"]*+$)';
+
# Some hardcoded searches (like with authorities) produce things like
# 'an=123', when it ought to be 'an:123' for our purposes.
$term =~ s/=/:/g;
+
$term = $self->_convert_index_strings_freeform($term);
- $term =~ s/[{}]/"/g;
+
+ # Remove unbalanced quotes
+ my $unquoted = $term;
+ my $count = ($unquoted =~ tr/"/ /);
+ if ($count % 2 == 1) {
+ $term = $unquoted;
+ }
+ $term = $self->_query_regex_escape_process($term);
+
+ # because of _truncate_terms and if QueryAutoTruncate enabled
+ # we will have any special operators ruined by _truncate_terms:
+ # for ex. search for "test [6 TO 7]" will be converted to "test* [6* TO* 7]"
+ # so no reason to keep ranges in QueryAutoTruncate==true case:
+ my $truncate = C4::Context->preference("QueryAutoTruncate") || 0;
+ unless($truncate) {
+ # replace all ranges with any square/curly brackets combinations to temporary substitutions (ex: "{a TO b]"" -> "~~LC~~a TO b~~RS~~")
+ # (where L is for left and C is for Curly and so on)
+ $term =~ s/
+ (?<!\\)
+ (?<backslashes>(?:[\\]{2})*)
+ (?<leftbracket>\{|\[)
+ (?<ranges>
+ [^\s\[\]\{\}]+\ TO\ [^\s\[\]\{\}]+
+ (?<!\\)
+ (?:[\\]{2})*
+ )
+ (?<rightbracket>\}|\])
+ /$+{backslashes}.'~~L'.($+{leftbracket} eq '[' ? 'S':'C').'~~'.$+{ranges}.'~~R'.($+{rightbracket} eq ']' ? 'S':'C').'~~'/gex;
+ }
+ # save all regex contents away before escaping brackets:
+ # (same trick as with brackets above, just RE for 'RegularExpression')
+ my @saved_regexes;
+ my $rgx_i = 0;
+ while(
+ $term =~ s@(
+ (?<!\\)(?:[\\]{2})*/
+ (?:[^/]+|(?<=\\)(?:[\\]{2})*/)+
+ (?<!\\)(?:[\\]{2})*/
+ )$lookahead@~~RE$rgx_i~~@x
+ ) {
+ @saved_regexes[$rgx_i++] = $1;
+ }
+
+ # remove leading and trailing colons mixed with optional slashes and spaces
+ $term =~ s/^([\s\\]*:\s*)+//;
+ $term =~ s/([\s\\]*:\s*)+$//;
+ # remove unquoted colons that have whitespace on either side of them
+ $term =~ s/([\s\\]*:\s*)+(\s+)$lookahead/$2/g;
+ $term =~ s/(\s+)([\s\\]*:\s*)+$lookahead/$1/g;
+ # replace with spaces all repeated colons no matter how they surrounded with spaces and slashes
+ $term =~ s/([\s\\]*:\s*){2,}$lookahead/ /g;
+ # screen all followups for colons after first colon,
+ # and correctly ignore unevenly backslashed:
+ $term =~ s/((?<!\\)(?:[\\]{2})*:[^:\s]+(?<!\\)(?:[\\]{2})*)(?=:)/$1\\/g;
+
+ # screen all exclamation signs that either are the last symbol or have white space after them
+ # or are followed by close parentheses
+ $term =~ s/(?:[\s\\]*!\s*)+(\s|$|\))/$1/g;
+
+ # screen all brackets with backslash
+ $term =~ s/(?<!\\)(?:[\\]{2})*([\{\}\[\]])$lookahead/\\$1/g;
+
+ # restore all regex contents after escaping brackets:
+ for (my $i = 0; $i < @saved_regexes; $i++) {
+ $term =~ s/~~RE$i~~/$saved_regexes[$i]/;
+ }
+ unless($truncate) {
+ # restore temporary weird substitutions back to normal brackets
+ $term =~ s/~~L(C|S)~~([^\s\[\]\{\}]+ TO [^\s\[\]\{\}]+)~~R(C|S)~~/($1 eq 'S' ? '[':'{').$2.($3 eq 'S' ? ']':'}')/ge;
+ }
return $term;
}
+=head2 _query_regex_escape_process
+
+ my $query = $self->_query_regex_escape_process($query);
+
+Processes query in accordance with current "QueryRegexEscapeOptions" system preference setting.
+
+=cut
+
+sub _query_regex_escape_process {
+ my ($self, $query) = @_;
+ my $regex_escape_options = C4::Context->preference("QueryRegexEscapeOptions");
+ if ($regex_escape_options ne 'dont_escape') {
+ if ($regex_escape_options eq 'escape') {
+ # Will escape unescaped slashes (/) while preserving
+ # unescaped slashes within quotes
+ # @TODO: assumes quotes are always balanced and will
+ # not handle escaped quotes properly, should perhaps be
+ # replaced with a more general parser solution
+ # so that this function is ever only provided with unquoted
+ # query parts
+ $query =~ s@(?:(?<!\\)((?:[\\]{2})*)(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@\\$1@g;
+ }
+ elsif($regex_escape_options eq 'unescape_escaped') {
+ # Will unescape escaped slashes (\/) and escape
+ # unescaped slashes (/) while preserving slashes within quotes
+ # The same limitatations as above apply for handling of quotes
+ $query =~ s@(?:(?<!\\)(?:((?:[\\]{2})*[\\])|((?:[\\]{2})*))(?=/))(?![^"]*"(?:[^"]*"[^"]*")*[^"]*$)@($1 ? substr($1, 0, -1) : ($2 . "\\"))@ge;
+ }
+ }
+ return $query;
+}
+
=head2 _fix_limit_special_cases
my $limits = $self->_fix_limit_special_cases($limits);
foreach my $l (@$limits) {
# This is set up by opac-search.pl
- if ( $l =~ /^yr,st-numeric,ge=/ ) {
+ if ( $l =~ /^yr,st-numeric,ge[=:]/ ) {
my ( $start, $end ) =
- ( $l =~ /^yr,st-numeric,ge=(.*) and yr,st-numeric,le=(.*)$/ );
+ ( $l =~ /^yr,st-numeric,ge[=:](.*) and yr,st-numeric,le[=:](.*)$/ );
next unless defined($start) && defined($end);
- push @new_lim, "copydate:[$start TO $end]";
+ push @new_lim, "date-of-publication:[$start TO $end]";
}
- elsif ( $l =~ /^yr,st-numeric=/ ) {
- my ($date) = ( $l =~ /^yr,st-numeric=(.*)$/ );
+ elsif( $l =~ /^search_filter:/ ){
+ # Here we are going to get the query as a string, clean it, and take care of the part of the limit
+ # Calling build_query_compat here is avoided because we generate more complex query structures
+ my ($filter_id) = ( $l =~ /^search_filter:(.*)$/ );
+ my $search_filter = Koha::SearchFilters->find( $filter_id );
+ next unless $search_filter;
+ my ($expanded_lim,$query_lim) = $search_filter->expand_filter;
+ # In the case of nested filters we need to expand them all
+ foreach my $el ( @{$self->_fix_limit_special_cases($expanded_lim)} ){
+ push @new_lim, $el;
+ }
+ # We need to clean the query part as we have built a string from the original search
+ push @new_lim, $self->clean_search_term( $query_lim );
+ }
+ elsif ( $l =~ /^yr,st-numeric[=:]/ ) {
+ my ($date) = ( $l =~ /^yr,st-numeric[=:](.*)$/ );
next unless defined($date);
- push @new_lim, "copydate:$date";
+ $date = $self->_modify_string_by_type(type => 'st-year', operand => $date);
+ push @new_lim, "date-of-publication:$date";
+ }
+ elsif ( $l =~ 'multibranchlimit|^branch' ) {
+ my $branchfield = C4::Context->preference('SearchLimitLibrary');
+ my @branchcodes;
+ if( $l =~ 'multibranchlimit' ) {
+ my ($group_id) = ( $l =~ /^multibranchlimit:(.*)$/ );
+ my $search_group = Koha::Library::Groups->find( $group_id );
+ @branchcodes = map { $_->branchcode } $search_group->all_libraries;
+ @branchcodes = sort { $a cmp $b } @branchcodes;
+ } else {
+ @branchcodes = ( $l =~ /^branch:(.*)$/ );
+ }
+
+ if (@branchcodes) {
+ # We quote the branchcodes here to prevent issues when codes are reserved words in ES, e.g. OR, AND, NOT, etc.
+ if ( $branchfield eq "homebranch" ) {
+ push @new_lim, sprintf "(%s)", join " OR ", map { 'homebranch: "' . $_ . '"' } @branchcodes;
+ }
+ elsif ( $branchfield eq "holdingbranch" ) {
+ push @new_lim, sprintf "(%s)", join " OR ", map { 'holdingbranch: "' . $_ . '"' } @branchcodes;
+ }
+ else {
+ push @new_lim, sprintf "(%s OR %s)",
+ join( " OR ", map { 'homebranch: "' . $_ . '"' } @branchcodes ),
+ join( " OR ", map { 'holdingbranch: "' . $_ . '"' } @branchcodes );
+ }
+ }
}
elsif ( $l =~ /^available$/ ) {
- push @new_lim, 'onloan:0';
+ push @new_lim, 'available:true';
}
else {
- push @new_lim, $l;
+ my ( $field, $term ) = $l =~ /^\s*([\w,-]*?):(.*)/;
+ $field =~ s/,phr$//; #We are quoting all the limits as phrase, this prevents from quoting again later
+ if ( defined($field) && defined($term) ) {
+ push @new_lim, "$field:(\"$term\")";
+ }
+ else {
+ push @new_lim, $l;
+ }
}
}
return \@new_lim;
my ($self, $f) = @_;
my $mappings = $self->get_elasticsearch_mappings();
- my $textField = defined $mappings->{data}{properties}{$f}{type} && $mappings->{data}{properties}{$f}{type} eq 'text';
+ my $textField = defined $mappings->{properties}{$f}{type} && $mappings->{properties}{$f}{type} eq 'text';
if (!defined $self->sort_fields()->{$f} || $self->sort_fields()->{$f}) {
$f .= '__sort';
- # We need to add '.phrase' to text fields, otherwise it'll sort
- # based on the tokenised form.
- $f .= '.phrase' if $textField;
} else {
# We need to add '.raw' to text fields without a sort field,
# otherwise it'll sort based on the tokenised form.
return @tokens;
}
+=head2 _search_fields
+ my $weighted_fields = $self->_search_fields({
+ is_opac => 0,
+ weighted_fields => 1,
+ subfield => 'raw'
+ });
+
+Generate a list of searchable fields to be used for Elasticsearch queries
+applied to multiple fields.
+
+Returns an arrayref of field names for either OPAC or staff interface, with
+possible weights and subfield appended to each field name depending on the
+options provided.
+
+=over 4
+
+=item C<$params>
+
+Hashref with options. The parameter C<is_opac> indicates whether the searchable
+fields for OPAC or staff interface should be retrieved. If C<weighted_fields> is set
+fields weights will be applied on returned fields. C<subfield> can be used to
+provide a subfield that will be appended to fields as "C<field_name>.C<subfield>".
+
+=back
+
+=cut
+
+sub _search_fields {
+ my ($self, $params) = @_;
+ $params //= {
+ is_opac => 0,
+ weighted_fields => 0,
+ whole_record => 0,
+ # This is a hack for authorities build_authorities_query
+ # can hopefully be removed in the future
+ subfield => undef,
+ };
+ my $cache = Koha::Caches->get_instance();
+ my $cache_key = 'elasticsearch_search_fields' . ($params->{is_opac} ? '_opac' : '_staff_client') . "_" . $self->index;
+ my $search_fields = $cache->get_from_cache($cache_key, { unsafe => 1 });
+ if (!$search_fields) {
+ # The reason we don't use Koha::SearchFields->search here is we don't
+ # want or need resultset wrapped as Koha::SearchField object.
+ # It does not make any sense in this context and would cause
+ # unnecessary overhead sice we are only querying for data
+ # Also would not work, or produce strange results, with the "columns"
+ # option.
+ my $schema = Koha::Database->schema;
+ my $result = $schema->resultset('SearchField')->search(
+ {
+ $params->{is_opac} ? (
+ 'opac' => 1,
+ ) : (
+ 'staff_client' => 1
+ ),
+ 'type' => { '!=' => 'boolean' },
+ 'search_marc_map.index_name' => $self->index,
+ 'search_marc_map.marc_type' => C4::Context->preference('marcflavour'),
+ 'search_marc_to_fields.search' => 1,
+ },
+ {
+ columns => [qw/name weight/],
+ collapse => 1,
+ join => {search_marc_to_fields => 'search_marc_map'},
+ }
+ );
+ my @search_fields;
+ while (my $search_field = $result->next) {
+ push @search_fields, [
+ lc $search_field->name,
+ $search_field->weight ? $search_field->weight : ()
+ ];
+ }
+ $search_fields = \@search_fields;
+ $cache->set_in_cache($cache_key, $search_fields);
+ }
+ if ($params->{subfield}) {
+ my $subfield = $params->{subfield};
+ $search_fields = [
+ map {
+ # Copy values to avoid mutating cached
+ # data (since unsafe is used)
+ my ($field, $weight) = @{$_};
+ ["${field}.${subfield}", $weight];
+ } @{$search_fields}
+ ];
+ }
+ if ($params->{weighted_fields}) {
+ return [map { join('^', @{$_}) } @{$search_fields}];
+ }
+ else {
+ # Exclude weight from field
+ return [map { $_->[0] } @{$search_fields}];
+ }
+}
+
1;