X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=Koha%2FSearchEngine%2FElasticsearch.pm;h=579588a74fef25ad4c3d52adaf2e9bba4b676ea0;hb=7d8b96803f664d86762a6afb966051f7d565c40e;hp=d904bcf710502ce371859b9a6409b39e2d480ef6;hpb=ec0ea67a43455b94a9d8bd3cf4ba8f68d1678e4a;p=srvgit diff --git a/Koha/SearchEngine/Elasticsearch.pm b/Koha/SearchEngine/Elasticsearch.pm index d904bcf710..579588a74f 100644 --- a/Koha/SearchEngine/Elasticsearch.pm +++ b/Koha/SearchEngine/Elasticsearch.pm @@ -4,18 +4,18 @@ package Koha::SearchEngine::Elasticsearch; # # This file is part of Koha. # -# Koha is free software; you can redistribute it and/or modify it under the -# terms of the GNU General Public License as published by the Free Software -# Foundation; either version 3 of the License, or (at your option) any later -# version. +# Koha is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. # -# Koha is distributed in the hope that it will be useful, but WITHOUT ANY -# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR -# A PARTICULAR PURPOSE. See the GNU General Public License for more details. +# Koha is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License along -# with Koha; if not, write to the Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# You should have received a copy of the GNU General Public License +# along with Koha; if not, see . use base qw(Class::Accessor); @@ -23,10 +23,13 @@ use C4::Context; use Koha::Database; use Koha::Exceptions::Config; +use Koha::Exceptions::Elasticsearch; use Koha::SearchFields; use Koha::SearchMarcMaps; +use C4::Heading; use Carp; +use Clone qw(clone); use JSON; use Modern::Perl; use Readonly; @@ -34,7 +37,11 @@ use Search::Elasticsearch; use Try::Tiny; use YAML::Syck; -use Data::Dumper; # TODO remove +use List::Util qw( sum0 reduce ); +use MARC::File::XML; +use MIME::Base64; +use Encode qw(encode); +use Business::ISBN; __PACKAGE__->mk_ro_accessors(qw( index )); __PACKAGE__->mk_accessors(qw( sort_fields )); @@ -65,10 +72,28 @@ sub new { my $class = shift @_; my $self = $class->SUPER::new(@_); # Check for a valid index - croak('No index name provided') unless $self->index; + Koha::Exceptions::MissingParameter->throw('No index name provided') unless $self->index; return $self; } +=head2 get_elasticsearch + + my $elasticsearch_client = $self->get_elasticsearch(); + +Returns a C client. The client is cached on a C +instance level and will be reused if method is called multiple times. + +=cut + +sub get_elasticsearch { + my $self = shift @_; + unless (defined $self->{elasticsearch}) { + my $conf = $self->get_elasticsearch_params(); + $self->{elasticsearch} = Search::Elasticsearch->new($conf); + } + return $self->{elasticsearch}; +} + =head2 get_elasticsearch_params my $params = $self->get_elasticsearch_params(); @@ -114,12 +139,16 @@ sub get_elasticsearch_params { else { die "No elasticsearch servers were specified in koha-conf.xml.\n"; } - die "No elasticserver index_name was specified in koha-conf.xml.\n" + die "No elasticsearch index_name was specified in koha-conf.xml.\n" if ( !$es->{index_name} ); # Append the name of this particular index to our namespace $es->{index_name} .= '_' . $self->index; $es->{key_prefix} = 'es_'; + $es->{client} //= '5_0::Direct'; + $es->{cxn_pool} //= 'Static'; + $es->{request_timeout} //= 60; + return $es; } @@ -127,8 +156,8 @@ sub get_elasticsearch_params { my $settings = $self->get_elasticsearch_settings(); -This provides the settings provided to elasticsearch when an index is created. -These can do things like define tokenisation methods. +This provides the settings provided to Elasticsearch when an index is created. +These can do things like define tokenization methods. A hashref containing the settings is returned. @@ -137,24 +166,14 @@ A hashref containing the settings is returned. sub get_elasticsearch_settings { my ($self) = @_; - # Ultimately this should come from a file or something, and not be - # hardcoded. - my $settings = { - index => { - analysis => { - analyzer => { - analyser_phrase => { - tokenizer => 'icu_tokenizer', - filter => ['icu_folding'], - }, - analyser_standard => { - tokenizer => 'icu_tokenizer', - filter => ['icu_folding'], - }, - }, - } - } - }; + # Use state to speed up repeated calls + state $settings = undef; + if (!defined $settings) { + my $config_file = C4::Context->config('elasticsearch_index_config'); + $config_file ||= C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/index_config.yaml'; + $settings = LoadFile( $config_file ); + } + return $settings; } @@ -162,7 +181,7 @@ sub get_elasticsearch_settings { my $mappings = $self->get_elasticsearch_mappings(); -This provides the mappings that get passed to elasticsearch when an index is +This provides the mappings that get passed to Elasticsearch when an index is created. =cut @@ -170,127 +189,143 @@ created. sub get_elasticsearch_mappings { my ($self) = @_; - # TODO cache in the object? - my $mappings = { - data => { - _all => {type => "string", analyzer => "analyser_standard"}, - properties => { - record => { - store => "true", - include_in_all => JSON::false, - type => "text", - }, - } - } - }; - my %sort_fields; - my $marcflavour = lc C4::Context->preference('marcflavour'); - $self->_foreach_mapping( - sub { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_; - return if $marc_type ne $marcflavour; - # TODO if this gets any sort of complexity to it, it should - # be broken out into its own function. - - # TODO be aware of date formats, but this requires pre-parsing - # as ES will simply reject anything with an invalid date. - my $es_type = - $type eq 'boolean' - ? 'boolean' - : 'text'; - - if ($es_type eq 'boolean') { - $mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_boolean( $name, $es_type, $facet, $suggestible, $sort, $marc_type ); - return; #Boolean cannot have facets nor sorting nor suggestions - } else { - $mappings->{data}{properties}{$name} = _elasticsearch_mapping_for_default( $name, $es_type, $facet, $suggestible, $sort, $marc_type ); - } + # Use state to speed up repeated calls + state %all_mappings; + state %sort_fields; + + if (!defined $all_mappings{$self->index}) { + $sort_fields{$self->index} = {}; + # Clone the general mapping to break ties with the original hash + my $mappings = { + data => clone(_get_elasticsearch_field_config('general', '')) + }; + my $marcflavour = lc C4::Context->preference('marcflavour'); + $self->_foreach_mapping( + sub { + my ( $name, $type, $facet, $suggestible, $sort, $search, $marc_type ) = @_; + return if $marc_type ne $marcflavour; + # TODO if this gets any sort of complexity to it, it should + # be broken out into its own function. + + # TODO be aware of date formats, but this requires pre-parsing + # as ES will simply reject anything with an invalid date. + my $es_type = 'text'; + if ($type eq 'boolean') { + $es_type = 'boolean'; + } elsif ($type eq 'number' || $type eq 'sum') { + $es_type = 'integer'; + } elsif ($type eq 'isbn' || $type eq 'stdno') { + $es_type = 'stdno'; + } - if ($facet) { - $mappings->{data}{properties}{ $name . '__facet' } = { - type => "keyword", - }; - } - if ($suggestible) { - $mappings->{data}{properties}{ $name . '__suggestion' } = { - type => 'completion', - analyzer => 'simple', - search_analyzer => 'simple', - }; - } - # Sort is a bit special as it can be true, false, undef. - # We care about "true" or "undef", - # "undef" means to do the default thing, which is make it sortable. - if ($sort || !defined $sort) { - $mappings->{data}{properties}{ $name . '__sort' } = { - search_analyzer => "analyser_phrase", - analyzer => "analyser_phrase", - type => "text", - include_in_all => JSON::false, - fields => { - phrase => { - type => "keyword", - }, - }, - }; - $sort_fields{$name} = 1; + if ($search) { + $mappings->{data}{properties}{$name} = _get_elasticsearch_field_config('search', $es_type); + } + + if ($facet) { + $mappings->{data}{properties}{ $name . '__facet' } = _get_elasticsearch_field_config('facet', $es_type); + } + if ($suggestible) { + $mappings->{data}{properties}{ $name . '__suggestion' } = _get_elasticsearch_field_config('suggestible', $es_type); + } + # Sort is a bit special as it can be true, false, undef. + # We care about "true" or "undef", + # "undef" means to do the default thing, which is make it sortable. + if (!defined $sort || $sort) { + $mappings->{data}{properties}{ $name . '__sort' } = _get_elasticsearch_field_config('sort', $es_type); + $sort_fields{$self->index}{$name} = 1; + } } - } - ); - $self->sort_fields(\%sort_fields); - return $mappings; + ); + $all_mappings{$self->index} = $mappings; + } + $self->sort_fields(\%{$sort_fields{$self->index}}); + + return $all_mappings{$self->index}; } -=head2 _elasticsearch_mapping_for_* +=head2 _get_elasticsearch_field_config -Get the ES mappings for the given data type or a special mapping case +Get the Elasticsearch field config for the given purpose and data type. -Receives the same parameters from the $self->_foreach_mapping() dispatcher +$mapping = _get_elasticsearch_field_config('search', 'text'); =cut -sub _elasticsearch_mapping_for_boolean { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_; +sub _get_elasticsearch_field_config { - return { - type => $type, - null_value => 0, - }; + my ( $purpose, $type ) = @_; + + # Use state to speed up repeated calls + state $settings = undef; + if (!defined $settings) { + my $config_file = C4::Context->config('elasticsearch_field_config'); + $config_file ||= C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/field_config.yaml'; + $settings = LoadFile( $config_file ); + } + + if (!defined $settings->{$purpose}) { + die "Field purpose $purpose not defined in field config"; + } + if ($type eq '') { + return $settings->{$purpose}; + } + if (defined $settings->{$purpose}{$type}) { + return $settings->{$purpose}{$type}; + } + if (defined $settings->{$purpose}{'default'}) { + return $settings->{$purpose}{'default'}; + } + return; } -sub _elasticsearch_mapping_for_default { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type ) = @_; - - return { - search_analyzer => "analyser_standard", - analyzer => "analyser_standard", - type => $type, - fields => { - phrase => { - search_analyzer => "analyser_phrase", - analyzer => "analyser_phrase", - type => "text", - }, - raw => { - type => "keyword", - } - }, - }; +=head2 _load_elasticsearch_mappings + +Load Elasticsearch mappings in the format of mappings.yaml. + +$indexes = _load_elasticsearch_mappings(); + +=cut + +sub _load_elasticsearch_mappings { + my $mappings_yaml = C4::Context->config('elasticsearch_index_mappings'); + $mappings_yaml ||= C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/mappings.yaml'; + return LoadFile( $mappings_yaml ); } sub reset_elasticsearch_mappings { - my $mappings_yaml = C4::Context->config('intranetdir') . '/admin/searchengine/elasticsearch/mappings.yaml'; - my $indexes = LoadFile( $mappings_yaml ); + my ( $self ) = @_; + my $indexes = $self->_load_elasticsearch_mappings(); + + Koha::SearchMarcMaps->delete; + Koha::SearchFields->delete; while ( my ( $index_name, $fields ) = each %$indexes ) { while ( my ( $field_name, $data ) = each %$fields ) { - my $field_type = $data->{type}; - my $field_label = $data->{label}; + + my %sf_params = map { $_ => $data->{$_} } grep { exists $data->{$_} } qw/ type label weight staff_client opac facet_order /; + + # Set default values + $sf_params{staff_client} //= 1; + $sf_params{opac} //= 1; + + $sf_params{name} = $field_name; + + my $search_field = Koha::SearchFields->find_or_create( \%sf_params, { key => 'name' } ); + my $mappings = $data->{mappings}; - my $search_field = Koha::SearchFields->find_or_create({ name => $field_name, label => $field_label, type => $field_type }, { key => 'name' }); for my $mapping ( @$mappings ) { - my $marc_field = Koha::SearchMarcMaps->find_or_create({ index_name => $index_name, marc_type => $mapping->{marc_type}, marc_field => $mapping->{marc_field} }); - $search_field->add_to_search_marc_maps($marc_field, { facet => $mapping->{facet} || 0, suggestible => $mapping->{suggestible} || 0, sort => $mapping->{sort} } ); + my $marc_field = Koha::SearchMarcMaps->find_or_create({ + index_name => $index_name, + marc_type => $mapping->{marc_type}, + marc_field => $mapping->{marc_field} + }); + $search_field->add_to_search_marc_maps($marc_field, { + facet => $mapping->{facet} || 0, + suggestible => $mapping->{suggestible} || 0, + sort => $mapping->{sort}, + search => $mapping->{search} // 1 + }); } } } @@ -312,53 +347,595 @@ sub sort_fields { return $self->_sort_fields_accessor(); } -# Provides the rules for data conversion. -sub get_fixer_rules { - my ($self) = @_; +=head2 _process_mappings($mappings, $data, $record_document, $altscript) + $self->_process_mappings($mappings, $marc_field_data, $record_document, 0) + +Process all C<$mappings> targets operating on a specific MARC field C<$data>. +Since we group all mappings by MARC field targets C<$mappings> will contain +all targets for C<$data> and thus we need to fetch the MARC field only once. +C<$mappings> will be applied to C<$record_document> and new field values added. +The method has no return value. + +=over 4 + +=item C<$mappings> + +Arrayref of mappings containing arrayrefs in the format +[C<$target>, C<$options>] where C<$target> is the name of the target field and +C<$options> is a hashref containing processing directives for this particular +mapping. + +=item C<$data> + +The source data from a MARC record field. + +=item C<$record_document> + +Hashref representing the Elasticsearch document on which mappings should be +applied. + +=item C<$altscript> + +A boolean value indicating whether an alternate script presentation is being +processed. + +=back + +=cut + +sub _process_mappings { + my ($_self, $mappings, $data, $record_document, $altscript) = @_; + foreach my $mapping (@{$mappings}) { + my ($target, $options) = @{$mapping}; + + # Don't process sort fields for alternate scripts + my $sort = $target =~ /__sort$/; + if ($sort && $altscript) { + next; + } + + # Copy (scalar) data since can have multiple targets + # with differing options for (possibly) mutating data + # so need a different copy for each + my $_data = $data; + $record_document->{$target} //= []; + if (defined $options->{substr}) { + my ($start, $length) = @{$options->{substr}}; + $_data = length($data) > $start ? substr $data, $start, $length : ''; + } + if (defined $options->{value_callbacks}) { + $_data = reduce { $b->($a) } ($_data, @{$options->{value_callbacks}}); + } + if (defined $options->{property}) { + $_data = { + $options->{property} => $_data + } + } + push @{$record_document->{$target}}, $_data; + } +} + +=head2 marc_records_to_documents($marc_records) + + my $record_documents = $self->marc_records_to_documents($marc_records); + +Using mappings stored in database convert C<$marc_records> to Elasticsearch documents. + +Returns array of hash references, representing Elasticsearch documents, +acceptable as body payload in C requests. + +=over 4 + +=item C<$marc_documents> + +Reference to array of C objects to be converted to Elasticsearch documents. + +=back + +=cut + +sub marc_records_to_documents { + my ($self, $records) = @_; + my $rules = $self->_get_marc_mapping_rules(); + my $control_fields_rules = $rules->{control_fields}; + my $data_fields_rules = $rules->{data_fields}; my $marcflavour = lc C4::Context->preference('marcflavour'); - my @rules; + my $use_array = C4::Context->preference('ElasticsearchMARCFormat') eq 'ARRAY'; - $self->_foreach_mapping( - sub { - my ( $name, $type, $facet, $suggestible, $sort, $marc_type, $marc_field ) = @_; - return if $marc_type ne $marcflavour; - my $options = ''; - - # There's a bug when using 'split' with something that - # selects a range - # The split makes everything into nested arrays, but that's not - # really a big deal, ES doesn't mind. - $options = '' unless $marc_field =~ m|_/| || $type eq 'sum'; - push @rules, "marc_map('$marc_field','${name}.\$append', $options)"; - if ($facet) { - push @rules, "marc_map('$marc_field','${name}__facet.\$append', $options)"; + my @record_documents; + + foreach my $record (@{$records}) { + my $record_document = {}; + my $mappings = $rules->{leader}; + if ($mappings) { + $self->_process_mappings($mappings, $record->leader(), $record_document, 0); + } + foreach my $field ($record->fields()) { + if ($field->is_control_field()) { + my $mappings = $control_fields_rules->{$field->tag()}; + if ($mappings) { + $self->_process_mappings($mappings, $field->data(), $record_document, 0); + } + } + else { + my $tag = $field->tag(); + # Handle alternate scripts in MARC 21 + my $altscript = 0; + if ($marcflavour eq 'marc21' && $tag eq '880') { + my $sub6 = $field->subfield('6'); + if ($sub6 =~ /^(...)-\d+/) { + $tag = $1; + $altscript = 1; + } + } + + my $data_field_rules = $data_fields_rules->{$tag}; + if ($data_field_rules) { + my $subfields_mappings = $data_field_rules->{subfields}; + my $wildcard_mappings = $subfields_mappings->{'*'}; + foreach my $subfield ($field->subfields()) { + my ($code, $data) = @{$subfield}; + my $mappings = $subfields_mappings->{$code} // []; + if ($wildcard_mappings) { + $mappings = [@{$mappings}, @{$wildcard_mappings}]; + } + if (@{$mappings}) { + $self->_process_mappings($mappings, $data, $record_document, $altscript); + } + if ( defined @{$mappings}[0] && grep /match-heading/, @{@{$mappings}[0]} ){ + # Used by the authority linker the match-heading field requires a specific syntax + # that is specified in C4/Heading + my $heading = C4::Heading->new_from_field( $field, undef, 1 ); #new auth heading + next unless $heading; + push @{$record_document->{'match-heading'}}, $heading->search_form; + } + } + + my $subfields_join_mappings = $data_field_rules->{subfields_join}; + if ($subfields_join_mappings) { + foreach my $subfields_group (keys %{$subfields_join_mappings}) { + # Map each subfield to values, remove empty values, join with space + my $data = join( + ' ', + grep( + $_, + map { join(' ', $field->subfield($_)) } split(//, $subfields_group) + ) + ); + if ($data) { + $self->_process_mappings($subfields_join_mappings->{$subfields_group}, $data, $record_document, $altscript); + } + if ( grep { $_->[0] eq 'match-heading' } @{$subfields_join_mappings->{$subfields_group}} ){ + # Used by the authority linker the match-heading field requires a specific syntax + # that is specified in C4/Heading + my $heading = C4::Heading->new_from_field( $field, undef, 1 ); #new auth heading + next unless $heading; + push @{$record_document->{'match-heading'}}, $heading->search_form; + } + } + } + } + } + } + foreach my $field (keys %{$rules->{defaults}}) { + unless (defined $record_document->{$field}) { + $record_document->{$field} = $rules->{defaults}->{$field}; } - if ($suggestible) { - push @rules, - #"marc_map('$marc_field','${name}__suggestion.input.\$append', $options)"; #must not have nested data structures in .input - "marc_map('$marc_field','${name}__suggestion.input.\$append')"; + } + foreach my $field (@{$rules->{sum}}) { + if (defined $record_document->{$field}) { + # TODO: validate numeric? filter? + # TODO: Or should only accept fields without nested values? + # TODO: Quick and dirty, improve if needed + $record_document->{$field} = sum0(grep { !ref($_) && m/\d+(\.\d+)?/} @{$record_document->{$field}}); + } + } + # Index all applicable ISBN forms (ISBN-10 and ISBN-13 with and without dashes) + foreach my $field (@{$rules->{isbn}}) { + if (defined $record_document->{$field}) { + my @isbns = (); + foreach my $input_isbn (@{$record_document->{$field}}) { + my $isbn = Business::ISBN->new($input_isbn); + if (defined $isbn && $isbn->is_valid) { + my $isbn13 = $isbn->as_isbn13->as_string; + push @isbns, $isbn13; + $isbn13 =~ s/\-//g; + push @isbns, $isbn13; + + my $isbn10 = $isbn->as_isbn10; + if ($isbn10) { + $isbn10 = $isbn10->as_string; + push @isbns, $isbn10; + $isbn10 =~ s/\-//g; + push @isbns, $isbn10; + } + } else { + push @isbns, $input_isbn; + } + } + $record_document->{$field} = \@isbns; } - if ( $type eq 'boolean' ) { + } - # boolean gets special handling, basically if it doesn't exist, - # it's added and set to false. Otherwise we can't query it. - push @rules, - "unless exists('$name') add_field('$name', 0) end"; + # Remove duplicate values and collapse sort fields + foreach my $field (keys %{$record_document}) { + if (ref($record_document->{$field}) eq 'ARRAY') { + @{$record_document->{$field}} = do { + my %seen; + grep { !$seen{ref($_) eq 'HASH' && defined $_->{input} ? $_->{input} : $_}++ } @{$record_document->{$field}}; + }; + if ($field =~ /__sort$/) { + # Make sure to keep the sort field length sensible. 255 was chosen as a nice round value. + $record_document->{$field} = [substr(join(' ', @{$record_document->{$field}}), 0, 255)]; + } } - if ($type eq 'sum' ) { - push @rules, "sum('$name')"; + } + + # TODO: Perhaps should check if $records_document non empty, but really should never be the case + $record->encoding('UTF-8'); + if ($use_array) { + $record_document->{'marc_data_array'} = $self->_marc_to_array($record); + $record_document->{'marc_format'} = 'ARRAY'; + } else { + my @warnings; + { + # Temporarily intercept all warn signals (MARC::Record carps when record length > 99999) + local $SIG{__WARN__} = sub { + push @warnings, $_[0]; + }; + $record_document->{'marc_data'} = encode_base64(encode('UTF-8', $record->as_usmarc())); } - if ($self->sort_fields()->{$name}) { - if ($sort || !defined $sort) { - push @rules, "marc_map('$marc_field','${name}__sort.\$append', $options)"; + if (@warnings) { + # Suppress warnings if record length exceeded + unless (substr($record->leader(), 0, 5) eq '99999') { + foreach my $warning (@warnings) { + carp $warning; + } } + $record_document->{'marc_data'} = $record->as_xml_record($marcflavour); + $record_document->{'marc_format'} = 'MARCXML'; + } + else { + $record_document->{'marc_format'} = 'base64ISO2709'; } } - ); + push @record_documents, $record_document; + } + return \@record_documents; +} + +=head2 _marc_to_array($record) + + my @fields = _marc_to_array($record) + +Convert a MARC::Record to an array modeled after MARC-in-JSON +(see https://github.com/marc4j/marc4j/wiki/MARC-in-JSON-Description) + +=over 4 + +=item C<$record> + +A MARC::Record object + +=back + +=cut + +sub _marc_to_array { + my ($self, $record) = @_; + + my $data = { + leader => $record->leader(), + fields => [] + }; + for my $field ($record->fields()) { + my $tag = $field->tag(); + if ($field->is_control_field()) { + push @{$data->{fields}}, {$tag => $field->data()}; + } else { + my $subfields = (); + foreach my $subfield ($field->subfields()) { + my ($code, $contents) = @{$subfield}; + push @{$subfields}, {$code => $contents}; + } + push @{$data->{fields}}, { + $tag => { + ind1 => $field->indicator(1), + ind2 => $field->indicator(2), + subfields => $subfields + } + }; + } + } + return $data; +} + +=head2 _array_to_marc($data) + + my $record = _array_to_marc($data) + +Convert an array modeled after MARC-in-JSON to a MARC::Record + +=over 4 + +=item C<$data> + +An array modeled after MARC-in-JSON +(see https://github.com/marc4j/marc4j/wiki/MARC-in-JSON-Description) + +=back + +=cut - push @rules, "move_field(_id,es_id)"; #Also you must set the Catmandu::Store::ElasticSearch->new(key_prefix: 'es_'); - return \@rules; +sub _array_to_marc { + my ($self, $data) = @_; + + my $record = MARC::Record->new(); + + $record->leader($data->{leader}); + for my $field (@{$data->{fields}}) { + my $tag = (keys %{$field})[0]; + $field = $field->{$tag}; + my $marc_field; + if (ref($field) eq 'HASH') { + my @subfields; + foreach my $subfield (@{$field->{subfields}}) { + my $code = (keys %{$subfield})[0]; + push @subfields, $code; + push @subfields, $subfield->{$code}; + } + $marc_field = MARC::Field->new($tag, $field->{ind1}, $field->{ind2}, @subfields); + } else { + $marc_field = MARC::Field->new($tag, $field) + } + $record->append_fields($marc_field); + } +; + return $record; +} + +=head2 _field_mappings($facet, $suggestible, $sort, $search, $target_name, $target_type, $range) + + my @mappings = _field_mappings($facet, $suggestible, $sort, $search, $target_name, $target_type, $range) + +Get mappings, an internal data structure later used by +L<_process_mappings($mappings, $data, $record_document, $altscript)> to process MARC target +data for a MARC mapping. + +The returned C<$mappings> is not to to be confused with mappings provided by +C<_foreach_mapping>, rather this sub accepts properties from a mapping as +provided by C<_foreach_mapping> and expands it to this internal data structure. +In the caller context (C<_get_marc_mapping_rules>) the returned C<@mappings> +is then applied to each MARC target (leader, control field data, subfield or +joined subfields) and integrated into the mapping rules data structure used in +C to transform MARC records into Elasticsearch +documents. + +=over 4 + +=item C<$facet> + +Boolean indicating whether to create a facet field for this mapping. + +=item C<$suggestible> + +Boolean indicating whether to create a suggestion field for this mapping. + +=item C<$sort> + +Boolean indicating whether to create a sort field for this mapping. + +=item C<$search> + +Boolean indicating whether to create a search field for this mapping. + +=item C<$target_name> + +Elasticsearch document target field name. + +=item C<$target_type> + +Elasticsearch document target field type. + +=item C<$range> + +An optional range as a string in the format "-" or "", +where "" and "" are integers specifying a range that will be used +for extracting a substring from MARC data as Elasticsearch field target value. + +The first character position is "0", and the range is inclusive, +so "0-2" means the first three characters of MARC data. + +If only "" is provided only one character at position "" will +be extracted. + +=back + +=cut + +sub _field_mappings { + my ($_self, $facet, $suggestible, $sort, $search, $target_name, $target_type, $range) = @_; + my %mapping_defaults = (); + my @mappings; + + my $substr_args = undef; + if (defined $range) { + # TODO: use value_callback instead? + my ($start, $end) = map(int, split /-/, $range, 2); + $substr_args = [$start]; + push @{$substr_args}, (defined $end ? $end - $start + 1 : 1); + } + my $default_options = {}; + if ($substr_args) { + $default_options->{substr} = $substr_args; + } + + # TODO: Should probably have per type value callback/hook + # but hard code for now + if ($target_type eq 'boolean') { + $default_options->{value_callbacks} //= []; + push @{$default_options->{value_callbacks}}, sub { + my ($value) = @_; + # Trim whitespace at both ends + $value =~ s/^\s+|\s+$//g; + return $value ? 'true' : 'false'; + }; + } + + if ($search) { + my $mapping = [$target_name, $default_options]; + push @mappings, $mapping; + } + + my @suffixes = (); + push @suffixes, 'facet' if $facet; + push @suffixes, 'suggestion' if $suggestible; + push @suffixes, 'sort' if !defined $sort || $sort; + + foreach my $suffix (@suffixes) { + my $mapping = ["${target_name}__$suffix"]; + # TODO: Hack, fix later in less hideous manner + if ($suffix eq 'suggestion') { + push @{$mapping}, {%{$default_options}, property => 'input'}; + } + else { + push @{$mapping}, $default_options; + } + push @mappings, $mapping; + } + return @mappings; +}; + +=head2 _get_marc_mapping_rules + + my $mapping_rules = $self->_get_marc_mapping_rules() + +Generates rules from mappings stored in database for MARC records to Elasticsearch JSON document conversion. + +Since field retrieval is slow in C (all fields are itereted through for +each call to C->field) we create an optimized structure of mapping +rules keyed by MARC field tags holding all the mapping rules for that particular tag. + +We can then iterate through all MARC fields for each record and apply all relevant +rules once per fields instead of retreiving fields multiple times for each mapping rule +which is terribly slow. + +=cut + +# TODO: This structure can be used for processing multiple MARC::Records so is currently +# rebuilt for each batch. Since it is cacheable it could also be stored in an in +# memory cache which it is currently not. The performance gain of caching +# would probably be marginal, but to do this could be a further improvement. + +sub _get_marc_mapping_rules { + my ($self) = @_; + my $marcflavour = lc C4::Context->preference('marcflavour'); + my $field_spec_regexp = qr/^([0-9]{3})([()0-9a-zA-Z]+)?(?:_\/(\d+(?:-\d+)?))?$/; + my $leader_regexp = qr/^leader(?:_\/(\d+(?:-\d+)?))?$/; + my $rules = { + 'leader' => [], + 'control_fields' => {}, + 'data_fields' => {}, + 'sum' => [], + 'isbn' => [], + 'defaults' => {} + }; + + $self->_foreach_mapping(sub { + my ($name, $type, $facet, $suggestible, $sort, $search, $marc_type, $marc_field) = @_; + return if $marc_type ne $marcflavour; + + if ($type eq 'sum') { + push @{$rules->{sum}}, $name; + push @{$rules->{sum}}, $name."__sort" if $sort; + } + elsif ($type eq 'isbn') { + push @{$rules->{isbn}}, $name; + } + elsif ($type eq 'boolean') { + # boolean gets special handling, if value doesn't exist for a field, + # it is set to false + $rules->{defaults}->{$name} = 'false'; + } + + if ($marc_field =~ $field_spec_regexp) { + my $field_tag = $1; + + my @subfields; + my @subfield_groups; + # Parse and separate subfields form subfield groups + if (defined $2) { + my $subfield_group = ''; + my $open_group = 0; + + foreach my $token (split //, $2) { + if ($token eq "(") { + if ($open_group) { + Koha::Exceptions::Elasticsearch::MARCFieldExprParseError->throw( + "Unmatched opening parenthesis for $marc_field" + ); + } + else { + $open_group = 1; + } + } + elsif ($token eq ")") { + if ($open_group) { + if ($subfield_group) { + push @subfield_groups, $subfield_group; + $subfield_group = ''; + } + $open_group = 0; + } + else { + Koha::Exceptions::Elasticsearch::MARCFieldExprParseError->throw( + "Unmatched closing parenthesis for $marc_field" + ); + } + } + elsif ($open_group) { + $subfield_group .= $token; + } + else { + push @subfields, $token; + } + } + } + else { + push @subfields, '*'; + } + + my $range = defined $3 ? $3 : undef; + my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $search, $name, $type, $range); + if ($field_tag < 10) { + $rules->{control_fields}->{$field_tag} //= []; + push @{$rules->{control_fields}->{$field_tag}}, @mappings; + } + else { + $rules->{data_fields}->{$field_tag} //= {}; + foreach my $subfield (@subfields) { + $rules->{data_fields}->{$field_tag}->{subfields}->{$subfield} //= []; + push @{$rules->{data_fields}->{$field_tag}->{subfields}->{$subfield}}, @mappings; + } + foreach my $subfield_group (@subfield_groups) { + $rules->{data_fields}->{$field_tag}->{subfields_join}->{$subfield_group} //= []; + push @{$rules->{data_fields}->{$field_tag}->{subfields_join}->{$subfield_group}}, @mappings; + } + } + } + elsif ($marc_field =~ $leader_regexp) { + my $range = defined $1 ? $1 : undef; + my @mappings = $self->_field_mappings($facet, $suggestible, $sort, $search, $name, $type, $range); + push @{$rules->{leader}}, @mappings; + } + else { + Koha::Exceptions::Elasticsearch::MARCFieldExprParseError->throw( + "Invalid MARC field expression: $marc_field" + ); + } + }); + return $rules; } =head2 _foreach_mapping @@ -428,6 +1005,7 @@ sub _foreach_mapping { 'search_marc_to_fields.facet', 'search_marc_to_fields.suggestible', 'search_marc_to_fields.sort', + 'search_marc_to_fields.search', 'search_marc_map.marc_type', 'search_marc_map.marc_field', ], @@ -435,6 +1013,7 @@ sub _foreach_mapping { 'facet', 'suggestible', 'sort', + 'search', 'marc_type', 'marc_field', ], @@ -443,11 +1022,14 @@ sub _foreach_mapping { while ( my $search_field = $search_fields->next ) { $sub->( - $search_field->name, + # Force lower case on indexed field names for case insensitive + # field name searches + lc($search_field->name), $search_field->type, $search_field->get_column('facet'), $search_field->get_column('suggestible'), $search_field->get_column('sort'), + $search_field->get_column('search'), $search_field->get_column('marc_type'), $search_field->get_column('marc_field'), ); @@ -537,6 +1119,30 @@ sub _read_configuration { return $configuration; } +=head2 get_facetable_fields + +my @facetable_fields = Koha::SearchEngine::Elasticsearch->get_facetable_fields(); + +Returns the list of Koha::SearchFields marked to be faceted in the ES configuration + +=cut + +sub get_facetable_fields { + my ($self) = @_; + + # These should correspond to the ES field names, as opposed to the CCL + # things that zebra uses. + my @search_field_names = qw( author itype location su-geo title-series subject ccode holdingbranch homebranch ln ); + my @faceted_fields = Koha::SearchFields->search( + { name => { -in => \@search_field_names }, facet_order => { '!=' => undef } }, { order_by => ['facet_order'] } + ); + my @not_faceted_fields = Koha::SearchFields->search( + { name => { -in => \@search_field_names }, facet_order => undef }, { order_by => ['facet_order'] } + ); + # This could certainly be improved + return ( @faceted_fields, @not_faceted_fields ); +} + 1; __END__