Revert "Bug 9828: More specific indexing of UNIMARC 6XX fields"

[koha_ffzg] / C4 / Search.pm
diff --git a/C4/Search.pm b/C4/Search.pm

index e90ecc9..1326f69 100644 (file)
--- a/C4/Search.pm
+++ b/C4/Search.pm
@@ -343,7 +343,6 @@ sub getRecords {
      my $facets_counter = {};
      my $facets_info    = {};
      my $facets         = getFacets();
-    my $facets_maxrecs = C4::Context->preference('maxRecordsForFacets')||20;
  
      my @facets_loop;    # stores the ref to array of hashes for template facets loop
  
@@ -502,31 +501,10 @@ sub getRecords {
                  # Fill the facets while we're looping, but only for the
                  # biblioserver and not for a scan
                  if ( !$scan && $servers[ $i - 1 ] =~ /biblioserver/ ) {
-
-                    my $jmax = $size > $facets_maxrecs
-                                ? $facets_maxrecs
-                                : $size;
-
-                    for ( my $j = 0 ; $j < $jmax ; $j++ ) {
-
-                        my $marc_record = new_record_from_zebra (
-                                'biblioserver',
-                                $results[ $i - 1 ]->record($j)->raw()
-                        );
-
-                        if ( ! defined $marc_record ) {
-                            warn "ERROR DECODING RECORD - $@: " .
-                                $results[ $i - 1 ]->record($j)->raw();
-                            next;
-                        }
-
-                        _get_facets_data_from_record( $marc_record, $facets, $facets_counter, $facets_info );
-                    }
+                    $facets_counter = GetFacets( $results[ $i - 1 ] );
+                    $facets_info    = _get_facets_info( $facets );
                  }
  
-                # warn "connection ", $i-1, ": $size hits";
-                # warn $results[$i-1]->record(0)->render() if $size > 0;
-
                  # BUILD FACETS
                  if ( $servers[ $i - 1 ] =~ /biblioserver/ ) {
                      for my $link_value (
@@ -651,6 +629,55 @@ sub getRecords {
      return ( undef, $results_hashref, \@facets_loop );
  }
  
+sub GetFacets {
+
+    my $rs = shift;
+    my $facets;
+
+    my $indexing_mode    = C4::Context->config('zebra_bib_index_mode') // 'dom';
+    my $use_zebra_facets = C4::Context->config('use_zebra_facets') // 0;
+
+    if ( $indexing_mode eq 'dom' &&
+         $use_zebra_facets ) {
+        $facets = _get_facets_from_zebra( $rs );
+    } else {
+        $facets = _get_facets_from_records( $rs );
+    }
+
+    return $facets;
+}
+
+sub _get_facets_from_records {
+
+    my $rs = shift;
+
+    my $facets_maxrecs = C4::Context->preference('maxRecordsForFacets') // 20;
+    my $facets_config  = getFacets();
+    my $facets         = {};
+    my $size           = $rs->size();
+    my $jmax           = $size > $facets_maxrecs
+                            ? $facets_maxrecs
+                            : $size;
+
+    for ( my $j = 0 ; $j < $jmax ; $j++ ) {
+
+        my $marc_record = new_record_from_zebra (
+                'biblioserver',
+                $rs->record( $j )->raw()
+        );
+
+        if ( ! defined $marc_record ) {
+            warn "ERROR DECODING RECORD - $@: " .
+                $rs->record( $j )->raw();
+            next;
+        }
+
+        _get_facets_data_from_record( $marc_record, $facets_config, $facets );
+    }
+
+    return $facets;
+}
+
  =head2 _get_facets_data_from_record
  
      C4::Search::_get_facets_data_from_record( $marc_record, $facets, $facets_counter );
@@ -665,7 +692,7 @@ facets for Zebra).
  
  sub _get_facets_data_from_record {
  
-    my ( $marc_record, $facets, $facets_counter, $facets_info ) = @_;
+    my ( $marc_record, $facets, $facets_counter ) = @_;
  
      for my $facet (@$facets) {
  
@@ -673,14 +700,17 @@ sub _get_facets_data_from_record {
  
          foreach my $tag ( @{ $facet->{ tags } } ) {
  
-            # avoid first line
+            # tag number is the first three digits
              my $tag_num          = substr( $tag, 0, 3 );
+            # subfields are the remainder
              my $subfield_letters = substr( $tag, 3 );
-            # Removed when as_string fixed
-            my @subfields = $subfield_letters =~ /./sg;
  
              my @fields = $marc_record->field( $tag_num );
              foreach my $field (@fields) {
+                # If $field->indicator(1) eq 'z', it means it is a 'see from'
+                # field introduced because of IncludeSeeFromInSearches, so skip it
+                next if $field->indicator(1) eq 'z';
+
                  my $data = $field->as_string( $subfield_letters, $facet->{ sep } );
  
                  unless ( grep { /^\Q$data\E$/ } @used_datas ) {
@@ -689,10 +719,125 @@ sub _get_facets_data_from_record {
                  }
              }
          }
-        # update $facets_info so we know what facet categories need to be rendered
+    }
+}
+
+=head2 _get_facets_from_zebra
+
+    my $facets = _get_facets_from_zebra( $result_set )
+
+Retrieves facets for a specified result set. It loops through the facets defined
+in C4::Koha::getFacets and returns a hash with the following structure:
+
+   {  facet_idx => {
+            facet_value => count
+      },
+      ...
+   }
+
+=cut
+
+sub _get_facets_from_zebra {
+
+    my $rs = shift;
+
+    # save current elementSetName
+    my $elementSetName = $rs->option( 'elementSetName' );
+
+    my $facets_loop = getFacets();
+    my $facets_data  = {};
+    # loop through defined facets and fill the facets hashref
+    foreach my $facet ( @$facets_loop ) {
+
+        my $idx = $facet->{ idx };
+        my $sep = $facet->{ sep };
+        my $facet_values = _get_facet_from_result_set( $idx, $rs, $sep );
+        if ( $facet_values ) {
+            # we've actually got a result
+            $facets_data->{ $idx } = $facet_values;
+        }
+    }
+    # set elementSetName to its previous value to avoid side effects
+    $rs->option( elementSetName => $elementSetName );
+
+    return $facets_data;
+}
+
+=head2 _get_facet_from_result_set
+
+    my $facet_values =
+        C4::Search::_get_facet_from_result_set( $facet_idx, $result_set, $sep )
+
+Internal function that extracts facet information for a specific index ($facet_idx) and
+returns a hash containing facet values and count:
+
+    {
+        $facet_value => $count ,
+        ...
+    }
+
+Warning: this function has the side effect of changing the elementSetName for the result
+set. It is a helper function for the main loop, which takes care of backing it up for
+restoring.
+
+=cut
+
+sub _get_facet_from_result_set {
+
+    my $facet_idx = shift;
+    my $rs        = shift;
+    my $sep       = shift;
+
+    my $internal_sep  = '<*>';
+    my $facetMaxCount = C4::Context->preference('FacetMaxCount') // 20;
+
+    return if ( ! defined $facet_idx || ! defined $rs );
+    # zebra's facet element, untokenized index
+    my $facet_element = 'zebra::facet::' . $facet_idx . ':0:' . $facetMaxCount;
+    # configure zebra results for retrieving the desired facet
+    $rs->option( elementSetName => $facet_element );
+    # get the facet record from result set
+    my $facet = $rs->record( 0 )->raw;
+    # if the facet has no restuls...
+    return if !defined $facet;
+    # TODO: benchmark DOM vs. SAX performance
+    my $facet_dom = XML::LibXML->load_xml(
+      string => ($facet)
+    );
+    my @terms = $facet_dom->getElementsByTagName('term');
+    return if ! @terms;
+
+    my $facets = {};
+    foreach my $term ( @terms ) {
+        my $facet_value = $term->textContent;
+        $facet_value =~ s/\Q$internal_sep\E/$sep/ if defined $sep;
+        $facets->{ $facet_value } = $term->getAttribute( 'occur' );
+    }
+
+    return $facets;
+}
+
+=head2 _get_facets_info
+
+    my $facets_info = C4::Search::_get_facets_info( $facets )
+
+Internal function that extracts facets information and properly builds
+the data structure needed to render facet labels.
+
+=cut
+
+sub _get_facets_info {
+
+    my $facets = shift;
+
+    my $facets_info = {};
+
+    for my $facet ( @$facets ) {
          $facets_info->{ $facet->{ idx } }->{ label_value } = $facet->{ label };
          $facets_info->{ $facet->{ idx } }->{ expanded }    = $facet->{ expanded };
      }
+
+    return $facets_info;
  }
  
  sub pazGetRecords {
@@ -1393,17 +1538,28 @@ sub buildQuery {
                  my $index   = $indexes[$i];
  
                  # Add index-specific attributes
+
+                #Afaik, this 'yr' condition will only ever be met in the staff client advanced search
+                #for "Publication date", since typing 'yr:YYYY' into the search box produces a CCL query,
+                #which is processed higher up in this sub. Other than that, year searches are typically
+                #handled as limits which are not processed her either.
+
                  # Date of Publication
-                if ( $index eq 'yr' ) {
-                    $index .= ",st-numeric";
-                    $indexes_set++;
+                if ( $index =~ /yr/ ) {
+                    #weight_fields/relevance search causes errors with date ranges
+                    #In the case of YYYY-, it will only return records with a 'yr' of YYYY (not the range)
+                    #In the case of YYYY-YYYY, it will return no results
                                         $stemming = $auto_truncation = $weight_fields = $fuzzy_enabled = $remove_stopwords = 0;
                  }
  
                  # Date of Acquisition
-                elsif ( $index eq 'acqdate' ) {
-                    $index .= ",st-date-normalized";
-                    $indexes_set++;
+                elsif ( $index =~ /acqdate/ ) {
+                    #stemming and auto_truncation would have zero impact since it already is YYYY-MM-DD format
+                    #Weight_fields probably SHOULD be turned OFF, otherwise you'll get records floating to the
+                      #top of the results just because they have lots of item records matching that date.
+                    #Fuzzy actually only applies during _build_weighted_query, and is reset there anyway, so
+                      #irrelevant here
+                    #remove_stopwords doesn't function anymore so is irrelevant
                                         $stemming = $auto_truncation = $weight_fields = $fuzzy_enabled = $remove_stopwords = 0;
                  }
                  # ISBN,ISSN,Standard Number, don't need special treatment
@@ -1590,9 +1746,13 @@ sub buildQuery {
      # This is flawed , means we can't search anything with : in it
      # if user wants to do ccl or cql, start the query with that
  #    $query =~ s/:/=/g;
+    #NOTE: We use several several different regexps here as you can't have variable length lookback assertions
      $query =~ s/(?<=(ti|au|pb|su|an|kw|mc|nb|ns)):/=/g;
      $query =~ s/(?<=(wrdl)):/=/g;
      $query =~ s/(?<=(trn|phr)):/=/g;
+    $query =~ s/(?<=(st-numeric)):/=/g;
+    $query =~ s/(?<=(st-year)):/=/g;
+    $query =~ s/(?<=(st-date-normalized)):/=/g;
      $limit =~ s/:/=/g;
      for ( $query, $query_desc, $limit, $limit_desc ) {
          s/  +/ /g;    # remove extra spaces
@@ -2433,7 +2593,7 @@ sub new_record_from_zebra {
      my $raw_data = shift;
      # Set the default indexing modes
      my $index_mode = ( $server eq 'biblioserver' )
-                        ? C4::Context->config('zebra_bib_index_mode') // 'grs1'
+                        ? C4::Context->config('zebra_bib_index_mode') // 'dom'
                          : C4::Context->config('zebra_auth_index_mode') // 'dom';
  
      my $marc_record =  eval {