# STOPWORDS
sub _remove_stopwords {
my ($operand,$index) = @_;
- # phrase and exact-qualified indexes shoudln't have stopwords removed
+ my @stopwords_removed;
+ # phrase and exact-qualified indexes shouldn't have stopwords removed
if ($index!~m/phr|ext/){
# remove stopwords from operand : parse all stopwords & remove them (case insensitive)
# we use IsAlpha unicode definition, to deal correctly with diacritics.
- # otherwise, a french word like "leçon" woudl be split into "le" "çon", le
- # is an empty word, we get "çon" and wouldn't find anything...
+ # otherwise, a French word like "leçon" woudl be split into "le" "çon", le
+ # is an empty word, we'd get "çon" and wouldn't find anything...
foreach (keys %{C4::Context->stopwords}) {
- next if ($_ =~/(and|or|not)/); # don't remove operators
- $operand=~ s/\P{IsAlpha}$_\P{IsAlpha}/ /i;
- $operand=~ s/^$_\P{IsAlpha}/ /i;
- $operand=~ s/\P{IsAlpha}$_$/ /i;
+ next if ($_ =~/(and|or|not)/); # don't remove operators
+ if ($operand =~ /(\P{IsAlpha}$_\P{IsAlpha}|^$_\P{IsAlpha}|\P{IsAlpha}$_$)/) {
+ $operand=~ s/\P{IsAlpha}$_\P{IsAlpha}/ /gi;
+ $operand=~ s/^$_\P{IsAlpha}/ /gi;
+ $operand=~ s/\P{IsAlpha}$_$/ /gi;
+ push @stopwords_removed, $_;
+ }
}
}
- return $operand;
+ return ($operand, \@stopwords_removed);
}
# TRUNCATION
my @limits = @$limits if $limits;
my @sort_by = @$sort_by if $sort_by;
- my $stemming = C4::Context->preference("QueryStemming") || 0;
- my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
- my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
+ my $stemming = C4::Context->preference("QueryStemming") || 0;
+ my $auto_truncation = C4::Context->preference("QueryAutoTruncate") || 0;
+ my $weight_fields = C4::Context->preference("QueryWeightFields") || 0;
+ my $fuzzy_enabled = C4::Context->preference("QueryFuzzy") || 0;
+ my $remove_stopwords = C4::Context->preference("QueryRemoveStopwords") || 0;
my $query = $operands[0];
my $simple_query = $operands[0];
my $query_cgi;
- my $query_search_desc;
+ my $query_desc;
+ my $query_type;
my $limit;
my $limit_cgi;
my $limit_desc;
-# STEP I: determine if this is a form-based / simple query or if it's complex (if complex,
-# pass it off to zebra directly)
+ my $stopwords_removed;
-# check if this is a known query language query, if it is, return immediately,
-# the user is responsible for constructing valid syntax:
+ # for handling ccl, cql, pqf queries in diagnostic mode, skip the rest of the steps
+ # DIAGNOSTIC ONLY!!
if ( $query =~ /^ccl=/ ) {
- return ( undef, $', $', $', '', '', '', 'ccl' );
+ return ( undef, $', $', $', $', '', '', '', '', 'ccl' );
}
if ( $query =~ /^cql=/ ) {
- return ( undef, $', $', $', '', '', '', 'cql' );
+ return ( undef, $', $', $', $', '', '', '', '', 'cql' );
}
if ( $query =~ /^pqf=/ ) {
- return ( undef, $', $', $', '', '', '', 'pqf' );
+ return ( undef, $', $', $', $', '', '', '', '', 'pqf' );
}
-# FIXME: this is bound to be broken now
- if ( $query =~ /(\(|\))/ ) { # sorry, too complex, assume CCL
- return ( undef, $query, $query_cgi, $query_search_desc, $limit, $limit_cgi, $limit_desc, 'ccl' );
+ # pass nested queries directly
+ if ( $query =~ /(\(|\))/ ) {
+ return ( undef, $query, $simple_query, $query_cgi, $query, $limit, $limit_cgi, $limit_desc, $stopwords_removed, 'ccl' );
}
# form-based queries are limited to non-nested at a specific depth, so we can easily
# COMBINE OPERANDS, INDEXES AND OPERATORS
if ( $operands[$i] ) {
- $weight_fields = 0 if $operands[$i] =~ /(:|=)/;
+ # a flag to determine whether or not to add the index to the query
+ my $indexes_set;
+ # if the user is sophisticated enough to specify an index, turn off some defaults
+ if ($operands[$i] =~ /(:|=)/) {
+ $weight_fields = 0;
+ $stemming = 0;
+ $remove_stopwords = 0;
+ }
my $operand = $operands[$i];
my $index = $indexes[$i];
- # if there's no index, don't use one, it will throw a CCL error
+ # some helpful index modifs
my $index_plus = "$index:" if $index;
my $index_plus_comma="$index," if $index;
- # Remove Stopwords
- $operand = _remove_stopwords($operand,$index);
- warn "OPERAND w/out STOPWORDS: >$operand<" if $DEBUG;
-
- my $indexes_set;
+ # Remove Stopwords
+ if ($remove_stopwords) {
+ ($operand, $stopwords_removed) = _remove_stopwords($operand,$index);
+ warn "OPERAND w/out STOPWORDS: >$operand<" if $DEBUG;
+ warn "REMOVED STOPWORDS: @$stopwords_removed" if ($stopwords_removed && $DEBUG);
+ }
# Detect Truncation
my ($nontruncated,$righttruncated,$lefttruncated,$rightlefttruncated,$regexpr);
my $truncated_operand;
($nontruncated,$righttruncated,$lefttruncated,$rightlefttruncated,$regexpr) = _detect_truncation($operand,$index);
warn "TRUNCATION: NON:>@$nontruncated< RIGHT:>@$righttruncated< LEFT:>@$lefttruncated< RIGHTLEFT:>@$rightlefttruncated< REGEX:>@$regexpr<" if $DEBUG;
+
# Apply Truncation
- # Problem is when build_weights gets ahold if this is wraps in quotes which breaks the truncation :/
if (scalar(@$righttruncated)+scalar(@$lefttruncated)+scalar(@$rightlefttruncated)>0){
+ # don't field weight or add the index to the query, we do it here
$indexes_set = 1;
undef $weight_fields;
my $previous_truncation_operand;
$query_cgi .="&op=$operators[$i-1]";
$query_cgi .="&idx=$index" if $index;
$query_cgi .="&q=$operands[$i]" if $operands[$i];
- $query_search_desc .=" $operators[$i-1] $index_plus $operands[$i]";
+ $query_desc .=" $operators[$i-1] $index_plus $operands[$i]";
}
# the default operator is and
$query .= "$operand";
$query_cgi .="&op=and&idx=$index" if $index;
$query_cgi .="&q=$operands[$i]" if $operands[$i];
- $query_search_desc .= " and $index_plus $operands[$i]";
+ $query_desc .= " and $index_plus $operands[$i]";
}
}
+ # there isn't a pervious operand, don't need an operator
else {
# field-weighted queries already have indexes set
$query .=" $index_plus " unless $indexes_set;
$query .= $operand;
- $query_search_desc .= " $index_plus $operands[$i]";
+ $query_desc .= " $index_plus $operands[$i]";
$query_cgi.="&idx=$index" if $index;
$query_cgi.="&q=$operands[$i]" if $operands[$i];
$limit.="($group_OR_limits)";
}
# normalize the strings
- for ($query, $query_search_desc, $limit, $limit_desc) {
+ for ($query, $query_desc, $limit, $limit_desc) {
$_ =~ s/ / /g; # remove extra spaces
$_ =~ s/^ //g; # remove any beginning spaces
- $_ =~ s/ $//g; # remove any beginning spaces
+ $_ =~ s/ $//g; # remove any ending spaces
$_ =~ s/:/=/g; # causes probs for server
$_ =~ s/==/=/g; # remove double == from query
$query_cgi =~ s/^&//;
# append the limit to the query
- $query .= $limit;
+ $query .= " ".$limit;
warn "QUERY:".$query if $DEBUG;
warn "QUERY CGI:".$query_cgi if $DEBUG;
- warn "QUERY DESC:".$query_search_desc if $DEBUG;
+ warn "QUERY DESC:".$query_desc if $DEBUG;
warn "LIMIT:".$limit if $DEBUG;
warn "LIMIT CGI:".$limit_cgi if $DEBUG;
warn "LIMIT DESC:".$limit_desc if $DEBUG;
- return ( undef, $query,$simple_query,$query_cgi,$query_search_desc,$limit,$limit_cgi,$limit_desc );
+ return ( undef, $query,$simple_query,$query_cgi,$query_desc,$limit,$limit_cgi,$limit_desc,$stopwords_removed,$query_type );
}
# IMO this subroutine is pretty messy still -- it's responsible for
=head2 NZanalyse
NZanalyse : get a CQL string as parameter, and returns a list of biblionumber;title,biblionumber;title,...
- the list is builded from inverted index in nozebra SQL table
+ the list is built from an inverted index in the nozebra SQL table
note that title is here only for convenience : the sorting will be very fast when requested on title
if the sorting is requested on something else, we will have to reread all results, and that may be longer.
my ($biblionumbers,$value);
next unless $_;
warn "EXECUTE : $server, $left, $_";
- $sth->execute($server, $left, $_);
+ $sth->execute($server, $left, $_) or warn "execute failed: $!";
while (my ($line,$value) = $sth->fetchrow) {
# if we are dealing with a numeric value, use only numeric results (in case of >=, <=, > or <)
# otherwise, fill the result