From e16a034e9cb6cd67851fa9a0b107ea78f6ba42d7 Mon Sep 17 00:00:00 2001 From: Henri-Damien LAURENT Date: Fri, 9 Oct 2009 11:48:14 +0200 Subject: [PATCH] Remove Stopwords bug fixing MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit in french, les is a stopword Modèles would match because of the combining and \P{IsAlnum} would not detect that. --- C4/Search.pm | 6 +++++- t/db_dependent/Search.t | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 t/db_dependent/Search.t diff --git a/C4/Search.pm b/C4/Search.pm index 9d690e0f97..c47b130d1f 100644 --- a/C4/Search.pm +++ b/C4/Search.pm @@ -27,6 +27,8 @@ use XML::Simple; use C4::Dates qw(format_date); use C4::XSLT; use C4::Branch; +use C4::Debug; +use YAML; use URI::Escape; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $DEBUG); @@ -645,10 +647,12 @@ sub _remove_stopwords { # we use IsAlpha unicode definition, to deal correctly with diacritics. # otherwise, a French word like "leçon" woudl be split into "le" "çon", "le" # is a stopword, we'd get "çon" and wouldn't find anything... +# foreach ( keys %{ C4::Context->stopwords } ) { next if ( $_ =~ /(and|or|not)/ ); # don't remove operators + $debug && warn "$_ Dump($operand)"; if ( my ($matched) = ($operand =~ - /(\P{IsAlnum}\Q$_\E\P{IsAlnum}|^\Q$_\E\P{IsAlnum}|\P{IsAlnum}\Q$_\E$|^\Q$_\E$)/gi) ) + /([^\X\p{isAlnum}]\Q$_\E[^\X\p{isAlnum}]|[^\X\p{isAlnum}]\Q$_\E$|^\Q$_\E[^\X\p{isAlnum}])/gi)) { $operand =~ s/\Q$matched\E/ /gi; push @stopwords_removed, $_; diff --git a/t/db_dependent/Search.t b/t/db_dependent/Search.t new file mode 100644 index 0000000000..00d5b7c84d --- /dev/null +++ b/t/db_dependent/Search.t @@ -0,0 +1,28 @@ +#!/usr/bin/perl +# +# This Koha test module is a stub! +# Add more tests here!!! + +use strict; +use warnings; +use YAML; + +use C4::Debug; +use C4::Context; +use C4::Search; + +use Test::More tests => 3; + +BEGIN { + use_ok('C4::Search'); +} +foreach my $string ("Leçon","mod\xc3\xa8les"){ +my @results=C4::Search::_remove_stopwords($string,"kw"); +$debug && warn "$string ",Dump(@results); +ok($results[0] eq $string,"$string is not modified"); +} +foreach my $string ("Les chaussettes de l'archiduchesse"){ +my @results=C4::Search::_remove_stopwords($string,"kw"); +$debug && warn "$string ",Dump(@results); +ok($results[0] ne $string,"$results[0] from $string"); +} -- 2.11.0