Word search with multi-part facets works properly only with Zebra ICU
tokenization. This patch add a new question to Koha command line
installer:
Zebra has two methods to perform records tokenization
and characters normalization: CHR and ICU. ICU is
recommended for catalogs containing non-Latin
characters. (chr, icu) [chr]
How to test:
- perl ./Makefile.PL
- Try each possible value for new parameter
- Take a look at zebradb/etc/default.idx file.
Depending of the parameter you get this line:
icuchain words-icu.xml
or this one:
charmap word-phrase-utf.chr
Signed-off-by: Jared Camins-Esakov <jcamins@cpbibliography.com>
(Note: This patch was previously associated with bug 3216; I moved it to a
separate bug because including ICU is a good idea independent of the fix for
the particular issue described in bug 3216)
Signed-off-by: Paul Poulain <paul.poulain@biblibre.com>
'AUTH_INDEX_MODE' => 'dom',
'ZEBRA_MARC_FORMAT' => 'marc21',
'ZEBRA_LANGUAGE' => 'en',
'AUTH_INDEX_MODE' => 'dom',
'ZEBRA_MARC_FORMAT' => 'marc21',
'ZEBRA_LANGUAGE' => 'en',
+ 'ZEBRA_TOKENIZER' => 'chr',
'ZEBRA_USER' => 'kohauser',
'ZEBRA_PASS' => 'zebrastripes',
'ZEBRA_SRU_HOST' => 'localhost',
'ZEBRA_USER' => 'kohauser',
'ZEBRA_PASS' => 'zebrastripes',
'ZEBRA_SRU_HOST' => 'localhost',
'AUTH_INDEX_MODE' => { 'grs1' => 1, 'dom' => 1 },
'ZEBRA_MARC_FORMAT' => { 'marc21' => 1, 'normarc' => 1, 'unimarc' => 1 }, # FIXME should generate from contents of distributation
'ZEBRA_LANGUAGE' => { 'en' => 1, 'fr' => 1, 'nb' => 1 }, # FIXME should generate from contents of distribution
'AUTH_INDEX_MODE' => { 'grs1' => 1, 'dom' => 1 },
'ZEBRA_MARC_FORMAT' => { 'marc21' => 1, 'normarc' => 1, 'unimarc' => 1 }, # FIXME should generate from contents of distributation
'ZEBRA_LANGUAGE' => { 'en' => 1, 'fr' => 1, 'nb' => 1 }, # FIXME should generate from contents of distribution
+ 'ZEBRA_TOKENIZER' => { chr => 1, icu => 1 },
'RUN_DATABASE_TESTS' => { 'yes' => 1, 'no' => 1 },
'USE_MEMCACHED' => { 'yes' => 1, 'no' => 1 },
);
'RUN_DATABASE_TESTS' => { 'yes' => 1, 'no' => 1 },
'USE_MEMCACHED' => { 'yes' => 1, 'no' => 1 },
);
'rewrite-config.PL' => [
'blib/KOHA_CONF_DIR/koha-conf.xml',
'blib/KOHA_CONF_DIR/koha-httpd.conf',
'rewrite-config.PL' => [
'blib/KOHA_CONF_DIR/koha-conf.xml',
'blib/KOHA_CONF_DIR/koha-httpd.conf',
+ 'blib/ZEBRA_CONF_DIR/etc/default.idx',
'blib/MISC_DIR/koha-install-log'
],
'fix-perl-path.PL' => [ # this script ensures the correct shebang line for the platform installed on...
'blib'
'blib/MISC_DIR/koha-install-log'
],
'fix-perl-path.PL' => [ # this script ensures the correct shebang line for the platform installed on...
'blib'
};
if ($config{'INSTALL_ZEBRA'} eq "yes") {
};
if ($config{'INSTALL_ZEBRA'} eq "yes") {
+$config{ZEBRA_TOKENIZER_STMT} = $config{ZEBRA_TOKENIZER} eq 'icu'
+ ? 'icuchain words-icu.xml'
+ : 'charmap word-phrase-utf.chr';
+
my %test_suite_override_dirs = (
KOHA_CONF_DIR => ['etc'],
ZEBRA_CONF_DIR => ['etc', 'zebradb'],
my %test_suite_override_dirs = (
KOHA_CONF_DIR => ['etc'],
ZEBRA_CONF_DIR => ['etc', 'zebradb'],
you must specify the primary MARC format of the
records to be indexed by Zebra.
you must specify the primary MARC format of the
records to be indexed by Zebra.
-Koha provides Zebra configuration files for MARC 21
-and UNIMARC.
+Koha provides Zebra configuration files for MARC21,
+NORMARC and UNIMARC.
MARC format for Zebra indexing);
$msg .= _add_valid_values_disp('ZEBRA_MARC_FORMAT', $valid_values);
MARC format for Zebra indexing);
$msg .= _add_valid_values_disp('ZEBRA_MARC_FORMAT', $valid_values);
$config{'AUTH_INDEX_MODE'} = _get_value('AUTH_INDEX_MODE', $msg, $defaults->{'AUTH_INDEX_MODE'}, $valid_values, $install_log_values);
$msg = q(
$config{'AUTH_INDEX_MODE'} = _get_value('AUTH_INDEX_MODE', $msg, $defaults->{'AUTH_INDEX_MODE'}, $valid_values, $install_log_values);
$msg = q(
+Zebra has two methods to perform records tokenization
+and characters normalization: CHR and ICU. ICU is
+recommended for catalogs containing non-Latin
+characters.);
+
+ $msg .= _add_valid_values_disp('ZEBRA_TOKENIZER', $valid_values);
+ $config{'ZEBRA_TOKENIZER'} = _get_value('ZEBRA_TOKENIZER', $msg, $defaults->{'ZEBRA_TOKENIZER'}, $valid_values, $install_log_values);
+
+ $msg = q(
Please specify Zebra database user);
$config{'ZEBRA_USER'} = _get_value('ZEBRA_USER', $msg, $defaults->{'ZEBRA_USER'}, $valid_values, $install_log_values);
Please specify Zebra database user);
$config{'ZEBRA_USER'} = _get_value('ZEBRA_USER', $msg, $defaults->{'ZEBRA_USER'}, $valid_values, $install_log_values);
position 1
alwaysmatches 1
firstinfield 1
position 1
alwaysmatches 1
firstinfield 1
-charmap word-phrase-utf.chr
-#firstinfield 1
+__ZEBRA_TOKENIZER_STMT__
# Phrase index
# Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1)
# and structure is word/phrase/word-list/free-form-text/document-text
index p
completeness 1
# Phrase index
# Used if completeness is 'complete {sub}field' (@attr 6=2, @attr 6=1)
# and structure is word/phrase/word-list/free-form-text/document-text
index p
completeness 1
-charmap word-phrase-utf.chr
+__ZEBRA_TOKENIZER_STMT__
# URX (URL) index
# Used if structure=urx (@attr 4=104)
# URX (URL) index
# Used if structure=urx (@attr 4=104)
--- /dev/null
+<icu_chain locale="">
+ <transliterate rule="\'>\ "/>
+ <transliterate rule="[:Number:] { '-' > '' "/>
+ <transform rule="[:Control:] Any-Remove"/>
+ <tokenize rule="l"/>
+ <transform rule="[[:WhiteSpace:][:Punctuation:]] Remove"/>
+ <transform rule="NFD"/>
+ <transform rule="[:Nonspacing Mark:] Remove"/>
+ <transform rule="NFC"/>
+ <display/>
+ <casemap rule="l"/>
+</icu_chain>
'__ZEBRA_RUN_DIR__' => "$prefix/var/run/zebradb",
'__ZEBRA_MARC_FORMAT__' => 'marc21',
'__ZEBRA_LANGUAGE__' => 'en',
'__ZEBRA_RUN_DIR__' => "$prefix/var/run/zebradb",
'__ZEBRA_MARC_FORMAT__' => 'marc21',
'__ZEBRA_LANGUAGE__' => 'en',
+ '__ZEBRA_TOKENIZER_STMT__' => 'charmap word-phrase-utf.chr',
'__ZEBRA_AUTH_CFG__' => 'zebra-authorities.cfg',
'__AUTH_RETRIEVAL_CFG__' => 'retrieval-info-auth-grs1.xml',
"__MERGE_SERVER_HOST__" => $myhost,
'__ZEBRA_AUTH_CFG__' => 'zebra-authorities.cfg',
'__AUTH_RETRIEVAL_CFG__' => 'retrieval-info-auth-grs1.xml',
"__MERGE_SERVER_HOST__" => $myhost,