X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=misc%2Fmigration_tools%2Frebuild_nozebra.pl;h=b6b82c5c8cf959fd29274b87f5c77d05914203e0;hb=5829cef6d837cee4fddb0b917520206c86518d06;hp=38922b01813184b021697185bb1794c30bf0ec22;hpb=cee40a741d4af031ca63aeb37686caeb93a0a593;p=koha_fer diff --git a/misc/migration_tools/rebuild_nozebra.pl b/misc/migration_tools/rebuild_nozebra.pl index 38922b0181..b6b82c5c8c 100755 --- a/misc/migration_tools/rebuild_nozebra.pl +++ b/misc/migration_tools/rebuild_nozebra.pl @@ -6,6 +6,7 @@ use C4::Biblio; use C4::AuthoritiesMarc; use strict; +#use warnings; FIXME - Bug 2505 # # script that fills the nozebra table # @@ -16,21 +17,58 @@ $|=1; # flushes output # limit for database dumping my $limit;# = "LIMIT 100"; my $directory; -my $skip_export; -my $keep_export; -my $reset; -my $biblios; +#my $skip_export; +#my $keep_export; +#my $reset; +#my $biblios; my $authorities; my $sysprefs; -GetOptions( - 'd:s' => \$directory, - 'reset' => \$reset, - 's' => \$skip_export, - 'k' => \$keep_export, - 'b' => \$biblios, - 'a' => \$authorities, - 's' => \$sysprefs, # rebuild 'NoZebraIndexes' syspref - ); +my $commit; +my $want_help; + +my $result = GetOptions( + 'd:s' => \$directory, +# 'reset' => \$reset, +# 's' => \$skip_export, # Not used and conflicts with 's' option some lines below for sysprefs!!! +# 'k' => \$keep_export, +# 'b' => \$biblios, +# 'a' => \$authorities, + 's' => \$sysprefs, # rebuild 'NoZebraIndexes' syspref + 'h|help' => \$want_help, + 'commit:f' => \$commit, + ); + +if (not $result or $want_help) { + print_usage(); + exit 0; +} + + +sub print_usage { + print <<_USAGE_; +$0: reindex MARC bibs and authorities if NOT using Zebra ("NoZebra"). + +Use this batch job to reindex all biblio and authority +records in your Koha database. This job is useful +only if you are NOT using Zebra ('NoZebra'); if you are +using the 'Zebra'mode, this job should NOT be used. + +Parameters: + -d Temporary directory for indexing. + If not specified, one is automatically + created. The export directory + is automatically deleted unless + you supply the -k switch. + + -s Rebuild "NoZebraIndexes" System Preference + + --help or -h show this message. +_USAGE_ +} # END of print_usage sub + + +my $commitnum = 1000; +$commitnum = $commit if ($commit) ; $directory = "export" unless $directory; my $dbh=C4::Context->dbh; @@ -58,28 +96,39 @@ if (!%index || $sysprefs ) { 'host-item' => '995a,995c',\" where variable='NoZebraIndexes'"); %index = GetNoZebraIndexes(); } elsif (C4::Context->preference('marcflavour') eq 'MARC21') { - $dbh->do("UPDATE systempreferences SET value=\" - 'title' => '245a,245b', - 'author' => '100a', - 'isbn' => '020a', - 'issn' => '022a', - 'biblionumber => '999c', - 'itemtype' => '942c', - 'publisher' => '260b', - 'date' => '260c', - 'note' => '500a', - 'subject' => '600a, 650a', - 'dewey' => '082', - 'bc' => '952p', - 'host-item' => '952a,952c',\" where variable='NoZebraIndexes'"); + $dbh->do("UPDATE systempreferences SET value=\" +'title' => '130a,210a,222a,240a,243a,245a,245b,246a,246b,247a,247b,250a,250b,440a,830a', +'author' => '100a,100b,100c,100d,110a,111a,111b,111c,111d,245c,700a,710a,711a,800a,810a,811a', +'isbn' => '020a', +'issn' => '022a', +'lccn' => '010a', +'biblionumber => '999c', +'itemtype' => '942c', +'publisher' => '260b', +'date' => '260c', +'note' => '500a, 501a,504a,505a,508a,511a,518a,520a,521a,522a,524a,526a,530a,533a,538a,541a,546a,555a,556a,562a,563a,583a,585a,582a', +'subject' => '600*,610*,611*,630*,650*,651*,653*,654*,655*,662*,690*', +'dewey' => '082', +'bc' => '952p', +'callnum' => '952o', +'an' => '6009,6109,6119', +'series' => 440*,490*, +'host-item' => '9529 +'shelf' => '952c', +'collection' => '9528', +\"WHERE variable='NoZebraIndexes'"); + %index = GetNoZebraIndexes(); } } $|=1; +$dbh->{AutoCommit} = 0; + print "***********************************\n"; print "***** building BIBLIO indexes *****\n"; print "***********************************\n"; + my $sth; $sth=$dbh->prepare("select biblionumber from biblioitems order by biblionumber $limit"); $sth->execute(); @@ -98,11 +147,11 @@ while (my ($biblionumber) = $sth->fetchrow) { } next unless $record; # get title of the record (to store the 10 first letters with the index) - my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title'); + my ($titletag,$titlesubfield) = GetMarcFromKohaField('biblio.title', ''); my $title = lc($record->subfield($titletag,$titlesubfield)); # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values - $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g; + $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|=|://g; # limit to 10 char, should be enough, and limit the DB size $title = substr($title,0,10); #parse each field @@ -116,7 +165,7 @@ while (my ($biblionumber) = $sth->fetchrow) { # check each index to see if the subfield is stored somewhere # otherwise, store it in __RAW__ index foreach my $key (keys %index) { - if ($index{$key} =~ /$tag\*/ or $index{$key} =~ /$tag$subfieldcode/) { + if ($index{$key} =~ /\Q$tag\E\*/ or $index{$key} =~ /\Q$tag$subfieldcode\E/) { $indexed=1; my $line= lc $subfield->[1]; # remove meaningless value in the field... @@ -126,9 +175,9 @@ while (my ($biblionumber) = $sth->fetchrow) { next unless $_; # skip empty values (multiple spaces) # remove any accented char # if the entry is already here, improve weight - if ($result{$key}->{"$_"} =~ /$biblionumber,$title\-(\d);/) { + if ($result{$key}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) { my $weight=$1+1; - $result{$key}->{"$_"} =~ s/$biblionumber,$title\-(\d);//; + $result{$key}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//; $result{$key}->{"$_"} .= "$biblionumber,$title-$weight;"; # otherwise, create it, with weight=1 } else { @@ -144,10 +193,10 @@ while (my ($biblionumber) = $sth->fetchrow) { foreach (split / /,$line) { next unless $_; # warn $record->as_formatted."$_ =>".$title; - if ($result{__RAW__}->{"$_"} =~ /$biblionumber,$title\-(\d);/) { + if ($result{__RAW__}->{"$_"} =~ /$biblionumber,\Q$title\E\-(\d);/) { my $weight=$1+1; # $weight++; - $result{__RAW__}->{"$_"} =~ s/$biblionumber,$title\-(\d);//; + $result{__RAW__}->{"$_"} =~ s/$biblionumber,\Q$title\E\-(\d);//; $result{__RAW__}->{"$_"} .= "$biblionumber,$title-$weight;"; } else { $result{__RAW__}->{"$_"}.="$biblionumber,$title-1;"; @@ -157,9 +206,15 @@ while (my ($biblionumber) = $sth->fetchrow) { } } } + + print "\nInserting records...\n"; $i=0; -my $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)"); + +my $commitnum = 100; +$dbh->{AutoCommit} = 0; + +$sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('biblioserver',?,?,?)"); foreach my $key (keys %result) { foreach my $index (keys %{$result{$key}}) { if (length($result{$key}->{$index}) > 1000000) { @@ -168,19 +223,23 @@ foreach my $key (keys %result) { print "\r$i"; $i++; $sth->execute($key,$index,$result{$key}->{$index}); + $dbh->commit() if (0 == $i % $commitnum); } + $dbh->commit() if (0 == $i % $commitnum); } +$dbh->commit; + + print "\nbiblios done\n"; print "\n***********************************\n"; print "***** building AUTHORITIES indexes *****\n"; print "***********************************\n"; -my $sth; $sth=$dbh->prepare("select authid from auth_header order by authid $limit"); $sth->execute(); -my $i=0; -my %result; +$i=0; +%result = (); while (my ($authid) = $sth->fetchrow) { $i++; print "\r$i"; @@ -195,7 +254,7 @@ while (my ($authid) = $sth->fetchrow) { my %index; # for authorities, the "title" is the $a mainentry - my $authref = C4::AuthoritiesMarc::GetAuthType($record->subfield(152,'b')); + my $authref = C4::AuthoritiesMarc::GetAuthType(C4::AuthoritiesMarc::GetAuthTypeCode($authid)); warn "ERROR : authtype undefined for ".$record->as_formatted unless $authref; my $title = $record->subfield($authref->{auth_tag_to_report},'a'); @@ -204,7 +263,7 @@ while (my ($authid) = $sth->fetchrow) { $index{'auth_type'} = '152b'; # remove blancks comma (that could cause problem when decoding the string for CQL retrieval) and regexp specific values - $title =~ s/ |,|;|\[|\]|\(|\)|\*|-|'|=//g; + $title =~ s/ |\.|,|;|\[|\]|\(|\)|\*|-|'|:|=//g; $title = quotemeta $title; # limit to 10 char, should be enough, and limit the DB size $title = substr($title,0,10); @@ -259,9 +318,15 @@ while (my ($authid) = $sth->fetchrow) { } } } + + + print "\nInserting...\n"; $i=0; -my $sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)"); + +my $commitnum = 100; +$dbh->{AutoCommit} = 0; +$sth = $dbh->prepare("INSERT INTO nozebra (server,indexname,value,biblionumbers) VALUES ('authorityserver',?,?,?)"); foreach my $key (keys %result) { foreach my $index (keys %{$result{$key}}) { if (length($result{$key}->{$index}) > 1000000) { @@ -270,6 +335,9 @@ foreach my $key (keys %result) { print "\r$i"; $i++; $sth->execute($key,$index,$result{$key}->{$index}); + $dbh->commit() if (0 == $i % $commitnum); } + $dbh->commit() if (0 == $i % $commitnum); } +$dbh->commit; print "\nauthorities done\n";