X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=misc%2Fmigration_tools%2Fbulkmarcimport.pl;h=a42bd720bcd4912817bdf79e0847a7b3fff152aa;hb=f8e9fb6445dadbdef91e13a253c9c6bcb75e0eec;hp=04b30b847b4fb8705f832be15435dd5ddd4b93db;hpb=fafe0896d6e4c670b7f0173d5ce0765273d97e8a;p=srvgit diff --git a/misc/migration_tools/bulkmarcimport.pl b/misc/migration_tools/bulkmarcimport.pl index 04b30b847b..a42bd720bc 100644 --- a/misc/migration_tools/bulkmarcimport.pl +++ b/misc/migration_tools/bulkmarcimport.pl @@ -2,24 +2,42 @@ # small script that import an iso2709 file into koha 2.0 use strict; +# use warnings; # Koha modules used use MARC::File::USMARC; +# Uncomment the line below and use MARC::File::XML again when it works better. +# -- thd +# use MARC::File::XML; use MARC::Record; use MARC::Batch; +use MARC::Charset; + +# According to kados, an undocumented feature of setting MARC::Charset to +# ignore_errors(1) is that errors are not ignored.  Instead of deleting the +# whole subfield when a character does not translate properly from MARC8 into +# UTF-8, just the problem characters are deleted.  This should solve at least +# some of the fixme problems for fMARC8ToUTF8(). +# +# Problems remain if there are MARC 21 records where 000/09 is set incorrectly. +# -- thd. +# MARC::Charset->ignore_errors(1); + use C4::Context; use C4::Biblio; use Time::HiRes qw(gettimeofday); - use Getopt::Long; +binmode(STDOUT, ":utf8"); -my $Zconn = C4::Context->Zconn or die "unable to set Zconn"; +use Getopt::Long; my ( $input_marc_file, $number) = ('',0); my ($version, $delete, $test_parameter,$char_encoding, $verbose, $commit); +$|=1; + GetOptions( - 'commit:f' => \$commit, + 'commit:f' => \$commit, 'file:s' => \$input_marc_file, 'n:f' => \$number, 'h' => \$version, @@ -29,8 +47,112 @@ GetOptions( 'v:s' => \$verbose, ); +# FIXME: Management of error conditions needed for record parsing problems +# and MARC8 character sets with mappings to Unicode not yet included in +# MARC::Charset. The real world rarity of these problems is not fully tested. +# Unmapped character sets will throw a warning currently and processing will +# continue with the error condition. A fairly trivial correction should +# address some record parsing and unmapped character set problems but I need +# time to implement a test and correction for undef subfields and revert to +# MARC8 if mappings are missing. -- thd +sub fMARC8ToUTF8($$) { + my ($record) = shift; + my ($verbose) = shift; + if ($verbose) { + if ($verbose >= 2) { + my $leader = $record->leader(); + $leader =~ s/ /#/g; + print "\n000 " . $leader; + } + } + foreach my $field ($record->fields()) { + if ($field->is_control_field()) { + if ($verbose) { + if ($verbose >= 2) { + my $fieldName = $field->tag(); + my $fieldValue = $field->data(); + $fieldValue =~ s/ /#/g; + print "\n" . $fieldName; + print ' ' . $fieldValue; + } + } + } else { + my @subfieldsArray; + my $fieldName = $field->tag(); + my $indicator1Value = $field->indicator(1); + my $indicator2Value = $field->indicator(2); + if ($verbose) { + if ($verbose >= 2) { + $indicator1Value =~ s/ /#/; + $indicator2Value =~ s/ /#/; + print "\n" . $fieldName . ' ' . + $indicator1Value . + $indicator2Value; + } + } + foreach my $subfield ($field->subfields()) { + my $subfieldName = $subfield->[0]; + my $subfieldValue = $subfield->[1]; + $subfieldValue = MARC::Charset::marc8_to_utf8($subfieldValue); + + # Alas, MARC::Field::update() does not work correctly. + ## push (@subfieldsArray, $subfieldName, $subfieldValue); + + push @subfieldsArray, [$subfieldName, $subfieldValue]; + if ($verbose) { + if ($verbose >= 2) { + print " \$" . $subfieldName . ' ' . $subfieldValue; + } + } + } + + # Alas, MARC::Field::update() does not work correctly. + # + # The first instance in the field of a of a repeated subfield + # overwrites the content from later instances with the content + # from the first instance. + ## $field->update(@subfieldsArray); + + foreach my $subfieldRow(@subfieldsArray) { + my $subfieldName = $subfieldRow->[0]; + $field->delete_subfields($subfieldName); + } + foreach my $subfieldRow(@subfieldsArray) { + $field->add_subfields(@$subfieldRow); + } + + if ($verbose) { + if ($verbose >= 2) { + # Reading the indicator values again is not necessary. + # They were not converted. + # $indicator1Value = $field->indicator(1); + # $indicator2Value = $field->indicator(2); + # $indicator1Value =~ s/ /#/; + # $indicator2Value =~ s/ /#/; + print "\nCONVERTED TO UTF-8:\n" . $fieldName . ' ' . + $indicator1Value . + $indicator2Value; + foreach my $subfield ($field->subfields()) { + my $subfieldName = $subfield->[0]; + my $subfieldValue = $subfield->[1]; + print " \$" . $subfieldName . ' ' . $subfieldValue; + } + } + } + if ($verbose) { + if ($verbose >= 2) { + print "\n" if $verbose; + } + } + } + } + $record->encoding('UTF-8'); + return $record; +} + + if ($version || ($input_marc_file eq '')) { - print <dbh; if ($delete) { - print "deleting biblios\n"; - $dbh->do("delete from biblio"); - $dbh->do("delete from biblioitems"); - $dbh->do("delete from items"); - $dbh->do("delete from bibliosubject"); - $dbh->do("delete from additionalauthors"); - $dbh->do("delete from bibliosubtitle"); - $dbh->do("delete from marc_biblio"); - $dbh->do("delete from marc_subfield_table"); - $dbh->do("delete from marc_word"); - $dbh->do("delete from marc_blob_subfield"); + print "deleting biblios\n"; + $dbh->do("delete from biblio"); + $dbh->do("delete from biblioitems"); + $dbh->do("delete from items"); } if ($test_parameter) { - print "TESTING MODE ONLY\n DOING NOTHING\n===============\n"; + print "TESTING MODE ONLY\n DOING NOTHING\n===============\n"; } -$char_encoding = 'MARC21' unless ($char_encoding); -print "CHAR : $char_encoding\n" if $verbose; +my $marcFlavour = C4::Context->preference('marcflavour') || 'MARC21'; + +print "Characteristic MARC flavour: $marcFlavour\n" if $verbose; +# die; my $starttime = gettimeofday; my $batch = MARC::Batch->new( 'USMARC', $input_marc_file ); $batch->warnings_off(); @@ -93,84 +211,93 @@ $commitnum = $commit; my ($tagfield,$tagsubfield) = &MARCfind_marc_from_kohafield($dbh,"items.itemnumber",''); # $dbh->do("lock tables biblio write, biblioitems write, items write, marc_biblio write, marc_subfield_table write, marc_blob_subfield write, marc_word write, marc_subfield_structure write, stopwords write"); while ( my $record = $batch->next() ) { -warn "I:".$i; -warn "NUM:".$number; - $i++; - - if ($i==$number) { - z3950_extended_services($Zconn,'commit',set_service_options('commit')); - print "COMMIT OPERATION SUCCESSFUL\n"; - - my $timeneeded = gettimeofday - $starttime; - die "$i MARC records imported in $timeneeded seconds\n"; - } - # perform the commit operation ever so often - if ($i==$commit) { - z3950_extended_services($Zconn,'commit',set_service_options('commit')); - $commit+=$commitnum; - print "COMMIT OPERATION SUCCESSFUL\n"; - } - #now, parse the record, extract the item fields, and store them in somewhere else. +# warn "=>".$record->as_formatted; +# warn "I:".$i; +# warn "NUM:".$number; + $i++; + print "."; + print "\r$i" unless $i % 100; +# if ($i==$number) { +# z3950_extended_services('commit',set_service_options('commit')); +# print "COMMIT OPERATION SUCCESSFUL\n"; +# +# my $timeneeded = gettimeofday - $starttime; +# die "$i MARC records imported in $timeneeded seconds\n"; +# } +# # perform the commit operation ever so often +# if ($i==$commit) { +# z3950_extended_services('commit',set_service_options('commit')); +# $commit+=$commitnum; +# print "COMMIT OPERATION SUCCESSFUL\n"; +# } + #now, parse the record, extract the item fields, and store them in somewhere else. ## create an empty record object to populate my $newRecord = MARC::Record->new(); - $newRecord->leader($record->leader()); + $newRecord->leader($record->leader()); # go through each field in the existing record foreach my $oldField ( $record->fields() ) { - # just reproduce tags < 010 in our new record - if ( $oldField->tag() < 10 ) { - $newRecord->append_fields( $oldField ); - next(); - } + # just reproduce tags < 010 in our new record + # + # Fields are not necessarily only numeric in the actual world of records + # nor in what I would recommend for additonal safe non-interfering local + # use fields. The following regular expression match is much safer than + # a numeric evaluation. -- thd + if ( $oldField->tag() =~ m/^00/ ) { + $newRecord->append_fields( $oldField ); + next(); + } - # store our new subfield data in this list - my @newSubfields = (); + # store our new subfield data in this list + my @newSubfields = (); - # go through each subfield code/data pair - foreach my $pair ( $oldField->subfields() ) { - #$pair->[1] =~ s/\[1] =~ s/\>//g; - push( @newSubfields, $pair->[0], $pair->[1] ); #char_decode($pair->[1],$char_encoding) ); - } + # go through each subfield code/data pair + foreach my $pair ( $oldField->subfields() ) { + #$pair->[1] =~ s/\[1] =~ s/\>//g; + push( @newSubfields, $pair->[0], $pair->[1] ); #char_decode($pair->[1],$char_encoding) ); + } - # add the new field to our new record - my $newField = MARC::Field->new( - $oldField->tag(), - $oldField->indicator(1), - $oldField->indicator(2), - @newSubfields - ); + # add the new field to our new record + my $newField = MARC::Field->new( + $oldField->tag(), + $oldField->indicator(1), + $oldField->indicator(2), + @newSubfields + ); - $newRecord->append_fields( $newField ); + $newRecord->append_fields( $newField ); } - warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2; - my @fields = $newRecord->field($tagfield); - my @items; - my $nbitems=0; - - foreach my $field (@fields) { - my $item = MARC::Record->new(); - $item->append_fields($field); - push @items,$item; - $newRecord->delete_field($field); - $nbitems++; - } - print "$i : $nbitems items found\n" if $verbose; - # now, create biblio and items with NEWnewXX call. - unless ($test_parameter) { - my ($bibid,$oldbibnum,$oldbibitemnum) = NEWnewbiblio($dbh,$Zconn,$newRecord,''); - warn "ADDED biblio NB $bibid in DB\n" if $verbose; - for (my $i=0;$i<=$#items;$i++) { - NEWnewitem($dbh,$Zconn,$items[$i],$bibid); - } - } + warn "$i ==>".$newRecord->as_formatted() if $verbose eq 2; + my @fields = $newRecord->field($tagfield); + my @items; + my $nbitems=0; + + foreach my $field (@fields) { + my $item = MARC::Record->new(); + $item->append_fields($field); + push @items,$item; + $newRecord->delete_field($field); + $nbitems++; + } + print "$i : $nbitems items found\n" if $verbose; + # now, create biblio and items with Addbiblio call. + unless ($test_parameter) { + warn "NEWREC : ".$newRecord->as_formatted; + my ($bibid,$oldbibitemnum) = AddBiblio($newRecord,''); + warn "ADDED biblio NB $bibid in DB\n" if $verbose; + for (my $i=0;$i<=$#items;$i++) { +# warn "here is the biblioitemnumber $oldbibitemnum"; + AddItem($items[$i],$bibid,$oldbibitemnum); + } + } } # final commit of the changes -z3950_extended_services($Zconn,'commit',set_service_options('commit')); +z3950_extended_services('commit',set_service_options('commit')); print "COMMIT OPERATION SUCCESSFUL\n"; my $timeneeded = gettimeofday - $starttime;