bulkmarcimport.pl: XML input option documented

[koha_gimpoz] / misc / migration_tools / bulkmarcimport.pl
diff --git a/misc/migration_tools/bulkmarcimport.pl b/misc/migration_tools/bulkmarcimport.pl

index 7033668..d6b8d40 100755 (executable)
--- a/misc/migration_tools/bulkmarcimport.pl
+++ b/misc/migration_tools/bulkmarcimport.pl
@@ -12,25 +12,14 @@ BEGIN {
  
  # Koha modules used
  use MARC::File::USMARC;
-# Uncomment the line below and use MARC::File::XML again when it works better.
-# -- thd
-# use MARC::File::XML;
+use MARC::File::XML;
  use MARC::Record;
  use MARC::Batch;
  use MARC::Charset;
  
-# According to kados, an undocumented feature of setting MARC::Charset to 
-# ignore_errors(1) is that errors are not ignored.  Instead of deleting the 
-# whole subfield when a character does not translate properly from MARC8 into 
-# UTF-8, just the problem characters are deleted.  This should solve at least 
-# some of the fixme problems for fMARC8ToUTF8().
-# 
-# Problems remain if there are MARC 21 records where 000/09 is set incorrectly. 
-# -- thd.
-# MARC::Charset->ignore_errors(1);
-
  use C4::Context;
  use C4::Biblio;
+use C4::Charset;
  use C4::Items;
  use Unicode::Normalize;
  use Time::HiRes qw(gettimeofday);
@@ -40,7 +29,7 @@ binmode(STDOUT, ":utf8");
  use Getopt::Long;
  
  my ( $input_marc_file, $number) = ('',0);
-my ($version, $delete, $test_parameter, $skip_marc8_conversion, $char_encoding, $verbose, $commit, $fk_off);
+my ($version, $delete, $test_parameter, $skip_marc8_conversion, $char_encoding, $verbose, $commit, $fk_off,$format);
  
  $|=1;
  
@@ -55,101 +44,9 @@ GetOptions(
      'c:s' => \$char_encoding,
      'v:s' => \$verbose,
      'fk' => \$fk_off,
+    'm:s' => \$format,
  );
  
-# FIXME:  Management of error conditions needed for record parsing problems
-# and MARC8 character sets with mappings to Unicode not yet included in 
-# MARC::Charset.  The real world rarity of these problems is not fully tested.
-# Unmapped character sets will throw a warning currently and processing will 
-# continue with the error condition.  A fairly trivial correction should 
-# address some record parsing and unmapped character set problems but I need 
-# time to implement a test and correction for undef subfields and revert to 
-# MARC8 if mappings are missing. -- thd
-sub fMARC8ToUTF8($$) {
-    my ($record) = shift;
-    my ($verbose) = shift;
-    
-    foreach my $field ($record->fields()) {
-        if ($field->is_control_field()) {
-            ; # do nothing -- control fields should not contain non-ASCII characters
-        } else {
-            my @subfieldsArray;
-            my $fieldName = $field->tag();
-            my $indicator1Value = $field->indicator(1);
-            my $indicator2Value = $field->indicator(2);
-            foreach my $subfield ($field->subfields()) {
-                my $subfieldName = $subfield->[0];
-                my $subfieldValue = $subfield->[1];
-                my $utf8sf = MARC::Charset::marc8_to_utf8($subfieldValue);
-                unless (defined $utf8sf) {
-                    # For now, we're being very strict about
-                    # error during the MARC8 conversion, so return
-                    # if there's a problem.
-                    return;
-                }
-                $subfieldValue = NFC($utf8sf); # Normalization Form C to assist
-                                               # some browswers (e.g., Firefox on OS X)
-                                               # that have issues with decomposed characters
-                                               # in certain fonts.
-    
-                # Alas, MARC::Field::update() does not work correctly.
-                ## push (@subfieldsArray, $subfieldName, $subfieldValue);
-    
-                push @subfieldsArray, [$subfieldName, $subfieldValue];
-            }
-    
-            # Alas, MARC::Field::update() does not work correctly.
-            #
-            # The first instance in the field of a of a repeated subfield
-            # overwrites the content from later instances with the content
-            # from the first instance.
-            ## $field->update(@subfieldsArray);
-    
-            foreach my $subfieldRow(@subfieldsArray) {
-                my $subfieldName = $subfieldRow->[0];
-                $field->delete_subfields($subfieldName);
-            }
-            foreach my $subfieldRow(@subfieldsArray) {
-                $field->add_subfields(@$subfieldRow);
-            }
-    
-            if ($verbose) {
-                if ($verbose >= 2) {
-                    # Reading the indicator values again is not necessary.
-                    # They were not converted.
-                    # $indicator1Value = $field->indicator(1);
-                    # $indicator2Value = $field->indicator(2);
-                    # $indicator1Value =~ s/ /#/;
-                    # $indicator2Value =~ s/ /#/;
-                    print "\nCONVERTED TO UTF-8:\n" . $fieldName . ' ' .
-                            $indicator1Value .
-                    $indicator2Value;
-                    foreach my $subfield ($field->subfields()) {
-                        my $subfieldName = $subfield->[0];
-                        my $subfieldValue = $subfield->[1];
-                        print " \$" . $subfieldName . ' ' . $subfieldValue;
-                    }
-                }
-            }
-            if ($verbose) {
-                if ($verbose >= 2) {
-                    print "\n" if $verbose;
-                }
-            }
-        }
-    }
-
-    # must set Leader/09 to 'a' to indicate that
-    # record is now in UTF-8
-    my $leader = $record->leader();
-    substr($leader, 9, 1) = 'a';
-    $record->leader($leader);
-
-    $record->encoding('UTF-8');
-    return 1;
-}
-
-
  if ($version || ($input_marc_file eq '')) {
      print <<EOF
  small script to import an iso2709 file into Koha.
@@ -167,6 +64,7 @@ parameters :
  \tsupported. MARC21 by default.
  \td : delete EVERYTHING related to biblio in koha-DB before import  :tables :
  \t\tbiblio, \tbiblioitems,\titems
+\tm : format, MARCXML or ISO2709 (defaults to ISO2709)
  IMPORTANT : don't use this script before you've entered and checked your MARC parameters tables twice (or more!).
  Otherwise, the import won't work correctly and you will get invalid data.
  
@@ -185,6 +83,11 @@ my $dbh = C4::Context->dbh;
  my $CataloguingLog = C4::Context->preference('CataloguingLog');
  $dbh->do("UPDATE systempreferences SET value=0 WHERE variable='CataloguingLog'");
  
+if ($fk_off) {
+       $dbh->do("SET FOREIGN_KEY_CHECKS = 0");
+}
+
+
  if ($delete) {
      print "deleting biblios\n";
      $dbh->do("truncate biblio");
@@ -192,9 +95,9 @@ if ($delete) {
      $dbh->do("truncate items");
      $dbh->do("truncate zebraqueue");
  }
-if ($fk_off) {
-       $dbh->do("SET FOREIGN_KEY_CHECKS = 0");
-}
+
+
+
  if ($test_parameter) {
      print "TESTING MODE ONLY\n    DOING NOTHING\n===============\n";
  }
@@ -203,7 +106,22 @@ my $marcFlavour = C4::Context->preference('marcflavour') || 'MARC21';
  
  print "Characteristic MARC flavour: $marcFlavour\n" if $verbose;
  my $starttime = gettimeofday;
-my $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
+my $batch;
+if ($format =~ /XML/i) {
+    # ugly hack follows -- MARC::File::XML, when used by MARC::Batch,
+    # appears to try to convert incoming XML records from MARC-8
+    # to UTF-8.  Setting the BinaryEncoding key turns that off
+    # TODO: see what happens to ISO-8859-1 XML files.
+    # TODO: determine if MARC::Batch can be fixed to handle
+    #       XML records properly -- it probably should be
+    #       be using a proper push or pull XML parser to
+    #       extract the records, not using regexes to look
+    #       for <record>.*</record>.
+    $MARC::File::XML::_load_args{BinaryEncoding} = 'utf-8';
+    $batch = MARC::Batch->new( 'XML', $input_marc_file );
+} else {
+    $batch = MARC::Batch->new( 'USMARC', $input_marc_file );
+}
  $batch->warnings_off();
  $batch->strict_off();
  my $i=0;
@@ -222,7 +140,10 @@ RECORD: while ( my $record = $batch->next() ) {
      print "\r$i" unless $i % 100;
  
      if ($record->encoding() eq 'MARC-8' and not $skip_marc8_conversion) {
-        unless (fMARC8ToUTF8($record, $verbose)) {
+        # FIXME update condition
+        my ($guessed_charset, $charset_errors);
+        ($record, $guessed_charset, $charset_errors) = MarcToUTF8Record($record, $marcFlavour);
+        if ($guessed_charset eq 'failed') {
              warn "ERROR: failed to perform character conversion for record $i\n";
              next RECORD;            
          }