Merge remote-tracking branch 'kc/new/bug_6351' into kcmaster

[koha_fer] / misc / migration_tools / rebuild_zebra.pl
diff --git a/misc/migration_tools/rebuild_zebra.pl b/misc/migration_tools/rebuild_zebra.pl

index fa61dea..dadf43f 100755 (executable)
--- a/misc/migration_tools/rebuild_zebra.pl
+++ b/misc/migration_tools/rebuild_zebra.pl
@@ -1,6 +1,7 @@
  #!/usr/bin/perl
  
  use strict;
+#use warnings; FIXME - Bug 2505
  
  use C4::Context;
  use Getopt::Long;
@@ -8,6 +9,7 @@ use File::Temp qw/ tempdir /;
  use File::Path;
  use C4::Biblio;
  use C4::AuthoritiesMarc;
+use C4::Items;
  
  # 
  # script that checks zebradir structure & create directories & mandatory files if needed
@@ -15,9 +17,11 @@ use C4::AuthoritiesMarc;
  #
  
  $|=1; # flushes output
-
-# limit for database dumping
+# If the cron job starts us in an unreadable dir, we will break without
+# this.
+chdir $ENV{HOME} if (!(-r '.'));
  my $directory;
+my $nosanitize;
  my $skip_export;
  my $keep_export;
  my $reset;
@@ -28,11 +32,16 @@ my $noshadow;
  my $do_munge;
  my $want_help;
  my $as_xml;
+my $process_zebraqueue;
+my $do_not_clear_zebraqueue;
+my $verbose_logging;
+my $zebraidx_log_opt = " -v none,fatal,warn ";
  my $result = GetOptions(
      'd:s'           => \$directory,
-    'reset'         => \$reset,
+    'r|reset'       => \$reset,
      's'             => \$skip_export,
      'k'             => \$keep_export,
+    'nosanitize'    => \$nosanitize,
      'b'             => \$biblios,
      'noxml'         => \$noxml,
      'w'             => \$noshadow,
@@ -40,6 +49,9 @@ my $result = GetOptions(
      'a'             => \$authorities,
      'h|help'        => \$want_help,
         'x'                             => \$as_xml,
+    'y'             => \$do_not_clear_zebraqueue,
+    'z'             => \$process_zebraqueue,
+    'v'             => \$verbose_logging,
  );
  
  
@@ -49,7 +61,31 @@ if (not $result or $want_help) {
  }
  
  if (not $biblios and not $authorities) {
-    my $msg = "Must specify -b or -a to reindex bibs or authorites\n";
+    my $msg = "Must specify -b or -a to reindex bibs or authorities\n";
+    $msg   .= "Please do '$0 --help' to see usage.\n";
+    die $msg;
+}
+
+if ($authorities and $as_xml) {
+    my $msg = "Cannot specify both -a and -x\n";
+    $msg   .= "Please do '$0 --help' to see usage.\n";
+    die $msg;
+}
+
+if ( !$as_xml and $nosanitize ) {
+    my $msg = "Cannot specify both -no_xml and -nosanitize\n";
+    $msg   .= "Please do '$0 --help' to see usage.\n";
+    die $msg;
+}
+
+if ($process_zebraqueue and ($skip_export or $reset)) {
+    my $msg = "Cannot specify -r or -s if -z is specified\n";
+    $msg   .= "Please do '$0 --help' to see usage.\n";
+    die $msg;
+}
+
+if ($process_zebraqueue and $do_not_clear_zebraqueue) {
+    my $msg = "Cannot specify both -y and -z\n";
      $msg   .= "Please do '$0 --help' to see usage.\n";
      die $msg;
  }
@@ -57,6 +93,13 @@ if (not $biblios and not $authorities) {
  if ($noshadow) {
      $noshadow = ' -n ';
  }
+
+#  -v is for verbose, which seems backwards here because of how logging is set
+#    on the CLI of zebraidx.  It works this way.  The default is to not log much
+if ($verbose_logging) {
+    $zebraidx_log_opt = '';
+}
+
  my $use_tempdir = 0;
  unless ($directory) {
      $use_tempdir = 1;
@@ -72,355 +115,474 @@ my $dbh = C4::Context->dbh;
  my ($biblionumbertagfield,$biblionumbertagsubfield) = &GetMarcFromKohaField("biblio.biblionumber","");
  my ($biblioitemnumbertagfield,$biblioitemnumbertagsubfield) = &GetMarcFromKohaField("biblioitems.biblioitemnumber","");
  
-print "Zebra configuration information\n";
-print "================================\n";
-print "Zebra biblio directory      = $biblioserverdir\n";
-print "Zebra authorities directory = $authorityserverdir\n";
-print "Koha directory              = $kohadir\n";
-print "BIBLIONUMBER in :     $biblionumbertagfield\$$biblionumbertagsubfield\n";
-print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
-print "================================\n";
+if ( $verbose_logging ) {
+    print "Zebra configuration information\n";
+    print "================================\n";
+    print "Zebra biblio directory      = $biblioserverdir\n";
+    print "Zebra authorities directory = $authorityserverdir\n";
+    print "Koha directory              = $kohadir\n";
+    print "BIBLIONUMBER in :     $biblionumbertagfield\$$biblionumbertagsubfield\n";
+    print "BIBLIOITEMNUMBER in : $biblioitemnumbertagfield\$$biblioitemnumbertagsubfield\n";
+    print "================================\n";
+}
  
  if ($do_munge) {
      munge_config();
  }
  
  if ($authorities) {
-    #
-    # exporting authorities
-    #
-    if ($skip_export) {
-        print "====================\n";
-        print "SKIPPING authorities export\n";
-        print "====================\n";
+    index_records('authority', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $authorityserverdir);
+} else {
+    print "skipping authorities\n" if ( $verbose_logging );
+}
+
+if ($biblios) {
+    index_records('biblio', $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $biblioserverdir);
+} else {
+    print "skipping biblios\n" if ( $verbose_logging );
+}
+
+
+if ( $verbose_logging ) {
+    print "====================\n";
+    print "CLEANING\n";
+    print "====================\n";
+}
+if ($keep_export) {
+    print "NOTHING cleaned : the export $directory has been kept.\n";
+    print "You can re-run this script with the -s ";
+    if ($use_tempdir) {
+        print " and -d $directory parameters";
      } else {
+        print "parameter";
+    }
+    print "\n";
+    print "if you just want to rebuild zebra after changing the record.abs\n";
+    print "or another zebra config file\n";
+} else {
+    unless ($use_tempdir) {
+        # if we're using a temporary directory
+        # created by File::Temp, it will be removed
+        # automatically.
+        rmtree($directory, 0, 1);
+        print "directory $directory deleted\n";
+    }
+}
+
+# This checks to see if the zebra directories exist under the provided path.
+# If they don't, then zebra is likely to spit the dummy. This returns true
+# if the directories had to be created, false otherwise.
+sub check_zebra_dirs {
+       my ($base) = shift() . '/';
+       my $needed_repairing = 0;
+       my @dirs = ( '', 'key', 'register', 'shadow' );
+       foreach my $dir (@dirs) {
+               my $bdir = $base . $dir;
+        if (! -d $bdir) {
+               $needed_repairing = 1;
+               mkdir $bdir || die "Unable to create '$bdir': $!\n";
+               print "$0: needed to create '$bdir'\n";
+        }
+    }
+    return $needed_repairing;
+}      # ----------  end of subroutine check_zebra_dirs  ----------
+
+sub index_records {
+    my ($record_type, $directory, $skip_export, $process_zebraqueue, $as_xml, $noxml, $nosanitize, $do_not_clear_zebraqueue, $verbose_logging, $zebraidx_log_opt, $server_dir) = @_;
+
+    my $num_records_exported = 0;
+    my $records_deleted;
+    my $need_reset = check_zebra_dirs($server_dir);
+    if ($need_reset) {
+       print "$0: found broken zebra server directories: forcing a rebuild\n";
+       $reset = 1;
+    }
+    if ($skip_export && $verbose_logging) {
          print "====================\n";
-        print "exporting authorities\n";
+        print "SKIPPING $record_type export\n";
          print "====================\n";
+    } else {
+        if ( $verbose_logging ) {
+            print "====================\n";
+            print "exporting $record_type\n";
+            print "====================\n";
+        }
          mkdir "$directory" unless (-d $directory);
-        mkdir "$directory/authorities" unless (-d "$directory/authorities");
-        open(OUT,">:utf8","$directory/authorities/authorities.iso2709") or die $!;
-        my $dbh=C4::Context->dbh;
-        my $sth;
-        $sth=$dbh->prepare("select authid,marc from auth_header");
-        $sth->execute();
-        my $i=0;
-        while (my ($authid,$record) = $sth->fetchrow) {
-            # FIXME : we retrieve the iso2709 record. if the GetAuthority (that uses the XML) fails
-            # due to some MARC::File::XML failure, then try the iso2709, 
-            # (add authid & authtype if needed)
-            my $record;
-            eval {
-                $record = GetAuthority($authid);
-            };
-            next unless $record;
-            # force authid in case it's not here, otherwise, zebra will die on this authority
-            unless ($record->field('001')->data() eq $authid){
-                print "$authid don't exist for this authority :".$record->as_formatted;
-                $record->delete_field($record->field('001'));
-                $record->insert_fields_ordered(MARC::Field->new('001',$authid));
-            }
-            if($@){
-                print "  There was some pb getting authority : ".$authid."\n";
-            next;
+        mkdir "$directory/$record_type" unless (-d "$directory/$record_type");
+        if ($process_zebraqueue) {
+            my $entries = select_zebraqueue_records($record_type, 'deleted');
+            mkdir "$directory/del_$record_type" unless (-d "$directory/del_$record_type");
+            $records_deleted = generate_deleted_marc_records($record_type, $entries, "$directory/del_$record_type", $as_xml);
+            mark_zebraqueue_batch_done($entries);
+            $entries = select_zebraqueue_records($record_type, 'updated');
+            mkdir "$directory/upd_$record_type" unless (-d "$directory/upd_$record_type");
+            $num_records_exported = export_marc_records_from_list($record_type, 
+                                                                  $entries, "$directory/upd_$record_type", $as_xml, $noxml, $records_deleted);
+            mark_zebraqueue_batch_done($entries);
+        } else {
+            my $sth = select_all_records($record_type);
+            $num_records_exported = export_marc_records_from_sth($record_type, $sth, "$directory/$record_type", $as_xml, $noxml, $nosanitize);
+            unless ($do_not_clear_zebraqueue) {
+                mark_all_zebraqueue_done($record_type);
              }
-        
-            print ".";
-            print "\r$i" unless ($i++ %100);
-#            # remove leader length, that could be wrong, it will be calculated automatically by as_usmarc
-#            # otherwise, if it's wron, zebra will fail miserabily (and never index what is after the failing record)
-            my $leader=$record->leader;
-            substr($leader,0,5)='     ';
-            substr($leader,10,7)='22     ';
-            $record->leader(substr($leader,0,24));
-            print OUT $record->as_usmarc;
          }
-        close(OUT);
      }
      
      #
      # and reindexing everything
      #
-    print "====================\n";
-    print "REINDEXING zebra\n";
-    print "====================\n";
-    do_indexing('authority', 'update', "$directory/authorities", $reset, $noshadow, 'iso2709');
-} else {
-    print "skipping authorities\n";
-}
-#################################################################################################################
-#                        BIBLIOS 
-#################################################################################################################
-
-if ($biblios) {
-    # die;
-    #
-    # exporting biblios
-    #
-    if ($skip_export) {
+    if ( $verbose_logging ) {
          print "====================\n";
-        print "SKIPPING biblio export\n";
+        print "REINDEXING zebra\n";
          print "====================\n";
+    }
+       my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
+    if ($process_zebraqueue) {
+        do_indexing($record_type, 'delete', "$directory/del_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt) 
+            if %$records_deleted;
+        do_indexing($record_type, 'update', "$directory/upd_$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+            if $num_records_exported;
      } else {
-        print "====================\n";
-        print "exporting biblios\n";
-        print "====================\n";
-        mkdir "$directory" unless (-d $directory);
-        mkdir "$directory/biblios" unless (-d "$directory/biblios");
-        open(OUT,">:utf8 ","$directory/biblios/export") or die $!;
-               my $dbh=C4::Context->dbh;
-        my $sth;
-    if ($noxml){
-        $sth=$dbh->prepare("select biblionumber,marc from biblioitems order by biblionumber");
-        $sth->execute();
-        my $i=0;
-        while (my ($biblionumber,$marc) = $sth->fetchrow) {
-            my $record;
-            $record=MARC::Record->new_from_usmarc($marc);
-            my $record_correct=1;
-            # skip uncorrect records : isn't this bogus, as just after we reintroduce biblionumber if it's missing ?
-            # FIXME next unless $record->field($biblionumbertagfield);
-            # check if biblionumber is present, otherwise, add it on the fly
-            if ($biblionumbertagfield eq '001') {
-                unless ($record->field($biblionumbertagfield)->data()) {
-                    $record_correct=0;
-                    my $field;
-                    # if the field where biblionumber is already exist, just update it, otherwise create it
-                if ($record->field($biblionumbertagfield)) {
-                $field =  $record->field($biblionumbertagfield);
-                $field->update($biblionumber);
-                } else {
-                my $newfield;
-                $newfield = MARC::Field->new( $biblionumbertagfield, $biblionumber);
-                $record->append_fields($newfield);
-                }
-            }
-            } else {
-            unless ($record->subfield($biblionumbertagfield,$biblionumbertagsubfield)) {
-                $record_correct=0;
-                my $field;
-                # if the field where biblionumber is already exist, just update it, otherwise create it
-                if ($record->field($biblionumbertagfield)) {
-                $field =  $record->field($biblionumbertagfield);
-                $field->add_subfields($biblionumbertagsubfield => $biblionumber);
-                } else {
-                my $newfield;
-                $newfield = MARC::Field->new( $biblionumbertagfield,'','', $biblionumbertagsubfield => $biblionumber);
-                $record->append_fields($newfield);
-                }
-            }
-    #             warn "FIXED BIBLIONUMBER".$record->as_formatted;
-            }
-            unless ($record->subfield($biblioitemnumbertagfield,$biblioitemnumbertagsubfield)) {
-                $record_correct=0;
-            #             warn "INCORRECT BIBLIOITEMNUMBER :".$record->as_formatted;
-            my $field;
-                # if the field where biblionumber is already exist, just update it, otherwise create it
-                if ($record->field($biblioitemnumbertagfield)) {
-                    $field =  $record->field($biblioitemnumbertagfield);
-                    if ($biblioitemnumbertagfield <10) {
-                    $field->update($biblionumber);
-                    } else {
-                    $field->add_subfields($biblioitemnumbertagsubfield => $biblionumber);
-                    }
-                } else {
-                    my $newfield;
-                    if ($biblioitemnumbertagfield <10) {
-                    $newfield = MARC::Field->new( $biblioitemnumbertagfield, $biblionumber);
-                    } else {
-                    $newfield = MARC::Field->new( $biblioitemnumbertagfield,'','', $biblioitemnumbertagsubfield => $biblionumber);
+        do_indexing($record_type, 'update', "$directory/$record_type", $reset, $noshadow, $record_fmt, $zebraidx_log_opt)
+            if ($num_records_exported or $skip_export);
+    }
+}
+
+
+sub select_zebraqueue_records {
+    my ($record_type, $update_type) = @_;
+
+    my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+    my $op = ($update_type eq 'deleted') ? 'recordDelete' : 'specialUpdate';
+
+    my $sth = $dbh->prepare("SELECT id, biblio_auth_number 
+                             FROM zebraqueue
+                             WHERE server = ?
+                             AND   operation = ?
+                             AND   done = 0
+                             ORDER BY id DESC");
+    $sth->execute($server, $op);
+    my $entries = $sth->fetchall_arrayref({});
+}
+
+sub mark_all_zebraqueue_done {
+    my ($record_type) = @_;
+
+    my $server = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
+
+    my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1
+                             WHERE server = ?
+                             AND done = 0");
+    $sth->execute($server);
+}
+
+sub mark_zebraqueue_batch_done {
+    my ($entries) = @_;
+
+    $dbh->{AutoCommit} = 0;
+    my $sth = $dbh->prepare("UPDATE zebraqueue SET done = 1 WHERE id = ?");
+    $dbh->commit();
+    foreach my $id (map { $_->{id} } @$entries) {
+        $sth->execute($id);
+    }
+    $dbh->{AutoCommit} = 1;
+}
+
+sub select_all_records {
+    my $record_type = shift;
+    return ($record_type eq 'biblio') ? select_all_biblios() : select_all_authorities();
+}
+
+sub select_all_authorities {
+    my $sth = $dbh->prepare("SELECT authid FROM auth_header");
+    $sth->execute();
+    return $sth;
+}
+
+sub select_all_biblios {
+    my $sth = $dbh->prepare("SELECT biblionumber FROM biblioitems ORDER BY biblionumber");
+    $sth->execute();
+    return $sth;
+}
+
+sub export_marc_records_from_sth {
+    my ($record_type, $sth, $directory, $as_xml, $noxml, $nosanitize) = @_;
+
+    my $num_exported = 0;
+    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    my $i = 0;
+    my ( $itemtag, $itemsubfield ) = GetMarcFromKohaField("items.itemnumber",'');
+    while (my ($record_number) = $sth->fetchrow_array) {
+        print "." if ( $verbose_logging );
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        if ( $nosanitize ) {
+            my $marcxml = $record_type eq 'biblio'
+                          ? GetXmlBiblio( $record_number )
+                          : GetAuthorityXML( $record_number );
+            if ($record_type eq 'biblio'){
+                my @items = GetItemsInfo($record_number);
+                if (@items){
+                    my $record = MARC::Record->new;
+                    my @itemsrecord;
+                    foreach my $item (@items){
+                        my $record = Item2Marc($item, $record_number);                        
+                        push @itemsrecord, $record->field($itemtag);
                      }
-                    $record->insert_grouped_field($newfield);
+                    $record->insert_fields_ordered(@itemsrecord);
+                    my $itemsxml=$record->as_xml_record();
+                    my $searchstring = '<record>\n';
+                    my $index = index($itemsxml, '<record>\n', 0);
+                    $itemsxml = substr($itemsxml, $index + length($searchstring));
+                    $searchstring = '</record>';
+                    $marcxml = substr($marcxml, 0, index($marcxml, $searchstring));
+                    $marcxml .= $itemsxml;
+                }
              }
-        #             warn "FIXED BIBLIOITEMNUMBER".$record->as_formatted;
+            if ( $marcxml ) {
+                print OUT $marcxml if $marcxml;
+                $num_exported++;
              }
-            my $leader=$record->leader;
-            substr($leader,0,5)='     ';
-            substr($leader,10,7)='22     ';
-            $record->leader(substr($leader,0,24));
-                print OUT $record->as_usmarc();
+            next;
          }
-        close (OUT);
-    } else {
-        $sth=$dbh->prepare("SELECT biblionumber FROM biblioitems ORDER BY biblionumber");
-        $sth->execute();
-        my $i=0;
-        while (my ($biblionumber) = $sth->fetchrow) {
-            print ".";
-            print "\r$i" unless ($i++ %100);
-            my $record;
+        my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
+        if (defined $marc) {
+            # FIXME - when more than one record is exported and $as_xml is true,
+            # the output file is not valid XML - it's just multiple <record> elements
+            # strung together with no single root element.  zebraidx doesn't seem
+            # to care, though, at least if you're using the GRS-1 filter.  It does
+            # care if you're using the DOM filter, which requires valid XML file(s).
              eval {
-                $record = GetMarcBiblio($biblionumber);
+                print OUT ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
+                $num_exported++;
              };
-            if($@){
-                print "  There was some pb getting biblio : #".$biblionumber."\n";
-                next;
-            }
-            next unless $record;
-# die if $record->subfield('090','9') eq 11;
-    #         print $record;
-            # check that biblionumber & biblioitemnumber are stored in the MARC record, otherwise, add them & update the biblioitems.marcxml data.
-            my $record_correct=1;
-            # skip uncorrect records : isn't this bogus, as just after we reintroduce biblionumber if it's missing ?
-            # FIXME next unless $record->field($biblionumbertagfield);
-            #
-            #
-            # CHECK  biblionumber
-            #
-            #
-       if ($biblionumbertagfield eq '001') {
-                unless ($record->field($biblionumbertagfield) && $record->field($biblionumbertagfield)->data()) {
-                    $record_correct=0;
-                    my $field;
-                    # if the field where biblionumber is already exist, just update it, otherwise create it
-                    if ($record->field($biblionumbertagfield)) {
-                        $field =  $record->field($biblionumbertagfield);
-                        $field->update($biblionumber);
-                    } else {
-                        my $newfield;
-                        $newfield = MARC::Field->new( $biblionumbertagfield, $biblionumber);
-                        $record->append_fields($newfield);
-                    }
-                }
-            } else {
-                unless ($record->subfield($biblionumbertagfield,$biblionumbertagsubfield)) {
-#                 warn "fixing biblionumber for $biblionumbertagfield,$biblionumbertagsubfield = $biblionumber";
-                    $record_correct=0;
-                    my $field;
-                    # if the field where biblionumber is already exist, just update it, otherwise create it
-                    if ($record->field($biblionumbertagfield)) {
-                        $field =  $record->field($biblionumbertagfield);
-                        $field->add_subfields($biblionumbertagsubfield => $biblionumber);
-                    } else {
-                        my $newfield;
-                        $newfield = MARC::Field->new( $biblionumbertagfield,'','', $biblionumbertagsubfield => $biblionumber);
-                        $record->append_fields($newfield);
-                    }
-                }
-#                 warn "FIXED BIBLIONUMBER".$record->as_formatted;
-            }
-            #
-            #
-            # CHECK BIBLIOITEMNUMBER
-            #
-            #
-            unless ($record->subfield($biblioitemnumbertagfield,$biblioitemnumbertagsubfield)) {
-#                 warn "fixing biblioitemnumber for $biblioitemnumbertagfield,$biblioitemnumbertagsubfield = $biblionumber";
-                $record_correct=0;
-                my $field;
-                # if the field where biblionumber is already exist, just update it, otherwise create it
-                if ($record->field($biblioitemnumbertagfield)) {
-                    $field =  $record->field($biblioitemnumbertagfield);
-                    if ($biblioitemnumbertagfield <10) {
-                        $field->update($biblionumber);
-                    } else {
-                        $field->add_subfields($biblioitemnumbertagsubfield => $biblionumber);
-                    }
-                } else {
-                    my $newfield;
-                    if ($biblioitemnumbertagfield <10) {
-                        $newfield = MARC::Field->new( $biblioitemnumbertagfield, $biblionumber);
-                    } else {
-                        $newfield = MARC::Field->new( $biblioitemnumbertagfield,'','', $biblioitemnumbertagsubfield => $biblionumber);
-                    }
-                    $record->insert_grouped_field($newfield);
-                }
-    #             warn "FIXED BIBLIOITEMNUMBER".$record->as_formatted;
+            if ($@) {
+              warn "Error exporting record $record_number ($record_type) ".($noxml ? "not XML" : "XML");
              }
-            #
-            #
-            # CHECK FIELD 100
-            #
-            #
-            my $encoding = C4::Context->preference("marcflavour");
-            # deal with UNIMARC field 100 (encoding) : create it if needed & set encoding to unicode
-                       if ( $encoding eq "UNIMARC" ) {
-                my $string;
-                if ( length($record->subfield( 100, "a" )) == 35 ) {
-                    $string = $record->subfield( 100, "a" );
-                    my $f100 = $record->field(100);
-                    $record->delete_field($f100);
-                }
-                else {
-                    $string = POSIX::strftime( "%Y%m%d", localtime );
-                    $string =~ s/\-//g;
-                    $string = sprintf( "%-*s", 35, $string );
-                }
-                substr( $string, 22, 6, "frey50" );
-                unless ( length($record->subfield( 100, "a" )) == 35 ) {
-                    $record->delete_field($record->field(100));
-                    $record->insert_grouped_field(
-                        MARC::Field->new( 100, "", "", "a" => $string ) );
+        }
+    }
+    print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+    close OUT;
+    return $num_exported;
+}
+
+sub export_marc_records_from_list {
+    my ($record_type, $entries, $directory, $as_xml, $noxml, $records_deleted) = @_;
+
+    my $num_exported = 0;
+    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    my $i = 0;
+
+    # Skip any deleted records. We check for this anyway, but this reduces error spam
+    my %found = %$records_deleted;
+    foreach my $record_number ( map { $_->{biblio_auth_number} }
+                                grep { !$found{ $_->{biblio_auth_number} }++ }
+                                @$entries ) {
+        print "." if ( $verbose_logging );
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        my ($marc) = get_corrected_marc_record($record_type, $record_number, $noxml);
+        if (defined $marc) {
+            # FIXME - when more than one record is exported and $as_xml is true,
+            # the output file is not valid XML - it's just multiple <record> elements
+            # strung together with no single root element.  zebraidx doesn't seem
+            # to care, though, at least if you're using the GRS-1 filter.  It does
+            # care if you're using the DOM filter, which requires valid XML file(s).
+            print OUT ($as_xml) ? $marc->as_xml_record(C4::Context->preference('marcflavour')) : $marc->as_usmarc();
+            $num_exported++;
+        }
+    }
+    print "\nRecords exported: $num_exported\n" if ( $verbose_logging );
+    close OUT;
+    return $num_exported;
+}
+
+sub generate_deleted_marc_records {
+    my ($record_type, $entries, $directory, $as_xml) = @_;
+
+    my $records_deleted = {};
+    open (OUT, ">:utf8 ", "$directory/exported_records") or die $!;
+    my $i = 0;
+    foreach my $record_number (map { $_->{biblio_auth_number} } @$entries ) {
+        print "\r$i" unless ($i++ %100 or !$verbose_logging);
+        print "." if ( $verbose_logging );
+
+        my $marc = MARC::Record->new();
+        if ($record_type eq 'biblio') {
+            fix_biblio_ids($marc, $record_number, $record_number);
+        } else {
+            fix_authority_id($marc, $record_number);
+        }
+        if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+            fix_unimarc_100($marc);
+        }
+
+        print OUT ($as_xml) ? $marc->as_xml_record(C4::Context->preference("marcflavour")) : $marc->as_usmarc();
+
+        $records_deleted->{$record_number} = 1;
+    }
+    print "\nRecords exported: $i\n" if ( $verbose_logging );
+    close OUT;
+    return $records_deleted;
+    
+
+}
+
+sub get_corrected_marc_record {
+    my ($record_type, $record_number, $noxml) = @_;
+
+    my $marc = get_raw_marc_record($record_type, $record_number, $noxml); 
+
+    if (defined $marc) {
+        fix_leader($marc);
+        if ($record_type eq 'biblio') {
+            my $succeeded = fix_biblio_ids($marc, $record_number);
+            return unless $succeeded;
+        } else {
+            fix_authority_id($marc, $record_number);
+        }
+        if (C4::Context->preference("marcflavour") eq "UNIMARC") {
+            fix_unimarc_100($marc);
+        }
+    }
+
+    return $marc;
+}
+
+sub get_raw_marc_record {
+    my ($record_type, $record_number, $noxml) = @_;
+  
+    my $marc; 
+    if ($record_type eq 'biblio') {
+        if ($noxml) {
+            my $fetch_sth = $dbh->prepare_cached("SELECT marc FROM biblioitems WHERE biblionumber = ?");
+            $fetch_sth->execute($record_number);
+            if (my ($blob) = $fetch_sth->fetchrow_array) {
+                $marc = MARC::Record->new_from_usmarc($blob);
+                unless ($marc) {
+                    warn "error creating MARC::Record from $blob";
                  }
              }
-            unless ($record_correct) {
-                my $update_xml = $dbh->prepare("update biblioitems set marcxml=? where biblionumber=?");
-                warn "UPDATING $biblionumber (missing biblionumber or biblioitemnumber in MARC record : ".$record->as_xml;
-                $update_xml->execute($record->as_xml,$biblionumber);
+            # failure to find a bib is not a problem -
+            # a delete could have been done before
+            # trying to process a record update
+
+            $fetch_sth->finish();
+            return unless $marc;
+        } else {
+            eval { $marc = GetMarcBiblio($record_number); };
+            if ($@ || !$marc) {
+                # here we do warn since catching an exception
+                # means that the bib was found but failed
+                # to be parsed
+                warn "error retrieving biblio $record_number";
+                return;
              }
-            # remove leader length, that could be wrong, it will be calculated automatically by as_usmarc
-            # otherwise, if it's wron, zebra will fail miserabily (and never index what is after the failing record)
-            my $leader=$record->leader;
-            substr($leader,0,5)='     ';
-            substr($leader,10,7)='22     ';
-            $record->leader(substr($leader,0,24));
-                       if($as_xml) {
-                               print OUT $record->as_xml_record();
-                               } else {
-                               print OUT $record->as_usmarc();
-                       }
-                       }
-               }
-        close(OUT);
+        }
+        # ITEM
+        C4::Biblio::EmbedItemsInMarcBiblio($marc, $record_number);
+    } else {
+        eval { $marc = GetAuthority($record_number); };
+        if ($@) {
+            warn "error retrieving authority $record_number";
+            return;
+        }
      }
-    
-    #
-    # and reindexing everything
+    return $marc;
+}
+
+sub fix_leader {
+    # FIXME - this routine is suspect
+    # It blanks the Leader/00-05 and Leader/12-16 to
+    # force them to be recalculated correct when
+    # the $marc->as_usmarc() or $marc->as_xml() is called.
+    # But why is this necessary?  It would be a serious bug
+    # in MARC::Record (definitely) and MARC::File::XML (arguably) 
+    # if they are emitting incorrect leader values.
+    my $marc = shift;
+
+    my $leader = $marc->leader;
+    substr($leader,  0, 5) = '     ';
+    substr($leader, 10, 7) = '22     ';
+    $marc->leader(substr($leader, 0, 24));
+}
+
+sub fix_biblio_ids {
+    # FIXME - it is essential to ensure that the biblionumber is present,
+    #         otherwise, Zebra will choke on the record.  However, this
+    #         logic belongs in the relevant C4::Biblio APIs.
+    my $marc = shift;
+    my $biblionumber = shift;
+    my $biblioitemnumber;
+    if (@_) {
+        $biblioitemnumber = shift;
+    } else {    
+        my $sth = $dbh->prepare(
+            "SELECT biblioitemnumber FROM biblioitems WHERE biblionumber=?");
+        $sth->execute($biblionumber);
+        ($biblioitemnumber) = $sth->fetchrow_array;
+        $sth->finish;
+        unless ($biblioitemnumber) {
+            warn "failed to get biblioitemnumber for biblio $biblionumber";
+            return 0;
+        }
+    }
+
+    # FIXME - this is cheating on two levels
+    # 1. C4::Biblio::_koha_marc_update_bib_ids is meant to be an internal function
+    # 2. Making sure that the biblionumber and biblioitemnumber are correct and
+    #    present in the MARC::Record object ought to be part of GetMarcBiblio.
      #
-       print "====================\n";
-    print "REINDEXING zebra\n";
-    print "====================\n";
-       my $record_fmt = ($as_xml) ? 'marcxml' : 'iso2709' ;
-    do_indexing('biblio', 'update', "$directory/biblios", $reset, $noshadow, $record_fmt);
-} else {
-    print "skipping biblios\n";
+    # On the other hand, this better for now than what rebuild_zebra.pl used to
+    # do, which was duplicate the code for inserting the biblionumber 
+    # and biblioitemnumber
+    C4::Biblio::_koha_marc_update_bib_ids($marc, '', $biblionumber, $biblioitemnumber);
+
+    return 1;
  }
  
-print "====================\n";
-print "CLEANING\n";
-print "====================\n";
-if ($keep_export) {
-    print "NOTHING cleaned : the export $directory has been kept.\n";
-    print "You can re-run this script with the -s ";
-    if ($use_tempdir) {
-        print " and -d $directory parameters";
-    } else {
-        print "parameter";
+sub fix_authority_id {
+    # FIXME - as with fix_biblio_ids, the authid must be present
+    #         for Zebra's sake.  However, this really belongs
+    #         in C4::AuthoritiesMarc.
+    my ($marc, $authid) = @_;
+    unless ($marc->field('001') and $marc->field('001')->data() eq $authid){
+        $marc->delete_field($marc->field('001'));
+        $marc->insert_fields_ordered(MARC::Field->new('001',$authid));
      }
-    print "\n";
-    print "if you just want to rebuild zebra after changing the record.abs\n";
-    print "or another zebra config file\n";
-} else {
-    unless ($use_tempdir) {
-        # if we're using a temporary directory
-        # created by File::Temp, it will be removed
-        # automatically.
-        rmtree($directory, 0, 1);
-        print "directory $directory deleted\n";
+}
+
+sub fix_unimarc_100 {
+    # FIXME - again, if this is necessary, it belongs in C4::AuthoritiesMarc.
+    my $marc = shift;
+
+    my $string;
+    if ( length($marc->subfield( 100, "a" )) == 35 ) {
+        $string = $marc->subfield( 100, "a" );
+        my $f100 = $marc->field(100);
+        $marc->delete_field($f100);
+    }
+    else {
+        $string = POSIX::strftime( "%Y%m%d", localtime );
+        $string =~ s/\-//g;
+        $string = sprintf( "%-*s", 35, $string );
+    }
+    substr( $string, 22, 6, "frey50" );
+    unless ( length($marc->subfield( 100, "a" )) == 35 ) {
+        $marc->delete_field($marc->field(100));
+        $marc->insert_grouped_field(MARC::Field->new( 100, "", "", "a" => $string ));
      }
  }
  
  sub do_indexing {
-    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format) = @_;
+    my ($record_type, $op, $record_dir, $reset_index, $noshadow, $record_format, $zebraidx_log_opt) = @_;
  
      my $zebra_server  = ($record_type eq 'biblio') ? 'biblioserver' : 'authorityserver';
      my $zebra_db_name = ($record_type eq 'biblio') ? 'biblios' : 'authorities';
      my $zebra_config  = C4::Context->zebraconfig($zebra_server)->{'config'};
      my $zebra_db_dir  = C4::Context->zebraconfig($zebra_server)->{'directory'};
  
-    system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name init") if $reset_index;
-    system("zebraidx -c $zebra_config $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
-    system("zebraidx -c $zebra_config -g $record_format -d $zebra_db_name commit") unless $noshadow;
+    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name init") if $reset_index;
+    system("zebraidx -c $zebra_config $zebraidx_log_opt $noshadow -g $record_format -d $zebra_db_name $op $record_dir");
+    system("zebraidx -c $zebra_config $zebraidx_log_opt -g $record_format -d $zebra_db_name commit") unless $noshadow;
  
  }
  
@@ -438,6 +600,11 @@ Parameters:
  
      -a                      index authority records
  
+    -z                      select only updated and deleted
+                            records marked in the zebraqueue
+                            table.  Cannot be used with -r
+                            or -s.
+
      -r                      clear Zebra index before
                              adding records to index
  
@@ -461,8 +628,23 @@ Parameters:
      -x                      export and index as xml instead of is02709 (biblios only).
                              use this if you might have records > 99,999 chars,
                                                         
+    -nosanitize             export biblio/authority records directly from DB marcxml
+                            field without sanitizing records. It speed up
+                            dump process but could fail if DB contains badly
+                            encoded records. Works only with -x,
+
      -w                      skip shadow indexing for this batch
  
+    -y                      do NOT clear zebraqueue after indexing; normally,
+                            after doing batch indexing, zebraqueue should be
+                            marked done for the affected record type(s) so that
+                            a running zebraqueue_daemon doesn't try to reindex
+                            the same records - specify -y to override this.  
+                            Cannot be used with -z.
+
+    -v                      increase the amount of logging.  Normally only 
+                            warnings and errors from the indexing are shown.
+
      -munge-config           Deprecated option to try
                              to fix Zebra config files.
      --help or -h            show this message.
@@ -554,9 +736,11 @@ print "Info: tab dir : $tabdir\n";
  #
  my $created_dir_or_file = 0;
  if ($authorities) {
-    print "====================\n";
-    print "checking directories & files for authorities\n";
-    print "====================\n";
+    if ( $verbose_logging ) {
+        print "====================\n";
+        print "checking directories & files for authorities\n";
+        print "====================\n";
+    }
      unless (-d "$authorityserverdir") {
          system("mkdir -p $authorityserverdir");
          print "Info: created $authorityserverdir\n";
@@ -695,10 +879,12 @@ rank:rank-1
      
  }
  if ($biblios) {
-    print "====================\n";
-    print "checking directories & files for biblios\n";
-    print "====================\n";
-    
+    if ( $verbose_logging ) {
+        print "====================\n";
+        print "checking directories & files for biblios\n";
+        print "====================\n";
+    }
+
      #
      # BIBLIOS : creating directory structure
      #