Merged SimpleMarc.pm over from rel-1-2

author tonnesen <tonnesen>

Tue, 2 Jul 2002 20:30:15 +0000 (20:30 +0000)

committer tonnesen <tonnesen>

Tue, 2 Jul 2002 20:30:15 +0000 (20:30 +0000)
author tonnesen <tonnesen>
Tue, 2 Jul 2002 20:30:15 +0000 (20:30 +0000)
committer tonnesen <tonnesen>
Tue, 2 Jul 2002 20:30:15 +0000 (20:30 +0000)
diff --git a/C4/SimpleMarc.pm b/C4/SimpleMarc.pm

new file mode 100755 (executable)

index 0000000..65b7798
--- /dev/null
+++ b/C4/SimpleMarc.pm
@@ -0,0 +1,464 @@
+#!/usr/bin/perl
+
+# $Id$
+
+package C4::SimpleMarc;
+
+# Routines for handling import of MARC data into Koha db
+
+# Koha library project  www.koha.org
+
+# Licensed under the GPL
+
+use strict;
+
+# standard or CPAN modules used
+use DBI;
+
+# Koha modules used
+use C4::Database;
+
+require Exporter;
+
+use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+
+# set the version for version checking
+$VERSION = 0.01;
+
+@ISA = qw(Exporter);
+@EXPORT = qw(
+       &extractmarcfields 
+       &parsemarcfileformat 
+       &taglabel
+       %tagtext
+       %tagmap
+);
+%EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],
+
+# your exported package globals go here,
+# as well as any optionally exported functions
+
+@EXPORT_OK   = qw(
+       %tagtext
+       %tagmap
+);
+
+# non-exported package globals go here
+use vars qw(@more $stuff);
+
+# initalize package globals, first exported ones
+
+my $Var1   = '';
+my %Hashit = ();
+
+# then the others (which are still accessible as $Some::Module::stuff)
+my $stuff  = '';
+my @more   = ();
+
+# all file-scoped lexicals must be created before
+# the functions below that use them.
+
+# file-private lexicals go here
+my $priv_var    = '';
+my %secret_hash = ();
+
+# here's a file-private function as a closure,
+# callable as &$priv_func;  it cannot be prototyped.
+my $priv_func = sub {
+  # stuff goes here.
+  };
+  
+# make all your functions, whether exported or not;
+#------------------------------------------------
+
+#------------------
+# Constants
+
+my %tagtext = (
+    'LDR' => 'Leader',
+    '001' => 'Control number',
+    '003' => 'Control number identifier',
+    '005' => 'Date and time of latest transaction',
+    '006' => 'Fixed-length data elements -- additional material characteristics',
+    '007' => 'Physical description fixed field',
+    '008' => 'Fixed length data elements',
+    '010' => 'LCCN',
+    '015' => 'National library CN',
+    '020' => 'ISBN',
+    '022' => 'ISSN',
+    '024' => 'Other standard ID',
+    '035' => 'System control number',
+    '037' => 'Source of acquisition',
+    '040' => 'Cataloging source',
+    '041' => 'Language code',
+    '043' => 'Geographic area code',
+    '043' => 'Publishing country code',
+    '050' => 'Library of Congress call number',
+    '055' => 'Canadian classification number',
+    '060' => 'National Library of Medicine call number',
+    '082' => 'Dewey decimal call number',
+    '100' => 'Main entry -- Personal name',
+    '110' => 'Main entry -- Corporate name',
+    '130' => 'Main entry -- Uniform title',
+    '240' => 'Uniform title',
+    '245' => 'Title statement',
+    '246' => 'Varying form of title',
+    '250' => 'Edition statement',
+    '256' => 'Computer file characteristics',
+    '260' => 'Publication, distribution, etc.',
+    '263' => 'Projected publication date',
+    '300' => 'Physical description',
+    '306' => 'Playing time',
+    '440' => 'Series statement / Added entry -- Title',
+    '490' => 'Series statement',
+    '500' => 'General note',
+    '504' => 'Bibliography, etc. note',
+    '505' => 'Formatted contents note',
+    '508' => 'Creation/production credits note',
+    '510' => 'Citation/references note',
+    '511' => 'Participant or performer note',
+    '520' => 'Summary, etc. note',
+    '521' => 'Target audience note (ie age)',
+    '530' => 'Additional physical form available note',
+    '538' => 'System details note',
+    '586' => 'Awards note',
+    '600' => 'Subject added entry -- Personal name',
+    '610' => 'Subject added entry -- Corporate name',
+    '650' => 'Subject added entry -- Topical term',
+    '651' => 'Subject added entry -- Geographic name',
+    '656' => 'Index term -- Occupation',
+    '700' => 'Added entry -- Personal name',
+    '710' => 'Added entry -- Corporate name',
+    '730' => 'Added entry -- Uniform title',
+    '740' => 'Added entry -- Uncontrolled related/analytical title',
+    '800' => 'Series added entry -- Personal name',
+    '830' => 'Series added entry -- Uniform title',
+    '852' => 'Location',
+    '856' => 'Electronic location and access',
+);
+
+# tag, subfield, field name, repeats, striptrailingchars
+my %tagmap=(
+    '010'=>{'a'=>{name=> 'lccn',       rpt=>0, striptrail=>' '         }},
+    '015'=>{'a'=>{name=> 'lccn',       rpt=>0  }},
+    '020'=>{'a'=>{name=> 'isbn',       rpt=>0  }},
+    '022'=>{'a'=>{name=> 'issn',       rpt=>0  }},
+    '082'=>{'a'=>{name=> 'dewey',      rpt=>0  }},
+    '100'=>{'a'=>{name=> 'author',     rpt=>0, striptrail=>',:;/-'     }},
+    '245'=>{'a'=>{name=> 'title',      rpt=>0, striptrail=>',:;/'      },
+            'b'=>{name=> 'subtitle',   rpt=>0, striptrail=>',:;/'      }},
+    '260'=>{'a'=>{name=> 'place',      rpt=>0, striptrail=>',:;/-'     },
+            'b'=>{name=> 'publisher',  rpt=>0, striptrail=>',:;/-'     },
+            'c'=>{name=> 'year' ,      rpt=>0, striptrail=>'.,:;/-'    }},
+    '300'=>{'a'=>{name=> 'pages',      rpt=>0, striptrail=>',:;/-'     },
+            'c'=>{name=> 'size',       rpt=>0, striptrail=>',:;/-'     }},
+    '362'=>{'a'=>{name=> 'volume-number',      rpt=>0  }},
+    '440'=>{'a'=>{name=> 'seriestitle',        rpt=>0, striptrail=>',:;/'      },
+            'v'=>{name=> 'volume-number',rpt=>0        }},
+    '490'=>{'a'=>{name=> 'seriestitle',        rpt=>0, striptrail=>',:;/'      },
+            'v'=>{name=> 'volume-number',rpt=>0        }},
+    '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/'   }},
+    '5xx'=>{'a'=>{name=> 'notes',      rpt=>1  }},
+    '65x'=>{'a'=>{name=> 'subject',    rpt=>1, striptrail=>'.,:;/-'    }},
+);
+
+
+#------------------
+sub extractmarcfields {
+    use strict;
+    # input
+    my (
+       $record,        # pointer to list of MARC field hashes.
+                       # Example: $record->[0]->{'tag'} = '100' # Author
+                       #       $record->[0]->{'subfields'}->{'a'} = subfieldvalue
+    )=@_;
+
+    # return 
+    my $bib;           # pointer to hash of named output fields
+                       # Example: $bib->{'author'} = "Twain, Mark";
+
+    my $debug=0;
+
+    my (
+       $field,         # hash ref
+       $value, 
+       $subfield,      # Marc subfield [a-z]
+       $fieldname,     # name of field "author", "title", etc.
+       $strip,         # chars to remove from end of field
+       $stripregex,    # reg exp pattern
+    );
+    my ($lccn, $isbn, $issn,    
+       $publicationyear, @subjects, $subject,
+       $controlnumber, 
+       $notes, $additionalauthors, $illustrator, $copyrightdate, 
+       $s, $subdivision, $subjectsubfield,
+    );
+
+    print "<PRE>\n" if $debug;
+
+    if ( ref($record) eq "ARRAY" ) {
+        foreach $field (@$record) {
+
+           # Check each subfield in field
+           foreach $subfield ( keys %{$field->{subfields}} ) {
+               # see if it is defined in our Marc to koha mapping table
+               if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
+                   # Yes, so keep the value
+                   if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
+                       # if it was an array, just keep first element.
+                       $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
+                   } else {
+                       $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
+                   } # if array
+                   print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
+                   # see if this field should have trailing chars dropped
+                   if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
+                       $strip=~s//\\/; # backquote each char
+                       $stripregex='[ ' . $strip . ']+$';  # remove trailing spaces also
+                       $bib->{$fieldname}=~s/$stripregex//;
+                       # also strip leading spaces
+                       $bib->{$fieldname}=~s/^ +//;
+                   } # if strip
+                   print "Found subfield $field->{'tag'} $subfield " .
+                       "$fieldname = $bib->{$fieldname}\n" if $debug;
+               } # if tagmap exists
+
+           } # foreach subfield
+
+
+           if ($field->{'tag'} eq '001') {
+               $bib->{controlnumber}=$field->{'indicator'};
+           }
+           if ($field->{'tag'} eq '015') {
+               $bib->{lccn}=$field->{'subfields'}->{'a'};
+               $bib->{lccn}=~s/^\s*//;
+               $bib->{lccn}=~s/^C//;
+               ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
+           }
+
+
+               if ($field->{'tag'} eq '260') {
+
+                   $publicationyear=$field->{'subfields'}->{'c'};
+                   if ($publicationyear=~/c(\d\d\d\d)/) {
+                       $copyrightdate=$1;
+                   }
+                   if ($publicationyear=~/[^c](\d\d\d\d)/) {
+                       $publicationyear=$1;
+                   } elsif ($copyrightdate) {
+                       $publicationyear=$copyrightdate;
+                   } else {
+                       $publicationyear=~/(\d\d\d\d)/;
+                       $publicationyear=$1;
+                   }
+               }
+               if ($field->{'tag'} eq '700') {
+                   my $name=$field->{'subfields'}->{'a'};
+                   if ( defined($field->{'subfields'}->{'e'}) 
+                       and  $field->{'subfields'}->{'e'}=~/ill/) {
+                       $illustrator=$name;
+                   } else {
+                       $additionalauthors.="$name\n";
+                   }
+               }
+               if ($field->{'tag'} =~/^5/) {
+                   $notes.="$field->{'subfields'}->{'a'}\n";
+               }
+               if ($field->{'tag'} =~/65\d/) {
+                   my $sub;
+                   my $subject=$field->{'subfields'}->{'a'};
+                   $subject=~s/\.$//;
+                   print "Subject=$subject\n" if $debug;
+                   foreach $subjectsubfield ( 'x','y','z' ) {
+                     if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
+                       if ( ref($subdivision) eq 'ARRAY' ) {
+                           foreach $s (@$subdivision) {
+                               $s=~s/\.$//;
+                               $subject.=" -- $s";
+                           } # foreach subdivision
+                       } else {
+                           $subdivision=~s/\.$//;
+                           $subject.=" -- $subdivision";
+                       } # if array
+                     } # if subfield exists
+                   } # foreach subfield
+                   print "Subject=$subject\n" if $debug;
+                   push @subjects, $subject;
+               } # if tag 65x
+
+
+        } # foreach field
+       ($publicationyear       ) && ($bib->{publicationyear}=$publicationyear  );
+       ($copyrightdate         ) && ($bib->{copyrightdate}=$copyrightdate  );
+       ($additionalauthors     ) && ($bib->{additionalauthors}=$additionalauthors  );
+       ($illustrator           ) && ($bib->{illustrator}=$illustrator  );
+       ($notes                 ) && ($bib->{notes}=$notes  );
+       ($#subjects             ) && ($bib->{subject}=\@subjects  );
+
+       # Misc cleanup
+       if ($bib->{dewey}) {
+           $bib->{dewey}=~s/\///g;     # drop any slashes
+       }
+
+       if ($bib->{lccn}) {
+          ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
+       }
+
+       if ( $bib->{isbn} ) {
+           $bib->{isbn}=~s/[^\d]*//g;  # drop non-digits
+       };
+
+       if ( $bib->{issn} ) {
+           $bib->{issn}=~s/^\s*//;
+           ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
+       };
+
+       if ( $bib->{'volume-number'} ) {
+           if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
+               $bib->{'volume'}=$1;
+               $bib->{'number'}=$2;
+           } else {
+               $bib->{volume}=$bib->{'volume-number'};
+           }
+           delete $bib->{'volume-number'};
+       } # if volume-number
+
+    } else {
+       print "Error: extractmarcfields: input ref $record is " .
+               ref($record) . " not ARRAY. Contact sysadmin.\n";
+    }
+    print "</PRE>\n" if $debug;
+
+    return $bib;
+
+} # sub extractmarcfields
+#---------------------------------
+
+#--------------------------
+# Parse MARC data in file format with control-character separators
+#   May be multiple records.
+sub parsemarcfileformat {
+    use strict;
+    # Input is one big text string
+    my $data=shift;
+    # Output is list of records.  Each record is list of field hashes
+    my @records;
+
+    my $splitchar=chr(29);
+    my $splitchar2=chr(30);
+    my $splitchar3=chr(31);
+    my $debug=0;
+    my $record;
+    foreach $record (split(/$splitchar/, $data)) {
+       my @record;
+       my $directory=0;
+       my $tagcounter=0;
+       my %tag;
+       my $field;
+
+       my $leader=substr($record,0,24);
+       print "<pre>parse Leader:$leader</pre>\n" if $debug;
+       push (@record, {
+               'tag' => 'LDR',
+               'indicator' => $leader ,
+       } );
+
+       $record=substr($record,24);
+       foreach $field (split(/$splitchar2/, $record)) {
+           my %field;
+           my $tag;
+           my $indicator;
+           unless ($directory) {
+               # If we didn't already find a directory, extract one.
+               $directory=$field;
+               my $itemcounter=1;
+               my $counter2=0;
+               my $item;
+               my $length;
+               my $start;
+               while ($item=substr($directory,0,12)) {
+                   # Pull out location of first field
+                   $tag=substr($directory,0,3);
+                   $length=substr($directory,3,4);
+                   $start=substr($directory,7,6);
+
+                   # Bump to next directory entry
+                   $directory=substr($directory,12);
+                   $tag{$counter2}=$tag;
+                   $counter2++;
+               }
+               $directory=1;
+               next;
+           }
+           $tag=$tag{$tagcounter};
+           $tagcounter++;
+           $field{'tag'}=$tag;
+           my @subfields=split(/$splitchar3/, $field);
+           $indicator=$subfields[0];
+           $field{'indicator'}=$indicator;
+           print "<pre>parse indicator:$indicator</pre>\n" if $debug;
+           my $firstline=1;
+           unless ($#subfields==0) {
+               my %subfields;
+               my @subfieldlist;
+               my $i;
+               for ($i=1; $i<=$#subfields; $i++) {
+                   my $text=$subfields[$i];
+                   my $subfieldcode=substr($text,0,1);
+                   my $subfield=substr($text,1);
+                   # if this subfield already exists, do array
+                   if ($subfields{$subfieldcode}) {
+                       my $subfieldlist=$subfields{$subfieldcode};
+                       if ( ref($subfieldlist) eq 'ARRAY' ) {
+                            # Already an array, add on to it
+                           print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
+                           @subfieldlist=@$subfieldlist;
+                           push (@subfieldlist, $subfield);
+                       } else {
+                            # Change simple value to array
+                           print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
+                           @subfieldlist=($subfields{$subfieldcode}, $subfield);
+                       }
+                       # keep new array
+                       $subfields{$subfieldcode}=\@subfieldlist;
+                   } else {
+                       # subfield doesn't exist yet, keep simple value
+                       $subfields{$subfieldcode}=$subfield;
+                   }
+               }
+               $field{'subfields'}=\%subfields;
+           }
+           push (@record, \%field);
+       } # foreach field in record
+       push (@records, \@record);
+       # $counter++;
+    }
+    print "</pre>" if $debug;
+    return @records;
+} # sub parsemarcfileformat
+
+#----------------------------------------------
+sub taglabel {
+    my ($tag)=@_;
+
+    return $tagtext{$tag};
+
+} # sub taglabel
+
+#---------------------------------------------
+# $Log$
+# Revision 1.2  2002/07/02 20:30:15  tonnesen
+# Merged SimpleMarc.pm over from rel-1-2
+#
+# Revision 1.1.2.4  2002/06/28 14:36:47  amillar
+# Fix broken logic on illustrator vs. add'l author
+#
+# Revision 1.1.2.3  2002/06/26 20:54:32  tonnesen
+# use warnings breaks on perl 5.005...
+#
+# Revision 1.1.2.2  2002/06/26 15:52:55  amillar
+# Fix display of marc tag labels and indicators
+#
+# Revision 1.1.2.1  2002/06/26 07:27:35  amillar
+# Moved acqui.simple MARC handling to new module SimpleMarc.pm
+#
author	tonnesen <tonnesen>
	Tue, 2 Jul 2002 20:30:15 +0000 (20:30 +0000)
committer	tonnesen <tonnesen>
	Tue, 2 Jul 2002 20:30:15 +0000 (20:30 +0000)