5 package C4::SimpleMarc;
7 # Routines for handling import of MARC data into Koha db
9 # Koha library project www.koha.org
11 # Licensed under the GPL
14 # Copyright 2000-2002 Katipo Communications
16 # This file is part of Koha.
18 # Koha is free software; you can redistribute it and/or modify it under the
19 # terms of the GNU General Public License as published by the Free Software
20 # Foundation; either version 2 of the License, or (at your option) any later
23 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
24 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
25 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
27 # You should have received a copy of the GNU General Public License along with
28 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
29 # Suite 330, Boston, MA 02111-1307 USA
33 # standard or CPAN modules used
40 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
42 # set the version for version checking
47 C4::SimpleMarc - Functions for parsing MARC records and files
55 This module provides functions for parsing MARC records and files.
71 %EXPORT_TAGS = ( ); # eg: TAG => [ qw!name1 name2! ],
73 # your exported package globals go here,
74 # as well as any optionally exported functions
76 # FIXME - %tagtext and %tagmap are in both @EXPORT and @EXPORT_OK.
77 # They should be in one or the other, but not both (though preferably,
78 # things shouldn't get exported in the first place).
84 # non-exported package globals go here
85 use vars qw(@more $stuff);
87 # initalize package globals, first exported ones
92 # then the others (which are still accessible as $Some::Module::stuff)
96 # all file-scoped lexicals must be created before
97 # the functions below that use them.
99 # file-private lexicals go here
101 my %secret_hash = ();
103 # here's a file-private function as a closure,
104 # callable as &$priv_func; it cannot be prototyped.
105 my $priv_func = sub {
109 # make all your functions, whether exported or not;
110 #------------------------------------------------
115 # %tagtext maps MARC tags to descriptive names.
118 '001' => 'Control number',
119 '003' => 'Control number identifier',
120 '005' => 'Date and time of latest transaction',
121 '006' => 'Fixed-length data elements -- additional material characteristics',
122 '007' => 'Physical description fixed field',
123 '008' => 'Fixed length data elements',
125 '015' => 'National library CN',
128 '024' => 'Other standard ID',
129 '035' => 'System control number',
130 '037' => 'Source of acquisition',
131 '040' => 'Cataloging source',
132 '041' => 'Language code',
133 '043' => 'Geographic area code',
134 '043' => 'Publishing country code',
135 '050' => 'Library of Congress call number',
136 '055' => 'Canadian classification number',
137 '060' => 'National Library of Medicine call number',
138 '082' => 'Dewey decimal call number',
139 '100' => 'Main entry -- Personal name',
140 '110' => 'Main entry -- Corporate name',
141 '130' => 'Main entry -- Uniform title',
142 '240' => 'Uniform title',
143 '245' => 'Title statement',
144 '246' => 'Varying form of title',
145 '250' => 'Edition statement',
146 '256' => 'Computer file characteristics',
147 '260' => 'Publication, distribution, etc.',
148 '263' => 'Projected publication date',
149 '300' => 'Physical description',
150 '306' => 'Playing time',
151 '440' => 'Series statement / Added entry -- Title',
152 '490' => 'Series statement',
153 '500' => 'General note',
154 '504' => 'Bibliography, etc. note',
155 '505' => 'Formatted contents note',
156 '508' => 'Creation/production credits note',
157 '510' => 'Citation/references note',
158 '511' => 'Participant or performer note',
159 '520' => 'Summary, etc. note',
160 '521' => 'Target audience note (ie age)',
161 '530' => 'Additional physical form available note',
162 '538' => 'System details note',
163 '586' => 'Awards note',
164 '600' => 'Subject added entry -- Personal name',
165 '610' => 'Subject added entry -- Corporate name',
166 '650' => 'Subject added entry -- Topical term',
167 '651' => 'Subject added entry -- Geographic name',
168 '656' => 'Index term -- Occupation',
169 '700' => 'Added entry -- Personal name',
170 '710' => 'Added entry -- Corporate name',
171 '730' => 'Added entry -- Uniform title',
172 '740' => 'Added entry -- Uncontrolled related/analytical title',
173 '800' => 'Series added entry -- Personal name',
174 '830' => 'Series added entry -- Uniform title',
176 '856' => 'Electronic location and access',
179 # tag, subfield, field name, repeats, striptrailingchars
180 # FIXME - What is this? Can it be explained without a semester-long
183 # XXX - Maps MARC (field, subfield) tuples to Koha database field
184 # names (presumably in 'biblioitems'). $tagmap{$field}->{$subfield} is
185 # an anonymous hash of the form
187 # name => "title", # Name of Koha field
188 # rpt => 0, # I don't know what this is, but
190 # striptrail => ',:;/-', # Lists the set of characters that
191 # # should be stripped from the end
192 # # of the MARC field.
196 '010'=>{'a'=>{name=> 'lccn', rpt=>0, striptrail=>' ' }},
197 '015'=>{'a'=>{name=> 'lccn', rpt=>0 }},
198 '020'=>{'a'=>{name=> 'isbn', rpt=>0 }},
199 '022'=>{'a'=>{name=> 'issn', rpt=>0 }},
200 '082'=>{'a'=>{name=> 'dewey', rpt=>0 }},
201 '100'=>{'a'=>{name=> 'author', rpt=>0, striptrail=>',:;/-' }},
202 '245'=>{'a'=>{name=> 'title', rpt=>0, striptrail=>',:;/' },
203 'b'=>{name=> 'subtitle', rpt=>0, striptrail=>',:;/' }},
204 '260'=>{'a'=>{name=> 'place', rpt=>0, striptrail=>',:;/-' },
205 'b'=>{name=> 'publisher', rpt=>0, striptrail=>',:;/-' },
206 'c'=>{name=> 'year' , rpt=>0, striptrail=>'.,:;/-' }},
207 '300'=>{'a'=>{name=> 'pages', rpt=>0, striptrail=>',:;/-' },
208 'c'=>{name=> 'size', rpt=>0, striptrail=>',:;/-' }},
209 '362'=>{'a'=>{name=> 'volume-number', rpt=>0 }},
210 '440'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
211 'v'=>{name=> 'volume-number',rpt=>0 }},
212 '490'=>{'a'=>{name=> 'seriestitle', rpt=>0, striptrail=>',:;/' },
213 'v'=>{name=> 'volume-number',rpt=>0 }},
214 '700'=>{'a'=>{name=> 'addtional-author-illus',rpt=>1, striptrail=>',:;/' }},
215 '5xx'=>{'a'=>{name=> 'notes', rpt=>1 }},
216 '65x'=>{'a'=>{name=> 'subject', rpt=>1, striptrail=>'.,:;/-' }},
222 =item extractmarcfields
224 $biblioitem = &extractmarcfields($marc_record);
226 C<$marc_record> is a reference-to-array representing a MARC record;
227 each element is a reference-to-hash specifying a MARC field (possibly
230 C<&extractmarcfields> translates C<$marc_record> into a Koha
231 biblioitem. C<$biblioitem> is a reference-to-hash whose keys are named
232 after fields in the biblioitems table of the Koha database.
236 # FIXME - Throughout:
237 # $foo->{bar}->[baz]->{quux}
238 # can be rewritten as
239 # $foo->{bar}[baz]{quux}
240 sub extractmarcfields {
244 $record, # pointer to list of MARC field hashes.
245 # Example: $record->[0]->{'tag'} = '100' # Author
246 # $record->[0]->{'subfields'}->{'a'} = subfieldvalue
250 my $bib; # pointer to hash of named output fields
251 # Example: $bib->{'author'} = "Twain, Mark";
258 $subfield, # Marc subfield [a-z]
259 $fieldname, # name of field "author", "title", etc.
260 $strip, # chars to remove from end of field
261 $stripregex, # reg exp pattern
263 my ($lccn, $isbn, $issn,
264 $publicationyear, @subjects, $subject,
266 $notes, $additionalauthors, $illustrator, $copyrightdate,
267 $s, $subdivision, $subjectsubfield,
270 print "<PRE>\n" if $debug;
272 if ( ref($record) eq "ARRAY" ) {
273 foreach $field (@$record) {
275 # Check each subfield in field
276 # FIXME - Would this code be more readable with
277 # while (($subfieldname, $subfield) = each %{$field->{subfields}})
279 foreach $subfield ( keys %{$field->{subfields}} ) {
280 # see if it is defined in our Marc to koha mapping table
281 # FIXME - This if-clause takes up the entire loop.
282 # This would be better rewritten as
283 # next unless defined($tagmap{...});
284 # Then the body of the loop doesn't have to be
286 if ( $fieldname=$tagmap{ $field->{'tag'} }->{$subfield}->{name} ) {
287 # Yes, so keep the value
288 if ( ref($field->{'subfields'}->{$subfield} ) eq 'ARRAY' ) {
289 # if it was an array, just keep first element.
290 $bib->{$fieldname}=$field->{'subfields'}->{$subfield}[0];
292 $bib->{$fieldname}=$field->{'subfields'}->{$subfield};
294 print "$field->{'tag'} $subfield $fieldname=$bib->{$fieldname}\n" if $debug;
295 # see if this field should have trailing chars dropped
296 if ($strip=$tagmap{ $field->{'tag'} }->{$subfield}->{striptrail} ) {
297 # FIXME - The next three lines can be rewritten as:
298 # $bib =~ s/[\Q$strip\E]+$//;
299 $strip=~s//\\/; # backquote each char
300 $stripregex='[ ' . $strip . ']+$'; # remove trailing spaces also
301 $bib->{$fieldname}=~s/$stripregex//;
302 # also strip leading spaces
303 $bib->{$fieldname}=~s/^ +//;
305 print "Found subfield $field->{'tag'} $subfield " .
306 "$fieldname = $bib->{$fieldname}\n" if $debug;
311 # Handle special fields and tags
312 if ($field->{'tag'} eq '001') {
313 $bib->{controlnumber}=$field->{'indicator'};
315 if ($field->{'tag'} eq '015') {
316 # FIXME - I think this can be rewritten as
317 # $field->{"subfields"}{"a"} =~ /^\s*C?(\S+)/ and
318 # $bib->{"lccn"} = $1;
319 # This might break with invalid input, though.
320 $bib->{lccn}=$field->{'subfields'}->{'a'};
321 $bib->{lccn}=~s/^\s*//;
322 $bib->{lccn}=~s/^C//;
323 ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0];
327 # FIXME - Fix indentation
328 if ($field->{'tag'} eq '260') {
330 $publicationyear=$field->{'subfields'}->{'c'};
331 # FIXME - "\d\d\d\d" can be rewritten as "\d{4}"
332 if ($publicationyear=~/c(\d\d\d\d)/) {
335 if ($publicationyear=~/[^c](\d\d\d\d)/) {
337 } elsif ($copyrightdate) {
338 $publicationyear=$copyrightdate;
340 $publicationyear=~/(\d\d\d\d)/;
344 if ($field->{'tag'} eq '700') {
345 my $name=$field->{'subfields'}->{'a'};
346 if ( defined($field->{'subfields'}->{'e'})
347 and $field->{'subfields'}->{'e'}=~/ill/) {
350 $additionalauthors.="$name\n";
353 if ($field->{'tag'} =~/^5/) {
354 $notes.="$field->{'subfields'}->{'a'}\n";
356 if ($field->{'tag'} =~/65\d/) {
357 my $sub; # FIXME - Never used
358 my $subject=$field->{'subfields'}->{'a'};
360 print "Subject=$subject\n" if $debug;
361 foreach $subjectsubfield ( 'x','y','z' ) {
362 # FIXME - $subdivision is only used in this
363 # loop. Make it 'my' here, rather than in the
365 # Ditto $subjectsubfield. Make it 'my' in the
366 # 'foreach' statement.
367 if ($subdivision=$field->{'subfields'}->{$subjectsubfield}) {
368 if ( ref($subdivision) eq 'ARRAY' ) {
369 foreach $s (@$subdivision) {
372 } # foreach subdivision
374 $subdivision=~s/\.$//;
375 $subject.=" -- $subdivision";
377 } # if subfield exists
379 print "Subject=$subject\n" if $debug;
380 push @subjects, $subject;
385 # FIXME - Why not do this up in the "Handle special fields and
387 ($publicationyear ) && ($bib->{publicationyear}=$publicationyear );
388 ($copyrightdate ) && ($bib->{copyrightdate}=$copyrightdate );
389 ($additionalauthors ) && ($bib->{additionalauthors}=$additionalauthors );
390 ($illustrator ) && ($bib->{illustrator}=$illustrator );
391 ($notes ) && ($bib->{notes}=$notes );
392 ($#subjects ) && ($bib->{subject}=\@subjects );
393 # FIXME - This doesn't look right: for an array with
394 # one element, $#subjects == 0, which is false. For an
395 # array with 0 elements, $#subjects == -1, which is
400 $bib->{dewey}=~s/\///g; # drop any slashes
401 # FIXME - Why? Don't the
402 # slashes mean something?
403 # The Dewey code is NOT a number,
408 ($bib->{lccn}) = (split(/\s+/, $bib->{lccn}))[0]; # only keep first word
411 if ( $bib->{isbn} ) {
412 $bib->{isbn}=~s/[^\d]*//g; # drop non-digits
413 # FIXME - "[^\d]" can be rewritten as "\D"
414 # FIXME - Does this include the check digit? If so,
418 if ( $bib->{issn} ) {
419 $bib->{issn}=~s/^\s*//;
420 ($bib->{issn}) = (split(/\s+/, $bib->{issn}))[0];
423 if ( $bib->{'volume-number'} ) {
424 if ($bib->{'volume-number'}=~/(\d+).*(\d+)/ ) {
428 $bib->{volume}=$bib->{'volume-number'};
430 delete $bib->{'volume-number'};
434 # FIXME - Style: this sort of error-checking should really go
435 # closer to the actual test, e.g.:
436 # if (ref($record) ne "ARRAY")
438 # die "Not an array!"
440 # then the rest of the code which follows can assume that the
441 # input is good, and you don't have to indent as much.
442 print "Error: extractmarcfields: input ref $record is " .
443 ref($record) . " not ARRAY. Contact sysadmin.\n";
445 print "</PRE>\n" if $debug;
449 } # sub extractmarcfields
450 #---------------------------------
452 #--------------------------
454 =item parsemarcfileformat
456 @records = &parsemarcfileformat($marc_data);
458 Parses the contents of a MARC file.
460 C<$marc_data> is a string, the contents of a MARC file.
461 C<&parsemarcfileformat> parses this string into individual MARC
462 records and returns them.
464 C<@records> is an array of references-to-hash. Each element is a MARC
465 record; its keys are the MARC tags.
469 # Parse MARC data in file format with control-character separators
470 # May be multiple records.
471 # FIXME - Is the input ever likely to be more than a few Kb? If so, it
472 # might be worth changing this function to take a (read-only)
473 # reference-to-string, to avoid unnecessary copying.
474 sub parsemarcfileformat {
476 # Input is one big text string
478 # Output is list of records. Each record is list of field hashes
481 my $splitchar=chr(29); # \c]
482 my $splitchar2=chr(30); # \c^
483 my $splitchar3=chr(31); # \c_
486 foreach $record (split(/$splitchar/, $data)) {
493 my $leader=substr($record,0,24);
494 print "<pre>parse Leader:$leader</pre>\n" if $debug;
497 'indicator' => $leader ,
500 $record=substr($record,24);
501 foreach $field (split(/$splitchar2/, $record)) {
505 unless ($directory) {
506 # If we didn't already find a directory, extract one.
513 while ($item=substr($directory,0,12)) {
514 # Pull out location of first field
515 $tag=substr($directory,0,3);
516 $length=substr($directory,3,4);
517 $start=substr($directory,7,6);
519 # Bump to next directory entry
520 $directory=substr($directory,12);
521 $tag{$counter2}=$tag;
527 $tag=$tag{$tagcounter};
530 my @subfields=split(/$splitchar3/, $field);
531 $indicator=$subfields[0];
532 $field{'indicator'}=$indicator;
533 print "<pre>parse indicator:$indicator</pre>\n" if $debug;
535 unless ($#subfields==0) {
539 for ($i=1; $i<=$#subfields; $i++) {
540 my $text=$subfields[$i];
541 my $subfieldcode=substr($text,0,1);
542 my $subfield=substr($text,1);
543 # if this subfield already exists, do array
544 if ($subfields{$subfieldcode}) {
545 my $subfieldlist=$subfields{$subfieldcode};
546 if ( ref($subfieldlist) eq 'ARRAY' ) {
547 # Already an array, add on to it
548 print "$tag Adding to array $subfieldcode -- $subfield<br>\n" if $debug;
549 @subfieldlist=@$subfieldlist;
550 push (@subfieldlist, $subfield);
552 # Change simple value to array
553 print "$tag Arraying $subfieldcode -- $subfield<br>\n" if $debug;
554 @subfieldlist=($subfields{$subfieldcode}, $subfield);
557 $subfields{$subfieldcode}=\@subfieldlist;
559 # subfield doesn't exist yet, keep simple value
560 $subfields{$subfieldcode}=$subfield;
563 $field{'subfields'}=\%subfields;
565 push (@record, \%field);
566 } # foreach field in record
567 push (@records, \@record);
570 print "</pre>" if $debug;
572 } # sub parsemarcfileformat
574 #----------------------------------------------
578 $label = &taglabel($tag);
580 Converts a MARC tag (a three-digit number, or "LDR") and returns a
583 Note that although the tag looks like a number, it is treated here as
584 a string. Be sure to use
586 $label = &taglabel("082");
590 $label = &taglabel(082); # <-- Invalid octal number!
594 # FIXME - Does this function mean that %tagtext doesn't need to be
599 return $tagtext{$tag};
605 #---------------------------------------------
607 # Revision 1.6 2002/10/10 04:44:28 arensb
608 # Added whitespace to make the POD work.
610 # Revision 1.5 2002/10/07 00:51:22 arensb
611 # Added POD and some comments.
613 # Revision 1.4 2002/10/05 09:53:11 arensb
614 # Merged with arensb-context branch: use C4::Context->dbh instead of
615 # &C4Connect, and generally prefer C4::Context over C4::Database.
617 # Revision 1.3.2.1 2002/10/04 02:57:38 arensb
618 # Removed useless "use C4::Database;" line.
620 # Revision 1.3 2002/08/14 18:12:52 tonnesen
621 # Added copyright statement to all .pl and .pm files
623 # Revision 1.2 2002/07/02 20:30:15 tonnesen
624 # Merged SimpleMarc.pm over from rel-1-2
626 # Revision 1.1.2.4 2002/06/28 14:36:47 amillar
627 # Fix broken logic on illustrator vs. add'l author
629 # Revision 1.1.2.3 2002/06/26 20:54:32 tonnesen
630 # use warnings breaks on perl 5.005...
632 # Revision 1.1.2.2 2002/06/26 15:52:55 amillar
633 # Fix display of marc tag labels and indicators
635 # Revision 1.1.2.1 2002/06/26 07:27:35 amillar
636 # Moved acqui.simple MARC handling to new module SimpleMarc.pm
644 Koha Developement team <info@koha.org>