#
# This file is part of Koha.
#
-# Koha is free software; you can redistribute it and/or modify it under the
-# terms of the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any later
-# version.
+# Koha is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
#
-# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
-# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+# Koha is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
#
-# You should have received a copy of the GNU General Public License along with
-# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
-# Suite 330, Boston, MA 02111-1307 USA
+# You should have received a copy of the GNU General Public License
+# along with Koha; if not, see <http://www.gnu.org/licenses>.
use strict;
+use warnings;
+
use MARC::Charset qw/marc8_to_utf8/;
use Text::Iconv;
+use C4::Debug;
+use Unicode::Normalize;
+use Encode qw( decode encode is_utf8 );
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
BEGIN {
# set the version for version checking
- $VERSION = 3.01;
+ $VERSION = 3.07.00.049;
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(
+ NormalizeString
IsStringUTF8ish
MarcToUTF8Record
+ SetUTF8Flag
SetMarcUnicodeFlag
StripNonXmlChars
+ nsb_clean
+ SanitizeRecord
);
}
+=encoding UTF-8
+
=head1 NAME
C4::Charset - utilities for handling character set conversions.
=head1 SYNOPSIS
-use C4::Charset;
+ use C4::Charset;
=head1 DESCRIPTION
=head2 IsStringUTF8ish
-=over 4
-
-my $is_utf8 = IsStringUTF8ish($str);
-
-=back
+ my $is_utf8 = IsStringUTF8ish($str);
Determines if C<$str> is valid UTF-8. This can mean
one of two things:
-=over 2
+=over
=item *
sub IsStringUTF8ish {
my $str = shift;
- return 1 if utf8::is_utf8($str);
- return utf8::decode($str);
+ return 1 if Encode::is_utf8($str);
+ return utf8::decode( $str );
}
-=head2 MarcToUTF8Record
+=head2 SetUTF8Flag
-=over 4
+ my $marc_record = SetUTF8Flag($marc_record, $nfd);
-($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
+This function sets the PERL UTF8 flag for data.
+It is required when using new_from_usmarc
+since MARC::File::USMARC does not handle PERL UTF8 setting.
+When editing unicode marc records fields and subfields, you
+would end up in double encoding without using this function.
-=back
+If $nfd is set, string normalization will use NFD instead of NFC
+
+FIXME
+In my opinion, this function belongs to MARC::Record and not
+to this package.
+But since it handles charset, and MARC::Record, it finds its way in that package
+
+=cut
+
+sub SetUTF8Flag{
+ my ($record, $nfd)=@_;
+ return unless ($record && $record->fields());
+ foreach my $field ($record->fields()){
+ if ($field->tag()>=10){
+ my @subfields;
+ foreach my $subfield ($field->subfields()){
+ push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
+ }
+ eval {
+ my $newfield=MARC::Field->new(
+ $field->tag(),
+ $field->indicator(1),
+ $field->indicator(2),
+ @subfields
+ );
+ $field->replace_with($newfield);
+ };
+ warn "ERROR occurred in SetUTF8Flag $@" if $@;
+ }
+ }
+}
+
+=head2 NormalizeString
+
+ my $normalized_string=NormalizeString($string,$nfd,$transform);
+
+Given a string
+nfd : If you want to set NFD and not NFC
+transform : If you expect all the signs to be removed
+
+Sets the PERL UTF8 Flag on your initial data if need be
+and applies cleaning if required
+
+Returns a utf8 NFC normalized string
+
+Sample code :
+ my $string=NormalizeString ("l'ornithoptère");
+ #results into ornithoptère in NFC form and sets UTF8 Flag
+
+=cut
+
+
+sub NormalizeString{
+ my ($string,$nfd,$transform)=@_;
+ return $string unless defined($string); # force scalar context return.
+ $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
+ if ($nfd){
+ $string= NFD($string);
+ }
+ else {
+ $string=NFC($string);
+ }
+ if ($transform){
+ $string=~s/\<|\>|\^|\;|\.|\?|,|\-|\(|\)|\[|\]|\{|\}|\$|\%|\!|\*|\:|\\|\/|\&|\"|\'/ /g;
+ #removing one letter words "d'" "l'" was changed into "d " "l "
+ $string=~s/\b\S\b//g;
+ $string=~s/\s+$//g;
+ }
+ return $string;
+}
+
+=head2 MarcToUTF8Record
+
+ ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob,
+ $marc_flavour, [, $source_encoding]);
Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
optional source encoding, return a C<MARC::Record> that is
my $marc = shift;
my $marc_flavour = shift;
my $source_encoding = shift;
-
my $marc_record;
my $marc_blob_is_utf8 = 0;
if (ref($marc) eq 'MARC::Record') {
# If we do not know the source encoding, try some guesses
# as follows:
# 1. Record is UTF-8 already.
- # 2. If MARC flavor is MARC21, then
+ # 2. If MARC flavor is MARC21 or NORMARC, then
# a. record is MARC-8
# b. record is ISO-8859-1
# 3. If MARC flavor is UNIMARC, then
if (not defined $source_encoding) {
if ($marc_blob_is_utf8) {
- # note that for MARC21 we are not bothering to check
+ # note that for MARC21/NORMARC we are not bothering to check
# if the Leader/09 is set to 'a' or not -- because
# of problems with various ILSs (including Koha in the
# past, alas), this just is not trustworthy.
SetMarcUnicodeFlag($marc_record, $marc_flavour);
return $marc_record, 'UTF-8', [];
} else {
- if ($marc_flavour eq 'MARC21') {
+ if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
- } elsif ($marc_flavour eq 'UNIMARC') {
+ } elsif ($marc_flavour =~/UNIMARC/) {
return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
} else {
return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
@errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
} else {
# assume any other character encoding is for Text::Iconv
- @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
+ @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
}
if (@errors) {
=head2 SetMarcUnicodeFlag
-=over 4
-
-SetMarcUnicodeFlag($marc_record, $marc_flavour);
-
-=back
+ SetMarcUnicodeFlag($marc_record, $marc_flavour);
Set both the internal MARC::Record encoding flag
and the appropriate Leader/09 (MARC21) or
my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
$marc_record->encoding('UTF-8');
- if ($marc_flavour eq 'MARC21') {
+ if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') {
my $leader = $marc_record->leader();
substr($leader, 9, 1) = 'a';
$marc_record->leader($leader);
- } elsif ($marc_flavour eq "UNIMARC") {
- if (my $field = $marc_record->field('100')) {
- my $sfa = $field->subfield('a');
-
- my $subflength = 36;
- # fix the length of the field
- $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
- $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
-
- substr($sfa, 26, 4) = '50 ';
- $field->update('a' => $sfa);
+ } elsif ($marc_flavour =~/UNIMARC/) {
+ require C4::Context;
+ my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
+ $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
+ my $string;
+ my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
+ $string=$marc_record->subfield( 100, "a" );
+ if (defined $string && length($string)==$subflength) {
+ $string = substr $string, 0,$subflength if (length($string)>$subflength);
+ }
+ else {
+ $string = POSIX::strftime( "%Y%m%d", localtime );
+ $string =~ s/\-//g;
+ $string = sprintf( "%-*s", $subflength, $string );
+ substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
+ }
+ substr( $string, $encodingposition, 3, "y50" );
+ if ( $marc_record->subfield( 100, "a" ) ) {
+ $marc_record->field('100')->update(a=>$string);
+ }
+ else {
+ $marc_record->insert_grouped_field(
+ MARC::Field->new( 100, '', '', "a" => $string ) );
}
+ $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
} else {
warn "Unrecognized marcflavour: $marc_flavour";
}
=head2 StripNonXmlChars
-=over 4
-
-my $new_str = StripNonXmlChars($old_str);
-
-=back
+ my $new_str = StripNonXmlChars($old_str);
Given a string, return a copy with the
characters that are illegal in XML
sub StripNonXmlChars {
my $str = shift;
+ if (!defined($str) || $str eq ""){
+ return "";
+ }
$str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
return $str;
}
-=head1 INTERNAL FUNCTIONS
-=head2 _default_marc21_charconv_to_utf8
+
+=head2 nsb_clean
=over 4
-my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
+nsb_clean($string);
=back
+Removes Non Sorting Block characters
+
+=cut
+sub nsb_clean {
+ my $NSB = '\x88' ; # NSB : begin Non Sorting Block
+ my $NSE = '\x89' ; # NSE : Non Sorting Block end
+ my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block
+ my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end
+ my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
+
+ # handles non sorting blocks
+ my ($string) = @_ ;
+ $_ = $string ;
+ s/($C2){0,1}($NSB|$NSB2)//g ;
+ s/($C2){0,1}($NSE|$NSE2)//g ;
+ $string = $_ ;
+
+ return($string) ;
+}
+
+
+=head2 SanitizeRecord
+
+SanitizeRecord($marcrecord);
+
+Sanitize a record
+This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
+It cleans any string with '&amp;...', replacing it by '&'
+
+=cut
+
+sub SanitizeRecord {
+ my ( $record, $biblionumber ) = @_;
+ my $string;
+ my $record_modified = 0;
+ my $frameworkcode = C4::Biblio::GetFrameworkCode($biblionumber);
+ my ( $url_field, $url_subfield ) =
+ C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
+ foreach my $field ( $record->fields() ) {
+ if ( $field->is_control_field() ) {
+ my $value = $field->data();
+ my $sanitized_value = _clean_ampersand($value);
+ $record_modified = 1 if $sanitized_value ne $value;
+ $field->update($sanitized_value);
+ }
+ else {
+ my @subfields = $field->subfields();
+ my @new_subfields;
+ foreach my $subfield (@subfields) {
+ next
+ if $url_field eq $field->tag()
+ and $url_subfield eq $subfield->[0];
+ my $value = $subfield->[1];
+ my $sanitized_value = _clean_ampersand($value);
+ push @new_subfields, $subfield->[0] => $sanitized_value;
+ $record_modified = 1 if $sanitized_value ne $value;
+ }
+ if ( scalar(@new_subfields) > 0 ) {
+ my $new_field = eval {
+ MARC::Field->new(
+ $field->tag(), $field->indicator(1),
+ $field->indicator(2), @new_subfields
+ );
+ };
+ if ($@) {
+ warn "error : $@";
+ }
+ else {
+ $field->replace_with($new_field);
+ }
+
+ }
+ }
+ }
+
+ return $record, $record_modified;
+}
+
+sub _clean_ampersand {
+ my ($string) = @_;
+ $string =~ s/(&)(amp;)+/$1/g;
+ return $string;
+}
+
+=head1 INTERNAL FUNCTIONS
+
+=head2 _default_marc21_charconv_to_utf8
+
+ my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
+
Converts a C<MARC::Record> of unknown character set to UTF-8,
first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
to UTF-8, then a default conversion that replaces each non-ASCII
=head2 _default_unimarc_charconv_to_utf8
-=over 4
-
-my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
-
-=back
+ my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
Converts a C<MARC::Record> of unknown character set to UTF-8,
first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
=head2 _marc_marc8_to_utf8
-=over 4
-
-my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+ my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
If the conversion fails for some reason, an
# occurs, upgrade the string in place. Moral of the story seems to be
# that pack("U", ...) is better than chr(...) if you need to guarantee
# that the resulting string is UTF-8.
- utf8::upgrade($utf8sf);
+ $utf8sf = Encode::encode('UTF-8', $utf8sf);
}
push @converted_subfields, $subfield->[0], $utf8sf;
}
=head2 _marc_iso5426_to_utf8
-=over 4
-
-my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
-
-=back
+ my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
If the conversion fails for some reason, an
=head2 _marc_to_utf8_via_text_iconv
-=over 4
-
-my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
-
-=back
+ my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
Convert a C<MARC::Record> to UTF-8 in-place using the
C<Text::Iconv> CPAN module. Any source encoding accepted
=head2 _marc_to_utf8_replacement_char
-=over 4
-
-_marc_to_utf8_replacement_char($marc_record, $marc_flavour);
-
-=back
+ _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
Convert a C<MARC::Record> to UTF-8 in-place, adopting the
unsatisfactory method of replacing all non-ASCII (e.g.,
=head2 char_decode5426
-=over 4
-
-my $utf8string = char_decode5426($iso_5426_string);
-
-=back
+ my $utf8string = char_decode5426($iso_5426_string);
Converts a string from ISO-5426 to UTF-8.
$chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
# $chars{0xb4}='è';
$chars{0xb4}=0x00e8;
+$chars{0xbd}=0x02b9;
+$chars{0xbe}=0x02ba;
# $chars{0xb5}='é';
$chars{0xb5}=0x00e9;
$chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
$chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
-$chars{0xfa}=0x0153;#oe
+$chars{0xfa}=0x0153; #oe
+$chars{0xea}=0x0152; #oe
$chars{0x81d1}=0x00b0;
####
=head1 AUTHOR
-Koha Development Team <info@koha.org>
+Koha Development Team <http://koha-community.org/>
Galen Charlton <galen.charlton@liblime.com>