X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=C4%2FCharset.pm;h=a4e6b716f8e5b44d251c0cebb1475e827ec9ac88;hb=c548761bc190d9b6a5f2989c0ab01b54f4d61879;hp=e39637acf3d94cf2329aec4142ecfd8e266e296f;hpb=9d1e7f43e15b869afc3fccd80c1545170cc84ea0;p=koha_gimpoz diff --git a/C4/Charset.pm b/C4/Charset.pm index e39637acf3..a4e6b716f8 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -13,9 +13,9 @@ package C4::Charset; # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR # A PARTICULAR PURPOSE. See the GNU General Public License for more details. # -# You should have received a copy of the GNU General Public License along with -# Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place, -# Suite 330, Boston, MA 02111-1307 USA +# You should have received a copy of the GNU General Public License along +# with Koha; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. use strict; use warnings; @@ -33,6 +33,7 @@ BEGIN { require Exporter; @ISA = qw(Exporter); @EXPORT = qw( + NormalizeString IsStringUTF8ish MarcToUTF8Record SetUTF8Flag @@ -47,7 +48,7 @@ C4::Charset - utilities for handling character set conversions. =head1 SYNOPSIS -use C4::Charset; + use C4::Charset; =head1 DESCRIPTION @@ -76,16 +77,12 @@ on how to deal with the situation. =head2 IsStringUTF8ish -=over 4 - -my $is_utf8 = IsStringUTF8ish($str); - -=back + my $is_utf8 = IsStringUTF8ish($str); Determines if C<$str> is valid UTF-8. This can mean one of two things: -=over 2 +=over =item * @@ -115,11 +112,7 @@ sub IsStringUTF8ish { =head2 SetUTF8Flag -=over 4 - -my $marc_record = SetUTF8Flag($marc_record); - -=back + my $marc_record = SetUTF8Flag($marc_record, $nfd); This function sets the PERL UTF8 flag for data. It is required when using new_from_usmarc @@ -127,6 +120,8 @@ since MARC::File::USMARC does not handle PERL UTF8 setting. When editing unicode marc records fields and subfields, you would end up in double encoding without using this function. +If $nfd is set, string normalization will use NFD instead of NFC + FIXME In my opinion, this function belongs to MARC::Record and not to this package. @@ -135,13 +130,13 @@ But since it handles charset, and MARC::Record, it finds its way in that package =cut sub SetUTF8Flag{ - my ($record)=@_; + my ($record, $nfd)=@_; return unless ($record && $record->fields()); foreach my $field ($record->fields()){ if ($field->tag()>=10){ my @subfields; foreach my $subfield ($field->subfields()){ - push @subfields,($$subfield[0],NormalizeString($$subfield[1])); + push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd)); } my $newfield=MARC::Field->new( $field->tag(), @@ -156,29 +151,28 @@ sub SetUTF8Flag{ =head2 NormalizeString -=over 4 + my $normalized_string=NormalizeString($string,$nfd,$transform); - my $normalized_string=NormalizeString($string); +Given a string +nfd : If you want to set NFD and not NFC +transform : If you expect all the signs to be removed + +Sets the PERL UTF8 Flag on your initial data if need be +and applies cleaning if required + +Returns a utf8 NFC normalized string + +Sample code : + my $string=NormalizeString ("l'ornithoptère"); + #results into ornithoptère in NFC form and sets UTF8 Flag -=back - Given - a string - nfc : If you want to set NFC and not NFD - transform : If you expect all the signs to be removed - Sets the PERL UTF8 Flag on your initial data if need be - and applies cleaning if required - - Returns a utf8 NFD normalized string - - Sample code : - my $string=NormalizeString ("l'ornithoptère"); - #results into ornithoptère in NFD form and sets UTF8 Flag =cut + sub NormalizeString{ - my ($string,$nfc,$transform)=@_; + my ($string,$nfd,$transform)=@_; utf8::decode($string) unless (utf8::is_utf8($string)); - if ($nfc){ + if ($nfd){ $string= NFD($string); } else { @@ -195,11 +189,8 @@ sub NormalizeString{ =head2 MarcToUTF8Record -=over 4 - -($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]); - -=back + ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, + $marc_flavour, [, $source_encoding]); Given a MARC blob or a C, the MARC flavour, and an optional source encoding, return a C that is @@ -259,20 +250,20 @@ sub MarcToUTF8Record { # If we do not know the source encoding, try some guesses # as follows: # 1. Record is UTF-8 already. - # 2. If MARC flavor is MARC21, then + # 2. If MARC flavor is MARC21 or NORMARC, then # a. record is MARC-8 # b. record is ISO-8859-1 # 3. If MARC flavor is UNIMARC, then if (not defined $source_encoding) { if ($marc_blob_is_utf8) { - # note that for MARC21 we are not bothering to check + # note that for MARC21/NORMARC we are not bothering to check # if the Leader/09 is set to 'a' or not -- because # of problems with various ILSs (including Koha in the # past, alas), this just is not trustworthy. SetMarcUnicodeFlag($marc_record, $marc_flavour); return $marc_record, 'UTF-8', []; } else { - if ($marc_flavour eq 'MARC21') { + if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour); } elsif ($marc_flavour =~/UNIMARC/) { return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour); @@ -312,11 +303,7 @@ sub MarcToUTF8Record { =head2 SetMarcUnicodeFlag -=over 4 - -SetMarcUnicodeFlag($marc_record, $marc_flavour); - -=back + SetMarcUnicodeFlag($marc_record, $marc_flavour); Set both the internal MARC::Record encoding flag and the appropriate Leader/09 (MARC21) or @@ -331,7 +318,7 @@ sub SetMarcUnicodeFlag { my $marc_flavour = shift; # || C4::Context->preference("marcflavour"); $marc_record->encoding('UTF-8'); - if ($marc_flavour eq 'MARC21') { + if ($marc_flavour eq 'MARC21' || $marc_flavour eq 'NORMARC') { my $leader = $marc_record->leader(); substr($leader, 9, 1) = 'a'; $marc_record->leader($leader); @@ -363,11 +350,7 @@ sub SetMarcUnicodeFlag { =head2 StripNonXmlChars -=over 4 - -my $new_str = StripNonXmlChars($old_str); - -=back + my $new_str = StripNonXmlChars($old_str); Given a string, return a copy with the characters that are illegal in XML @@ -403,11 +386,7 @@ sub StripNonXmlChars { =head2 _default_marc21_charconv_to_utf8 -=over 4 - -my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record); - -=back + my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record); Converts a C of unknown character set to UTF-8, first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1 @@ -449,11 +428,7 @@ sub _default_marc21_charconv_to_utf8 { =head2 _default_unimarc_charconv_to_utf8 -=over 4 - -my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record); - -=back + my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record); Converts a C of unknown character set to UTF-8, first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1 @@ -493,11 +468,7 @@ sub _default_unimarc_charconv_to_utf8 { =head2 _marc_marc8_to_utf8 -=over 4 - -my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding); - -=back + my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding); Convert a C to UTF-8 in-place from MARC-8. If the conversion fails for some reason, an @@ -568,11 +539,7 @@ sub _marc_marc8_to_utf8 { =head2 _marc_iso5426_to_utf8 -=over 4 - -my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding); - -=back + my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding); Convert a C to UTF-8 in-place from ISO-5426. If the conversion fails for some reason, an @@ -614,11 +581,7 @@ sub _marc_iso5426_to_utf8 { =head2 _marc_to_utf8_via_text_iconv -=over 4 - -my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); - -=back + my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding); Convert a C to UTF-8 in-place using the C CPAN module. Any source encoding accepted @@ -687,11 +650,7 @@ sub _marc_to_utf8_via_text_iconv { =head2 _marc_to_utf8_replacement_char -=over 4 - -_marc_to_utf8_replacement_char($marc_record, $marc_flavour); - -=back + _marc_to_utf8_replacement_char($marc_record, $marc_flavour); Convert a C to UTF-8 in-place, adopting the unsatisfactory method of replacing all non-ASCII (e.g., @@ -730,11 +689,7 @@ sub _marc_to_utf8_replacement_char { =head2 char_decode5426 -=over 4 - -my $utf8string = char_decode5426($iso_5426_string); - -=back + my $utf8string = char_decode5426($iso_5426_string); Converts a string from ISO-5426 to UTF-8. @@ -750,11 +705,14 @@ $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark # $chars{0xb4}='è'; $chars{0xb4}=0x00e8; +$chars{0xbd}=0x02b9; +$chars{0xbe}=0x02ba; # $chars{0xb5}='é'; $chars{0xb5}=0x00e9; $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark -$chars{0xfa}=0x0153;#oe +$chars{0xfa}=0x0153; #oe +$chars{0xea}=0x0152; #oe $chars{0x81d1}=0x00b0; #### @@ -1170,7 +1128,7 @@ sub char_decode5426 { =head1 AUTHOR -Koha Development Team +Koha Development Team Galen Charlton