X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=C4%2FCharset.pm;h=8b69848d5dd5f8bdd6f25aad5c7b969821ac9ad0;hb=624edf3dba8fda35d0a071bb9dc8755ddbf00d44;hp=a4a06e7bb458368988597c69d174a29d64cc092a;hpb=d1cea14fae6525b7478548aa19108871395e198d;p=koha_gimpoz diff --git a/C4/Charset.pm b/C4/Charset.pm index a4a06e7bb4..8b69848d5d 100644 --- a/C4/Charset.pm +++ b/C4/Charset.pm @@ -39,6 +39,7 @@ BEGIN { SetUTF8Flag SetMarcUnicodeFlag StripNonXmlChars + nsb_clean ); } @@ -112,7 +113,7 @@ sub IsStringUTF8ish { =head2 SetUTF8Flag - my $marc_record = SetUTF8Flag($marc_record); + my $marc_record = SetUTF8Flag($marc_record, $nfd); This function sets the PERL UTF8 flag for data. It is required when using new_from_usmarc @@ -120,6 +121,8 @@ since MARC::File::USMARC does not handle PERL UTF8 setting. When editing unicode marc records fields and subfields, you would end up in double encoding without using this function. +If $nfd is set, string normalization will use NFD instead of NFC + FIXME In my opinion, this function belongs to MARC::Record and not to this package. @@ -128,13 +131,13 @@ But since it handles charset, and MARC::Record, it finds its way in that package =cut sub SetUTF8Flag{ - my ($record)=@_; + my ($record, $nfd)=@_; return unless ($record && $record->fields()); foreach my $field ($record->fields()){ if ($field->tag()>=10){ my @subfields; foreach my $subfield ($field->subfields()){ - push @subfields,($$subfield[0],NormalizeString($$subfield[1])); + push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd)); } my $newfield=MARC::Field->new( $field->tag(), @@ -380,6 +383,40 @@ sub StripNonXmlChars { return $str; } + + +=head2 nsb_clean + +=over 4 + +nsb_clean($string); + +=back + +Removes Non Sorting Block characters + +=cut +sub nsb_clean { + my $NSB = '\x88' ; # NSB : begin Non Sorting Block + my $NSE = '\x89' ; # NSE : Non Sorting Block end + my $NSB2 = '\x98' ; # NSB : begin Non Sorting Block + my $NSE2 = '\x9C' ; # NSE : Non Sorting Block end + my $C2 = '\xC2' ; # What is this char ? It is sometimes left by the regexp after removing NSB / NSE + + # handles non sorting blocks + my ($string) = @_ ; + $_ = $string ; + s/$NSB//g ; + s/$NSE//g ; + s/$NSB2//g ; + s/$NSE2//g ; + s/$C2//g ; + $string = $_ ; + + return($string) ; +} + + =head1 INTERNAL FUNCTIONS =head2 _default_marc21_charconv_to_utf8