X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=C4%2FCharset.pm;h=8b69848d5dd5f8bdd6f25aad5c7b969821ac9ad0;hb=624edf3dba8fda35d0a071bb9dc8755ddbf00d44;hp=a4a06e7bb458368988597c69d174a29d64cc092a;hpb=d1cea14fae6525b7478548aa19108871395e198d;p=koha_gimpoz

diff --git a/C4/Charset.pm b/C4/Charset.pm
index a4a06e7bb4..8b69848d5d 100644
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -39,6 +39,7 @@ BEGIN {
         SetUTF8Flag
         SetMarcUnicodeFlag
         StripNonXmlChars
+        nsb_clean
     );
 }
 
@@ -112,7 +113,7 @@ sub IsStringUTF8ish {
 
 =head2 SetUTF8Flag
 
-  my $marc_record = SetUTF8Flag($marc_record);
+  my $marc_record = SetUTF8Flag($marc_record, $nfd);
 
 This function sets the PERL UTF8 flag for data.
 It is required when using new_from_usmarc 
@@ -120,6 +121,8 @@ since MARC::File::USMARC does not handle PERL UTF8 setting.
 When editing unicode marc records fields and subfields, you
 would end up in double encoding without using this function. 
 
+If $nfd is set, string normalization will use NFD instead of NFC
+
 FIXME
 In my opinion, this function belongs to MARC::Record and not
 to this package.
@@ -128,13 +131,13 @@ But since it handles charset, and MARC::Record, it finds its way in that package
 =cut
 
 sub SetUTF8Flag{
-	my ($record)=@_;
+	my ($record, $nfd)=@_;
 	return unless ($record && $record->fields());
 	foreach my $field ($record->fields()){
 		if ($field->tag()>=10){
 			my @subfields;
 			foreach my $subfield ($field->subfields()){
-				push @subfields,($$subfield[0],NormalizeString($$subfield[1]));
+				push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
 			}
 			my $newfield=MARC::Field->new(
 							$field->tag(),
@@ -380,6 +383,40 @@ sub StripNonXmlChars {
     return $str;
 }
 
+
+
+=head2 nsb_clean
+
+=over 4
+
+nsb_clean($string);
+
+=back
+
+Removes Non Sorting Block characters
+
+=cut
+sub nsb_clean {
+    my $NSB  = '\x88' ;        # NSB : begin Non Sorting Block
+    my $NSE  = '\x89' ;        # NSE : Non Sorting Block end
+    my $NSB2 = '\x98' ;        # NSB : begin Non Sorting Block
+    my $NSE2 = '\x9C' ;        # NSE : Non Sorting Block end
+    my $C2   = '\xC2' ;        # What is this char ? It is sometimes left by the regexp after removing NSB / NSE
+
+    # handles non sorting blocks
+    my ($string) = @_ ;
+    $_ = $string ;
+    s/$NSB//g ;
+    s/$NSE//g ;
+    s/$NSB2//g ;
+    s/$NSE2//g ;
+    s/$C2//g ;
+    $string = $_ ;
+
+    return($string) ;
+}
+
+
 =head1 INTERNAL FUNCTIONS
 
 =head2 _default_marc21_charconv_to_utf8