Bug 2400 [11/18]: fixing pod syntax in C4/Labels.pm

[koha_fer] / C4 / Charset.pm
diff --git a/C4/Charset.pm b/C4/Charset.pm

index 329c416..a676b7c 100644 (file)
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -32,6 +32,7 @@ BEGIN {
          IsStringUTF8ish
          MarcToUTF8Record
          SetMarcUnicodeFlag
+        StripNonXmlChars
      );
  }
  
@@ -193,6 +194,7 @@ sub MarcToUTF8Record {
          } elsif ($source_encoding =~ /marc-?8/i) {
              @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
          } elsif ($source_encoding =~ /5426/) {
+            @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
          } else {
              # assume any other character encoding is for Text::Iconv
              @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
@@ -234,7 +236,7 @@ sub SetMarcUnicodeFlag {
          substr($leader, 9, 1) = 'a';
          $marc_record->leader($leader); 
      } elsif ($marc_flavour eq "UNIMARC") {
-        if (my $field = $marc_record->fields('100')) {
+        if (my $field = $marc_record->field('100')) {
              my $sfa = $field->subfield('a');
              substr($sfa, 26, 4) = '5050';
              $field->update('a' => $sfa);
@@ -242,7 +244,40 @@ sub SetMarcUnicodeFlag {
      }
  }
  
+=head2 StripNonXmlChars
  
+=over 4
+
+my $new_str = StripNonXmlChars($old_str);
+
+=back
+
+Given a string, return a copy with the
+characters that are illegal in XML 
+removed.
+
+This function exists to work around a problem
+that can occur with badly-encoded MARC records.
+Specifically, if a UTF-8 MARC record also
+has excape (\x1b) characters, MARC::File::XML
+will let the escape characters pass through
+when as_xml() or as_xml_record() is called.  The
+problem is that the escape character is not
+legal in well-formed XML documents, so when
+MARC::File::XML attempts to parse such a record,
+the XML parser will fail.
+
+Stripping such characters will allow a 
+MARC::Record->new_from_xml()
+to work, at the possible risk of some data loss.
+
+=cut
+
+sub StripNonXmlChars {
+    my $str = shift;
+    $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
+    return $str;
+}
  
  =head1 INTERNAL FUNCTIONS
  
@@ -384,6 +419,16 @@ sub _marc_marc8_to_utf8 {
              my @converted_subfields;
              foreach my $subfield ($field->subfields()) {
                  my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
+                unless (IsStringUTF8ish($utf8sf)) {
+                    # Because of a bug in MARC::Charset 0.98, if the string
+                    # has (a) one or more diacritics that (b) are only in character positions
+                    # 128 to 255 inclusive, the resulting converted string is not in
+                    # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1).  If that
+                    # occurs, upgrade the string in place.  Moral of the story seems to be
+                    # that pack("U", ...) is better than chr(...) if you need to guarantee
+                    # that the resulting string is UTF-8.
+                    utf8::upgrade($utf8sf);
+                }
                  push @converted_subfields, $subfield->[0], $utf8sf;
              }