Bug 14544: Get rid of GetSomeShelfNames

[koha-ffzg.git] / C4 / Charset.pm
diff --git a/C4/Charset.pm b/C4/Charset.pm

index 8b69848..667c2d2 100644 (file)
--- a/C4/Charset.pm
+++ b/C4/Charset.pm
@@ -4,18 +4,18 @@ package C4::Charset;
  #
  # This file is part of Koha.
  #
-# Koha is free software; you can redistribute it and/or modify it under the
-# terms of the GNU General Public License as published by the Free Software
-# Foundation; either version 2 of the License, or (at your option) any later
-# version.
+# Koha is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
  #
-# Koha is distributed in the hope that it will be useful, but WITHOUT ANY
-# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
-# A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+# Koha is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
  #
-# You should have received a copy of the GNU General Public License along
-# with Koha; if not, write to the Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+# You should have received a copy of the GNU General Public License
+# along with Koha; if not, see <http://www.gnu.org/licenses>.
  
  use strict;
  use warnings;
@@ -24,12 +24,13 @@ use MARC::Charset qw/marc8_to_utf8/;
  use Text::Iconv;
  use C4::Debug;
  use Unicode::Normalize;
+use Encode qw( decode encode is_utf8 );
  
  use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  
  BEGIN {
      # set the version for version checking
-    $VERSION = 3.01;
+    $VERSION = 3.07.00.049;
      require Exporter;
      @ISA    = qw(Exporter);
      @EXPORT = qw(
@@ -40,9 +41,12 @@ BEGIN {
          SetMarcUnicodeFlag
          StripNonXmlChars
          nsb_clean
+        SanitizeRecord
      );
  }
  
+=encoding UTF-8
+
  =head1 NAME
  
  C4::Charset - utilities for handling character set conversions.
@@ -107,8 +111,8 @@ will assume that this situation occur does not very often.
  sub IsStringUTF8ish {
      my $str = shift;
  
-    return 1 if utf8::is_utf8($str);
-    return utf8::decode($str);
+    return 1 if Encode::is_utf8($str);
+    return utf8::decode( $str );
  }
  
  =head2 SetUTF8Flag
@@ -131,23 +135,26 @@ But since it handles charset, and MARC::Record, it finds its way in that package
  =cut
  
  sub SetUTF8Flag{
-       my ($record, $nfd)=@_;
-       return unless ($record && $record->fields());
-       foreach my $field ($record->fields()){
-               if ($field->tag()>=10){
-                       my @subfields;
-                       foreach my $subfield ($field->subfields()){
-                               push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
-                       }
-                       my $newfield=MARC::Field->new(
-                                                       $field->tag(),
-                                                       $field->indicator(1),
-                                                       $field->indicator(2),
-                                                       @subfields
-                                               );
-                       $field->replace_with($newfield);
-               }
-       }
+    my ($record, $nfd)=@_;
+    return unless ($record && $record->fields());
+    foreach my $field ($record->fields()){
+        if ($field->tag()>=10){
+            my @subfields;
+            foreach my $subfield ($field->subfields()){
+                push @subfields,($$subfield[0],NormalizeString($$subfield[1],$nfd));
+            }
+            eval {
+                my $newfield=MARC::Field->new(
+                            $field->tag(),
+                            $field->indicator(1),
+                            $field->indicator(2),
+                            @subfields
+                        );
+                $field->replace_with($newfield);
+            };
+            warn "ERROR occurred in SetUTF8Flag $@" if $@;
+        }
+    }
  }
  
  =head2 NormalizeString
@@ -172,7 +179,8 @@ Sample code :
  
  sub NormalizeString{
         my ($string,$nfd,$transform)=@_;
-       utf8::decode($string) unless (utf8::is_utf8($string));
+    return $string unless defined($string); # force scalar context return.
+    $string = Encode::decode('UTF-8', $string) unless (Encode::is_utf8($string));
         if ($nfd){
                 $string= NFD($string);
         }
@@ -324,8 +332,11 @@ sub SetMarcUnicodeFlag {
          substr($leader, 9, 1) = 'a';
          $marc_record->leader($leader); 
      } elsif ($marc_flavour =~/UNIMARC/) {
+        require C4::Context;
+       my $defaultlanguage = C4::Context->preference("UNIMARCField100Language");
+        $defaultlanguage = "fre" if (!$defaultlanguage || length($defaultlanguage) != 3);
          my $string; 
-               my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,9):(36,22));
+               my ($subflength,$encodingposition)=($marc_flavour=~/AUTH/?(21,12):(36,25));
                 $string=$marc_record->subfield( 100, "a" );
          if (defined $string && length($string)==$subflength) { 
                         $string = substr $string, 0,$subflength if (length($string)>$subflength);
@@ -333,9 +344,10 @@ sub SetMarcUnicodeFlag {
          else { 
              $string = POSIX::strftime( "%Y%m%d", localtime ); 
              $string =~ s/\-//g; 
-            $string = sprintf( "%-*s", $subflength, $string ); 
+            $string = sprintf( "%-*s", $subflength, $string );
+           substr ( $string, ($encodingposition - 3), 3, $defaultlanguage);
          } 
-        substr( $string, $encodingposition, 8, "frey50  " ); 
+        substr( $string, $encodingposition, 3, "y50" );
          if ( $marc_record->subfield( 100, "a" ) ) { 
                         $marc_record->field('100')->update(a=>$string);
                 }
@@ -343,7 +355,7 @@ sub SetMarcUnicodeFlag {
              $marc_record->insert_grouped_field( 
                  MARC::Field->new( 100, '', '', "a" => $string ) ); 
          }
-               $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 8 );
+               $debug && warn "encodage: ", substr( $marc_record->subfield(100, 'a'), $encodingposition, 3 );
      } else {
          warn "Unrecognized marcflavour: $marc_flavour";
      }
@@ -406,17 +418,77 @@ sub nsb_clean {
      # handles non sorting blocks
      my ($string) = @_ ;
      $_ = $string ;
-    s/$NSB//g ;
-    s/$NSE//g ;
-    s/$NSB2//g ;
-    s/$NSE2//g ;
-    s/$C2//g ;
+    s/($C2){0,1}($NSB|$NSB2)//g ;
+    s/($C2){0,1}($NSE|$NSE2)//g ;
      $string = $_ ;
  
      return($string) ;
  }
  
  
+=head2 SanitizeRecord
+
+SanitizeRecord($marcrecord);
+
+Sanitize a record
+This routine is called in the maintenance script misc/maintenance/sanitize_records.pl.
+It cleans any string with '&amp;amp;...', replacing it by '&'
+
+=cut
+
+sub SanitizeRecord {
+    my ( $record, $biblionumber ) = @_;
+    my $string;
+    my $record_modified = 0;
+    my $frameworkcode   = C4::Biblio::GetFrameworkCode($biblionumber);
+    my ( $url_field, $url_subfield ) =
+      C4::Biblio::GetMarcFromKohaField( 'biblioitems.url', $frameworkcode );
+    foreach my $field ( $record->fields() ) {
+        if ( $field->is_control_field() ) {
+            my $value           = $field->data();
+            my $sanitized_value = _clean_ampersand($value);
+            $record_modified = 1 if $sanitized_value ne $value;
+            $field->update($sanitized_value);
+        }
+        else {
+            my @subfields = $field->subfields();
+            my @new_subfields;
+            foreach my $subfield (@subfields) {
+                next
+                  if $url_field eq $field->tag()
+                      and $url_subfield eq $subfield->[0];
+                my $value           = $subfield->[1];
+                my $sanitized_value = _clean_ampersand($value);
+                push @new_subfields, $subfield->[0] => $sanitized_value;
+                $record_modified = 1 if $sanitized_value ne $value;
+            }
+            if ( scalar(@new_subfields) > 0 ) {
+                my $new_field = eval {
+                    MARC::Field->new(
+                        $field->tag(),        $field->indicator(1),
+                        $field->indicator(2), @new_subfields
+                    );
+                };
+                if ($@) {
+                    warn "error : $@";
+                }
+                else {
+                    $field->replace_with($new_field);
+                }
+
+            }
+        }
+    }
+
+    return $record, $record_modified;
+}
+
+sub _clean_ampersand {
+    my ($string) = @_;
+    $string =~ s/(&)(amp;)+/$1/g;
+    return $string;
+}
+
  =head1 INTERNAL FUNCTIONS
  
  =head2 _default_marc21_charconv_to_utf8
@@ -553,7 +625,7 @@ sub _marc_marc8_to_utf8 {
                      # occurs, upgrade the string in place.  Moral of the story seems to be
                      # that pack("U", ...) is better than chr(...) if you need to guarantee
                      # that the resulting string is UTF-8.
-                    utf8::upgrade($utf8sf);
+                    $utf8sf = Encode::encode('UTF-8', $utf8sf);
                  }
                  push @converted_subfields, $subfield->[0], $utf8sf;
              }