3 # Copyright (C) 2008 LibLime
5 # This file is part of Koha.
7 # Koha is free software; you can redistribute it and/or modify it under the
8 # terms of the GNU General Public License as published by the Free Software
9 # Foundation; either version 2 of the License, or (at your option) any later
12 # Koha is distributed in the hope that it will be useful, but WITHOUT ANY
13 # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
14 # A PARTICULAR PURPOSE. See the GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along with
17 # Koha; if not, write to the Free Software Foundation, Inc., 59 Temple Place,
18 # Suite 330, Boston, MA 02111-1307 USA
21 use MARC::Charset qw/marc8_to_utf8/;
24 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
27 # set the version for version checking
41 C4::Charset - utilities for handling character set conversions.
49 This module contains routines for dealing with character set
50 conversions, particularly for MARC records.
52 A variety of character encodings are in use by various MARC
53 standards, and even more character encodings are used by
54 non-standard MARC records. The various MARC formats generally
55 do not do a good job of advertising a given record's character
56 encoding, and even when a record does advertise its encoding,
57 e.g., via the Leader/09, experience has shown that one cannot
60 Ultimately, all MARC records are stored in Koha in UTF-8 and
61 must be converted from whatever the source character encoding is.
62 The goal of this module is to ensure that these conversions
63 take place accurately. When a character conversion cannot take
64 place, or at least not accurately, the module was provide
65 enough information to allow user-facing code to inform the user
66 on how to deal with the situation.
72 =head2 IsStringUTF8ish
76 my $is_utf8 = IsStringUTF8ish($str);
80 Determines if C<$str> is valid UTF-8. This can mean
87 The Perl UTF-8 flag is set and the string contains valid UTF-8.
91 The Perl UTF-8 flag is B<not> set, but the octets contain
96 The function is named C<IsStringUTF8ish> instead of C<IsStringUTF8>
97 because in one could be presented with a MARC blob that is
98 not actually in UTF-8 but whose sequence of octets appears to be
99 valid UTF-8. The rest of the MARC character conversion functions
100 will assume that this situation occur does not very often.
104 sub IsStringUTF8ish {
107 return 1 if utf8::is_utf8($str);
108 return utf8::decode($str);
111 =head2 MarcToUTF8Record
115 ($marc_record, $converted_from, $errors_arrayref) = MarcToUTF8Record($marc_blob, $marc_flavour, [, $source_encoding]);
119 Given a MARC blob or a C<MARC::Record>, the MARC flavour, and an
120 optional source encoding, return a C<MARC::Record> that is
123 The returned C<$marc_record> is guaranteed to be in valid UTF-8, but
124 is not guaranteed to have been converted correctly. Specifically,
125 if C<$converted_from> is 'failed', the MARC record returned failed
126 character conversion and had each of its non-ASCII octets changed
127 to the Unicode replacement character.
129 If the source encoding was not specified, this routine will
130 try to guess it; the character encoding used for a successful
131 conversion is returned in C<$converted_from>.
135 sub MarcToUTF8Record {
137 my $marc_flavour = shift;
138 my $source_encoding = shift;
141 my $marc_blob_is_utf8 = 0;
142 if (ref($marc) eq 'MARC::Record') {
143 my $marc_blob = $marc->as_usmarc();
144 $marc_blob_is_utf8 = IsStringUTF8ish($marc_blob);
145 $marc_record = $marc;
147 # dealing with a MARC blob
149 # remove any ersatz whitespace from the beginning and
150 # end of the MARC blob -- these can creep into MARC
151 # files produced by several sources -- caller really
152 # should be doing this, however
155 $marc_blob_is_utf8 = IsStringUTF8ish($marc);
156 $marc_record = MARC::Record->new_from_usmarc($marc);
159 # If we do not know the source encoding, try some guesses
161 # 1. Record is UTF-8 already.
162 # 2. If MARC flavor is MARC21, then
163 # a. record is MARC-8
164 # b. record is ISO-8859-1
165 # 3. If MARC flavor is UNIMARC, then
166 if (not defined $source_encoding) {
167 if ($marc_blob_is_utf8) {
168 # note that for MARC21 we are not bothering to check
169 # if the Leader/09 is set to 'a' or not -- because
170 # of problems with various ILSs (including Koha in the
171 # past, alas), this just is not trustworthy.
172 SetMarcUnicodeFlag($marc_record, $marc_flavour);
173 return $marc_record, 'UTF-8', [];
175 if ($marc_flavour eq 'MARC21') {
176 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
177 } elsif ($marc_flavour eq 'UNIMARC') {
178 return _default_unimarc_charconv_to_utf8($marc_record, $marc_flavour);
180 return _default_marc21_charconv_to_utf8($marc_record, $marc_flavour);
184 # caller knows the character encoding
185 my $original_marc_record = $marc_record->clone();
187 if ($source_encoding =~ /utf-?8/i) {
188 if ($marc_blob_is_utf8) {
189 SetMarcUnicodeFlag($marc_record, $marc_flavour);
190 return $marc_record, 'UTF-8', [];
192 push @errors, 'specified UTF-8 => UTF-8 conversion, but record is not in UTF-8';
194 } elsif ($source_encoding =~ /marc-?8/i) {
195 @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour);
196 } elsif ($source_encoding =~ /5426/) {
197 @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour);
199 # assume any other character encoding is for Text::Iconv
200 @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, 'iso-8859-1');
204 _marc_to_utf8_replacement_char($original_marc_record, $marc_flavour);
205 return $original_marc_record, 'failed', \@errors;
207 return $marc_record, $source_encoding, [];
213 =head2 SetMarcUnicodeFlag
217 SetMarcUnicodeFlag($marc_record, $marc_flavour);
221 Set both the internal MARC::Record encoding flag
222 and the appropriate Leader/09 (MARC21) or
223 100/26-29 (UNIMARC) to indicate that the record
224 is in UTF-8. Note that this does B<not> do
225 any actual character conversion.
229 sub SetMarcUnicodeFlag {
230 my $marc_record = shift;
231 my $marc_flavour = shift; # || C4::Context->preference("marcflavour");
233 $marc_record->encoding('UTF-8');
234 if ($marc_flavour eq 'MARC21') {
235 my $leader = $marc_record->leader();
236 substr($leader, 9, 1) = 'a';
237 $marc_record->leader($leader);
238 } elsif ($marc_flavour eq "UNIMARC") {
239 if (my $field = $marc_record->field('100')) {
240 my $sfa = $field->subfield('a');
243 # fix the length of the field
244 $sfa = substr $sfa, 0, $subflength if (length($sfa) > $subflength);
245 $sfa = sprintf( "%-*s", 35, $sfa ) if (length($sfa) < $subflength);
247 substr($sfa, 26, 4) = '50 ';
248 $field->update('a' => $sfa);
251 warn "Unrecognized marcflavour: $marc_flavour";
255 =head2 StripNonXmlChars
259 my $new_str = StripNonXmlChars($old_str);
263 Given a string, return a copy with the
264 characters that are illegal in XML
267 This function exists to work around a problem
268 that can occur with badly-encoded MARC records.
269 Specifically, if a UTF-8 MARC record also
270 has excape (\x1b) characters, MARC::File::XML
271 will let the escape characters pass through
272 when as_xml() or as_xml_record() is called. The
273 problem is that the escape character is not
274 legal in well-formed XML documents, so when
275 MARC::File::XML attempts to parse such a record,
276 the XML parser will fail.
278 Stripping such characters will allow a
279 MARC::Record->new_from_xml()
280 to work, at the possible risk of some data loss.
284 sub StripNonXmlChars {
286 $str =~ s/[^\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]//g;
290 =head1 INTERNAL FUNCTIONS
292 =head2 _default_marc21_charconv_to_utf8
296 my ($new_marc_record, $guessed_charset) = _default_marc21_charconv_to_utf8($marc_record);
300 Converts a C<MARC::Record> of unknown character set to UTF-8,
301 first by trying a MARC-8 to UTF-8 conversion, then ISO-8859-1
302 to UTF-8, then a default conversion that replaces each non-ASCII
303 character with the replacement character.
305 The C<$guessed_charset> return value contains the character set
306 that resulted in a conversion to valid UTF-8; note that
307 if the MARC-8 and ISO-8859-1 conversions failed, the value of
312 sub _default_marc21_charconv_to_utf8 {
313 my $marc_record = shift;
314 my $marc_flavour = shift;
316 my $trial_marc8 = $marc_record->clone();
318 my @errors = _marc_marc8_to_utf8($trial_marc8, $marc_flavour);
320 return $trial_marc8, 'MARC-8', [];
322 push @all_errors, @errors;
324 my $trial_8859_1 = $marc_record->clone();
325 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
327 return $trial_8859_1, 'iso-8859-1', []; # note -- we could return \@all_errors
328 # instead if we wanted to report details
329 # of the failed attempt at MARC-8 => UTF-8
331 push @all_errors, @errors;
333 my $default_converted = $marc_record->clone();
334 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
335 return $default_converted, 'failed', \@all_errors;
338 =head2 _default_unimarc_charconv_to_utf8
342 my ($new_marc_record, $guessed_charset) = _default_unimarc_charconv_to_utf8($marc_record);
346 Converts a C<MARC::Record> of unknown character set to UTF-8,
347 first by trying a ISO-5426 to UTF-8 conversion, then ISO-8859-1
348 to UTF-8, then a default conversion that replaces each non-ASCII
349 character with the replacement character.
351 The C<$guessed_charset> return value contains the character set
352 that resulted in a conversion to valid UTF-8; note that
353 if the MARC-8 and ISO-8859-1 conversions failed, the value of
358 sub _default_unimarc_charconv_to_utf8 {
359 my $marc_record = shift;
360 my $marc_flavour = shift;
362 my $trial_marc8 = $marc_record->clone();
364 my @errors = _marc_iso5426_to_utf8($trial_marc8, $marc_flavour);
366 return $trial_marc8, 'iso-5426';
368 push @all_errors, @errors;
370 my $trial_8859_1 = $marc_record->clone();
371 @errors = _marc_to_utf8_via_text_iconv($trial_8859_1, $marc_flavour, 'iso-8859-1');
373 return $trial_8859_1, 'iso-8859-1';
375 push @all_errors, @errors;
377 my $default_converted = $marc_record->clone();
378 _marc_to_utf8_replacement_char($default_converted, $marc_flavour);
379 return $default_converted, 'failed', \@all_errors;
382 =head2 _marc_marc8_to_utf8
386 my @errors = _marc_marc8_to_utf8($marc_record, $marc_flavour, $source_encoding);
390 Convert a C<MARC::Record> to UTF-8 in-place from MARC-8.
391 If the conversion fails for some reason, an
392 appropriate messages will be placed in the returned
397 sub _marc_marc8_to_utf8 {
398 my $marc_record = shift;
399 my $marc_flavour = shift;
401 my $prev_ignore = MARC::Charset->ignore_errors();
402 MARC::Charset->ignore_errors(1);
404 # trap warnings raised by MARC::Charset
406 local $SIG{__WARN__} = sub {
408 if ($msg =~ /MARC.Charset/) {
409 # FIXME - purpose of this regexp is to strip out the
410 # line reference to MARC/Charset.pm, but as it
411 # exists probably won't work quite on Windows --
412 # some sort of minimal-bunch back-tracking RE
413 # would be helpful here
414 $msg =~ s/at [\/].*?.MARC.Charset\.pm line \d+\.\n$//;
417 # if warning doesn't come from MARC::Charset, just
423 foreach my $field ($marc_record->fields()) {
424 if ($field->is_control_field()) {
425 ; # do nothing -- control fields should not contain non-ASCII characters
427 my @converted_subfields;
428 foreach my $subfield ($field->subfields()) {
429 my $utf8sf = MARC::Charset::marc8_to_utf8($subfield->[1]);
430 unless (IsStringUTF8ish($utf8sf)) {
431 # Because of a bug in MARC::Charset 0.98, if the string
432 # has (a) one or more diacritics that (b) are only in character positions
433 # 128 to 255 inclusive, the resulting converted string is not in
434 # UTF-8, but the legacy 8-bit encoding (e.g., ISO-8859-1). If that
435 # occurs, upgrade the string in place. Moral of the story seems to be
436 # that pack("U", ...) is better than chr(...) if you need to guarantee
437 # that the resulting string is UTF-8.
438 utf8::upgrade($utf8sf);
440 push @converted_subfields, $subfield->[0], $utf8sf;
443 $field->replace_with(MARC::Field->new(
444 $field->tag(), $field->indicator(1), $field->indicator(2),
445 @converted_subfields)
450 MARC::Charset->ignore_errors($prev_ignore);
452 SetMarcUnicodeFlag($marc_record, $marc_flavour);
457 =head2 _marc_iso5426_to_utf8
461 my @errors = _marc_iso5426_to_utf8($marc_record, $marc_flavour, $source_encoding);
465 Convert a C<MARC::Record> to UTF-8 in-place from ISO-5426.
466 If the conversion fails for some reason, an
467 appropriate messages will be placed in the returned
470 FIXME - is ISO-5426 equivalent enough to MARC-8
471 that C<MARC::Charset> can be used instead?
475 sub _marc_iso5426_to_utf8 {
476 my $marc_record = shift;
477 my $marc_flavour = shift;
481 foreach my $field ($marc_record->fields()) {
482 if ($field->is_control_field()) {
483 ; # do nothing -- control fields should not contain non-ASCII characters
485 my @converted_subfields;
486 foreach my $subfield ($field->subfields()) {
487 my $utf8sf = char_decode5426($subfield->[1]);
488 push @converted_subfields, $subfield->[0], $utf8sf;
491 $field->replace_with(MARC::Field->new(
492 $field->tag(), $field->indicator(1), $field->indicator(2),
493 @converted_subfields)
498 SetMarcUnicodeFlag($marc_record, $marc_flavour);
503 =head2 _marc_to_utf8_via_text_iconv
507 my @errors = _marc_to_utf8_via_text_iconv($marc_record, $marc_flavour, $source_encoding);
511 Convert a C<MARC::Record> to UTF-8 in-place using the
512 C<Text::Iconv> CPAN module. Any source encoding accepted
513 by the user's iconv installation should work. If
514 the source encoding is not recognized on the user's
515 server or the conversion fails for some reason,
516 appropriate messages will be placed in the returned
521 sub _marc_to_utf8_via_text_iconv {
522 my $marc_record = shift;
523 my $marc_flavour = shift;
524 my $source_encoding = shift;
528 eval { $decoder = Text::Iconv->new($source_encoding, 'utf8'); };
530 push @errors, "Could not initialze $source_encoding => utf8 converter: $@";
534 my $prev_raise_error = Text::Iconv->raise_error();
535 Text::Iconv->raise_error(1);
537 foreach my $field ($marc_record->fields()) {
538 if ($field->is_control_field()) {
539 ; # do nothing -- control fields should not contain non-ASCII characters
541 my @converted_subfields;
542 foreach my $subfield ($field->subfields()) {
544 my $conversion_ok = 1;
545 eval { $converted_value = $decoder->convert($subfield->[1]); };
549 } elsif (not defined $converted_value) {
551 push @errors, "Text::Iconv conversion failed - retval is " . $decoder->retval();
554 if ($conversion_ok) {
555 push @converted_subfields, $subfield->[0], $converted_value;
557 $converted_value = $subfield->[1];
558 $converted_value =~ s/[\200-\377]/\xef\xbf\xbd/g;
559 push @converted_subfields, $subfield->[0], $converted_value;
563 $field->replace_with(MARC::Field->new(
564 $field->tag(), $field->indicator(1), $field->indicator(2),
565 @converted_subfields)
570 SetMarcUnicodeFlag($marc_record, $marc_flavour);
571 Text::Iconv->raise_error($prev_raise_error);
576 =head2 _marc_to_utf8_replacement_char
580 _marc_to_utf8_replacement_char($marc_record, $marc_flavour);
584 Convert a C<MARC::Record> to UTF-8 in-place, adopting the
585 unsatisfactory method of replacing all non-ASCII (e.g.,
586 where the eight bit is set) octet with the Unicode
587 replacement character. This is meant as a last-ditch
588 method, and would be best used as part of a UI that
589 lets a cataloguer pick various character conversions
590 until he or she finds the right one.
594 sub _marc_to_utf8_replacement_char {
595 my $marc_record = shift;
596 my $marc_flavour = shift;
598 foreach my $field ($marc_record->fields()) {
599 if ($field->is_control_field()) {
600 ; # do nothing -- control fields should not contain non-ASCII characters
602 my @converted_subfields;
603 foreach my $subfield ($field->subfields()) {
604 my $value = $subfield->[1];
605 $value =~ s/[\200-\377]/\xef\xbf\xbd/g;
606 push @converted_subfields, $subfield->[0], $value;
609 $field->replace_with(MARC::Field->new(
610 $field->tag(), $field->indicator(1), $field->indicator(2),
611 @converted_subfields)
616 SetMarcUnicodeFlag($marc_record, $marc_flavour);
619 =head2 char_decode5426
623 my $utf8string = char_decode5426($iso_5426_string);
627 Converts a string from ISO-5426 to UTF-8.
633 $chars{0xb0}=0x0101;#3/0ayn[ain]
634 $chars{0xb1}=0x0623;#3/1alif/hamzah[alefwithhamzaabove]
635 #$chars{0xb2}=0x00e0;#'Ã ';
636 $chars{0xb2}=0x00e0;#3/2leftlowsinglequotationmark
637 #$chars{0xb3}=0x00e7;#'ç';
638 $chars{0xb3}=0x00e7;#3/2leftlowsinglequotationmark
643 $chars{0x97}=0x003c;#3/2leftlowsinglequotationmark
644 $chars{0x98}=0x003e;#3/2leftlowsinglequotationmark
645 $chars{0xfa}=0x0153;#oe
646 $chars{0x81d1}=0x00b0;
649 ## combined characters iso5426
651 $chars{0xc041}=0x1ea2; # capital a with hook above
652 $chars{0xc045}=0x1eba; # capital e with hook above
653 $chars{0xc049}=0x1ec8; # capital i with hook above
654 $chars{0xc04f}=0x1ece; # capital o with hook above
655 $chars{0xc055}=0x1ee6; # capital u with hook above
656 $chars{0xc059}=0x1ef6; # capital y with hook above
657 $chars{0xc061}=0x1ea3; # small a with hook above
658 $chars{0xc065}=0x1ebb; # small e with hook above
659 $chars{0xc069}=0x1ec9; # small i with hook above
660 $chars{0xc06f}=0x1ecf; # small o with hook above
661 $chars{0xc075}=0x1ee7; # small u with hook above
662 $chars{0xc079}=0x1ef7; # small y with hook above
665 $chars{0xc141}=0x00c0; # capital a with grave accent
666 $chars{0xc145}=0x00c8; # capital e with grave accent
667 $chars{0xc149}=0x00cc; # capital i with grave accent
668 $chars{0xc14f}=0x00d2; # capital o with grave accent
669 $chars{0xc155}=0x00d9; # capital u with grave accent
670 $chars{0xc157}=0x1e80; # capital w with grave
671 $chars{0xc159}=0x1ef2; # capital y with grave
672 $chars{0xc161}=0x00e0; # small a with grave accent
673 $chars{0xc165}=0x00e8; # small e with grave accent
674 $chars{0xc169}=0x00ec; # small i with grave accent
675 $chars{0xc16f}=0x00f2; # small o with grave accent
676 $chars{0xc175}=0x00f9; # small u with grave accent
677 $chars{0xc177}=0x1e81; # small w with grave
678 $chars{0xc179}=0x1ef3; # small y with grave
680 $chars{0xc241}=0x00c1; # capital a with acute accent
681 $chars{0xc243}=0x0106; # capital c with acute accent
682 $chars{0xc245}=0x00c9; # capital e with acute accent
683 $chars{0xc247}=0x01f4; # capital g with acute
684 $chars{0xc249}=0x00cd; # capital i with acute accent
685 $chars{0xc24b}=0x1e30; # capital k with acute
686 $chars{0xc24c}=0x0139; # capital l with acute accent
687 $chars{0xc24d}=0x1e3e; # capital m with acute
688 $chars{0xc24e}=0x0143; # capital n with acute accent
689 $chars{0xc24f}=0x00d3; # capital o with acute accent
690 $chars{0xc250}=0x1e54; # capital p with acute
691 $chars{0xc252}=0x0154; # capital r with acute accent
692 $chars{0xc253}=0x015a; # capital s with acute accent
693 $chars{0xc255}=0x00da; # capital u with acute accent
694 $chars{0xc257}=0x1e82; # capital w with acute
695 $chars{0xc259}=0x00dd; # capital y with acute accent
696 $chars{0xc25a}=0x0179; # capital z with acute accent
697 $chars{0xc261}=0x00e1; # small a with acute accent
698 $chars{0xc263}=0x0107; # small c with acute accent
699 $chars{0xc265}=0x00e9; # small e with acute accent
700 $chars{0xc267}=0x01f5; # small g with acute
701 $chars{0xc269}=0x00ed; # small i with acute accent
702 $chars{0xc26b}=0x1e31; # small k with acute
703 $chars{0xc26c}=0x013a; # small l with acute accent
704 $chars{0xc26d}=0x1e3f; # small m with acute
705 $chars{0xc26e}=0x0144; # small n with acute accent
706 $chars{0xc26f}=0x00f3; # small o with acute accent
707 $chars{0xc270}=0x1e55; # small p with acute
708 $chars{0xc272}=0x0155; # small r with acute accent
709 $chars{0xc273}=0x015b; # small s with acute accent
710 $chars{0xc275}=0x00fa; # small u with acute accent
711 $chars{0xc277}=0x1e83; # small w with acute
712 $chars{0xc279}=0x00fd; # small y with acute accent
713 $chars{0xc27a}=0x017a; # small z with acute accent
714 $chars{0xc2e1}=0x01fc; # capital ae with acute
715 $chars{0xc2f1}=0x01fd; # small ae with acute
716 # 4/3 circumflex accent
717 $chars{0xc341}=0x00c2; # capital a with circumflex accent
718 $chars{0xc343}=0x0108; # capital c with circumflex
719 $chars{0xc345}=0x00ca; # capital e with circumflex accent
720 $chars{0xc347}=0x011c; # capital g with circumflex
721 $chars{0xc348}=0x0124; # capital h with circumflex
722 $chars{0xc349}=0x00ce; # capital i with circumflex accent
723 $chars{0xc34a}=0x0134; # capital j with circumflex
724 $chars{0xc34f}=0x00d4; # capital o with circumflex accent
725 $chars{0xc353}=0x015c; # capital s with circumflex
726 $chars{0xc355}=0x00db; # capital u with circumflex
727 $chars{0xc357}=0x0174; # capital w with circumflex
728 $chars{0xc359}=0x0176; # capital y with circumflex
729 $chars{0xc35a}=0x1e90; # capital z with circumflex
730 $chars{0xc361}=0x00e2; # small a with circumflex accent
731 $chars{0xc363}=0x0109; # small c with circumflex
732 $chars{0xc365}=0x00ea; # small e with circumflex accent
733 $chars{0xc367}=0x011d; # small g with circumflex
734 $chars{0xc368}=0x0125; # small h with circumflex
735 $chars{0xc369}=0x00ee; # small i with circumflex accent
736 $chars{0xc36a}=0x0135; # small j with circumflex
737 $chars{0xc36e}=0x00f1; # small n with tilde
738 $chars{0xc36f}=0x00f4; # small o with circumflex accent
739 $chars{0xc373}=0x015d; # small s with circumflex
740 $chars{0xc375}=0x00fb; # small u with circumflex
741 $chars{0xc377}=0x0175; # small w with circumflex
742 $chars{0xc379}=0x0177; # small y with circumflex
743 $chars{0xc37a}=0x1e91; # small z with circumflex
745 $chars{0xc441}=0x00c3; # capital a with tilde
746 $chars{0xc445}=0x1ebc; # capital e with tilde
747 $chars{0xc449}=0x0128; # capital i with tilde
748 $chars{0xc44e}=0x00d1; # capital n with tilde
749 $chars{0xc44f}=0x00d5; # capital o with tilde
750 $chars{0xc455}=0x0168; # capital u with tilde
751 $chars{0xc456}=0x1e7c; # capital v with tilde
752 $chars{0xc459}=0x1ef8; # capital y with tilde
753 $chars{0xc461}=0x00e3; # small a with tilde
754 $chars{0xc465}=0x1ebd; # small e with tilde
755 $chars{0xc469}=0x0129; # small i with tilde
756 $chars{0xc46e}=0x00f1; # small n with tilde
757 $chars{0xc46f}=0x00f5; # small o with tilde
758 $chars{0xc475}=0x0169; # small u with tilde
759 $chars{0xc476}=0x1e7d; # small v with tilde
760 $chars{0xc479}=0x1ef9; # small y with tilde
762 $chars{0xc541}=0x0100; # capital a with macron
763 $chars{0xc545}=0x0112; # capital e with macron
764 $chars{0xc547}=0x1e20; # capital g with macron
765 $chars{0xc549}=0x012a; # capital i with macron
766 $chars{0xc54f}=0x014c; # capital o with macron
767 $chars{0xc555}=0x016a; # capital u with macron
768 $chars{0xc561}=0x0101; # small a with macron
769 $chars{0xc565}=0x0113; # small e with macron
770 $chars{0xc567}=0x1e21; # small g with macron
771 $chars{0xc569}=0x012b; # small i with macron
772 $chars{0xc56f}=0x014d; # small o with macron
773 $chars{0xc575}=0x016b; # small u with macron
774 $chars{0xc572}=0x0159; # small r with macron
775 $chars{0xc5e1}=0x01e2; # capital ae with macron
776 $chars{0xc5f1}=0x01e3; # small ae with macron
778 $chars{0xc641}=0x0102; # capital a with breve
779 $chars{0xc645}=0x0114; # capital e with breve
780 $chars{0xc647}=0x011e; # capital g with breve
781 $chars{0xc649}=0x012c; # capital i with breve
782 $chars{0xc64f}=0x014e; # capital o with breve
783 $chars{0xc655}=0x016c; # capital u with breve
784 $chars{0xc661}=0x0103; # small a with breve
785 $chars{0xc665}=0x0115; # small e with breve
786 $chars{0xc667}=0x011f; # small g with breve
787 $chars{0xc669}=0x012d; # small i with breve
788 $chars{0xc66f}=0x014f; # small o with breve
789 $chars{0xc675}=0x016d; # small u with breve
791 $chars{0xc7b0}=0x01e1; # Ain with dot above
792 $chars{0xc742}=0x1e02; # capital b with dot above
793 $chars{0xc743}=0x010a; # capital c with dot above
794 $chars{0xc744}=0x1e0a; # capital d with dot above
795 $chars{0xc745}=0x0116; # capital e with dot above
796 $chars{0xc746}=0x1e1e; # capital f with dot above
797 $chars{0xc747}=0x0120; # capital g with dot above
798 $chars{0xc748}=0x1e22; # capital h with dot above
799 $chars{0xc749}=0x0130; # capital i with dot above
800 $chars{0xc74d}=0x1e40; # capital m with dot above
801 $chars{0xc74e}=0x1e44; # capital n with dot above
802 $chars{0xc750}=0x1e56; # capital p with dot above
803 $chars{0xc752}=0x1e58; # capital r with dot above
804 $chars{0xc753}=0x1e60; # capital s with dot above
805 $chars{0xc754}=0x1e6a; # capital t with dot above
806 $chars{0xc757}=0x1e86; # capital w with dot above
807 $chars{0xc758}=0x1e8a; # capital x with dot above
808 $chars{0xc759}=0x1e8e; # capital y with dot above
809 $chars{0xc75a}=0x017b; # capital z with dot above
810 $chars{0xc761}=0x0227; # small b with dot above
811 $chars{0xc762}=0x1e03; # small b with dot above
812 $chars{0xc763}=0x010b; # small c with dot above
813 $chars{0xc764}=0x1e0b; # small d with dot above
814 $chars{0xc765}=0x0117; # small e with dot above
815 $chars{0xc766}=0x1e1f; # small f with dot above
816 $chars{0xc767}=0x0121; # small g with dot above
817 $chars{0xc768}=0x1e23; # small h with dot above
818 $chars{0xc76d}=0x1e41; # small m with dot above
819 $chars{0xc76e}=0x1e45; # small n with dot above
820 $chars{0xc770}=0x1e57; # small p with dot above
821 $chars{0xc772}=0x1e59; # small r with dot above
822 $chars{0xc773}=0x1e61; # small s with dot above
823 $chars{0xc774}=0x1e6b; # small t with dot above
824 $chars{0xc777}=0x1e87; # small w with dot above
825 $chars{0xc778}=0x1e8b; # small x with dot above
826 $chars{0xc779}=0x1e8f; # small y with dot above
827 $chars{0xc77a}=0x017c; # small z with dot above
828 # 4/8 trema, diaresis
829 $chars{0xc820}=0x00a8; # diaeresis
830 $chars{0xc841}=0x00c4; # capital a with diaeresis
831 $chars{0xc845}=0x00cb; # capital e with diaeresis
832 $chars{0xc848}=0x1e26; # capital h with diaeresis
833 $chars{0xc849}=0x00cf; # capital i with diaeresis
834 $chars{0xc84f}=0x00d6; # capital o with diaeresis
835 $chars{0xc855}=0x00dc; # capital u with diaeresis
836 $chars{0xc857}=0x1e84; # capital w with diaeresis
837 $chars{0xc858}=0x1e8c; # capital x with diaeresis
838 $chars{0xc859}=0x0178; # capital y with diaeresis
839 $chars{0xc861}=0x00e4; # small a with diaeresis
840 $chars{0xc865}=0x00eb; # small e with diaeresis
841 $chars{0xc868}=0x1e27; # small h with diaeresis
842 $chars{0xc869}=0x00ef; # small i with diaeresis
843 $chars{0xc86f}=0x00f6; # small o with diaeresis
844 $chars{0xc874}=0x1e97; # small t with diaeresis
845 $chars{0xc875}=0x00fc; # small u with diaeresis
846 $chars{0xc877}=0x1e85; # small w with diaeresis
847 $chars{0xc878}=0x1e8d; # small x with diaeresis
848 $chars{0xc879}=0x00ff; # small y with diaeresis
850 $chars{0xc920}=0x00a8; # [diaeresis]
851 $chars{0xc961}=0x00e4; # a with umlaut
852 $chars{0xc965}=0x00eb; # e with umlaut
853 $chars{0xc969}=0x00ef; # i with umlaut
854 $chars{0xc96f}=0x00f6; # o with umlaut
855 $chars{0xc975}=0x00fc; # u with umlaut
857 $chars{0xca41}=0x00c5; # capital a with ring above
858 $chars{0xcaad}=0x016e; # capital u with ring above
859 $chars{0xca61}=0x00e5; # small a with ring above
860 $chars{0xca75}=0x016f; # small u with ring above
861 $chars{0xca77}=0x1e98; # small w with ring above
862 $chars{0xca79}=0x1e99; # small y with ring above
863 # 4/11 high comma off centre
864 # 4/12 inverted high comma centred
865 # 4/13 double acute accent
866 $chars{0xcd4f}=0x0150; # capital o with double acute
867 $chars{0xcd55}=0x0170; # capital u with double acute
868 $chars{0xcd6f}=0x0151; # small o with double acute
869 $chars{0xcd75}=0x0171; # small u with double acute
871 $chars{0xce54}=0x01a0; # latin capital letter o with horn
872 $chars{0xce55}=0x01af; # latin capital letter u with horn
873 $chars{0xce74}=0x01a1; # latin small letter o with horn
874 $chars{0xce75}=0x01b0; # latin small letter u with horn
876 $chars{0xcf41}=0x01cd; # capital a with caron
877 $chars{0xcf43}=0x010c; # capital c with caron
878 $chars{0xcf44}=0x010e; # capital d with caron
879 $chars{0xcf45}=0x011a; # capital e with caron
880 $chars{0xcf47}=0x01e6; # capital g with caron
881 $chars{0xcf49}=0x01cf; # capital i with caron
882 $chars{0xcf4b}=0x01e8; # capital k with caron
883 $chars{0xcf4c}=0x013d; # capital l with caron
884 $chars{0xcf4e}=0x0147; # capital n with caron
885 $chars{0xcf4f}=0x01d1; # capital o with caron
886 $chars{0xcf52}=0x0158; # capital r with caron
887 $chars{0xcf53}=0x0160; # capital s with caron
888 $chars{0xcf54}=0x0164; # capital t with caron
889 $chars{0xcf55}=0x01d3; # capital u with caron
890 $chars{0xcf5a}=0x017d; # capital z with caron
891 $chars{0xcf61}=0x01ce; # small a with caron
892 $chars{0xcf63}=0x010d; # small c with caron
893 $chars{0xcf64}=0x010f; # small d with caron
894 $chars{0xcf65}=0x011b; # small e with caron
895 $chars{0xcf67}=0x01e7; # small g with caron
896 $chars{0xcf69}=0x01d0; # small i with caron
897 $chars{0xcf6a}=0x01f0; # small j with caron
898 $chars{0xcf6b}=0x01e9; # small k with caron
899 $chars{0xcf6c}=0x013e; # small l with caron
900 $chars{0xcf6e}=0x0148; # small n with caron
901 $chars{0xcf6f}=0x01d2; # small o with caron
902 $chars{0xcf72}=0x0159; # small r with caron
903 $chars{0xcf73}=0x0161; # small s with caron
904 $chars{0xcf74}=0x0165; # small t with caron
905 $chars{0xcf75}=0x01d4; # small u with caron
906 $chars{0xcf7a}=0x017e; # small z with caron
908 $chars{0xd020}=0x00b8; # cedilla
909 $chars{0xd043}=0x00c7; # capital c with cedilla
910 $chars{0xd044}=0x1e10; # capital d with cedilla
911 $chars{0xd047}=0x0122; # capital g with cedilla
912 $chars{0xd048}=0x1e28; # capital h with cedilla
913 $chars{0xd04b}=0x0136; # capital k with cedilla
914 $chars{0xd04c}=0x013b; # capital l with cedilla
915 $chars{0xd04e}=0x0145; # capital n with cedilla
916 $chars{0xd052}=0x0156; # capital r with cedilla
917 $chars{0xd053}=0x015e; # capital s with cedilla
918 $chars{0xd054}=0x0162; # capital t with cedilla
919 $chars{0xd063}=0x00e7; # small c with cedilla
920 $chars{0xd064}=0x1e11; # small d with cedilla
921 $chars{0xd065}=0x0119; # small e with cedilla
922 $chars{0xd067}=0x0123; # small g with cedilla
923 $chars{0xd068}=0x1e29; # small h with cedilla
924 $chars{0xd06b}=0x0137; # small k with cedilla
925 $chars{0xd06c}=0x013c; # small l with cedilla
926 $chars{0xd06e}=0x0146; # small n with cedilla
927 $chars{0xd072}=0x0157; # small r with cedilla
928 $chars{0xd073}=0x015f; # small s with cedilla
929 $chars{0xd074}=0x0163; # small t with cedilla
932 # 5/3 ogonek (hook to right
933 $chars{0xd320}=0x02db; # ogonek
934 $chars{0xd341}=0x0104; # capital a with ogonek
935 $chars{0xd345}=0x0118; # capital e with ogonek
936 $chars{0xd349}=0x012e; # capital i with ogonek
937 $chars{0xd34f}=0x01ea; # capital o with ogonek
938 $chars{0xd355}=0x0172; # capital u with ogonek
939 $chars{0xd361}=0x0105; # small a with ogonek
940 $chars{0xd365}=0x0119; # small e with ogonek
941 $chars{0xd369}=0x012f; # small i with ogonek
942 $chars{0xd36f}=0x01eb; # small o with ogonek
943 $chars{0xd375}=0x0173; # small u with ogonek
945 $chars{0xd441}=0x1e00; # capital a with ring below
946 $chars{0xd461}=0x1e01; # small a with ring below
947 # 5/5 half circle below
948 $chars{0xf948}=0x1e2a; # capital h with breve below
949 $chars{0xf968}=0x1e2b; # small h with breve below
951 $chars{0xd641}=0x1ea0; # capital a with dot below
952 $chars{0xd642}=0x1e04; # capital b with dot below
953 $chars{0xd644}=0x1e0c; # capital d with dot below
954 $chars{0xd645}=0x1eb8; # capital e with dot below
955 $chars{0xd648}=0x1e24; # capital h with dot below
956 $chars{0xd649}=0x1eca; # capital i with dot below
957 $chars{0xd64b}=0x1e32; # capital k with dot below
958 $chars{0xd64c}=0x1e36; # capital l with dot below
959 $chars{0xd64d}=0x1e42; # capital m with dot below
960 $chars{0xd64e}=0x1e46; # capital n with dot below
961 $chars{0xd64f}=0x1ecc; # capital o with dot below
962 $chars{0xd652}=0x1e5a; # capital r with dot below
963 $chars{0xd653}=0x1e62; # capital s with dot below
964 $chars{0xd654}=0x1e6c; # capital t with dot below
965 $chars{0xd655}=0x1ee4; # capital u with dot below
966 $chars{0xd656}=0x1e7e; # capital v with dot below
967 $chars{0xd657}=0x1e88; # capital w with dot below
968 $chars{0xd659}=0x1ef4; # capital y with dot below
969 $chars{0xd65a}=0x1e92; # capital z with dot below
970 $chars{0xd661}=0x1ea1; # small a with dot below
971 $chars{0xd662}=0x1e05; # small b with dot below
972 $chars{0xd664}=0x1e0d; # small d with dot below
973 $chars{0xd665}=0x1eb9; # small e with dot below
974 $chars{0xd668}=0x1e25; # small h with dot below
975 $chars{0xd669}=0x1ecb; # small i with dot below
976 $chars{0xd66b}=0x1e33; # small k with dot below
977 $chars{0xd66c}=0x1e37; # small l with dot below
978 $chars{0xd66d}=0x1e43; # small m with dot below
979 $chars{0xd66e}=0x1e47; # small n with dot below
980 $chars{0xd66f}=0x1ecd; # small o with dot below
981 $chars{0xd672}=0x1e5b; # small r with dot below
982 $chars{0xd673}=0x1e63; # small s with dot below
983 $chars{0xd674}=0x1e6d; # small t with dot below
984 $chars{0xd675}=0x1ee5; # small u with dot below
985 $chars{0xd676}=0x1e7f; # small v with dot below
986 $chars{0xd677}=0x1e89; # small w with dot below
987 $chars{0xd679}=0x1ef5; # small y with dot below
988 $chars{0xd67a}=0x1e93; # small z with dot below
989 # 5/7 double dot below
990 $chars{0xd755}=0x1e72; # capital u with diaeresis below
991 $chars{0xd775}=0x1e73; # small u with diaeresis below
993 $chars{0xd820}=0x005f; # underline
994 # 5/9 double underline
995 $chars{0xd920}=0x2017; # double underline
996 # 5/10 small low vertical bar
997 $chars{0xda20}=0x02cc; #
998 # 5/11 circumflex below
999 # 5/12 (this position shall not be used)
1000 # 5/13 left half of ligature sign and of double tilde
1001 # 5/14 right half of ligature sign
1002 # 5/15 right half of double tilde
1003 # map {printf "%x :%x\n",$_,$chars{$_};}keys %chars;
1005 sub char_decode5426 {
1009 my @data = unpack("C*", $string);
1011 my $length=scalar(@data);
1012 for (my $i = 0; $i < scalar(@data); $i++) {
1013 my $char= $data[$i];
1014 if ($char >= 0x00 && $char <= 0x7F){
1017 push @characters,$char unless ($char<0x02 ||$char== 0x0F);
1018 }elsif (($char >= 0xC0 && $char <= 0xDF)) {
1021 if ($chars{$char*256+$data[$i+1]}) {
1022 $convchar= $chars{$char * 256 + $data[$i+1]};
1024 # printf "char %x $char, char to convert %x , converted %x\n",$char,$char * 256 + $data[$i - 1],$convchar;
1025 } elsif ($chars{$char}) {
1026 $convchar= $chars{$char};
1027 # printf "0xC char %x, converted %x\n",$char,$chars{$char};
1031 push @characters,$convchar;
1034 if ($chars{$char}) {
1035 $convchar= $chars{$char};
1036 # printf "char %x, converted %x\n",$char,$chars{$char};
1038 # printf "char %x $char\n",$char;
1041 push @characters,$convchar;
1044 $result=pack "U*",@characters;
1045 # $result=~s/\x01//;
1046 # $result=~s/\x00//;
1050 $result=~s/\x1b\x5b//;
1051 # map{printf "%x",$_} @characters;
1061 Koha Development Team <info@koha.org>
1063 Galen Charlton <galen.charlton@liblime.com>