X-Git-Url: http://koha-dev.rot13.org:8081/gitweb/?a=blobdiff_plain;f=misc%2Ftranslator%2Fxgettext.pl;h=e04c5298634791f818c5f42539e30a2d4e4786d4;hb=3cfc2ec7bd1becef4386404fe7fc9f12740e8c97;hp=f7a940b8cd1ba387c1fd6383f49f6b86fde67884;hpb=6f6bfb8aff59ae10b4bf7e8d4bdcb31c5b269458;p=koha-ffzg.git diff --git a/misc/translator/xgettext.pl b/misc/translator/xgettext.pl index f7a940b8cd..e04c529863 100755 --- a/misc/translator/xgettext.pl +++ b/misc/translator/xgettext.pl @@ -1,11 +1,29 @@ #!/usr/bin/perl +# This file is part of Koha. +# +# Koha is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# Koha is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Koha; if not, see . + =head1 NAME -xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction +xgettext.pl - xgettext(1)-like interface for .tt strings extraction =cut +use FindBin; +use lib $FindBin::Bin; + use strict; use warnings; use Getopt::Long; @@ -24,9 +42,11 @@ use vars qw( $disable_fuzzy_p ); use vars qw( $verbose_p ); use vars qw( $po_mode_p ); +our $OUTPUT; + ############################################################################### -sub string_negligible_p ($) { +sub string_negligible_p { my($t) = @_; # a string # Don't emit pure whitespace, pure numbers, pure punctuation, # single letters, or TMPL_VAR's. @@ -37,26 +57,35 @@ sub string_negligible_p ($) { || $t =~ /^\d+$/ # purely digits || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context || $t =~ /^[A-Za-z]$/ # single letters + || $t =~ /^(&[a-z]+;|&#\d+;|&#x[0-9a-fA-F]+;|%%|%s|\s|[[:punct:]])*$/ # html entities,placeholder,punct, ... + || ( $t =~ /^\[\%.*\%\]$/ and $t !~ /\%\].*\[\%/ ) # pure TT entities + || $t =~ /^\s*<\?.*\?>/ # ignore xml prolog ) } -sub token_negligible_p( $ ) { - my($x) = @_; +sub token_negligible_p { + my ($x) = @_; my $t = $x->type; return !$extract_all_p && ( - $t == C4::TmplTokenType::TEXT? string_negligible_p( $x->string ): - $t == C4::TmplTokenType::DIRECTIVE? 1: - $t == C4::TmplTokenType::TEXT_PARAMETRIZED - && join( '', map { my $t = $_->type; - $t == C4::TmplTokenType::DIRECTIVE? - '1': $t == C4::TmplTokenType::TAG? - '': token_negligible_p( $_ )? - '': '1' } @{$x->children} ) eq '' ); + $t == C4::TmplTokenType::TEXT() ? string_negligible_p( $x->string ) + : $t == C4::TmplTokenType::DIRECTIVE() ? 1 + : $t == C4::TmplTokenType::TEXT_PARAMETRIZED() + && join( + '', + map { + my $t = $_->type; + $t == C4::TmplTokenType::DIRECTIVE() ? '1' + : $t == C4::TmplTokenType::TAG() ? '' + : token_negligible_p($_) ? '' + : '1' + } @{ $x->children } + ) eq '' + ); } ############################################################################### -sub remember ($$) { +sub remember { my($token, $string) = @_; # If we determine that the string is negligible, don't bother to remember unless (string_negligible_p( $string ) || token_negligible_p( $token )) { @@ -68,7 +97,7 @@ sub remember ($$) { ############################################################################### -sub string_list () { +sub string_list { my @t = keys %text; # The real gettext tools seems to sort case sensitively; I don't know why @t = sort { $a cmp $b } @t if $sort eq 's'; @@ -85,10 +114,10 @@ sub string_list () { ############################################################################### -sub text_extract (*) { +sub text_extract { my($h) = @_; for (;;) { - my $s = TmplTokenizer::next_token $h; + my $s = TmplTokenizer::next_token($h); last unless defined $s; my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes); if ($kind eq C4::TmplTokenType::TEXT) { @@ -101,15 +130,21 @@ sub text_extract (*) { } } elsif ($kind eq C4::TmplTokenType::TAG && %$attr) { # value [tag=input], meta - my $tag = lc($1) if $t =~ /^<(\S+)/s; - for my $a ('alt', 'content', 'title', 'value','label') { + my $tag; + $tag = lc($1) if $t =~ /^<(\S+)/s; + for my $a ('alt', 'content', 'title', 'value', 'label', 'placeholder', 'aria-label') { if ($attr->{$a}) { next if $a eq 'label' && $tag ne 'optgroup'; next if $a eq 'content' && $tag ne 'meta'; next if $a eq 'value' && ($tag ne 'input' || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME - $val = TmplTokenizer::trim $val; + $val = TmplTokenizer::trim($val); + # for selected attributes replace '[%..%]' with '%s' globally + if ( $a =~ /title|value|alt|content|placeholder|aria-label/ ) { + $val =~ s/\[\%.*?\%\]/\%s/g; + } + # save attribute text for translation remember( $s, $val ) if $val =~ /\S/s; } } @@ -123,37 +158,37 @@ sub text_extract (*) { ############################################################################### -sub generate_strings_list () { +sub generate_strings_list { # Emit all extracted strings. for my $t (string_list) { - printf OUTPUT "%s\n", $t; + printf $OUTPUT "%s\n", $t; } } ############################################################################### -sub generate_po_file () { +sub generate_po_file { # We don't emit the Plural-Forms header; it's meaningless for us my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET'); - $pot_charset = TmplTokenizer::charset_canon $pot_charset; + $pot_charset = TmplTokenizer::charset_canon($pot_charset); # Time stamps aren't exactly right semantically. I don't know how to fix it. my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time)); my $time_pot = $time; my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE'; - print OUTPUT <, YEAR. # EOF - print OUTPUT <\\n" @@ -167,7 +202,7 @@ EOF for my $t (string_list) { if ($text{$t}->[0]->type == C4::TmplTokenType::TEXT_PARAMETRIZED) { my($token, $n) = ($text{$t}->[0], 0); - printf OUTPUT "#. For the first occurrence,\n" + printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1 && $token->parameters_and_fields > 0; for my $param ($token->parameters_and_fields) { $n += 1; @@ -182,61 +217,64 @@ EOF $type = $param->string =~ /\[%(.*?)%\]/is? $1: 'ERROR'; my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is? $2: undef; - printf OUTPUT "#. %s: %s\n", $fmt, + printf $OUTPUT "#. %s: %s\n", $fmt, "$type" . (defined $name? " name=$name": ''); } else { my $name = $param->attributes->{'name'}; - my $value = $param->attributes->{'value'} + my $value; + $value = $param->attributes->{'value'} unless $subtype =~ /^(?:text)$/; - printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype" + printf $OUTPUT "#. %s: %s\n", $fmt, "type=$subtype" . (defined $name? " name=$name->[1]": '') . (defined $value? " value=$value->[1]": ''); } } } elsif ($text{$t}->[0]->type == C4::TmplTokenType::TAG) { my($token) = ($text{$t}->[0]); - printf OUTPUT "#. For the first occurrence,\n" + printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1 && $token->parameters_and_fields > 0; if ($token->string =~ /^attributes->{'http-equiv'}->[1]; - print OUTPUT "#. META http-equiv=$type\n" if defined $type; + print $OUTPUT "#. META http-equiv=$type\n" if defined $type; } elsif ($token->string =~ /^<([a-z0-9]+)/is) { my $tag = uc($1); my $type = (lc($tag) eq 'input'? $token->attributes->{'type'}: undef); my $name = $token->attributes->{'name'}; - printf OUTPUT "#. %s\n", $tag + printf $OUTPUT "#. %s\n", $tag . (defined $type? " type=$type->[1]": '') . (defined $name? " name=$name->[1]": ''); } } elsif ($text{$t}->[0]->has_js_data) { - printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1; - printf OUTPUT "#. SCRIPT\n"; + printf $OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1; + printf $OUTPUT "#. SCRIPT\n"; } my $cformat_p; for my $token (@{$text{$t}}) { my $pathname = $token->pathname; $pathname =~ s/^$directory_re//os; $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/; - printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number + printf $OUTPUT "#: %s:%d\n", $pathname, $token->line_number if defined $pathname && defined $token->line_number; $cformat_p = 1 if $token->type == C4::TmplTokenType::TEXT_PARAMETRIZED; } - printf OUTPUT "#, c-format\n" if $cformat_p; - printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po - TmplTokenizer::string_canon - TmplTokenizer::charset_convert $t, $charset_in, $charset_out; - printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}? + printf $OUTPUT "#, c-format\n" if $cformat_p; + printf $OUTPUT "msgid %s\n", TmplTokenizer::quote_po( + TmplTokenizer::string_canon( + TmplTokenizer::charset_convert($t, $charset_in, $charset_out) + ) + ); + printf $OUTPUT "msgstr %s\n\n", (defined $translation{$t}? TmplTokenizer::quote_po( $translation{$t} ): "\"\""); } } ############################################################################### -sub convert_translation_file () { - open(INPUT, "<$convert_from") || die "$convert_from: $!\n"; - VerboseWarnings::set_input_file_name $convert_from; - while () { +sub convert_translation_file { + open(my $INPUT, '<:encoding(utf-8)', $convert_from) || die "$convert_from: $!\n"; + VerboseWarnings::set_input_file_name($convert_from); + while (<$INPUT>) { chomp; my($msgid, $msgstr) = split(/\t/); die "$convert_from: $.: Malformed tmpl_process input (no tab)\n" @@ -252,13 +290,13 @@ sub convert_translation_file () { $translation{$msgid} = $msgstr unless $msgstr eq '*****'; if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) { - my $candidate = TmplTokenizer::charset_canon $2; + my $candidate = TmplTokenizer::charset_canon($2); die "Conflicting charsets in msgid: $candidate vs $charset_in\n" if defined $charset_in && $charset_in ne $candidate; $charset_in = $candidate; } if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) { - my $candidate = TmplTokenizer::charset_canon $2; + my $candidate = TmplTokenizer::charset_canon($2); die "Conflicting charsets in msgid: $candidate vs $charset_out\n" if defined $charset_out && $charset_out ne $candidate; $charset_out = $candidate; @@ -266,14 +304,14 @@ sub convert_translation_file () { } # The following assumption is correct; that's what HTML::Template assumes if (!defined $charset_in) { - $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8'; - warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n"; + $charset_in = $charset_out = TmplTokenizer::charset_canon('utf-8'); + warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n"; } } ############################################################################### -sub usage ($) { +sub usage { my($exitcode) = @_; my $h = $exitcode? *STDERR: *STDOUT; print $h < sub { usage(0) }, ) || usage_error; -VerboseWarnings::set_application_name $0; -VerboseWarnings::set_pedantic_mode $pedantic_p; +VerboseWarnings::set_application_name($0); +VerboseWarnings::set_pedantic_mode($pedantic_p); usage_error('Missing mandatory option -f') unless defined $files_from || defined $convert_from; @@ -346,26 +384,25 @@ usage_error('You cannot specify both --convert-from and --files-from') if (defined $output && $output ne '-') { print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p; - open(OUTPUT, ">$output") || die "$output: $!\n"; + open($OUTPUT, '>:encoding(utf-8)', $output) || die "$output: $!\n"; } else { print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p; - open(OUTPUT, ">&STDOUT"); + open($OUTPUT, ">&STDOUT"); } -binmode OUTPUT, ':encoding(UTF-8)'; if (defined $files_from) { print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p; - open(INPUT, "<$files_from") || die "$files_from: $!\n"; - while () { + open(my $INPUT, '<:encoding(utf-8)', $files_from) || die "$files_from: $!\n"; + while (<$INPUT>) { chomp; my $input = /^\//? $_: "$directory/$_"; my $h = TmplTokenizer->new( $input ); $h->set_allow_cformat( 1 ); - VerboseWarnings::set_input_file_name $input; + VerboseWarnings::set_input_file_name($input); print STDERR "$0: Processing file \"$input\"\n" if $verbose_p; text_extract( $h ); } - close INPUT; + close $INPUT; } else { print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p; convert_translation_file; @@ -382,8 +419,7 @@ exit(-1) if TmplTokenizer::fatal_p; =head1 DESCRIPTION -This is an experimental script based on the modularized -text-extract2.pl script. It has behaviour similar to +This script has behaviour similar to xgettext(1), and generates gettext-compatible output files. A gettext-like format provides the following advantages: @@ -394,7 +430,7 @@ A gettext-like format provides the following advantages: Translation to non-English-like languages with different word order: gettext's c-format strings can theoretically be -emulated if we are able to do some analysis on the .tmpl input +emulated if we are able to do some analysis on the .tt input and treat in a way similar to %s. =item - @@ -425,10 +461,10 @@ details. If you want to generate GNOME-style POTFILES.in files, such files (passed to -f) can be generated thus: - (cd ../.. && find koha-tmpl/opac-tmpl/default/en \ - -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in - (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \ - -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in + (cd ../.. && find koha-tmpl/opac-tmpl/default/en \ + -name \*.inc -o -name \*.tt) > opac/POTFILES.in + (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \ + -name \*.inc -o -name \*.tt) > intranet/POTFILES.in This is, however, quite pointless, because the "create" and "update" actions have already been implemented in tmpl_process3.pl.