5 xgettext.pl - xgettext(1)-like interface for .tmpl strings extraction
17 use vars qw( $convert_from );
18 use vars qw( $files_from $directory $output $sort );
19 use vars qw( $extract_all_p );
20 use vars qw( $pedantic_p );
21 use vars qw( %text %translation );
22 use vars qw( $charset_in $charset_out );
23 use vars qw( $disable_fuzzy_p );
24 use vars qw( $verbose_p );
25 use vars qw( $po_mode_p );
27 ###############################################################################
29 sub string_negligible_p ($) {
30 my($t) = @_; # a string
31 # Don't emit pure whitespace, pure numbers, pure punctuation,
32 # single letters, or TMPL_VAR's.
33 # Punctuation should arguably be translated. But without context
34 # they are untranslatable. Note that $t is a string, not a token object.
35 return !$extract_all_p && (
36 TmplTokenizer::blank_p($t) # blank or TMPL_VAR
37 || $t =~ /^\d+$/ # purely digits
38 || $t =~ /^[-\+\.,:;!\?'"%\(\)\[\]\|]+$/ # punctuation w/o context
39 || $t =~ /^[A-Za-z]$/ # single letters
43 sub token_negligible_p( $ ) {
46 return !$extract_all_p && (
47 $t == TmplTokenType::TEXT? string_negligible_p( $x->string ):
48 $t == TmplTokenType::DIRECTIVE? 1:
49 $t == TmplTokenType::TEXT_PARAMETRIZED
50 && join( '', map { my $t = $_->type;
51 $t == TmplTokenType::DIRECTIVE?
52 '1': $t == TmplTokenType::TAG?
53 '': token_negligible_p( $_ )?
54 '': '1' } @{$x->children} ) eq '' );
57 ###############################################################################
60 my($token, $string) = @_;
61 # If we determine that the string is negligible, don't bother to remember
62 unless (string_negligible_p( $string ) || token_negligible_p( $token )) {
63 my $key = TmplTokenizer::string_canon( $string );
64 $text{$key} = [] unless defined $text{$key};
65 push @{$text{$key}}, $token;
69 ###############################################################################
73 # The real gettext tools seems to sort case sensitively; I don't know why
74 @t = sort { $a cmp $b } @t if $sort eq 's';
76 my @aa = sort { $a->pathname cmp $b->pathname
77 || $a->line_number <=> $b->line_number } @{$text{$a}};
78 my @bb = sort { $a->pathname cmp $b->pathname
79 || $a->line_number <=> $b->line_number } @{$text{$b}};
80 $aa[0]->pathname cmp $bb[0]->pathname
81 || $aa[0]->line_number <=> $bb[0]->line_number;
86 ###############################################################################
88 sub text_extract (*) {
91 my $s = TmplTokenizer::next_token $h;
92 last unless defined $s;
93 my($kind, $t, $attr) = ($s->type, $s->string, $s->attributes);
94 if ($kind eq TmplTokenType::TEXT) {
95 remember( $s, $t ) if $t =~ /\S/s;
96 } elsif ($kind eq TmplTokenType::TEXT_PARAMETRIZED) {
97 remember( $s, $s->form ) if $s->form =~ /\S/s;
98 } elsif ($kind eq TmplTokenType::TAG && %$attr) {
99 # value [tag=input], meta
100 my $tag = lc($1) if $t =~ /^<(\S+)/s;
101 for my $a ('alt', 'content', 'title', 'value','label') {
103 next if $a eq 'label' && $tag ne 'optgroup';
104 next if $a eq 'content' && $tag ne 'meta';
105 next if $a eq 'value' && ($tag ne 'input'
106 || (ref $attr->{'type'} && $attr->{'type'}->[1] =~ /^(?:hidden|radio|checkbox)$/)); # FIXME
107 my($key, $val, $val_orig, $order) = @{$attr->{$a}}; #FIXME
108 $val = TmplTokenizer::trim $val;
109 remember( $s, $val ) if $val =~ /\S/s;
112 } elsif ($s->has_js_data) {
113 for my $t (@{$s->js_data}) {
114 remember( $s, $t->[3] ) if $t->[0]; # FIXME
120 ###############################################################################
122 sub generate_strings_list () {
123 # Emit all extracted strings.
124 for my $t (string_list) {
125 printf OUTPUT "%s\n", $t;
129 ###############################################################################
131 sub generate_po_file () {
132 # We don't emit the Plural-Forms header; it's meaningless for us
133 my $pot_charset = (defined $charset_out? $charset_out: 'CHARSET');
134 $pot_charset = TmplTokenizer::charset_canon $pot_charset;
135 # Time stamps aren't exactly right semantically. I don't know how to fix it.
136 my $time = POSIX::strftime('%Y-%m-%d %H:%M%z', localtime(time));
137 my $time_pot = $time;
138 my $time_po = $po_mode_p? $time: 'YEAR-MO-DA HO:MI+ZONE';
140 # SOME DESCRIPTIVE TITLE.
141 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
142 # This file is distributed under the same license as the PACKAGE package.
143 # FIRST AUTHOR <EMAIL\@ADDRESS>, YEAR.
146 print OUTPUT <<EOF unless $disable_fuzzy_p;
152 "Project-Id-Version: PACKAGE VERSION\\n"
153 "POT-Creation-Date: $time_pot\\n"
154 "PO-Revision-Date: $time_po\\n"
155 "Last-Translator: FULL NAME <EMAIL\@ADDRESS>\\n"
156 "Language-Team: LANGUAGE <LL\@li.org>\\n"
157 "MIME-Version: 1.0\\n"
158 "Content-Type: text/plain; charset=$pot_charset\\n"
159 "Content-Transfer-Encoding: 8bit\\n"
162 my $directory_re = quotemeta("$directory/");
163 for my $t (string_list) {
164 if ($text{$t}->[0]->type == TmplTokenType::TEXT_PARAMETRIZED) {
165 my($token, $n) = ($text{$t}->[0], 0);
166 printf OUTPUT "#. For the first occurrence,\n"
167 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
168 for my $param ($token->parameters_and_fields) {
170 my $type = $param->type;
171 my $subtype = ($type == TmplTokenType::TAG
172 && $param->string =~ /^<input\b/is?
173 $param->attributes->{'type'}->[1]: undef);
174 my $fmt = TmplTokenizer::_formalize( $param );
176 if ($type == TmplTokenType::DIRECTIVE) {
177 $type = $param->string =~ /(TMPL_[A-Z]+)+/is? $1: 'ERROR';
178 my $name = $param->string =~ /\bname=(["']?)([^\s"']+)\1/is?
180 printf OUTPUT "#. %s: %s\n", $fmt,
181 "$type" . (defined $name? " name=$name": '');
183 my $name = $param->attributes->{'name'};
184 my $value = $param->attributes->{'value'}
185 unless $subtype =~ /^(?:text)$/;
186 printf OUTPUT "#. %s: %s\n", $fmt, "type=$subtype"
187 . (defined $name? " name=$name->[1]": '')
188 . (defined $value? " value=$value->[1]": '');
191 } elsif ($text{$t}->[0]->type == TmplTokenType::TAG) {
192 my($token) = ($text{$t}->[0]);
193 printf OUTPUT "#. For the first occurrence,\n"
194 if @{$text{$t}} > 1 && $token->parameters_and_fields > 0;
195 if ($token->string =~ /^<meta\b/is) {
196 my $type = $token->attributes->{'http-equiv'}->[1];
197 print OUTPUT "#. META http-equiv=$type\n" if defined $type;
198 } elsif ($token->string =~ /^<([a-z0-9]+)/is) {
200 my $type = (lc($tag) eq 'input'?
201 $token->attributes->{'type'}: undef);
202 my $name = $token->attributes->{'name'};
203 printf OUTPUT "#. %s\n", $tag
204 . (defined $type? " type=$type->[1]": '')
205 . (defined $name? " name=$name->[1]": '');
207 } elsif ($text{$t}->[0]->has_js_data) {
208 printf OUTPUT "#. For the first occurrence,\n" if @{$text{$t}} > 1;
209 printf OUTPUT "#. SCRIPT\n";
212 for my $token (@{$text{$t}}) {
213 my $pathname = $token->pathname;
214 $pathname =~ s/^$directory_re//os;
215 $pathname =~ s/^.*\/koha-tmpl\/(.*)$/$1/;
216 printf OUTPUT "#: %s:%d\n", $pathname, $token->line_number
217 if defined $pathname && defined $token->line_number;
218 $cformat_p = 1 if $token->type == TmplTokenType::TEXT_PARAMETRIZED;
220 printf OUTPUT "#, c-format\n" if $cformat_p;
221 printf OUTPUT "msgid %s\n", TmplTokenizer::quote_po
222 TmplTokenizer::string_canon
223 TmplTokenizer::charset_convert $t, $charset_in, $charset_out;
224 printf OUTPUT "msgstr %s\n\n", (defined $translation{$t}?
225 TmplTokenizer::quote_po( $translation{$t} ): "\"\"");
229 ###############################################################################
231 sub convert_translation_file () {
232 open(INPUT, "<$convert_from") || die "$convert_from: $!\n";
233 VerboseWarnings::set_input_file_name $convert_from;
236 my($msgid, $msgstr) = split(/\t/);
237 die "$convert_from: $.: Malformed tmpl_process input (no tab)\n"
238 unless defined $msgstr;
240 # Fixup some of the bad strings
241 $msgid =~ s/^SELECTED>//;
244 my $token = TmplToken->new( $msgid, TmplTokenType::UNKNOWN, undef, undef );
245 remember( $token, $msgid );
246 $msgstr =~ s/^(?:LIMIT;|LIMITED;)//g; # unneeded for tmpl_process3
247 $translation{$msgid} = $msgstr unless $msgstr eq '*****';
249 if ($msgid =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
250 my $candidate = TmplTokenizer::charset_canon $2;
251 die "Conflicting charsets in msgid: $candidate vs $charset_in\n"
252 if defined $charset_in && $charset_in ne $candidate;
253 $charset_in = $candidate;
255 if ($msgstr =~ /\bcharset=(["']?)([^;\s"']+)\1/s) {
256 my $candidate = TmplTokenizer::charset_canon $2;
257 die "Conflicting charsets in msgid: $candidate vs $charset_out\n"
258 if defined $charset_out && $charset_out ne $candidate;
259 $charset_out = $candidate;
262 # The following assumption is correct; that's what HTML::Template assumes
263 if (!defined $charset_in) {
264 $charset_in = $charset_out = TmplTokenizer::charset_canon 'utf-8';
265 warn "Warning: Can't determine original templates' charset, defaulting to $charset_in\n";
269 ###############################################################################
273 my $h = $exitcode? *STDERR: *STDOUT;
276 Extract translatable strings from given HTML::Template input files.
279 -f, --files-from=FILE Get list of input files from FILE
280 -D, --directory=DIRECTORY Add DIRECTORY to list for input files search
282 Output file location:
283 -o, --output=FILE Write output to specified file
285 HTML::Template options:
286 -a, --extract-all Extract all strings
287 --pedantic-warnings Issue warnings even for detected problems
288 which are likely to be harmless
291 -s, --sort-output generate sorted output
292 -F, --sort-by-file sort output by file location
293 -v, --verbose explain what is being done
296 --help Display this help and exit
298 Try `perldoc $0' for perhaps more information.
303 ###############################################################################
305 sub usage_error (;$) {
306 print STDERR "$_[0]\n" if @_;
307 print STDERR "Try `$0 --help' for more information.\n";
311 ###############################################################################
313 Getopt::Long::config qw( bundling no_auto_abbrev );
315 'a|extract-all' => \$extract_all_p,
316 'charset=s' => sub { $charset_in = $charset_out = $_[1] }, # INTERNAL
317 'convert-from=s' => \$convert_from,
318 'D|directory=s' => \$directory,
319 'disable-fuzzy' => \$disable_fuzzy_p, # INTERNAL
320 'f|files-from=s' => \$files_from,
321 'I|input-charset=s' => \$charset_in, # INTERNAL
322 'pedantic-warnings|pedantic' => sub { $pedantic_p = 1 },
323 'O|output-charset=s' => \$charset_out, # INTERNAL
324 'output|o=s' => \$output,
325 'po-mode' => \$po_mode_p, # INTERNAL
326 's|sort-output' => sub { $sort = 's' },
327 'F|sort-by-file' => sub { $sort = 'F' },
328 'v|verbose' => \$verbose_p,
329 'help' => sub { usage(0) },
332 VerboseWarnings::set_application_name $0;
333 VerboseWarnings::set_pedantic_mode $pedantic_p;
335 usage_error('Missing mandatory option -f')
336 unless defined $files_from || defined $convert_from;
337 $directory = '.' unless defined $directory;
339 usage_error('You cannot specify both --convert-from and --files-from')
340 if defined $convert_from && defined $files_from;
342 if (defined $output && $output ne '-') {
343 print STDERR "$0: Opening output file \"$output\"\n" if $verbose_p;
344 open(OUTPUT, ">$output") || die "$output: $!\n";
346 print STDERR "$0: Outputting to STDOUT...\n" if $verbose_p;
347 open(OUTPUT, ">&STDOUT");
350 if (defined $files_from) {
351 print STDERR "$0: Opening input file list \"$files_from\"\n" if $verbose_p;
352 open(INPUT, "<$files_from") || die "$files_from: $!\n";
355 my $input = /^\//? $_: "$directory/$_";
356 my $h = TmplTokenizer->new( $input );
357 $h->set_allow_cformat( 1 );
358 VerboseWarnings::set_input_file_name $input;
359 print STDERR "$0: Processing file \"$input\"\n" if $verbose_p;
364 print STDERR "$0: Converting \"$convert_from\"\n" if $verbose_p;
365 convert_translation_file;
369 warn "This input will not work with Mozilla standards-compliant mode\n", undef
370 if TmplTokenizer::syntaxerror_p;
373 exit(-1) if TmplTokenizer::fatal_p;
375 ###############################################################################
379 This is an experimental script based on the modularized
380 text-extract2.pl script. It has behaviour similar to
381 xgettext(1), and generates gettext-compatible output files.
383 A gettext-like format provides the following advantages:
389 Translation to non-English-like languages with different word
390 order: gettext's c-format strings can theoretically be
391 emulated if we are able to do some analysis on the .tmpl input
392 and treat <TMPL_VAR> in a way similar to %s.
396 Context for the extracted strings: the gettext format provides
397 the filenames and line numbers where each string can be found.
398 The translator can read the source file and see the context,
399 in case the string by itself can mean several different things.
403 Place for the translator to add comments about the translations.
407 Gettext-compatible tools, if any, might be usable if we adopt
412 This script has already been in use for over a year and should
413 be reasonable stable. Nevertheless, it is still somewhat
414 experimental and there are still some issues.
416 Please refer to the explanation in tmpl_process3 for further
419 If you want to generate GNOME-style POTFILES.in files, such
420 files (passed to -f) can be generated thus:
422 (cd ../.. && find koha-tmpl/opac-tmpl/default/en \
423 -name \*.inc -o -name \*.tmpl) > opac/POTFILES.in
424 (cd ../.. && find koha-tmpl/intranet-tmpl/default/en \
425 -name \*.inc -o -name \*.tmpl) > intranet/POTFILES.in
427 This is, however, quite pointless, because the "create" and
428 "update" actions have already been implemented in tmpl_process3.pl.
430 =head2 Strings inside JavaScript
432 In the SCRIPT elements, the script will attempt to scan for
433 _("I<string literal>") patterns, and extract the I<string literal>
434 as a translatable string.
436 Note that the C-like _(...) notation is required.
438 The JavaScript must actually define a _ function
439 so that the code remains correct JavaScript.
440 A suitable definition of such a function can be
442 function _(s) { return s } // dummy function for gettext
453 There probably are some. Bugs related to scanning of <INPUT>
454 tags seem to be especially likely to be present.
456 Its diagnostics are probably too verbose.
458 When a <TMPL_VAR> within a JavaScript-related attribute is
459 detected, the script currently displays no warnings at all.
460 It might be good to display some kind of warning.
462 Its sort order (-s option) seems to be different than the real
463 xgettext(1)'s sort option. This will result in translation
464 strings inside the generated PO file spuriously moving about
465 when tmpl_process3.pl calls msgmerge(1) to update the PO file.
467 If a Javascript string has leading spaces, it will
468 generate strings with spurious leading spaces,
469 leading to failure to match the strings when actually generating