package TmplTokenizer;
use strict;
+#use warnings; FIXME - Bug 2505
use TmplTokenType;
use TmplToken;
use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
# Hideous stuff
use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
-use vars qw( $re_directive_control $re_tmpl_endif_endloop );
+use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl);
BEGIN {
# $re_directive must not do any backreferences
$re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
$re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
$re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
# TMPL_VAR ESCAPE=1/HTML/URL
+ $re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
$re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
# Any control flow directive
$re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
sub CDATA_MODE_P () {'cdata-mode-p'}
sub CDATA_CLOSE () {'cdata-close'}
sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
+sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
sub new {
- my $this = shift;
- my($input) = @_;
- my $class = ref($this) || $this;
- my $self = {};
- bless $self, $class;
-
- my $handle = sprintf('TMPLTOKENIZER%d', $serial);
- $serial += 1;
-
- no strict;
- open($handle, "<$input") || die "$input: $!\n";
- use strict;
- $self->{+FILENAME} = $input;
- $self->{+HANDLE} = $handle;
- $self->{+READAHEAD} = [];
- return $self;
+ shift;
+ my ($filename) = @_;
+ open my $handle,$filename or die "can't open $filename";
+ bless {
+ filename => $filename
+ , handle => $handle
+ , readahead => []
+ } , __PACKAGE__;
}
###############################################################################
sub filename {
my $this = shift;
- return $this->{+FILENAME};
+ return $this->{filename};
}
sub _handle {
my $this = shift;
- return $this->{+HANDLE};
+ return $this->{handle};
}
sub fatal_p {
sub has_readahead_p {
my $this = shift;
- return @{$this->{+READAHEAD}};
+ return @{$this->{readahead}};
}
sub _peek_readahead {
my $this = shift;
- return $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}];
+ return $this->{readahead}->[$#{$this->{readahead}}];
}
sub line_number_start {
return $this->{+PCDATA_MODE_P};
}
+sub js_mode_p {
+ my $this = shift;
+ return $this->{+JS_MODE_P};
+}
+
sub cdata_close {
my $this = shift;
return $this->{+CDATA_CLOSE};
sub _push_readahead {
my $this = shift;
- push @{$this->{+READAHEAD}}, $_[0];
+ push @{$this->{readahead}}, $_[0];
return $this;
}
sub _pop_readahead {
my $this = shift;
- return pop @{$this->{+READAHEAD}};
+ return pop @{$this->{readahead}};
}
sub _append_readahead {
my $this = shift;
- $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] .= $_[0];
+ $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
return $this;
}
sub _set_readahead {
my $this = shift;
- $this->{+READAHEAD}->[$#{$this->{+READAHEAD}}] = $_[0];
+ $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
return $this;
}
return $this;
}
+sub _set_js_mode {
+ my $this = shift;
+ $this->{+JS_MODE_P} = $_[0];
+ return $this;
+}
+
sub _set_cdata_close {
my $this = shift;
$this->{+CDATA_CLOSE} = $_[0];
###############################################################################
+use vars qw( $js_EscapeSequence );
+BEGIN {
+ # Perl quoting is really screwed up, but this common subexp is way too long
+ $js_EscapeSequence = q{\\\\(?:['"\\\\bfnrt]|[^0-7xu]|[0-3]?[0-7]{1,2}|x[\da-fA-F]{2}|u[\da-fA-F]{4})};
+}
+sub parenleft () { '(' }
+sub parenright () { ')' }
+
+sub split_js ($) {
+ my ($s0) = @_;
+ my @it = ();
+ while (length $s0) {
+ if ($s0 =~ /^\s+/s) { # whitespace
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
+ push @it, $&;
+ $s0 = $';
+ # Keyword or identifier, ECMA-262 p.13 (section 7.5)
+ } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
+ push @it, $&;
+ $s0 = $';
+ # Punctuator, ECMA-262 p.13 (section 7.6)
+ } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
+ push @it, $&;
+ $s0 = $';
+ # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
+ } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
+ push @it, $&;
+ $s0 = $';
+ # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+ } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
+ push @it, $&;
+ $s0 = $';
+ # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+ } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
+ push @it, $&;
+ $s0 = $';
+ # StringLiteral, ECMA-262 p.17 (section 7.7.4)
+ # XXX SourceCharacter doesn't seem to be defined (?)
+ } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
+ push @it, $&;
+ $s0 = $';
+ }
+ }
+ return @it;
+}
+
+sub STATE_UNDERSCORE () { 1 }
+sub STATE_PARENLEFT () { 2 }
+sub STATE_STRING_LITERAL () { 3 }
+
+# XXX This is a crazy hack. I don't want to write an ECMAScript parser.
+# XXX A scanner is one thing; a parser another thing.
+sub identify_js_translatables (@) {
+ my @input = @_;
+ my @output = ();
+ # We mark a JavaScript translatable string as in C, i.e., _("literal")
+ # For simplicity, we ONLY look for "_" "(" StringLiteral ")"
+ for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
+ my $reset_state_p = 0;
+ push @output, [0, $input[$i]];
+ if ($input[$i] !~ /\S/s) {
+ ;
+ } elsif ($state == 0) {
+ $state = STATE_UNDERSCORE if $input[$i] eq '_';
+ } elsif ($state == STATE_UNDERSCORE) {
+ $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
+ } elsif ($state == STATE_PARENLEFT) {
+ if ($input[$i] =~ /^(['"])(.*)\1$/s) {
+ ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
+ } else {
+ $state = 0;
+ }
+ } elsif ($state == STATE_STRING_LITERAL) {
+ if ($input[$i] eq parenright) {
+ $output[$j] = [1, $output[$j]->[1], $q, $s];
+ }
+ $state = 0;
+ } else {
+ die "identify_js_translatables internal error: Unknown state $state"
+ }
+ }
+ return \@output;
+}
+
+###############################################################################
+
sub _extract_attributes ($;$) {
my $this = shift;
my($s, $lc) = @_;
$this->_set_cdata_mode( 1 );
$this->_set_cdata_close( "</$1\\s*>" );
$this->_set_pcdata_mode( 0 );
+ $this->_set_js_mode( lc($1) eq 'script' );
# } elsif ($it->string =~ /^<(title)\b/is) {
# $this->_set_cdata_mode( 1 );
# $this->_set_cdata_close( "</$1\\s*>" );
$it = TmplToken->new( $it,
($this->pcdata_mode_p?
TmplTokenType::TEXT: TmplTokenType::CDATA),
- $this->line_number )
+ $this->line_number, $this->filename )
if defined $it;
+ if ($this->js_mode_p) {
+ my $s0 = $it->string;
+ my @head = ();
+ my @tail = ();
+ if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
+ push @head, $1;
+ push @tail, $3;
+ $s0 = $2;
+ }
+ push @head, split_js $s0;
+ $it->set_js_data( identify_js_translatables(@head, @tail) );
+ }
$this->_set_pcdata_mode, 0;
$this->_set_cdata_close, undef unless !defined $it;
}
my @structure = @_;
my $undo_trailing_blanks = sub {
for (my $i = $#structure; $i >= 0; $i -= 1) {
- last if $structure[$i]->type != TmplTokenType::TEXT;
- last if !blank_p($structure[$i]->string);
+ last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
# Queue element structure: [reanalysis-p, token]
push @{$this->{_queue}}, [1, pop @structure];
}
&$undo_trailing_blanks;
$something_done_p = 1;
}
+ # FIXME: If the first token is an open tag, but there is no
+ # FIXME: corresponding close tag, "drop the open tag", i.e.,
+ # FIXME: requeue everything for reanalysis, except the frist tag. :-(
+ if (@structure >= 2
+ && $structure[0]->type == TmplTokenType::TAG
+ && $structure[0]->string =~ /^<([a-z0-9]+)/is
+ && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
+ ) {
+ my $tag_open_count = 1;
+ for (my $i = 1; $i <= $#structure; $i += 1) {
+ if ($structure[$i]->type == TmplTokenType::TAG) {
+ if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
+ $tag_open_count += ($1? -1: +1);
+ }
+ }
+ }
+ if ($tag_open_count > 0) {
+ for (my $i = $#structure; $i; $i -= 1) {
+ push @{$this->{_queue}}, [1, pop @structure];
+ }
+ $something_done_p = 1;
+ }
+ }
# FIXME: If the first token is an open tag, the last token is the
# FIXME: corresponding close tag, and there are no other close tags
# FIXME: inbetween, requeue the tokens from the second token on,
# FIXME: flagged as ok for re-analysis
if (@structure >= 3
&& $structure[0]->type == TmplTokenType::TAG
- && $structure[0]->string =~ /^<([a-z0-9])/is && (my $tag = $1)
+ && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
&& $structure[$#structure]->type == TmplTokenType::TAG
&& $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
my $has_other_open_or_close_tags_p = 0;
my $error_p = 0;
for (my $i = 0; $i <= $#structure; $i += 1) {
if ($structure[$i]->type == TmplTokenType::TAG) {
- if ($structure[$i]->string =~ /^<([A-Z0-9]+)/is) {
+ my $form = $structure[$i]->string;
+ if ($form =~ /^<([A-Z0-9]+)/is) {
my $tag = lc($1);
- push @tags, $tag unless $tag =~ /^<(?:input)/is
- || $tag =~ /\/>$/is;
- } elsif ($structure[$i]->string =~ /^<\/([A-Z0-9]+)/is) {
+ if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
+ push @tags, $tag;
+ }
+ } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
if (@tags && lc($1) eq $tags[$#tags]) {
pop @tags;
} else {
} elsif ($it->type == TmplTokenType::DIRECTIVE) {
$parametrized_p = 1;
} elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
- push @tags, lc($1);
- $with_anchor_p = 1 if lc($1) eq 'a';
- $with_input_p = 1 if lc($1) eq 'input';
+ my $tag = lc($1);
+ push @tags, $tag if $tag !~ /^(?:br|input)$/i;
+ $with_anchor_p = 1 if $tag eq 'a';
+ $with_input_p = 1 if $tag eq 'input';
}
# We hate | and || in msgid strings, so we try to avoid them
for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
$parametrized_p = 1;
} elsif ($next->type == TmplTokenType::TAG) {
if ($next->string =~ /^<([A-Z0-9]+)/is) {
- my $candidate = lc($1);
- push @tags, $candidate
- unless $candidate =~ /^(?:input)$/is;
- $with_anchor_p = 1 if lc($1) eq 'a';
- $with_input_p = 1 if lc($1) eq 'input';
+ my $tag = lc($1);
+ push @tags, $tag if $tag !~ /^(?:br|input)$/i;
+ $with_anchor_p = 1 if $tag eq 'a';
+ $with_input_p = 1 if $tag eq 'input';
} elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
my $close = lc($1);
$quit_p = 1 unless @tags && $close eq $tags[$#tags];
sub blank_p ($) {
my($s) = @_;
- return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var)*$/os;
+ return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var|$re_xsl)*$/os;
}
sub trim ($) {
=item S
Specifies a text input field (<INPUT TYPE=TEXT>).
-This use of the o conversion specifier is somewhat reasonable,
+This use of the S conversion specifier is somewhat reasonable,
since text input fields contain values of undeterminable type,
which can be treated as strings.