use strict;
#use warnings; FIXME - Bug 2505
-use TmplTokenType;
-use TmplToken;
+use C4::TmplTokenType;
+use C4::TmplToken;
+use C4::TTParser;
use VerboseWarnings qw( pedantic_p error_normal warn_normal warn_pedantic );
require Exporter;
=head1 NAME
-TmplTokenizer.pm - Simple-minded tokenizer class for HTML::Template .tmpl files
+TmplTokenizer.pm - Simple-minded wrapper class for TTParser
=head1 DESCRIPTION
-Because .tmpl files contains HTML::Template directives
-that tend to confuse real parsers (e.g., HTML::Parse),
-it might be better to create a customized scanner
-to scan the template files for tokens.
-This module is a simple-minded attempt at such a scanner.
+A wrapper for the functionality found in TTParser to allow an easier transition to Template Toolkit
=cut
###############################################################################
# Hideous stuff
-use vars qw( $re_directive $re_tmpl_var $re_tmpl_var_escaped $re_tmpl_include );
-use vars qw( $re_directive_control $re_tmpl_endif_endloop $re_xsl);
+use vars qw( $re_xsl $re_end_entity $re_tmpl_var);
BEGIN {
- # $re_directive must not do any backreferences
- $re_directive = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:VAR|LOOP|INCLUDE|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
- # TMPL_VAR or TMPL_INCLUDE
- $re_tmpl_var = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
- $re_tmpl_include = q{<(?:(?i)(?:!--\s*)?TMPL_(?:INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
- # TMPL_VAR ESCAPE=1/HTML/URL
+ $re_tmpl_var = q{\[%\s*[get|set|default]?\s*[\w\.]+\s*[|.*?]?\s*%\]};
$re_xsl = q{<\/?(?:xsl:)(?:[\s\-a-zA-Z0-9"'\/\.\[\]\@\(\):=,$]+)\/?>};
- $re_tmpl_var_escaped = q{<(?:(?i)(?:!--\s*)?TMPL_(?:VAR|INCLUDE)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))\s+ESCAPE=(?:1|HTML|URL)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
- # Any control flow directive
- $re_directive_control = q{<(?:(?i)(?:!--\s*)?\/?TMPL_(?:LOOP|IF|ELSE|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
- # /LOOP or /IF or /UNLESS
- $re_tmpl_endif_endloop = q{<(?:(?i)(?:!--\s*)?\/TMPL_(?:LOOP|IF|UNLESS)(?:\s+(?:[a-zA-Z][-a-zA-Z0-9]*=)?(?:'[^']*'|"[^"]*"|[^\s<>]+))*\s*(?:--)?)>};
-}
-
-# Hideous stuff from subst.pl, slightly modified to use the above hideous stuff
-# Note: The $re_tag's set $1 (<tag), $2 (>), and $3 (rest of string)
-use vars qw( $re_comment $re_entity_name $re_end_entity $re_etag );
-use vars qw( $re_tag_strict $re_tag_compat @re_tag );
-sub re_tag ($) {
- my($compat) = @_;
- my $etag = $compat? '>': '<>\/';
- # This is no longer similar to the original regexp in subst.pl :-(
- # Note that we don't want <> in compat mode; Mozilla knows about <
- q{(<\/?(?:|(?:"(?:} . $re_directive . q{|[^"])*"|'(?:} . $re_directive . q{|[^'])*'|--(?:(?!--)(?:$re_directive)*.)*--|(?:}
- . $re_directive
- . q{|(?!--)[^"'<>} . $etag . q{]))+))([} . $etag . q{]|(?=<))(.*)};
-}
-BEGIN {
- $re_comment = '(?:--(?:[^-]|-[^-])*--)';
- $re_entity_name = '(?:[^&%#;<>\s]+)'; # NOTE: not really correct SGML
$re_end_entity = '(?:;|$|(?=\s))'; # semicolon or before-whitespace
- $re_etag = q{(?:<\/?(?:"[^"]*"|'[^']*'|[^"'>\/])*[>\/])}; # end-tag
- @re_tag = ($re_tag_strict, $re_tag_compat) = (re_tag(0), re_tag(1));
}
-
# End of the hideous stuff
use vars qw( $serial );
sub SYNTAXERROR_P () {'syntaxerror-p'}
sub FILENAME () {'input'}
-sub HANDLE () {'handle'}
+#sub HANDLE () {'handle'}
-sub READAHEAD () {'readahead'}
+#sub READAHEAD () {'readahead'}
sub LINENUM_START () {'lc_0'}
sub LINENUM () {'lc'}
sub CDATA_MODE_P () {'cdata-mode-p'}
sub CDATA_CLOSE () {'cdata-close'}
-sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
+#sub PCDATA_MODE_P () {'pcdata-mode-p'} # additional submode for CDATA
sub JS_MODE_P () {'js-mode-p'} # cdata-mode-p must also be true
sub ALLOW_CFORMAT_P () {'allow-cformat-p'}
sub new {
shift;
my ($filename) = @_;
- open my $handle,$filename or die "can't open $filename";
+ #open my $handle,$filename or die "can't open $filename";
+ my $parser = C4::TTParser->new;
+ $parser->build_tokens( $filename );
bless {
- filename => $filename
- , handle => $handle
- , readahead => []
+ filename => $filename,
+ _parser => $parser
+# , handle => $handle
+# , readahead => []
} , __PACKAGE__;
}
return $this->{filename};
}
-sub _handle {
- my $this = shift;
- return $this->{handle};
-}
-
sub fatal_p {
my $this = shift;
return $this->{+FATAL_P};
}
+# work around, currently not implemented
sub syntaxerror_p {
- my $this = shift;
- return $this->{+SYNTAXERROR_P};
-}
-
-sub has_readahead_p {
- my $this = shift;
- return @{$this->{readahead}};
-}
-
-sub _peek_readahead {
- my $this = shift;
- return $this->{readahead}->[$#{$this->{readahead}}];
-}
-
-sub line_number_start {
- my $this = shift;
- return $this->{+LINENUM_START};
-}
-
-sub line_number {
- my $this = shift;
- return $this->{+LINENUM};
-}
-
-sub cdata_mode_p {
- my $this = shift;
- return $this->{+CDATA_MODE_P};
-}
-
-sub pcdata_mode_p {
- my $this = shift;
- return $this->{+PCDATA_MODE_P};
+# my $this = shift;
+# return $this->{+SYNTAXERROR_P};
+ return 0;
}
sub js_mode_p {
return $this->{+JS_MODE_P};
}
-sub cdata_close {
- my $this = shift;
- return $this->{+CDATA_CLOSE};
-}
-
sub allow_cformat_p {
my $this = shift;
return $this->{+ALLOW_CFORMAT_P};
return $this;
}
-sub _set_syntaxerror {
- my $this = shift;
- $this->{+SYNTAXERROR_P} = $_[0];
- return $this;
-}
-
-sub _push_readahead {
- my $this = shift;
- push @{$this->{readahead}}, $_[0];
- return $this;
-}
-
-sub _pop_readahead {
- my $this = shift;
- return pop @{$this->{readahead}};
-}
-
-sub _append_readahead {
- my $this = shift;
- $this->{readahead}->[$#{$this->{readahead}}] .= $_[0];
- return $this;
-}
-
-sub _set_readahead {
- my $this = shift;
- $this->{readahead}->[$#{$this->{readahead}}] = $_[0];
- return $this;
-}
-
-sub _increment_line_number {
- my $this = shift;
- $this->{+LINENUM} += 1;
- return $this;
-}
-
-sub _set_line_number_start {
- my $this = shift;
- $this->{+LINENUM_START} = $_[0];
- return $this;
-}
-
-sub _set_cdata_mode {
- my $this = shift;
- $this->{+CDATA_MODE_P} = $_[0];
- return $this;
-}
-
-sub _set_pcdata_mode {
- my $this = shift;
- $this->{+PCDATA_MODE_P} = $_[0];
- return $this;
-}
-
sub _set_js_mode {
my $this = shift;
$this->{+JS_MODE_P} = $_[0];
return $this;
}
-sub _set_cdata_close {
- my $this = shift;
- $this->{+CDATA_CLOSE} = $_[0];
- return $this;
-}
-
+#used in xgettext, tmpl_process3 and text-extract2
sub set_allow_cformat {
my $this = shift;
$this->{+ALLOW_CFORMAT_P} = $_[0];
sub parenleft () { '(' }
sub parenright () { ')' }
-sub split_js ($) {
+sub _split_js ($) {
my ($s0) = @_;
my @it = ();
while (length $s0) {
- if ($s0 =~ /^\s+/s) { # whitespace
- push @it, $&;
- $s0 = $';
- } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
- push @it, $&;
- $s0 = $';
- } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
- push @it, $&;
- $s0 = $';
- # Keyword or identifier, ECMA-262 p.13 (section 7.5)
- } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
- push @it, $&;
- $s0 = $';
- # Punctuator, ECMA-262 p.13 (section 7.6)
- } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
- push @it, $&;
- $s0 = $';
- # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
- } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
- push @it, $&;
- $s0 = $';
- # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
- } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
- push @it, $&;
- $s0 = $';
- # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
- } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
- push @it, $&;
- $s0 = $';
- # StringLiteral, ECMA-262 p.17 (section 7.7.4)
- # XXX SourceCharacter doesn't seem to be defined (?)
- } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
- push @it, $&;
- $s0 = $';
- } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
- push @it, $&;
- $s0 = $';
- }
+ if ($s0 =~ /^\s+/s) { # whitespace
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^\/\/[^\r\n]*(?:[\r\n]|$)/s) { # C++-style comment
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^\/\*(?:(?!\*\/).)*\*\//s) { # C-style comment
+ push @it, $&;
+ $s0 = $';
+ # Keyword or identifier, ECMA-262 p.13 (section 7.5)
+ } elsif ($s0 =~ /^[A-Z_\$][A-Z\d_\$]*/is) { # IdentifierName
+ push @it, $&;
+ $s0 = $';
+ # Punctuator, ECMA-262 p.13 (section 7.6)
+ } elsif ($s0 =~ /^(?:[\(\){}\[\];]|>>>=|<<=|>>=|[-\+\*\/\&\|\^\%]=|>>>|<<|>>|--|\+\+|\|\||\&\&|==|<=|>=|!=|[=><,!~\?:\.\-\+\*\/\&\|\^\%])/s) {
+ push @it, $&;
+ $s0 = $';
+ # DecimalLiteral, ECMA-262 p.14 (section 7.7.3); note: bug in the spec
+ } elsif ($s0 =~ /^(?:0|[1-9]\d+(?:\.\d*(?:[eE][-\+]?\d+)?)?)/s) {
+ push @it, $&;
+ $s0 = $';
+ # HexIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+ } elsif ($s0 =~ /^0[xX][\da-fA-F]+/s) {
+ push @it, $&;
+ $s0 = $';
+ # OctalIntegerLiteral, ECMA-262 p.15 (section 7.7.3)
+ } elsif ($s0 =~ /^0[\da-fA-F]+/s) {
+ push @it, $&;
+ $s0 = $';
+ # StringLiteral, ECMA-262 p.17 (section 7.7.4)
+ # XXX SourceCharacter doesn't seem to be defined (?)
+ } elsif ($s0 =~ /^(?:"(?:(?!["\\\r\n]).|$js_EscapeSequence)*"|'(?:(?!['\\\r\n]).|$js_EscapeSequence)*')/os) {
+ push @it, $&;
+ $s0 = $';
+ } elsif ($s0 =~ /^./) { # UNKNOWN TOKEN !!!
+ push @it, $&;
+ $s0 = $';
+ }
}
return @it;
}
# XXX This is a crazy hack. I don't want to write an ECMAScript parser.
# XXX A scanner is one thing; a parser another thing.
-sub identify_js_translatables (@) {
+sub _identify_js_translatables (@) {
my @input = @_;
my @output = ();
# We mark a JavaScript translatable string as in C, i.e., _("literal")
# For simplicity, we ONLY look for "_" "(" StringLiteral ")"
for (my $i = 0, my $state = 0, my($j, $q, $s); $i <= $#input; $i += 1) {
- my $reset_state_p = 0;
- push @output, [0, $input[$i]];
- if ($input[$i] !~ /\S/s) {
- ;
- } elsif ($state == 0) {
- $state = STATE_UNDERSCORE if $input[$i] eq '_';
- } elsif ($state == STATE_UNDERSCORE) {
- $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
- } elsif ($state == STATE_PARENLEFT) {
- if ($input[$i] =~ /^(['"])(.*)\1$/s) {
- ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
- } else {
- $state = 0;
- }
- } elsif ($state == STATE_STRING_LITERAL) {
- if ($input[$i] eq parenright) {
- $output[$j] = [1, $output[$j]->[1], $q, $s];
- }
- $state = 0;
- } else {
- die "identify_js_translatables internal error: Unknown state $state"
- }
+# warn $input[$i];
+ my $reset_state_p = 0;
+ push @output, [0, $input[$i]];
+ if ($input[$i] !~ /\S/s) {
+ ;
+ } elsif ($state == 0) {
+ $state = STATE_UNDERSCORE if $input[$i] eq '_';
+ } elsif ($state == STATE_UNDERSCORE) {
+ $state = $input[$i] eq parenleft ? STATE_PARENLEFT : 0;
+ } elsif ($state == STATE_PARENLEFT) {
+ if ($input[$i] =~ /^(['"])(.*)\1$/s) {
+ ($state, $j, $q, $s) = (STATE_STRING_LITERAL, $#output, $1, $2);
+ } else {
+ $state = 0;
+ }
+ } elsif ($state == STATE_STRING_LITERAL) {
+ if ($input[$i] eq parenright) {
+ $output[$j] = [1, $output[$j]->[1], $q, $s];
+ }
+ $state = 0;
+ } else {
+ die "identify_js_translatables internal error: Unknown state $state"
+ }
}
+# use Data::Dumper;
+# warn Dumper \@output;
return \@output;
}
###############################################################################
-sub _extract_attributes ($;$) {
- my $this = shift;
- my($s, $lc) = @_;
- my %attr;
- $s = $1 if $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\/\S$/s # XML-style self-closing tags
- || $s =~ /^<(?:(?!$re_directive_control)\S)+(.*)\S$/s; # SGML-style tags
-
- for (my $i = 0; $s =~ /^(?:$re_directive_control)?\s+(?:$re_directive_control)?(?:([a-zA-Z][-a-zA-Z0-9]*)\s*=\s*)?('((?:$re_directive|[^'])*)'|"((?:$re_directive|[^"])*)"|((?:$re_directive|[^\s<>])+))/os;) {
- my($key, $val, $val_orig, $rest)
- = ($1, (defined $3? $3: defined $4? $4: $5), $2, $');
- $i += 1;
- $attr{+lc($key)} = [$key, $val, $val_orig, $i];
- $s = $rest;
- if ($val =~ /$re_tmpl_include/os) {
- warn_normal "TMPL_INCLUDE in attribute: $val_orig\n", $lc;
- } elsif ($val =~ /$re_tmpl_var/os && $val !~ /$re_tmpl_var_escaped/os) {
- # XXX: we probably should not warn if key is "onclick" etc
- # XXX: there's just no reasonable thing to suggest
- my $suggest = ($key =~ /^(?:action|archive|background|cite|classid|codebase|data|datasrc|for|href|longdesc|profile|src|usemap)$/i? 'URL': 'HTML');
- undef $suggest if $key =~ /^(?:onblur|onchange|onclick|ondblclick|onfocus|onkeydown|onkeypress|onkeyup|onload|onmousedown|onmousemove|onmouseout|onmouseover|onmouseup|onreset|onselect|onsubmit|onunload)$/i;
- warn_pedantic
- "Suggest ESCAPE=$suggest for TMPL_VAR in attribute \"$key\""
- . ": $val_orig",
- $lc, \$pedantic_tmpl_var_use_in_nonpedantic_mode_p
- if defined $suggest && (pedantic_p || !$pedantic_tmpl_var_use_in_nonpedantic_mode_p);
- } elsif ($val_orig !~ /^['"]/) {
- my $t = $val; $t =~ s/$re_directive_control//os;
- warn_pedantic
- "Unquoted attribute contains character(s) that should be quoted"
- . ": $val_orig",
- $lc, \$pedantic_attribute_error_in_nonpedantic_mode_p
- if $t =~ /[^-\.A-Za-z0-9]/s;
- }
- }
- my $s2 = $s; $s2 =~ s/$re_tmpl_endif_endloop//g; # for the next check
- if ($s2 =~ /\S/s) { # should never happen
- if ($s =~ /^([^\n]*)\n/s) { # this is even worse
- error_normal("Completely confused while extracting attributes: $1", $lc);
- error_normal((scalar(split(/\n/, $s)) - 1) . " more line(s) not shown.", undef);
- $this->_set_fatal( 1 );
- } else {
- # There's something wrong with the attribute syntax.
- # We might be able to deduce a likely cause by looking more.
- if ($s =~ /^[a-z0-9]/is && "<foo $s>" =~ /^$re_tag_compat$/s) {
- warn_normal "Probably missing whitespace before or missing quotation mark near: $s\n", $lc;
- } else {
- warn_normal "Strange attribute syntax: $s\n", $lc;
- }
- }
- }
- return \%attr;
-}
-
-sub _next_token_internal {
- my $this = shift;
- my($h) = @_;
- my($it, $kind);
- my $eof_p = 0;
- $this->_pop_readahead if $this->has_readahead_p
- && !ref $this->_peek_readahead
- && !length $this->_peek_readahead;
- if (!$this->has_readahead_p) {
- my $next = scalar <$h>;
- $eof_p = !defined $next;
- if (!$eof_p) {
- $this->_increment_line_number;
- $this->_push_readahead( $next );
- }
- }
- $this->_set_line_number_start( $this->line_number ); # remember 1st line num
- if ($this->has_readahead_p && ref $this->_peek_readahead) { # TmplToken obj.
- ($it, $kind) = ($this->_pop_readahead, undef);
- } elsif ($eof_p && !$this->has_readahead_p) { # nothing left to do
- ;
- } elsif ($this->_peek_readahead =~ /^\s+/s) { # whitespace
- ($kind, $it) = (TmplTokenType::TEXT, $&);
- $this->_set_readahead( $' );
- # FIXME the following (the [<\s] part) is an unreliable HACK :-(
- } elsif ($this->_peek_readahead =~ /^(?:[^<]|<[<\s])*(?:[^<\s])/s) { # non-space normal text
- ($kind, $it) = (TmplTokenType::TEXT, $&);
- $this->_set_readahead( $' );
- warn_normal "Unescaped < in $it\n", $this->line_number_start
- if !$this->cdata_mode_p && $it =~ /</s;
- } else { # tag/declaration/processing instruction
- my $ok_p = 0;
- my $bad_comment_p = 0;
- for (my $cdata_close = $this->cdata_close;;) {
- if ($this->cdata_mode_p) {
- my $next = $this->_pop_readahead;
- if ($next =~ /^$cdata_close/is) {
- ($kind, $it) = (TmplTokenType::TAG, $&);
- $this->_push_readahead( $' );
- $ok_p = 1;
- } elsif ($next =~ /^((?:(?!$cdata_close).)+)($cdata_close)/is) {
- ($kind, $it) = (TmplTokenType::TEXT, $1);
- $this->_push_readahead( "$2$'" );
- $ok_p = 1;
- } else {
- ($kind, $it) = (TmplTokenType::TEXT, $next);
- $ok_p = 1;
- }
- } elsif ($this->_peek_readahead =~ /^$re_tag_compat/os) {
- # If we detect a "closed start tag" but we know that the
- # following token looks like a TMPL_VAR, don't stop
- my($head, $tail, $post) = ($1, $2, $3);
- if ($tail eq '' && $post =~ $re_tmpl_var) {
- # Don't bother to show the warning if we're too confused
- # FIXME. There's no method for _closed_start_tag_warning
- if (!defined $this->{'_closed_start_tag_warning'}
- || ($this->{'_closed_start_tag_warning'}->[0] eq $head
- && $this->{'_closed_start_tag_warning'}->[1] != $this->line_number - 1)) {
- warn_normal "Possible SGML \"closed start tag\" notation: $head<\n", $this->line_number
- if split(/\n/, $head) < 10;
- }
- $this->{'_closed_start_tag_warning'} = [$head, $this->line_number];
- } else {
- ($kind, $it) = (TmplTokenType::TAG, "$head>");
- $this->_set_readahead( $post );
- $ok_p = 1;
- warn_normal "SGML \"closed start tag\" notation: $head<\n", $this->line_number if $tail eq ''
- and $head ne '<!DOCTYPE stylesheet ['; # another bit of temporary ugliness for bug 4472
- }
- } elsif ($this->_peek_readahead =~ /^<!--(?:(?!-->)$re_directive*.)*-->/os) {
- ($kind, $it) = (TmplTokenType::COMMENT, $&);
- $this->_set_readahead( $' );
- $ok_p = 1;
- $bad_comment_p = 1;
- }
- last if $ok_p;
- my $next = scalar <$h>;
- $eof_p = !defined $next;
- last if $eof_p;
- $this->_increment_line_number;
- $this->_append_readahead( $next );
- }
- if ($kind ne TmplTokenType::TAG) {
- ;
- } elsif ($it =~ /^<!/) {
- $kind = TmplTokenType::DECL;
- $kind = TmplTokenType::COMMENT if $it =~ /^<!--(?:(?!-->).)*-->/;
- if ($kind == TmplTokenType::COMMENT && $it =~ /^<!--\s*#include/s) {
- warn_normal "Apache #include directive found instead of HTML::Template <TMPL_INCLUDE> directive", $this->line_number_start;
- }
- } elsif ($it =~ /^<\?/) {
- $kind = TmplTokenType::PI;
- }
- if ($it =~ /^$re_directive/ios && !$this->cdata_mode_p) {
- $kind = TmplTokenType::DIRECTIVE;
- } elsif ($bad_comment_p) {
- warn_normal sprintf("Syntax error in comment: %s\n", $it),
- $this->line_number_start;
- $this->_set_syntaxerror( 1 );
- }
- if (!$ok_p && $eof_p) {
- ($kind, $it) = (TmplTokenType::UNKNOWN, $this->_peek_readahead);
- $this->_set_readahead, undef;
- $this->_set_syntaxerror( 1 );
- }
- }
- warn_normal "Unrecognizable token found: "
- . (split(/\n/, $it) < 10? $it: '(too confused to show details)')
- . "\n", $this->line_number_start
- if $kind == TmplTokenType::UNKNOWN;
- return defined $it? (ref $it? $it: TmplToken->new($it, $kind, $this->line_number, $this->filename)): undef;
-}
-
-sub _next_token_intermediate {
- my $this = shift;
- my $h = $this->_handle;
- my $it;
- if (!$this->cdata_mode_p) {
- $it = $this->_next_token_internal($h);
- if (defined $it && $it->type == TmplTokenType::TAG) {
- if ($it->string =~ /^<(script|style|textarea)\b/is ||
- ($this->filename =~ /(opensearch)|(opac-showreviews-rss)/ && $it->string =~ /^<(description)\b/) # FIXME special case to handle
- # a CDATA in opac-opensearch.tmpl and opac-showreviews-rss.tmpl
- ) {
- $this->_set_cdata_mode( 1 );
- $this->_set_cdata_close( "</$1\\s*>" );
- $this->_set_pcdata_mode( 0 );
- $this->_set_js_mode( lc($1) eq 'script' );
-# } elsif ($it->string =~ /^<(title)\b/is) {
-# $this->_set_cdata_mode( 1 );
-# $this->_set_cdata_close( "</$1\\s*>" );
-# $this->_set_pcdata_mode( 1 );
- }
- $it->set_attributes( $this->_extract_attributes($it->string, $it->line_number) );
- }
- } else {
- my $eof_p = 0;
- for ($it = '', my $cdata_close = $this->cdata_close;;) {
- my $next = $this->_next_token_internal($h);
- $eof_p = !defined $next;
- last if $eof_p;
- if (defined $next && $next->string =~ /$cdata_close/is) {
- $this->_push_readahead( $next ); # push entire TmplToken object
- $this->_set_cdata_mode( 0 );
- }
- last unless $this->cdata_mode_p;
- $it .= $next->string;
- }
- if ($eof_p) {
- $it = undef;
- error_normal "Unexpected end of file while looking for "
- . $this->cdata_close
- . "\n", $this->line_number_start;
- $this->_set_fatal( 1 );
- $this->_set_syntaxerror( 1 );
- }
- if ($this->pcdata_mode_p) {
- my $check = $it;
- $check =~ s/$re_directive//gos;
- warn_pedantic "Markup found in PCDATA\n", $this->line_number,
- \$pedantic_error_markup_in_pcdata_p
- if $check =~ /$re_tag_compat/s;
- }
- # PCDATA should be treated as text, not CDATA
- # Actually it should be treated as TEXT_PARAMETRIZED :-(
- $it = TmplToken->new( $it,
- ($this->pcdata_mode_p?
- TmplTokenType::TEXT: TmplTokenType::CDATA),
- $this->line_number, $this->filename )
- if defined $it;
- if ($this->js_mode_p) {
- my $s0 = $it->string;
- my @head = ();
- my @tail = ();
- if ($s0 =~ /^(\s*<!--\s*)(.*)(\s*--\s*>\s*)$/s) {
- push @head, $1;
- push @tail, $3;
- $s0 = $2;
- }
- push @head, split_js $s0;
- $it->set_js_data( identify_js_translatables(@head, @tail) );
- }
- $this->_set_pcdata_mode, 0;
- $this->_set_cdata_close, undef unless !defined $it;
- }
- return $it;
-}
-
-sub _token_groupable1_p ($) { # as first token, groupable into TEXT_PARAMETRIZED
- my($t) = @_;
- return ($t->type == TmplTokenType::TEXT && $t->string !~ /^[,\.:\|\s]+$/is)
- || ($t->type == TmplTokenType::DIRECTIVE
- && $t->string =~ /^(?:$re_tmpl_var)$/os)
- || ($t->type == TmplTokenType::TAG
- && ($t->string =~ /^<(?:a|b|em|h[123456]|i|u)\b/is
- || ($t->string =~ /^<input\b/is
- && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)
- ))
-}
-
-sub _token_groupable2_p ($) { # as other token, groupable into TEXT_PARAMETRIZED
- my($t) = @_;
- return ($t->type == TmplTokenType::TEXT && ($t->string =~ /^\s*$/s || $t->string !~ /^[\|\s]+$/is))
- || ($t->type == TmplTokenType::DIRECTIVE
- && $t->string =~ /^(?:$re_tmpl_var)$/os)
- || ($t->type == TmplTokenType::TAG
- && ($t->string =~ /^<\/?(?:a|b|em|h[123456]|i|u)\b/is
- || ($t->string =~ /^<input\b/is
- && $t->attributes->{'type'}->[1] =~ /^(?:radio|text)$/is)))
-}
-
-sub _quote_cformat ($) {
- my($s) = @_;
- $s =~ s/%/%%/g;
- return $s;
-}
-
sub string_canon ($) {
- my($s) = @_;
- if (1) { # FIXME
- # Fold all whitespace into single blanks
- $s =~ s/\s+/ /gs;
- }
- return $s;
-}
-
-sub _formalize_string_cformat ($) {
- my($s) = @_;
- return _quote_cformat string_canon $s;
-}
-
-sub _formalize ($) {
- my($t) = @_;
- return $t->type == TmplTokenType::DIRECTIVE? '%s':
- $t->type == TmplTokenType::TEXT?
- _formalize_string_cformat($t->string):
- $t->type == TmplTokenType::TAG?
- ($t->string =~ /^<a\b/is? '<a>':
- $t->string =~ /^<input\b/is? (
- lc $t->attributes->{'type'}->[1] eq 'text' ? '%S':
- '%p'):
- _quote_cformat($t->string)):
- _quote_cformat($t->string);
-}
-
-sub _optimize {
+ my $s = shift;
+ # Fold all whitespace into single blanks
+ $s =~ s/\s+/ /g;
+ $s =~ s/^\s+//g;
+ return $s;
+}
+
+# safer version used internally, preserves new lines
+sub string_canon_safe ($) {
+ my $s = shift;
+ # fold tabs and spaces into single spaces
+ $s =~ s/[\ \t]+/ /gs;
+ return $s;
+}
+
+
+sub _quote_cformat{
+ my $s = shift;
+ $s =~ s/%/%%/g;
+ return $s;
+}
+
+sub _formalize_string_cformat{
+ my $s = shift;
+ return _quote_cformat( string_canon_safe $s );
+}
+
+sub _formalize{
+ my $t = shift;
+ if( $t->type == C4::TmplTokenType::DIRECTIVE ){
+ return '%s';
+ } elsif( $t->type == C4::TmplTokenType::TEXT ){
+ return _formalize_string_cformat( $t->string );
+ } elsif( $t->type == C4::TmplTokenType::TAG ){
+ if( $t->string =~ m/^a\b/is ){
+ return '<a>';
+ } elsif( $t->string =~ m/^input\b/is ){
+ if( lc $t->attributes->{'type'}->[1] eq 'text' ){
+ return '%S';
+ } else{
+ return '%p';
+ }
+ } else{
+ return _quote_cformat $t->string;
+ }
+ } else{
+ return _quote_cformat $t->string;
+ }
+}
+
+# internal parametization, used within next_token
+# method that takes in an array of TEXT and DIRECTIVE tokens (DIRECTIVEs must be GET) and return a C4::TmplTokenType::TEXT_PARAMETRIZED
+sub _parametrize_internal{
my $this = shift;
- my @structure = @_;
- my $undo_trailing_blanks = sub {
- for (my $i = $#structure; $i >= 0; $i -= 1) {
- last unless ($structure[$i]->type == TmplTokenType::TEXT && blank_p($structure[$i]->string)) ;#|| ($structure[$i]->type == TmplTokenType::TAG && $structure[$i]->string =~ /^<br\b/is);
- # Queue element structure: [reanalysis-p, token]
- push @{$this->{_queue}}, [1, pop @structure];
- }
- };
- &$undo_trailing_blanks;
- while (@structure >= 2) {
- my $something_done_p = 0;
- # FIXME: If the last token is a close tag but there are no tags
- # FIXME: before it, drop the close tag back into the queue. This
- # FIXME: is an ugly hack to get rid of "foo %s</h1>" type mess.
- if (@structure >= 2
- && $structure[$#structure]->type == TmplTokenType::TAG
- && $structure[$#structure]->string =~ /^<\//s) {
- my $has_other_tags_p = 0;
- for (my $i = 0; $i < $#structure; $i += 1) {
- $has_other_tags_p = 1
- if $structure[$i]->type == TmplTokenType::TAG;
- last if $has_other_tags_p;
- }
- if (!$has_other_tags_p) {
- push @{$this->{_queue}}, [0, pop @structure]
- &$undo_trailing_blanks;
- $something_done_p = 1;
- }
- }
- # FIXME: Do the same ugly hack for the last token being a ( or [
- if (@structure >= 2
- && $structure[$#structure]->type == TmplTokenType::TEXT
- && $structure[$#structure]->string =~ /^[\(\[]$/) { # not )]
- push @{$this->{_queue}}, [1, pop @structure];
- &$undo_trailing_blanks;
- $something_done_p = 1;
- }
- # FIXME: If the first token is an open tag, but there is no
- # FIXME: corresponding close tag, "drop the open tag", i.e.,
- # FIXME: requeue everything for reanalysis, except the frist tag. :-(
- if (@structure >= 2
- && $structure[0]->type == TmplTokenType::TAG
- && $structure[0]->string =~ /^<([a-z0-9]+)/is
- && (my $tag = $1) !~ /^(?:br|hr|img|input)\b/is
- ) {
- my $tag_open_count = 1;
- for (my $i = 1; $i <= $#structure; $i += 1) {
- if ($structure[$i]->type == TmplTokenType::TAG) {
- if ($structure[$i]->string =~ /^<(\/?)$tag\b/is) {
- $tag_open_count += ($1? -1: +1);
- }
- }
- }
- if ($tag_open_count > 0) {
- for (my $i = $#structure; $i; $i -= 1) {
- push @{$this->{_queue}}, [1, pop @structure];
- }
- $something_done_p = 1;
- }
- }
- # FIXME: If the first token is an open tag, the last token is the
- # FIXME: corresponding close tag, and there are no other close tags
- # FIXME: inbetween, requeue the tokens from the second token on,
- # FIXME: flagged as ok for re-analysis
- if (@structure >= 3
- && $structure[0]->type == TmplTokenType::TAG
- && $structure[0]->string =~ /^<([a-z0-9]+)/is && (my $tag = $1)
- && $structure[$#structure]->type == TmplTokenType::TAG
- && $structure[$#structure]->string =~ /^<\/$1\s*>$/is) {
- my $has_other_open_or_close_tags_p = 0;
- for (my $i = 1; $i < $#structure; $i += 1) {
- $has_other_open_or_close_tags_p = 1
- if $structure[$i]->type == TmplTokenType::TAG
- && $structure[$i]->string =~ /^<\/?$tag\b/is;
- last if $has_other_open_or_close_tags_p;
- }
- if (!$has_other_open_or_close_tags_p) {
- for (my $i = $#structure; $i; $i -= 1) {
- push @{$this->{_queue}}, [1, pop @structure];
- }
- $something_done_p = 1;
- }
- }
- last if !$something_done_p;
- }
- return @structure;
-}
-
-sub looks_plausibly_like_groupable_text_p (@) {
- my @structure = @_;
- # The text would look plausibly groupable if all open tags are also closed.
- my @tags = ();
- my $error_p = 0;
- for (my $i = 0; $i <= $#structure; $i += 1) {
- if ($structure[$i]->type == TmplTokenType::TAG) {
- my $form = $structure[$i]->string;
- if ($form =~ /^<([A-Z0-9]+)/is) {
- my $tag = lc($1);
- if ($tag !~ /^(?:br|input)$/is && $form !~ /\/>$/is) {
- push @tags, $tag;
- }
- } elsif ($form =~ /^<\/([A-Z0-9]+)/is) {
- if (@tags && lc($1) eq $tags[$#tags]) {
- pop @tags;
- } else {
- $error_p = 1;
- }
- }
- } elsif ($structure[$i]->type != TmplTokenType::TEXT) {
- $error_p = 1;
- }
- last if $error_p;
- }
- return !$error_p && !@tags;
+ my @parts = @_;
+ # my $s = "";
+ # for my $item (@parts){
+ # if( $item->type == C4::TmplTokenType::TEXT ){
+ # $s .= $item->string;
+ # } else {
+ # #must be a variable directive
+ # $s .= "%s";
+ # }
+ # }
+ my $s = join( "", map { _formalize $_ } @parts );
+ # should both the string and form be $s? maybe only the later? posibly the former....
+ # used line number from first token, should suffice
+ my $t = C4::TmplToken->new( $s, C4::TmplTokenType::TEXT_PARAMETRIZED, $parts[0]->line_number, $this->filename );
+ $t->set_children(@parts);
+ $t->set_form($s);
+ return $t;
}
sub next_token {
- my $this = shift;
- my $h = $this->_handle;
- my $it;
- $this->{_queue} = [] unless defined $this->{_queue};
-
- # Elements in the queue are ordered pairs. The first in the ordered pair
- # specifies whether we are allowed to reanalysis; the second is the token.
- if (@{$this->{_queue}} && !$this->{_queue}->[$#{$this->{_queue}}]->[0]) {
- $it = (pop @{$this->{_queue}})->[1];
- } else {
- if (@{$this->{_queue}}) {
- $it = (pop @{$this->{_queue}})->[1];
- } else {
- $it = $this->_next_token_intermediate($h);
- }
- if (!$this->cdata_mode_p && $this->allow_cformat_p && defined $it
- && ($it->type == TmplTokenType::TEXT?
- !blank_p( $it->string ): _token_groupable1_p( $it ))) {
- my @structure = ( $it );
- my @tags = ();
- my $next = undef;
- my($nonblank_text_p, $parametrized_p, $with_anchor_p, $with_input_p) = (0, 0, 0, 0);
- if ($it->type == TmplTokenType::TEXT) {
- $nonblank_text_p = 1 if !blank_p( $it->string );
- } elsif ($it->type == TmplTokenType::DIRECTIVE) {
- $parametrized_p = 1;
- } elsif ($it->type == TmplTokenType::TAG && $it->string =~ /^<([A-Z0-9]+)/is) {
- my $tag = lc($1);
- push @tags, $tag if $tag !~ /^(?:br|input)$/i;
- $with_anchor_p = 1 if $tag eq 'a';
- $with_input_p = 1 if $tag eq 'input';
- }
- # We hate | and || in msgid strings, so we try to avoid them
- for (my $i = 1, my $quit_p = 0, my $quit_next_p = ($it->type == TmplTokenType::TEXT && $it->string =~ /^\|+$/s);; $i += 1) {
- if (@{$this->{_queue}}) {
- $next = (pop @{$this->{_queue}})->[1];
- } else {
- $next = $this->_next_token_intermediate($h);
- }
- push @structure, $next; # for consistency (with initialization)
- last unless defined $next && _token_groupable2_p( $next );
- last if $quit_next_p;
- if ($next->type == TmplTokenType::TEXT) {
- $nonblank_text_p = 1 if !blank_p( $next->string );
- $quit_p = 1 if $next->string =~ /^\|+$/s; # We hate | and ||
- } elsif ($next->type == TmplTokenType::DIRECTIVE) {
- $parametrized_p = 1;
- } elsif ($next->type == TmplTokenType::TAG) {
- if ($next->string =~ /^<([A-Z0-9]+)/is) {
- my $tag = lc($1);
- push @tags, $tag if $tag !~ /^(?:br|input)$/i;
- $with_anchor_p = 1 if $tag eq 'a';
- $with_input_p = 1 if $tag eq 'input';
- } elsif ($next->string =~ /^<\/([A-Z0-9]+)/is) {
- my $close = lc($1);
- $quit_p = 1 unless @tags && $close eq $tags[$#tags];
- $quit_next_p = 1 if $close =~ /^h\d$/;
- pop @tags;
- }
- }
- last if $quit_p;
- }
- # Undo the last token, allowing reanalysis
- push @{$this->{_queue}}, [1, pop @structure];
- # Simply it a bit more
- @structure = $this->_optimize( @structure );
- if (@structure < 2) {
- # Nothing to do
- ;
- } elsif ($nonblank_text_p && ($parametrized_p || $with_anchor_p || $with_input_p)) {
- # Create the corresponding c-format string
- my $string = join('', map { $_->string } @structure);
- my $form = join('', map { _formalize $_ } @structure);
- my($a_counter, $input_counter) = (0, 0);
- $form =~ s/<a>/ $a_counter += 1, "<a$a_counter>" /egs;
- $form =~ s/<input>/ $input_counter += 1, "<input$input_counter>" /egs;
- $it = TmplToken->new($string, TmplTokenType::TEXT_PARAMETRIZED,
- $it->line_number, $it->pathname);
- $it->set_form( $form );
- $it->set_children( @structure );
- } elsif ($nonblank_text_p
- && looks_plausibly_like_groupable_text_p( @structure )
- && $structure[$#structure]->type == TmplTokenType::TEXT) {
- # Combine the strings
- my $string = join('', map { $_->string } @structure);
- $it = TmplToken->new($string, TmplTokenType::TEXT,
- $it->line_number, $it->pathname);;
- } else {
- # Requeue the tokens thus seen for re-emitting, allow reanalysis
- for (;;) {
- push @{$this->{_queue}}, [1, pop @structure];
- last if !@structure;
- }
- $it = (pop @{$this->{_queue}})->[1];
- }
- }
- }
- if (defined $it && $it->type == TmplTokenType::TEXT) {
- my $form = string_canon $it->string;
- $it->set_form( $form );
+ my $self = shift;
+ my $next;
+# warn "in next_token";
+ # parts that make up a text_parametrized (future children of the token)
+ my @parts = ();
+ while(1){
+ $next = $self->{_parser}->next_token;
+ if (! $next){
+ if (@parts){
+ return $self->_parametrize_internal(@parts);
+ }
+ else {
+ return undef;
+ }
+ }
+ # if cformat mode is off, dont bother parametrizing, just return them as they come
+ return $next unless $self->allow_cformat_p;
+ if( $next->type == C4::TmplTokenType::TEXT ){
+ push @parts, $next;
+ }
+# elsif( $next->type == C4::TmplTokenType::DIRECTIVE && $next->string =~ m/\[%\s*\w+\s*%\]/ ){
+ elsif( $next->type == C4::TmplTokenType::DIRECTIVE ){
+ push @parts, $next;
+ }
+ elsif ( $next->type == C4::TmplTokenType::CDATA){
+ $self->_set_js_mode(1);
+ my $s0 = $next->string;
+ my @head = ();
+ my @tail = ();
+
+ if ($s0 =~ /^(\s*\[%\s*)(.*)(\s%=]\s*)$/s) {
+ push @head, $1;
+ push @tail, $3;
+ $s0 = $2;
+ }
+ push @head, _split_js $s0;
+ $next->set_js_data(_identify_js_translatables(@head, @tail) );
+ return $next unless @parts;
+ $self->{_parser}->unshift_token($next);
+ return $self->_parametrize_internal(@parts);
+ }
+ else {
+ # if there is nothing in parts, return this token
+ return $next unless @parts;
+
+ # OTHERWISE, put this token back and return the parametrized string of @parts
+ $self->{_parser}->unshift_token($next);
+ return $self->_parametrize_internal(@parts);
+ }
+
}
- return $it;
}
###############################################################################
-# Other simple functions (These are not methods)
-
-sub blank_p ($) {
- my($s) = @_;
- return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var|$re_xsl)*$/os;
-}
-
-sub trim ($) {
- my($s0) = @_;
- my $l0 = length $s0;
- my $s = $s0;
- $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
- $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
- return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
-}
-
-sub quote_po ($) {
- my($s) = @_;
- # Locale::PO->quote is buggy, it doesn't quote newlines :-/
- $s =~ s/([\\"])/\\\1/gs;
- $s =~ s/\n/\\n/g;
- #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
- return "\"$s\"";
-}
-
-# Some functions that shouldn't be here... should be moved out some time
+# function taken from old version
+# used by tmpl_process3
sub parametrize ($$$$) {
my($fmt_0, $cformat_p, $t, $f) = @_;
my $it = '';
my $param = $params[$i - 1];
warn_normal "$fmt_0: $&: Expected a TMPL_VAR, but found a "
. $param->type->to_string . "\n", undef
- if $param->type != TmplTokenType::DIRECTIVE;
+ if $param->type != C4::TmplTokenType::DIRECTIVE;
warn_normal "$fmt_0: $&: Unsupported "
. "field width or precision\n", undef
if defined $width || defined $prec;
if (!defined $param) {
warn_normal "$fmt_0: $&: Parameter $i not known", undef;
} else {
- if ($param->type == TmplTokenType::TAG
+ if ($param->type == C4::TmplTokenType::TAG
&& $param->string =~ /^<input\b/is) {
my $type = defined $param->attributes?
lc($param->attributes->{'type'}->[1]): undef;
$it .= $&;
die "$&: Unknown or unsupported format specification\n"; #XXX
} else {
- die "$&: Completely confused parametrizing\n";#XXX
+ die "$&: Completely confused parametrizing -- msgid: $fmt_0\n";#XXX
}
}
}
return $it;
}
+
+# Other simple functions (These are not methods)
+
+sub blank_p ($) {
+ my($s) = @_;
+ return $s =~ /^(?:\s|\ $re_end_entity|$re_tmpl_var|$re_xsl)*$/osi;
+}
+
+sub trim ($) {
+ my($s0) = @_;
+ my $l0 = length $s0;
+ my $s = $s0;
+ $s =~ s/^(\s|\ $re_end_entity)+//os; my $l1 = $l0 - length $s;
+ $s =~ s/(\s|\ $re_end_entity)+$//os; my $l2 = $l0 - $l1 - length $s;
+ return wantarray? (substr($s0, 0, $l1), $s, substr($s0, $l0 - $l2)): $s;
+}
+
+sub quote_po ($) {
+ my($s) = @_;
+ # Locale::PO->quote is buggy, it doesn't quote newlines :-/
+ $s =~ s/([\\"])/\\\1/gs;
+ $s =~ s/\n/\\n/g;
+ #$s =~ s/[\177-\377]/ sprintf("\\%03o", ord($&)) /egs;
+ return "\"$s\"";
+}
+
sub charset_canon ($) {
my($charset) = @_;
$charset = uc($charset);
words will require certain inflectional suffixes in sentences.
Because this is an incompatible change, this mode must be explicitly
-turned on using the set_cformat(1) method call.
+turned on using the set_allow_cformat(1) method call.
=head2 The flag characters