diff options
Diffstat (limited to 'lib/spack/external/jinja2/lexer.py')
-rw-r--r-- | lib/spack/external/jinja2/lexer.py | 775 |
1 files changed, 442 insertions, 333 deletions
diff --git a/lib/spack/external/jinja2/lexer.py b/lib/spack/external/jinja2/lexer.py index 6fd135dd5b..552356a12d 100644 --- a/lib/spack/external/jinja2/lexer.py +++ b/lib/spack/external/jinja2/lexer.py @@ -1,185 +1,194 @@ # -*- coding: utf-8 -*- -""" - jinja2.lexer - ~~~~~~~~~~~~ - - This module implements a Jinja / Python combination lexer. The - `Lexer` class provided by this module is used to do some preprocessing - for Jinja. - - On the one hand it filters out invalid operators like the bitshift - operators we don't allow in templates. On the other hand it separates - template code and python code in expressions. - - :copyright: (c) 2017 by the Jinja Team. - :license: BSD, see LICENSE for more details. +"""Implements a Jinja / Python combination lexer. The ``Lexer`` class +is used to do some preprocessing. It filters out invalid operators like +the bitshift operators we don't allow in templates. It separates +template code and python code in expressions. """ import re +from ast import literal_eval from collections import deque from operator import itemgetter -from jinja2._compat import implements_iterator, intern, iteritems, text_type -from jinja2.exceptions import TemplateSyntaxError -from jinja2.utils import LRUCache +from ._compat import implements_iterator +from ._compat import intern +from ._compat import iteritems +from ._compat import text_type +from .exceptions import TemplateSyntaxError +from .utils import LRUCache # cache for the lexers. Exists in order to be able to have multiple # environments with the same lexer _lexer_cache = LRUCache(50) # static regular expressions -whitespace_re = re.compile(r'\s+', re.U) -string_re = re.compile(r"('([^'\\]*(?:\\.[^'\\]*)*)'" - r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S) -integer_re = re.compile(r'\d+') +whitespace_re = re.compile(r"\s+", re.U) +newline_re = re.compile(r"(\r\n|\r|\n)") +string_re = re.compile( + r"('([^'\\]*(?:\\.[^'\\]*)*)'" r'|"([^"\\]*(?:\\.[^"\\]*)*)")', re.S +) +integer_re = re.compile(r"(\d+_)*\d+") +float_re = re.compile( + r""" + (?<!\.) # doesn't start with a . + (\d+_)*\d+ # digits, possibly _ separated + ( + (\.(\d+_)*\d+)? # optional fractional part + e[+\-]?(\d+_)*\d+ # exponent part + | + \.(\d+_)*\d+ # required fractional part + ) + """, + re.IGNORECASE | re.VERBOSE, +) try: # check if this Python supports Unicode identifiers - compile('föö', '<unknown>', 'eval') + compile("föö", "<unknown>", "eval") except SyntaxError: - # no Unicode support, use ASCII identifiers - name_re = re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*') + # Python 2, no Unicode support, use ASCII identifiers + name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*") check_ident = False else: - # Unicode support, build a pattern to match valid characters, and set flag - # to use str.isidentifier to validate during lexing - from jinja2 import _identifier - name_re = re.compile(r'[\w{0}]+'.format(_identifier.pattern)) - check_ident = True - # remove the pattern from memory after building the regex - import sys - del sys.modules['jinja2._identifier'] - import jinja2 - del jinja2._identifier - del _identifier + # Unicode support, import generated re pattern and set flag to use + # str.isidentifier to validate during lexing. + from ._identifier import pattern as name_re -float_re = re.compile(r'(?<!\.)\d+\.\d+') -newline_re = re.compile(r'(\r\n|\r|\n)') + check_ident = True # internal the tokens and keep references to them -TOKEN_ADD = intern('add') -TOKEN_ASSIGN = intern('assign') -TOKEN_COLON = intern('colon') -TOKEN_COMMA = intern('comma') -TOKEN_DIV = intern('div') -TOKEN_DOT = intern('dot') -TOKEN_EQ = intern('eq') -TOKEN_FLOORDIV = intern('floordiv') -TOKEN_GT = intern('gt') -TOKEN_GTEQ = intern('gteq') -TOKEN_LBRACE = intern('lbrace') -TOKEN_LBRACKET = intern('lbracket') -TOKEN_LPAREN = intern('lparen') -TOKEN_LT = intern('lt') -TOKEN_LTEQ = intern('lteq') -TOKEN_MOD = intern('mod') -TOKEN_MUL = intern('mul') -TOKEN_NE = intern('ne') -TOKEN_PIPE = intern('pipe') -TOKEN_POW = intern('pow') -TOKEN_RBRACE = intern('rbrace') -TOKEN_RBRACKET = intern('rbracket') -TOKEN_RPAREN = intern('rparen') -TOKEN_SEMICOLON = intern('semicolon') -TOKEN_SUB = intern('sub') -TOKEN_TILDE = intern('tilde') -TOKEN_WHITESPACE = intern('whitespace') -TOKEN_FLOAT = intern('float') -TOKEN_INTEGER = intern('integer') -TOKEN_NAME = intern('name') -TOKEN_STRING = intern('string') -TOKEN_OPERATOR = intern('operator') -TOKEN_BLOCK_BEGIN = intern('block_begin') -TOKEN_BLOCK_END = intern('block_end') -TOKEN_VARIABLE_BEGIN = intern('variable_begin') -TOKEN_VARIABLE_END = intern('variable_end') -TOKEN_RAW_BEGIN = intern('raw_begin') -TOKEN_RAW_END = intern('raw_end') -TOKEN_COMMENT_BEGIN = intern('comment_begin') -TOKEN_COMMENT_END = intern('comment_end') -TOKEN_COMMENT = intern('comment') -TOKEN_LINESTATEMENT_BEGIN = intern('linestatement_begin') -TOKEN_LINESTATEMENT_END = intern('linestatement_end') -TOKEN_LINECOMMENT_BEGIN = intern('linecomment_begin') -TOKEN_LINECOMMENT_END = intern('linecomment_end') -TOKEN_LINECOMMENT = intern('linecomment') -TOKEN_DATA = intern('data') -TOKEN_INITIAL = intern('initial') -TOKEN_EOF = intern('eof') +TOKEN_ADD = intern("add") +TOKEN_ASSIGN = intern("assign") +TOKEN_COLON = intern("colon") +TOKEN_COMMA = intern("comma") +TOKEN_DIV = intern("div") +TOKEN_DOT = intern("dot") +TOKEN_EQ = intern("eq") +TOKEN_FLOORDIV = intern("floordiv") +TOKEN_GT = intern("gt") +TOKEN_GTEQ = intern("gteq") +TOKEN_LBRACE = intern("lbrace") +TOKEN_LBRACKET = intern("lbracket") +TOKEN_LPAREN = intern("lparen") +TOKEN_LT = intern("lt") +TOKEN_LTEQ = intern("lteq") +TOKEN_MOD = intern("mod") +TOKEN_MUL = intern("mul") +TOKEN_NE = intern("ne") +TOKEN_PIPE = intern("pipe") +TOKEN_POW = intern("pow") +TOKEN_RBRACE = intern("rbrace") +TOKEN_RBRACKET = intern("rbracket") +TOKEN_RPAREN = intern("rparen") +TOKEN_SEMICOLON = intern("semicolon") +TOKEN_SUB = intern("sub") +TOKEN_TILDE = intern("tilde") +TOKEN_WHITESPACE = intern("whitespace") +TOKEN_FLOAT = intern("float") +TOKEN_INTEGER = intern("integer") +TOKEN_NAME = intern("name") +TOKEN_STRING = intern("string") +TOKEN_OPERATOR = intern("operator") +TOKEN_BLOCK_BEGIN = intern("block_begin") +TOKEN_BLOCK_END = intern("block_end") +TOKEN_VARIABLE_BEGIN = intern("variable_begin") +TOKEN_VARIABLE_END = intern("variable_end") +TOKEN_RAW_BEGIN = intern("raw_begin") +TOKEN_RAW_END = intern("raw_end") +TOKEN_COMMENT_BEGIN = intern("comment_begin") +TOKEN_COMMENT_END = intern("comment_end") +TOKEN_COMMENT = intern("comment") +TOKEN_LINESTATEMENT_BEGIN = intern("linestatement_begin") +TOKEN_LINESTATEMENT_END = intern("linestatement_end") +TOKEN_LINECOMMENT_BEGIN = intern("linecomment_begin") +TOKEN_LINECOMMENT_END = intern("linecomment_end") +TOKEN_LINECOMMENT = intern("linecomment") +TOKEN_DATA = intern("data") +TOKEN_INITIAL = intern("initial") +TOKEN_EOF = intern("eof") # bind operators to token types operators = { - '+': TOKEN_ADD, - '-': TOKEN_SUB, - '/': TOKEN_DIV, - '//': TOKEN_FLOORDIV, - '*': TOKEN_MUL, - '%': TOKEN_MOD, - '**': TOKEN_POW, - '~': TOKEN_TILDE, - '[': TOKEN_LBRACKET, - ']': TOKEN_RBRACKET, - '(': TOKEN_LPAREN, - ')': TOKEN_RPAREN, - '{': TOKEN_LBRACE, - '}': TOKEN_RBRACE, - '==': TOKEN_EQ, - '!=': TOKEN_NE, - '>': TOKEN_GT, - '>=': TOKEN_GTEQ, - '<': TOKEN_LT, - '<=': TOKEN_LTEQ, - '=': TOKEN_ASSIGN, - '.': TOKEN_DOT, - ':': TOKEN_COLON, - '|': TOKEN_PIPE, - ',': TOKEN_COMMA, - ';': TOKEN_SEMICOLON + "+": TOKEN_ADD, + "-": TOKEN_SUB, + "/": TOKEN_DIV, + "//": TOKEN_FLOORDIV, + "*": TOKEN_MUL, + "%": TOKEN_MOD, + "**": TOKEN_POW, + "~": TOKEN_TILDE, + "[": TOKEN_LBRACKET, + "]": TOKEN_RBRACKET, + "(": TOKEN_LPAREN, + ")": TOKEN_RPAREN, + "{": TOKEN_LBRACE, + "}": TOKEN_RBRACE, + "==": TOKEN_EQ, + "!=": TOKEN_NE, + ">": TOKEN_GT, + ">=": TOKEN_GTEQ, + "<": TOKEN_LT, + "<=": TOKEN_LTEQ, + "=": TOKEN_ASSIGN, + ".": TOKEN_DOT, + ":": TOKEN_COLON, + "|": TOKEN_PIPE, + ",": TOKEN_COMMA, + ";": TOKEN_SEMICOLON, } reverse_operators = dict([(v, k) for k, v in iteritems(operators)]) -assert len(operators) == len(reverse_operators), 'operators dropped' -operator_re = re.compile('(%s)' % '|'.join(re.escape(x) for x in - sorted(operators, key=lambda x: -len(x)))) - -ignored_tokens = frozenset([TOKEN_COMMENT_BEGIN, TOKEN_COMMENT, - TOKEN_COMMENT_END, TOKEN_WHITESPACE, - TOKEN_LINECOMMENT_BEGIN, TOKEN_LINECOMMENT_END, - TOKEN_LINECOMMENT]) -ignore_if_empty = frozenset([TOKEN_WHITESPACE, TOKEN_DATA, - TOKEN_COMMENT, TOKEN_LINECOMMENT]) +assert len(operators) == len(reverse_operators), "operators dropped" +operator_re = re.compile( + "(%s)" % "|".join(re.escape(x) for x in sorted(operators, key=lambda x: -len(x))) +) + +ignored_tokens = frozenset( + [ + TOKEN_COMMENT_BEGIN, + TOKEN_COMMENT, + TOKEN_COMMENT_END, + TOKEN_WHITESPACE, + TOKEN_LINECOMMENT_BEGIN, + TOKEN_LINECOMMENT_END, + TOKEN_LINECOMMENT, + ] +) +ignore_if_empty = frozenset( + [TOKEN_WHITESPACE, TOKEN_DATA, TOKEN_COMMENT, TOKEN_LINECOMMENT] +) def _describe_token_type(token_type): if token_type in reverse_operators: return reverse_operators[token_type] return { - TOKEN_COMMENT_BEGIN: 'begin of comment', - TOKEN_COMMENT_END: 'end of comment', - TOKEN_COMMENT: 'comment', - TOKEN_LINECOMMENT: 'comment', - TOKEN_BLOCK_BEGIN: 'begin of statement block', - TOKEN_BLOCK_END: 'end of statement block', - TOKEN_VARIABLE_BEGIN: 'begin of print statement', - TOKEN_VARIABLE_END: 'end of print statement', - TOKEN_LINESTATEMENT_BEGIN: 'begin of line statement', - TOKEN_LINESTATEMENT_END: 'end of line statement', - TOKEN_DATA: 'template data / text', - TOKEN_EOF: 'end of template' + TOKEN_COMMENT_BEGIN: "begin of comment", + TOKEN_COMMENT_END: "end of comment", + TOKEN_COMMENT: "comment", + TOKEN_LINECOMMENT: "comment", + TOKEN_BLOCK_BEGIN: "begin of statement block", + TOKEN_BLOCK_END: "end of statement block", + TOKEN_VARIABLE_BEGIN: "begin of print statement", + TOKEN_VARIABLE_END: "end of print statement", + TOKEN_LINESTATEMENT_BEGIN: "begin of line statement", + TOKEN_LINESTATEMENT_END: "end of line statement", + TOKEN_DATA: "template data / text", + TOKEN_EOF: "end of template", }.get(token_type, token_type) def describe_token(token): """Returns a description of the token.""" - if token.type == 'name': + if token.type == TOKEN_NAME: return token.value return _describe_token_type(token.type) def describe_token_expr(expr): """Like `describe_token` but for token expressions.""" - if ':' in expr: - type, value = expr.split(':', 1) - if type == 'name': + if ":" in expr: + type, value = expr.split(":", 1) + if type == TOKEN_NAME: return value else: type = expr @@ -197,21 +206,39 @@ def compile_rules(environment): """Compiles all the rules from the environment into a list of rules.""" e = re.escape rules = [ - (len(environment.comment_start_string), 'comment', - e(environment.comment_start_string)), - (len(environment.block_start_string), 'block', - e(environment.block_start_string)), - (len(environment.variable_start_string), 'variable', - e(environment.variable_start_string)) + ( + len(environment.comment_start_string), + TOKEN_COMMENT_BEGIN, + e(environment.comment_start_string), + ), + ( + len(environment.block_start_string), + TOKEN_BLOCK_BEGIN, + e(environment.block_start_string), + ), + ( + len(environment.variable_start_string), + TOKEN_VARIABLE_BEGIN, + e(environment.variable_start_string), + ), ] if environment.line_statement_prefix is not None: - rules.append((len(environment.line_statement_prefix), 'linestatement', - r'^[ \t\v]*' + e(environment.line_statement_prefix))) + rules.append( + ( + len(environment.line_statement_prefix), + TOKEN_LINESTATEMENT_BEGIN, + r"^[ \t\v]*" + e(environment.line_statement_prefix), + ) + ) if environment.line_comment_prefix is not None: - rules.append((len(environment.line_comment_prefix), 'linecomment', - r'(?:^|(?<=\S))[^\S\r\n]*' + - e(environment.line_comment_prefix))) + rules.append( + ( + len(environment.line_comment_prefix), + TOKEN_LINECOMMENT_BEGIN, + r"(?:^|(?<=\S))[^\S\r\n]*" + e(environment.line_comment_prefix), + ) + ) return [x[1:] for x in sorted(rules, reverse=True)] @@ -231,6 +258,7 @@ class Failure(object): class Token(tuple): """Token class.""" + __slots__ = () lineno, type, value = (property(itemgetter(x)) for x in range(3)) @@ -240,7 +268,7 @@ class Token(tuple): def __str__(self): if self.type in reverse_operators: return reverse_operators[self.type] - elif self.type == 'name': + elif self.type == "name": return self.value return self.type @@ -253,8 +281,8 @@ class Token(tuple): # passed an iterable of not interned strings. if self.type == expr: return True - elif ':' in expr: - return expr.split(':', 1) == [self.type, self.value] + elif ":" in expr: + return expr.split(":", 1) == [self.type, self.value] return False def test_any(self, *iterable): @@ -265,11 +293,7 @@ class Token(tuple): return False def __repr__(self): - return 'Token(%r, %r, %r)' % ( - self.lineno, - self.type, - self.value - ) + return "Token(%r, %r, %r)" % (self.lineno, self.type, self.value) @implements_iterator @@ -306,7 +330,7 @@ class TokenStream(object): self.name = name self.filename = filename self.closed = False - self.current = Token(1, TOKEN_INITIAL, '') + self.current = Token(1, TOKEN_INITIAL, "") next(self) def __iter__(self): @@ -314,9 +338,13 @@ class TokenStream(object): def __bool__(self): return bool(self._pushed) or self.current.type is not TOKEN_EOF + __nonzero__ = __bool__ # py2 - eos = property(lambda x: not x, doc="Are we at the end of the stream?") + @property + def eos(self): + """Are we at the end of the stream?""" + return not self def push(self, token): """Push a token back to the stream.""" @@ -332,7 +360,7 @@ class TokenStream(object): def skip(self, n=1): """Got n tokens ahead.""" - for x in range(n): + for _ in range(n): next(self) def next_if(self, expr): @@ -363,7 +391,7 @@ class TokenStream(object): def close(self): """Close the stream.""" - self.current = Token(self.current.lineno, TOKEN_EOF, '') + self.current = Token(self.current.lineno, TOKEN_EOF, "") self._iter = None self.closed = True @@ -374,14 +402,18 @@ class TokenStream(object): if not self.current.test(expr): expr = describe_token_expr(expr) if self.current.type is TOKEN_EOF: - raise TemplateSyntaxError('unexpected end of template, ' - 'expected %r.' % expr, - self.current.lineno, - self.name, self.filename) - raise TemplateSyntaxError("expected token %r, got %r" % - (expr, describe_token(self.current)), - self.current.lineno, - self.name, self.filename) + raise TemplateSyntaxError( + "unexpected end of template, expected %r." % expr, + self.current.lineno, + self.name, + self.filename, + ) + raise TemplateSyntaxError( + "expected token %r, got %r" % (expr, describe_token(self.current)), + self.current.lineno, + self.name, + self.filename, + ) try: return self.current finally: @@ -390,18 +422,20 @@ class TokenStream(object): def get_lexer(environment): """Return a lexer which is probably cached.""" - key = (environment.block_start_string, - environment.block_end_string, - environment.variable_start_string, - environment.variable_end_string, - environment.comment_start_string, - environment.comment_end_string, - environment.line_statement_prefix, - environment.line_comment_prefix, - environment.trim_blocks, - environment.lstrip_blocks, - environment.newline_sequence, - environment.keep_trailing_newline) + key = ( + environment.block_start_string, + environment.block_end_string, + environment.variable_start_string, + environment.variable_end_string, + environment.comment_start_string, + environment.comment_end_string, + environment.line_statement_prefix, + environment.line_comment_prefix, + environment.trim_blocks, + environment.lstrip_blocks, + environment.newline_sequence, + environment.keep_trailing_newline, + ) lexer = _lexer_cache.get(key) if lexer is None: lexer = Lexer(environment) @@ -409,6 +443,19 @@ def get_lexer(environment): return lexer +class OptionalLStrip(tuple): + """A special tuple for marking a point in the state that can have + lstrip applied. + """ + + __slots__ = () + + # Even though it looks like a no-op, creating instances fails + # without this. + def __new__(cls, *members, **kwargs): + return super(OptionalLStrip, cls).__new__(cls, members) + + class Lexer(object): """Class that implements a lexer for a given environment. Automatically created by the environment class, usually you don't have to do that. @@ -419,9 +466,11 @@ class Lexer(object): def __init__(self, environment): # shortcuts - c = lambda x: re.compile(x, re.M | re.S) e = re.escape + def c(x): + return re.compile(x, re.M | re.S) + # lexing rules for tags tag_rules = [ (whitespace_re, TOKEN_WHITESPACE, None), @@ -429,7 +478,7 @@ class Lexer(object): (integer_re, TOKEN_INTEGER, None), (name_re, TOKEN_NAME, None), (string_re, TOKEN_STRING, None), - (operator_re, TOKEN_OPERATOR, None) + (operator_re, TOKEN_OPERATOR, None), ] # assemble the root lexing rule. because "|" is ungreedy @@ -441,108 +490,120 @@ class Lexer(object): root_tag_rules = compile_rules(environment) # block suffix if trimming is enabled - block_suffix_re = environment.trim_blocks and '\\n?' or '' - - # strip leading spaces if lstrip_blocks is enabled - prefix_re = {} - if environment.lstrip_blocks: - # use '{%+' to manually disable lstrip_blocks behavior - no_lstrip_re = e('+') - # detect overlap between block and variable or comment strings - block_diff = c(r'^%s(.*)' % e(environment.block_start_string)) - # make sure we don't mistake a block for a variable or a comment - m = block_diff.match(environment.comment_start_string) - no_lstrip_re += m and r'|%s' % e(m.group(1)) or '' - m = block_diff.match(environment.variable_start_string) - no_lstrip_re += m and r'|%s' % e(m.group(1)) or '' - - # detect overlap between comment and variable strings - comment_diff = c(r'^%s(.*)' % e(environment.comment_start_string)) - m = comment_diff.match(environment.variable_start_string) - no_variable_re = m and r'(?!%s)' % e(m.group(1)) or '' - - lstrip_re = r'^[ \t]*' - block_prefix_re = r'%s%s(?!%s)|%s\+?' % ( - lstrip_re, - e(environment.block_start_string), - no_lstrip_re, - e(environment.block_start_string), - ) - comment_prefix_re = r'%s%s%s|%s\+?' % ( - lstrip_re, - e(environment.comment_start_string), - no_variable_re, - e(environment.comment_start_string), - ) - prefix_re['block'] = block_prefix_re - prefix_re['comment'] = comment_prefix_re - else: - block_prefix_re = '%s' % e(environment.block_start_string) + block_suffix_re = environment.trim_blocks and "\\n?" or "" + + # If lstrip is enabled, it should not be applied if there is any + # non-whitespace between the newline and block. + self.lstrip_unless_re = c(r"[^ \t]") if environment.lstrip_blocks else None self.newline_sequence = environment.newline_sequence self.keep_trailing_newline = environment.keep_trailing_newline # global lexing rules self.rules = { - 'root': [ + "root": [ # directives - (c('(.*?)(?:%s)' % '|'.join( - [r'(?P<raw_begin>(?:\s*%s\-|%s)\s*raw\s*(?:\-%s\s*|%s))' % ( - e(environment.block_start_string), - block_prefix_re, - e(environment.block_end_string), - e(environment.block_end_string) - )] + [ - r'(?P<%s_begin>\s*%s\-|%s)' % (n, r, prefix_re.get(n,r)) - for n, r in root_tag_rules - ])), (TOKEN_DATA, '#bygroup'), '#bygroup'), + ( + c( + "(.*?)(?:%s)" + % "|".join( + [ + r"(?P<raw_begin>%s(\-|\+|)\s*raw\s*(?:\-%s\s*|%s))" + % ( + e(environment.block_start_string), + e(environment.block_end_string), + e(environment.block_end_string), + ) + ] + + [ + r"(?P<%s>%s(\-|\+|))" % (n, r) + for n, r in root_tag_rules + ] + ) + ), + OptionalLStrip(TOKEN_DATA, "#bygroup"), + "#bygroup", + ), # data - (c('.+'), TOKEN_DATA, None) + (c(".+"), TOKEN_DATA, None), ], # comments TOKEN_COMMENT_BEGIN: [ - (c(r'(.*?)((?:\-%s\s*|%s)%s)' % ( - e(environment.comment_end_string), - e(environment.comment_end_string), - block_suffix_re - )), (TOKEN_COMMENT, TOKEN_COMMENT_END), '#pop'), - (c('(.)'), (Failure('Missing end of comment tag'),), None) + ( + c( + r"(.*?)((?:\-%s\s*|%s)%s)" + % ( + e(environment.comment_end_string), + e(environment.comment_end_string), + block_suffix_re, + ) + ), + (TOKEN_COMMENT, TOKEN_COMMENT_END), + "#pop", + ), + (c("(.)"), (Failure("Missing end of comment tag"),), None), ], # blocks TOKEN_BLOCK_BEGIN: [ - (c(r'(?:\-%s\s*|%s)%s' % ( - e(environment.block_end_string), - e(environment.block_end_string), - block_suffix_re - )), TOKEN_BLOCK_END, '#pop'), - ] + tag_rules, + ( + c( + r"(?:\-%s\s*|%s)%s" + % ( + e(environment.block_end_string), + e(environment.block_end_string), + block_suffix_re, + ) + ), + TOKEN_BLOCK_END, + "#pop", + ), + ] + + tag_rules, # variables TOKEN_VARIABLE_BEGIN: [ - (c(r'\-%s\s*|%s' % ( - e(environment.variable_end_string), - e(environment.variable_end_string) - )), TOKEN_VARIABLE_END, '#pop') - ] + tag_rules, + ( + c( + r"\-%s\s*|%s" + % ( + e(environment.variable_end_string), + e(environment.variable_end_string), + ) + ), + TOKEN_VARIABLE_END, + "#pop", + ) + ] + + tag_rules, # raw block TOKEN_RAW_BEGIN: [ - (c(r'(.*?)((?:\s*%s\-|%s)\s*endraw\s*(?:\-%s\s*|%s%s))' % ( - e(environment.block_start_string), - block_prefix_re, - e(environment.block_end_string), - e(environment.block_end_string), - block_suffix_re - )), (TOKEN_DATA, TOKEN_RAW_END), '#pop'), - (c('(.)'), (Failure('Missing end of raw directive'),), None) + ( + c( + r"(.*?)((?:%s(\-|\+|))\s*endraw\s*(?:\-%s\s*|%s%s))" + % ( + e(environment.block_start_string), + e(environment.block_end_string), + e(environment.block_end_string), + block_suffix_re, + ) + ), + OptionalLStrip(TOKEN_DATA, TOKEN_RAW_END), + "#pop", + ), + (c("(.)"), (Failure("Missing end of raw directive"),), None), ], # line statements TOKEN_LINESTATEMENT_BEGIN: [ - (c(r'\s*(\n|$)'), TOKEN_LINESTATEMENT_END, '#pop') - ] + tag_rules, + (c(r"\s*(\n|$)"), TOKEN_LINESTATEMENT_END, "#pop") + ] + + tag_rules, # line comments TOKEN_LINECOMMENT_BEGIN: [ - (c(r'(.*?)()(?=\n|$)'), (TOKEN_LINECOMMENT, - TOKEN_LINECOMMENT_END), '#pop') - ] + ( + c(r"(.*?)()(?=\n|$)"), + (TOKEN_LINECOMMENT, TOKEN_LINECOMMENT_END), + "#pop", + ) + ], } def _normalize_newlines(self, value): @@ -550,8 +611,7 @@ class Lexer(object): return newline_re.sub(self.newline_sequence, value) def tokenize(self, source, name=None, filename=None, state=None): - """Calls tokeniter + tokenize and wraps it in a token stream. - """ + """Calls tokeniter + tokenize and wraps it in a token stream.""" stream = self.tokeniter(source, name, filename, state) return TokenStream(self.wrap(stream, name, filename), name, filename) @@ -562,37 +622,40 @@ class Lexer(object): for lineno, token, value in stream: if token in ignored_tokens: continue - elif token == 'linestatement_begin': - token = 'block_begin' - elif token == 'linestatement_end': - token = 'block_end' + elif token == TOKEN_LINESTATEMENT_BEGIN: + token = TOKEN_BLOCK_BEGIN + elif token == TOKEN_LINESTATEMENT_END: + token = TOKEN_BLOCK_END # we are not interested in those tokens in the parser - elif token in ('raw_begin', 'raw_end'): + elif token in (TOKEN_RAW_BEGIN, TOKEN_RAW_END): continue - elif token == 'data': + elif token == TOKEN_DATA: value = self._normalize_newlines(value) - elif token == 'keyword': + elif token == "keyword": token = value - elif token == 'name': + elif token == TOKEN_NAME: value = str(value) if check_ident and not value.isidentifier(): raise TemplateSyntaxError( - 'Invalid character in identifier', - lineno, name, filename) - elif token == 'string': + "Invalid character in identifier", lineno, name, filename + ) + elif token == TOKEN_STRING: # try to unescape string try: - value = self._normalize_newlines(value[1:-1]) \ - .encode('ascii', 'backslashreplace') \ - .decode('unicode-escape') + value = ( + self._normalize_newlines(value[1:-1]) + .encode("ascii", "backslashreplace") + .decode("unicode-escape") + ) except Exception as e: - msg = str(e).split(':')[-1].strip() + msg = str(e).split(":")[-1].strip() raise TemplateSyntaxError(msg, lineno, name, filename) - elif token == 'integer': - value = int(value) - elif token == 'float': - value = float(value) - elif token == 'operator': + elif token == TOKEN_INTEGER: + value = int(value.replace("_", "")) + elif token == TOKEN_FLOAT: + # remove all "_" first to support more Python versions + value = literal_eval(value.replace("_", "")) + elif token == TOKEN_OPERATOR: token = operators[value] yield Token(lineno, token, value) @@ -603,23 +666,23 @@ class Lexer(object): source = text_type(source) lines = source.splitlines() if self.keep_trailing_newline and source: - for newline in ('\r\n', '\r', '\n'): + for newline in ("\r\n", "\r", "\n"): if source.endswith(newline): - lines.append('') + lines.append("") break - source = '\n'.join(lines) + source = "\n".join(lines) pos = 0 lineno = 1 - stack = ['root'] - if state is not None and state != 'root': - assert state in ('variable', 'block'), 'invalid state' - stack.append(state + '_begin') - else: - state = 'root' + stack = ["root"] + if state is not None and state != "root": + assert state in ("variable", "block"), "invalid state" + stack.append(state + "_begin") statetokens = self.rules[stack[-1]] source_length = len(source) - balancing_stack = [] + lstrip_unless_re = self.lstrip_unless_re + newlines_stripped = 0 + line_starting = True while 1: # tokenizer loop @@ -633,13 +696,48 @@ class Lexer(object): # are balanced. continue parsing with the lower rule which # is the operator rule. do this only if the end tags look # like operators - if balancing_stack and \ - tokens in ('variable_end', 'block_end', - 'linestatement_end'): + if balancing_stack and tokens in ( + TOKEN_VARIABLE_END, + TOKEN_BLOCK_END, + TOKEN_LINESTATEMENT_END, + ): continue # tuples support more options if isinstance(tokens, tuple): + groups = m.groups() + + if isinstance(tokens, OptionalLStrip): + # Rule supports lstrip. Match will look like + # text, block type, whitespace control, type, control, ... + text = groups[0] + + # Skipping the text and first type, every other group is the + # whitespace control for each type. One of the groups will be + # -, +, or empty string instead of None. + strip_sign = next(g for g in groups[2::2] if g is not None) + + if strip_sign == "-": + # Strip all whitespace between the text and the tag. + stripped = text.rstrip() + newlines_stripped = text[len(stripped) :].count("\n") + groups = (stripped,) + groups[1:] + elif ( + # Not marked for preserving whitespace. + strip_sign != "+" + # lstrip is enabled. + and lstrip_unless_re is not None + # Not a variable expression. + and not m.groupdict().get(TOKEN_VARIABLE_BEGIN) + ): + # The start of text between the last newline and the tag. + l_pos = text.rfind("\n") + 1 + if l_pos > 0 or line_starting: + # If there's only whitespace between the newline and the + # tag, strip it. + if not lstrip_unless_re.search(text, l_pos): + groups = (text[:l_pos],) + groups[1:] + for idx, token in enumerate(tokens): # failure group if token.__class__ is Failure: @@ -647,51 +745,57 @@ class Lexer(object): # bygroup is a bit more complex, in that case we # yield for the current token the first named # group that matched - elif token == '#bygroup': + elif token == "#bygroup": for key, value in iteritems(m.groupdict()): if value is not None: yield lineno, key, value - lineno += value.count('\n') + lineno += value.count("\n") break else: - raise RuntimeError('%r wanted to resolve ' - 'the token dynamically' - ' but no group matched' - % regex) + raise RuntimeError( + "%r wanted to resolve " + "the token dynamically" + " but no group matched" % regex + ) # normal group else: - data = m.group(idx + 1) + data = groups[idx] if data or token not in ignore_if_empty: yield lineno, token, data - lineno += data.count('\n') + lineno += data.count("\n") + newlines_stripped + newlines_stripped = 0 # strings as token just are yielded as it. else: data = m.group() # update brace/parentheses balance - if tokens == 'operator': - if data == '{': - balancing_stack.append('}') - elif data == '(': - balancing_stack.append(')') - elif data == '[': - balancing_stack.append(']') - elif data in ('}', ')', ']'): + if tokens == TOKEN_OPERATOR: + if data == "{": + balancing_stack.append("}") + elif data == "(": + balancing_stack.append(")") + elif data == "[": + balancing_stack.append("]") + elif data in ("}", ")", "]"): if not balancing_stack: - raise TemplateSyntaxError('unexpected \'%s\'' % - data, lineno, name, - filename) + raise TemplateSyntaxError( + "unexpected '%s'" % data, lineno, name, filename + ) expected_op = balancing_stack.pop() if expected_op != data: - raise TemplateSyntaxError('unexpected \'%s\', ' - 'expected \'%s\'' % - (data, expected_op), - lineno, name, - filename) + raise TemplateSyntaxError( + "unexpected '%s', " + "expected '%s'" % (data, expected_op), + lineno, + name, + filename, + ) # yield items if data or tokens not in ignore_if_empty: yield lineno, tokens, data - lineno += data.count('\n') + lineno += data.count("\n") + + line_starting = m.group()[-1:] == "\n" # fetch new position into new variable so that we can check # if there is a internal parsing error which would result @@ -701,19 +805,20 @@ class Lexer(object): # handle state changes if new_state is not None: # remove the uppermost state - if new_state == '#pop': + if new_state == "#pop": stack.pop() # resolve the new state by group checking - elif new_state == '#bygroup': + elif new_state == "#bygroup": for key, value in iteritems(m.groupdict()): if value is not None: stack.append(key) break else: - raise RuntimeError('%r wanted to resolve the ' - 'new state dynamically but' - ' no group matched' % - regex) + raise RuntimeError( + "%r wanted to resolve the " + "new state dynamically but" + " no group matched" % regex + ) # direct state name given else: stack.append(new_state) @@ -722,8 +827,9 @@ class Lexer(object): # this means a loop without break condition, avoid that and # raise error elif pos2 == pos: - raise RuntimeError('%r yielded empty string without ' - 'stack change' % regex) + raise RuntimeError( + "%r yielded empty string without stack change" % regex + ) # publish new function and start again pos = pos2 break @@ -734,6 +840,9 @@ class Lexer(object): if pos >= source_length: return # something went wrong - raise TemplateSyntaxError('unexpected char %r at %d' % - (source[pos], pos), lineno, - name, filename) + raise TemplateSyntaxError( + "unexpected char %r at %d" % (source[pos], pos), + lineno, + name, + filename, + ) |