Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def make_token(scanner, token_string):
if type == "__OP__":
actual_type = token_string
else:
actual_type = type
return Token(actual_type,
Origin(string, *scanner.match.span()),
token_string)
return make_token
def _tokenize_constraint(string, variable_names):
lparen_re = r"\("
rparen_re = r"\)"
op_re = "|".join([re.escape(op.token_type) for op in _ops])
num_re = r"[-+]?[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?"
whitespace_re = r"\s+"
# Prefer long matches:
variable_names = sorted(variable_names, key=len, reverse=True)
variable_re = "|".join([re.escape(n) for n in variable_names])
lexicon = [
(lparen_re, _token_maker(Token.LPAREN, string)),
(rparen_re, _token_maker(Token.RPAREN, string)),
(op_re, _token_maker("__OP__", string)),
(variable_re, _token_maker("VARIABLE", string)),
(num_re, _token_maker("NUMBER", string)),
(whitespace_re, None),
]
scanner = re.Scanner(lexicon)
tokens, leftover = scanner.scan(string)
if leftover:
offset = len(string) - len(leftover)
raise PatsyError("unrecognized token in constraint",
Origin(string, offset, offset + 1))
return tokens
ops = [Operator("+", 2, 10),
Operator("*", 2, 20),
Operator("-", 1, 30)]
atomic = ["ATOM1", "ATOM2"]
# a + -b * (c + d)
mock_origin = Origin("asdf", 2, 3)
tokens = [Token("ATOM1", mock_origin, "a"),
Token("+", mock_origin, "+"),
Token("-", mock_origin, "-"),
Token("ATOM2", mock_origin, "b"),
Token("*", mock_origin, "*"),
Token(Token.LPAREN, mock_origin, "("),
Token("ATOM1", mock_origin, "c"),
Token("+", mock_origin, "+"),
Token("ATOM2", mock_origin, "d"),
Token(Token.RPAREN, mock_origin, ")")]
tree = infix_parse(tokens, ops, atomic)
def te(tree, type, extra):
assert tree.type == type
assert tree.token.extra == extra
te(tree, "+", "+")
te(tree.args[0], "ATOM1", "a")
assert tree.args[0].args == []
te(tree.args[1], "*", "*")
te(tree.args[1].args[0], "-", "-")
assert len(tree.args[1].args[0].args) == 1
te(tree.args[1].args[0].args[0], "ATOM2", "b")
te(tree.args[1].args[1], "+", "+")
te(tree.args[1].args[1].args[0], "ATOM1", "c")
te(tree.args[1].args[1].args[1], "ATOM2", "d")
from nose.tools import assert_raises
def test__tokenize_formula():
code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10"
tokens = list(_tokenize_formula(code, ["+", "-", "~"]))
expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"),
("~", Origin(code, 2, 3), None),
("PYTHON_EXPR", Origin(code, 4, 5), "a"),
("+", Origin(code, 6, 7), None),
(Token.LPAREN, Origin(code, 8, 9), None),
("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"),
(Token.RPAREN, Origin(code, 23, 24), None),
("+", Origin(code, 25, 26), None),
("-", Origin(code, 27, 28), None),
("ONE", Origin(code, 28, 29), "1"),
("+", Origin(code, 30, 31), None),
("ZERO", Origin(code, 32, 33), "0"),
("+", Origin(code, 34, 35), None),
("NUMBER", Origin(code, 36, 38), "10"),
]
for got, expected in zip(tokens, expecteds):
assert isinstance(got, Token)
assert got.type == expected[0]
assert got.origin == expected[1]
assert got.extra == expected[2]
assert "(" not in operator_strings
assert ")" not in operator_strings
magic_token_types = {"(": Token.LPAREN,
")": Token.RPAREN,
}
for operator_string in operator_strings:
magic_token_types[operator_string] = operator_string
# Once we enter a Python expression, a ( does not end it, but any other
# "magic" token does:
end_tokens = set(magic_token_types)
end_tokens.remove("(")
it = PushbackAdapter(python_tokenize(code))
for pytype, token_string, origin in it:
if token_string in magic_token_types:
yield Token(magic_token_types[token_string], origin)
else:
it.push_back((pytype, token_string, origin))
yield _read_python_expr(it, end_tokens)
raise PatsyError("unmatched close bracket", origin)
pytypes.append(pytype)
token_strings.append(token_string)
origins.append(origin)
# Either we found an end_token, or we hit the end of the string
if bracket_level == 0:
expr_text = pretty_untokenize(zip(pytypes, token_strings))
if expr_text == "0":
token_type = "ZERO"
elif expr_text == "1":
token_type = "ONE"
elif _is_a(int, expr_text) or _is_a(float, expr_text):
token_type = "NUMBER"
else:
token_type = "PYTHON_EXPR"
return Token(token_type, Origin.combine(origins), extra=expr_text)
else:
raise PatsyError("unclosed bracket in embedded Python "
"expression",
Origin.combine(origins))
def test_infix_parse():
ops = [Operator("+", 2, 10),
Operator("*", 2, 20),
Operator("-", 1, 30)]
atomic = ["ATOM1", "ATOM2"]
# a + -b * (c + d)
mock_origin = Origin("asdf", 2, 3)
tokens = [Token("ATOM1", mock_origin, "a"),
Token("+", mock_origin, "+"),
Token("-", mock_origin, "-"),
Token("ATOM2", mock_origin, "b"),
Token("*", mock_origin, "*"),
Token(Token.LPAREN, mock_origin, "("),
Token("ATOM1", mock_origin, "c"),
Token("+", mock_origin, "+"),
Token("ATOM2", mock_origin, "d"),
Token(Token.RPAREN, mock_origin, ")")]
tree = infix_parse(tokens, ops, atomic)
def te(tree, type, extra):
assert tree.type == type
assert tree.token.extra == extra
te(tree, "+", "+")
te(tree.args[0], "ATOM1", "a")
assert tree.args[0].args == []
te(tree.args[1], "*", "*")
te(tree.args[1].args[0], "-", "-")
def test_infix_parse():
ops = [Operator("+", 2, 10),
Operator("*", 2, 20),
Operator("-", 1, 30)]
atomic = ["ATOM1", "ATOM2"]
# a + -b * (c + d)
mock_origin = Origin("asdf", 2, 3)
tokens = [Token("ATOM1", mock_origin, "a"),
Token("+", mock_origin, "+"),
Token("-", mock_origin, "-"),
Token("ATOM2", mock_origin, "b"),
Token("*", mock_origin, "*"),
Token(Token.LPAREN, mock_origin, "("),
Token("ATOM1", mock_origin, "c"),
Token("+", mock_origin, "+"),
Token("ATOM2", mock_origin, "d"),
Token(Token.RPAREN, mock_origin, ")")]
tree = infix_parse(tokens, ops, atomic)
def te(tree, type, extra):
assert tree.type == type
assert tree.token.extra == extra
te(tree, "+", "+")
te(tree.args[0], "ATOM1", "a")
assert tree.args[0].args == []
te(tree.args[1], "*", "*")
te(tree.args[1].args[0], "-", "-")
assert len(tree.args[1].args[0].args) == 1
te(tree.args[1].args[0].args[0], "ATOM2", "b")
te(tree.args[1].args[1], "+", "+")
def _read_op_context(token, c):
if token.type == Token.RPAREN:
if c.trace:
print("Found close-paren")
while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN:
_run_op(c)
if not c.op_stack:
raise PatsyError("missing '(' or extra ')'", token)
assert c.op_stack[-1].op.token_type == Token.LPAREN
# Expand the origin of the item on top of the noun stack to include
# the open and close parens:
combined = Origin.combine([c.op_stack[-1].token,
c.noun_stack[-1].token,
token])
c.noun_stack[-1].origin = combined
# Pop the open-paren
c.op_stack.pop()
return False
def test_infix_parse():
ops = [Operator("+", 2, 10),
Operator("*", 2, 20),
Operator("-", 1, 30)]
atomic = ["ATOM1", "ATOM2"]
# a + -b * (c + d)
mock_origin = Origin("asdf", 2, 3)
tokens = [Token("ATOM1", mock_origin, "a"),
Token("+", mock_origin, "+"),
Token("-", mock_origin, "-"),
Token("ATOM2", mock_origin, "b"),
Token("*", mock_origin, "*"),
Token(Token.LPAREN, mock_origin, "("),
Token("ATOM1", mock_origin, "c"),
Token("+", mock_origin, "+"),
Token("ATOM2", mock_origin, "d"),
Token(Token.RPAREN, mock_origin, ")")]
tree = infix_parse(tokens, ops, atomic)
def te(tree, type, extra):
assert tree.type == type
assert tree.token.extra == extra
te(tree, "+", "+")
te(tree.args[0], "ATOM1", "a")
assert tree.args[0].args == []
te(tree.args[1], "*", "*")