From 6b2df208c209eb8f99898d42cd4fa64cee949b0b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 13 Feb 2019 13:35:32 +0200 Subject: [PATCH] Fixed support for hex encoding (\xAA) --- lark/lexer.py | 7 ++++++- tests/test_parser.py | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 3237b02..ace3206 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -72,7 +72,12 @@ class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): - self = super(Token, cls).__new__(cls, value) + try: + self = super(Token, cls).__new__(cls, value) + except UnicodeDecodeError: + value = value.decode('latin1') + self = super(Token, cls).__new__(cls, value) + self.type = type_ self.pos_in_stream = pos_in_stream self.value = value diff --git a/tests/test_parser.py b/tests/test_parser.py index 7d99e20..68514a1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -449,11 +449,12 @@ def _make_parser_test(LEXER, PARSER): g.parse(u'\xa3\u0101\u00a3\u0203\n') def test_hex_escape(self): - g = _Lark(r"""start: A B + g = _Lark(r"""start: A B C A: "\x01" B: /\x02/ + C: "\xABCD" """) - g.parse('\x01\x02') + g.parse('\x01\x02\xABCD') @unittest.skipIf(PARSER == 'cyk', "Takes forever") def test_stack_for_ebnf(self):