Convert exceptions to Python list

This commit is contained in:
ines 2017-02-24 18:22:40 +01:00
parent 51eb190ef4
commit 0e2e331b58
3 changed files with 26312 additions and 26303 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,7 @@ from ..language_data.tokenizer_exceptions import _URL_PATTERN
from ..language_data.punctuation import ALPHA_LOWER from ..language_data.punctuation import ALPHA_LOWER
from .punctuation import ELISION, HYPHENS from .punctuation import ELISION, HYPHENS
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
from ..symbols import * from ..symbols import *
@ -16,11 +17,13 @@ import io
import re import re
def iter_exceptions(): def get_exceptions():
with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'), return BASE_EXCEPTIONS
'rt', encoding='utf8') as f:
for line in f: # with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
yield line.strip('\n') # 'rt', encoding='utf8') as f:
# for line in f:
# yield line.strip('\n')
def upper_first_letter(text): def upper_first_letter(text):
@ -142,7 +145,7 @@ def get_tokenizer_exceptions():
HYPHEN = ['-', ''] HYPHEN = ['-', '']
base_exceptions = list(iter_exceptions()) base_exceptions = get_exceptions()
infixes_exceptions = [] infixes_exceptions = []
for elision_char in ELISION: for elision_char in ELISION: