mirror of https://github.com/explosion/spaCy.git
Convert exceptions to Python list
This commit is contained in:
parent
51eb190ef4
commit
0e2e331b58
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -8,6 +8,7 @@ from ..language_data.tokenizer_exceptions import _URL_PATTERN
|
||||||
from ..language_data.punctuation import ALPHA_LOWER
|
from ..language_data.punctuation import ALPHA_LOWER
|
||||||
|
|
||||||
from .punctuation import ELISION, HYPHENS
|
from .punctuation import ELISION, HYPHENS
|
||||||
|
from ._tokenizer_exceptions_list import BASE_EXCEPTIONS
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
|
|
||||||
|
@ -16,11 +17,13 @@ import io
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def iter_exceptions():
|
def get_exceptions():
|
||||||
with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
|
return BASE_EXCEPTIONS
|
||||||
'rt', encoding='utf8') as f:
|
|
||||||
for line in f:
|
# with io.open(os.path.join(os.path.dirname(__file__), 'resources/tokenizer_exceptions'),
|
||||||
yield line.strip('\n')
|
# 'rt', encoding='utf8') as f:
|
||||||
|
# for line in f:
|
||||||
|
# yield line.strip('\n')
|
||||||
|
|
||||||
|
|
||||||
def upper_first_letter(text):
|
def upper_first_letter(text):
|
||||||
|
@ -142,7 +145,7 @@ def get_tokenizer_exceptions():
|
||||||
|
|
||||||
HYPHEN = ['-', '‐']
|
HYPHEN = ['-', '‐']
|
||||||
|
|
||||||
base_exceptions = list(iter_exceptions())
|
base_exceptions = get_exceptions()
|
||||||
infixes_exceptions = []
|
infixes_exceptions = []
|
||||||
|
|
||||||
for elision_char in ELISION:
|
for elision_char in ELISION:
|
||||||
|
|
Loading…
Reference in New Issue