mirror of https://github.com/explosion/spaCy.git
* Generalize tokenization rules to capitals
This commit is contained in:
parent
df0458001d
commit
25849fc926
|
@ -36,6 +36,11 @@ def read_tokenization(lang):
|
||||||
assert chunk not in seen, chunk
|
assert chunk not in seen, chunk
|
||||||
seen.add(chunk)
|
seen.add(chunk)
|
||||||
entries.append((chunk, lex, pieces))
|
entries.append((chunk, lex, pieces))
|
||||||
|
if chunk[0].isalpha() and chunk[0].islower():
|
||||||
|
chunk = chunk[0].title() + chunk[1:]
|
||||||
|
lex = lex[0].title() + lex[1:]
|
||||||
|
seen.add(chunk)
|
||||||
|
entries.append((chunk, lex, pieces))
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -32,3 +32,13 @@ def test_aint():
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert unhash(lex_of(tokens[0])) == "are"
|
assert unhash(lex_of(tokens[0])) == "are"
|
||||||
assert unhash(lex_of(tokens[1])) == "not"
|
assert unhash(lex_of(tokens[1])) == "not"
|
||||||
|
|
||||||
|
|
||||||
|
def test_capitalized():
|
||||||
|
tokens = expand_chunk(lookup("can't"))
|
||||||
|
assert len(tokens) == 2
|
||||||
|
tokens = expand_chunk(lookup("Can't"))
|
||||||
|
assert len(tokens) == 2
|
||||||
|
tokens = expand_chunk(lookup("Ain't"))
|
||||||
|
assert len(tokens) == 2
|
||||||
|
assert unhash(lex_of(tokens[0])) == "Are"
|
||||||
|
|
Loading…
Reference in New Issue