mirror of https://github.com/explosion/spaCy.git
* Filter out phrases that consist of common, lower-case words.
This commit is contained in:
parent
4bbc8f45c6
commit
5af4b62fe7
|
@ -45,6 +45,8 @@ def read_gazetteer(tokenizer, loc, n=-1):
|
||||||
if i >= n:
|
if i >= n:
|
||||||
break
|
break
|
||||||
phrase = tokenizer(phrase)
|
phrase = tokenizer(phrase)
|
||||||
|
if all((t.is_lower and t.prob >= -10) for t in phrase):
|
||||||
|
continue
|
||||||
if len(phrase) >= 2:
|
if len(phrase) >= 2:
|
||||||
yield phrase
|
yield phrase
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue