From 5af4b62fe731758ae2b20fbd737a558f457ea6b9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 12:47:43 +1100 Subject: [PATCH] * Filter out phrases that consist of common, lower-case words. --- examples/multi_word_matches.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/multi_word_matches.py b/examples/multi_word_matches.py index 3c715736e..73f48bf42 100644 --- a/examples/multi_word_matches.py +++ b/examples/multi_word_matches.py @@ -45,6 +45,8 @@ def read_gazetteer(tokenizer, loc, n=-1): if i >= n: break phrase = tokenizer(phrase) + if all((t.is_lower and t.prob >= -10) for t in phrase): + continue if len(phrase) >= 2: yield phrase