From a23504fe07c5d3d55b247e4aa0b185dd0a338ee7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Jan 2017 19:58:07 +0100 Subject: [PATCH] Move abbreviations below other exceptions --- spacy/en/tokenizer_exceptions.py | 135 +------------------------------ 1 file changed, 1 insertion(+), 134 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 2c046c157..49b612d73 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -505,142 +505,9 @@ ABBREVIATIONS = { } -# Other exceptions - -OTHER = { - " ": [ - {ORTH: " ", TAG: "SP"} - ], - - "\u00a0": [ - {ORTH: "\u00a0", TAG: "SP", LEMMA: " "} - ], - - "and/or": [ - {ORTH: "and/or", LEMMA: "and/or", TAG: "CC"} - ], - - "'cause": [ - {ORTH: "'cause", LEMMA: "because"} - ], - - "y'all": [ - {ORTH: "y'", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "yall": [ - {ORTH: "y", LEMMA: PRON_LEMMA, NORM: "you"}, - {ORTH: "all"} - ], - - "'em": [ - {ORTH: "'em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "em": [ - {ORTH: "em", LEMMA: PRON_LEMMA, NORM: "them"} - ], - - "nothin'": [ - {ORTH: "nothin'", LEMMA: "nothing"} - ], - - "nuthin'": [ - {ORTH: "nuthin'", LEMMA: "nothing"} - ], - - "'nuff": [ - {ORTH: "'nuff", LEMMA: "enough"} - ], - - "ol'": [ - {ORTH: "ol'", LEMMA: "old"} - ], - - "not've": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "notve": [ - {ORTH: "not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "Not've": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "'ve", LEMMA: "have", TAG: "VB"} - ], - - "Notve": [ - {ORTH: "Not", LEMMA: "not", TAG: "RB"}, - {ORTH: "ve", LEMMA: "have", TAG: "VB"} - ], - - "cannot": [ - {ORTH: "can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "Cannot": [ - {ORTH: "Can", LEMMA: "can", TAG: "MD"}, - {ORTH: "not", LEMMA: "not", TAG: "RB"} - ], - - "gonna": [ - {ORTH: "gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "Gonna": [ - {ORTH: "Gon", LEMMA: "go", NORM: "going"}, - {ORTH: "na", LEMMA: "to"} - ], - - "let's": [ - {ORTH: "let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "Let's": [ - {ORTH: "Let"}, - {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"} - ], - - "'S": [ - {ORTH: "'S", LEMMA: "'s"} - ], - - "'s": [ - {ORTH: "'s", LEMMA: "'s"} - ], - - "\u2018S": [ - {ORTH: "\u2018S", LEMMA: "'s"} - ], - - "\u2018s": [ - {ORTH: "\u2018s", LEMMA: "'s"} - ], - - "\u2014": [ - {ORTH: "\u2014", TAG: ":", LEMMA: "--"} - ], - - "\n": [ - {ORTH: "\n", TAG: "SP"} - ], - - "\t": [ - {ORTH: "\t", TAG: "SP"} - ] -} - - TOKENIZER_EXCEPTIONS = dict(EXC) -TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) TOKENIZER_EXCEPTIONS.update(OTHER) +TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS) # Remove EXCLUDE_EXC if in exceptions