From 5332c0b697f68eb7d25221dc722d1d5ee65a479e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 9 Oct 2015 18:44:21 +1100 Subject: [PATCH] * Add support for punctuation lemmatization, to handle unicode characters. This should help in addressing Issue #130 --- lang_data/en/lemma_rules.json | 6 ++++++ spacy/lemmatizer.py | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json index c45eb1df6..498240be1 100644 --- a/lang_data/en/lemma_rules.json +++ b/lang_data/en/lemma_rules.json @@ -27,5 +27,11 @@ ["est", ""], ["er", "e"], ["est", "e"] + ], + + "punct": [ + ["“", "``"], + ["”", "''"], + ["–", "--"] ] } diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index ed04e2d77..c1d296d7c 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function from os import path import codecs @@ -7,7 +7,7 @@ try: except ImportError: import json -from .parts_of_speech import NOUN, VERB, ADJ +from .parts_of_speech import NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @@ -36,6 +36,8 @@ class Lemmatizer(object): pos = 'verb' elif pos == ADJ: pos = 'adj' + elif pos == PUNCT: + pos = 'punct' lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, [])) return lemmas @@ -48,6 +50,9 @@ class Lemmatizer(object): def adj(self, string): return self(string, 'adj') + def punct(self, string): + return self(string, 'punct') + def lemmatize(string, index, exceptions, rules): string = string.lower() @@ -58,7 +63,7 @@ def lemmatize(string, index, exceptions, rules): for old, new in rules: if string.endswith(old): form = string[:len(string) - len(old)] + new - if form in index: + if form in index or not form.isalpha(): forms.append(form) if not forms: forms.append(string)