* Add support for punctuation lemmatization, to handle unicode characters. This should help in addressing Issue #130

This commit is contained in:
Matthew Honnibal 2015-10-09 18:44:21 +11:00
parent b71ba2eed5
commit 5332c0b697
2 changed files with 14 additions and 3 deletions

View File

@ -27,5 +27,11 @@
["est", ""], ["est", ""],
["er", "e"], ["er", "e"],
["est", "e"] ["est", "e"]
],
"punct": [
["“", "``"],
["”", "''"],
["", "--"]
] ]
} }

View File

@ -1,4 +1,4 @@
from __future__ import unicode_literals from __future__ import unicode_literals, print_function
from os import path from os import path
import codecs import codecs
@ -7,7 +7,7 @@ try:
except ImportError: except ImportError:
import json import json
from .parts_of_speech import NOUN, VERB, ADJ from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
class Lemmatizer(object): class Lemmatizer(object):
@ -36,6 +36,8 @@ class Lemmatizer(object):
pos = 'verb' pos = 'verb'
elif pos == ADJ: elif pos == ADJ:
pos = 'adj' pos = 'adj'
elif pos == PUNCT:
pos = 'punct'
lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, [])) lemmas = lemmatize(string, self.index.get(pos, {}), self.exc.get(pos, {}), self.rules.get(pos, []))
return lemmas return lemmas
@ -48,6 +50,9 @@ class Lemmatizer(object):
def adj(self, string): def adj(self, string):
return self(string, 'adj') return self(string, 'adj')
def punct(self, string):
return self(string, 'punct')
def lemmatize(string, index, exceptions, rules): def lemmatize(string, index, exceptions, rules):
string = string.lower() string = string.lower()
@ -58,7 +63,7 @@ def lemmatize(string, index, exceptions, rules):
for old, new in rules: for old, new in rules:
if string.endswith(old): if string.endswith(old):
form = string[:len(string) - len(old)] + new form = string[:len(string) - len(old)] + new
if form in index: if form in index or not form.isalpha():
forms.append(form) forms.append(form)
if not forms: if not forms:
forms.append(string) forms.append(string)