* Add supersense sets to lexemes, from WordNet. Look-up via lemmatization.

This commit is contained in:
Matthew Honnibal 2015-07-01 18:48:59 +02:00
parent aa3d06857e
commit 62cfcd76fe
1 changed files with 36 additions and 1 deletions

View File

@ -21,11 +21,17 @@ from pathlib import Path
from shutil import copyfile
from shutil import copytree
import codecs
from collections import defaultdict
from spacy.en import get_lex_props
from spacy.en.lemmatizer import Lemmatizer
from spacy.vocab import Vocab
from spacy.vocab import write_binary_vectors
from spacy.parts_of_speech import NOUN, VERB, ADJ
import spacy.senses
def setup_tokenizer(lang_data_dir, tok_dir):
if not tok_dir.exists():
@ -72,6 +78,22 @@ def _read_probs(loc):
return probs
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
for line in codecs.open(str(loc), 'r', 'utf8'):
sense_strings = line.split()
word = sense_strings.pop(0)
for sense in sense_strings:
pos, sense = sense[3:].split('.')
sense_name = '%s_%s' % (pos[0].upper(), sense.lower())
if sense_name != 'N_tops':
sense_id = sense_names[sense_name]
lexicon[word][pos_ids[pos]].append(sense_id)
return lexicon
def setup_vocab(src_dir, dst_dir):
if not dst_dir.exists():
dst_dir.mkdir()
@ -81,10 +103,12 @@ def setup_vocab(src_dir, dst_dir):
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt')
senses = _read_senses(src_dir / 'supersenses.txt')
probs = _read_probs(src_dir / 'words.sgt.prob')
for word in clusters:
for word in set(clusters).union(set(senses)):
if word not in probs:
probs[word] = -17.0
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
entry = get_lex_props(word)
@ -94,6 +118,17 @@ def setup_vocab(src_dir, dst_dir):
# Decode as a little-endian string, so that we can do & 15 to get
# the first 4 bits. See _parse_features.pyx
entry['cluster'] = int(cluster[::-1], 2)
orth_senses = set()
lemmas = []
for pos in [NOUN, VERB, ADJ]:
for lemma in lemmatizer(word.lower(), pos):
lemmas.append(lemma)
orth_senses.update(senses[lemma][pos])
if word.lower() == 'dogging':
print word
print lemmas
print [spacy.senses.STRINGS[si] for si in orth_senses]
entry['senses'] = list(sorted(orth_senses))
vocab[word] = entry
vocab.dump(str(dst_dir / 'lexemes.bin'))
vocab.strings.dump(str(dst_dir / 'strings.txt'))