Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-04 20:19:05 -05:00
commit fc4dd62e84
2 changed files with 37 additions and 6 deletions

View File

@ -203,14 +203,16 @@ class GoldCorpus(object):
return n return n
def train_docs(self, nlp, gold_preproc=False, def train_docs(self, nlp, gold_preproc=False,
projectivize=False, max_length=None): projectivize=False, max_length=None,
noise_level=0.0):
train_tuples = self.train_tuples train_tuples = self.train_tuples
if projectivize: if projectivize:
train_tuples = nonproj.preprocess_training_data( train_tuples = nonproj.preprocess_training_data(
self.train_tuples) self.train_tuples)
random.shuffle(train_tuples) random.shuffle(train_tuples)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length) max_length=max_length,
noise_level=noise_level)
yield from gold_docs yield from gold_docs
def dev_docs(self, nlp, gold_preproc=False): def dev_docs(self, nlp, gold_preproc=False):
@ -219,7 +221,8 @@ class GoldCorpus(object):
yield from gold_docs yield from gold_docs
@classmethod @classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None): def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0):
for raw_text, paragraph_tuples in tuples: for raw_text, paragraph_tuples in tuples:
if gold_preproc: if gold_preproc:
raw_text = None raw_text = None
@ -227,18 +230,20 @@ class GoldCorpus(object):
paragraph_tuples = merge_sents(paragraph_tuples) paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc) gold_preproc, noise_level=noise_level)
golds = cls._make_golds(docs, paragraph_tuples) golds = cls._make_golds(docs, paragraph_tuples)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length: if (not max_length) or len(doc) < max_length:
yield doc, gold yield doc, gold
@classmethod @classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
noise_level=0.0):
if raw_text is not None: if raw_text is not None:
raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)]
else: else:
return [Doc(nlp.vocab, words=sent_tuples[1]) return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples] for (sent_tuples, brackets) in paragraph_tuples]
@classmethod @classmethod
@ -270,6 +275,30 @@ class GoldCorpus(object):
return locs return locs
def add_noise(orig, noise_level):
if random.random() >= noise_level:
return orig
elif type(orig) == list:
corrupted = [_corrupt(word, noise_level) for word in orig]
corrupted = [w for w in corrupted if w]
return corrupted
else:
return ''.join(_corrupt(c, noise_level) for c in orig)
def _corrupt(c, noise_level):
if random.random() >= noise_level:
return c
elif c == ' ':
return '\n'
elif c == '\n':
return ' '
elif c in ['.', "'", "!", "?"]:
return ''
else:
return c.lower()
def read_json_file(loc, docs_filter=None, limit=None): def read_json_file(loc, docs_filter=None, limit=None):
loc = ensure_path(loc) loc = ensure_path(loc)
if loc.is_dir(): if loc.is_dir():

View File

@ -284,6 +284,8 @@ class NeuralTagger(object):
new_tag_map[tag] = orig_tag_map[tag] new_tag_map[tag] = orig_tag_map[tag]
else: else:
new_tag_map[tag] = {POS: X} new_tag_map[tag] = {POS: X}
if 'SP' not in new_tag_map:
new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X})
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
if new_tag_map: if new_tag_map:
vocab.morphology = Morphology(vocab.strings, new_tag_map, vocab.morphology = Morphology(vocab.strings, new_tag_map,