From 7b2ede783d3fb97f61e370842ac4739ab5d90aa8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 4 Jun 2017 20:16:30 -0500 Subject: [PATCH 1/2] Add SP tag to tag map if missing --- spacy/pipeline.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index a838b3412..2df9b555b 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -281,6 +281,8 @@ class NeuralTagger(object): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} + if 'SP' not in new_tag_map: + new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X}) cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map, From 9bc4a262139cb34d85c7624f2acb879341faecaa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 4 Jun 2017 20:16:57 -0500 Subject: [PATCH 2/2] Add option of data augmentation noise --- spacy/gold.pyx | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 6b07592cc..57b5dc039 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -199,14 +199,16 @@ class GoldCorpus(object): return n def train_docs(self, nlp, gold_preproc=False, - projectivize=False, max_length=None): + projectivize=False, max_length=None, + noise_level=0.0): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( self.train_tuples) random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, - max_length=max_length) + max_length=max_length, + noise_level=noise_level) yield from gold_docs def dev_docs(self, nlp, gold_preproc=False): @@ -215,7 +217,8 @@ class GoldCorpus(object): yield from gold_docs @classmethod - def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None): + def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, + noise_level=0.0): for raw_text, paragraph_tuples in tuples: if gold_preproc: raw_text = None @@ -223,18 +226,20 @@ class GoldCorpus(object): paragraph_tuples = merge_sents(paragraph_tuples) docs = cls._make_docs(nlp, raw_text, paragraph_tuples, - gold_preproc) + gold_preproc, noise_level=noise_level) golds = cls._make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): if (not max_length) or len(doc) < max_length: yield doc, gold @classmethod - def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): + def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, + noise_level=0.0): if raw_text is not None: + raw_text = add_noise(raw_text, noise_level) return [nlp.make_doc(raw_text)] else: - return [Doc(nlp.vocab, words=sent_tuples[1]) + return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) for (sent_tuples, brackets) in paragraph_tuples] @classmethod @@ -266,6 +271,30 @@ class GoldCorpus(object): return locs +def add_noise(orig, noise_level): + if random.random() >= noise_level: + return orig + elif type(orig) == list: + corrupted = [_corrupt(word, noise_level) for word in orig] + corrupted = [w for w in corrupted if w] + return corrupted + else: + return ''.join(_corrupt(c, noise_level) for c in orig) + + +def _corrupt(c, noise_level): + if random.random() >= noise_level: + return c + elif c == ' ': + return '\n' + elif c == '\n': + return ' ' + elif c in ['.', "'", "!", "?"]: + return '' + else: + return c.lower() + + def read_json_file(loc, docs_filter=None, limit=None): loc = ensure_path(loc) if loc.is_dir():