diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 0e5db8329..51de7e160 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -203,14 +203,16 @@ class GoldCorpus(object): return n def train_docs(self, nlp, gold_preproc=False, - projectivize=False, max_length=None): + projectivize=False, max_length=None, + noise_level=0.0): train_tuples = self.train_tuples if projectivize: train_tuples = nonproj.preprocess_training_data( self.train_tuples) random.shuffle(train_tuples) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, - max_length=max_length) + max_length=max_length, + noise_level=noise_level) yield from gold_docs def dev_docs(self, nlp, gold_preproc=False): @@ -219,7 +221,8 @@ class GoldCorpus(object): yield from gold_docs @classmethod - def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None): + def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, + noise_level=0.0): for raw_text, paragraph_tuples in tuples: if gold_preproc: raw_text = None @@ -227,18 +230,20 @@ class GoldCorpus(object): paragraph_tuples = merge_sents(paragraph_tuples) docs = cls._make_docs(nlp, raw_text, paragraph_tuples, - gold_preproc) + gold_preproc, noise_level=noise_level) golds = cls._make_golds(docs, paragraph_tuples) for doc, gold in zip(docs, golds): if (not max_length) or len(doc) < max_length: yield doc, gold @classmethod - def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): + def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, + noise_level=0.0): if raw_text is not None: + raw_text = add_noise(raw_text, noise_level) return [nlp.make_doc(raw_text)] else: - return [Doc(nlp.vocab, words=sent_tuples[1]) + return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) for (sent_tuples, brackets) in paragraph_tuples] @classmethod @@ -270,6 +275,30 @@ class GoldCorpus(object): return locs +def add_noise(orig, noise_level): + if random.random() >= noise_level: + return orig + elif type(orig) == list: + corrupted = [_corrupt(word, noise_level) for word in orig] + corrupted = [w for w in corrupted if w] + return corrupted + else: + return ''.join(_corrupt(c, noise_level) for c in orig) + + +def _corrupt(c, noise_level): + if random.random() >= noise_level: + return c + elif c == ' ': + return '\n' + elif c == '\n': + return ' ' + elif c in ['.', "'", "!", "?"]: + return '' + else: + return c.lower() + + def read_json_file(loc, docs_filter=None, limit=None): loc = ensure_path(loc) if loc.is_dir(): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 29e9fb2aa..db8821b0e 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -284,6 +284,8 @@ class NeuralTagger(object): new_tag_map[tag] = orig_tag_map[tag] else: new_tag_map[tag] = {POS: X} + if 'SP' not in new_tag_map: + new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X}) cdef Vocab vocab = self.vocab if new_tag_map: vocab.morphology = Morphology(vocab.strings, new_tag_map,