From 7b2ede783d3fb97f61e370842ac4739ab5d90aa8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 4 Jun 2017 20:16:30 -0500
Subject: [PATCH 1/2] Add SP tag to tag map if missing

---
 spacy/pipeline.pyx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index a838b3412..2df9b555b 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -281,6 +281,8 @@ class NeuralTagger(object):
                         new_tag_map[tag] = orig_tag_map[tag]
                     else:
                         new_tag_map[tag] = {POS: X}
+        if 'SP' not in new_tag_map:
+            new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X})
         cdef Vocab vocab = self.vocab
         if new_tag_map:
             vocab.morphology = Morphology(vocab.strings, new_tag_map,

From 9bc4a262139cb34d85c7624f2acb879341faecaa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 4 Jun 2017 20:16:57 -0500
Subject: [PATCH 2/2] Add option of data augmentation noise

---
 spacy/gold.pyx | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 6b07592cc..57b5dc039 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -199,14 +199,16 @@ class GoldCorpus(object):
         return n
 
     def train_docs(self, nlp, gold_preproc=False,
-                   projectivize=False, max_length=None):
+                   projectivize=False, max_length=None,
+                   noise_level=0.0):
         train_tuples = self.train_tuples
         if projectivize:
             train_tuples = nonproj.preprocess_training_data(
                                self.train_tuples)
         random.shuffle(train_tuples)
         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
-                                        max_length=max_length)
+                                        max_length=max_length,
+                                        noise_level=noise_level)
         yield from gold_docs
 
     def dev_docs(self, nlp, gold_preproc=False):
@@ -215,7 +217,8 @@ class GoldCorpus(object):
         yield from gold_docs
 
     @classmethod
-    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None):
+    def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
+                       noise_level=0.0):
         for raw_text, paragraph_tuples in tuples:
             if gold_preproc:
                 raw_text = None
@@ -223,18 +226,20 @@ class GoldCorpus(object):
                 paragraph_tuples = merge_sents(paragraph_tuples)
 
             docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
-                                  gold_preproc)
+                                  gold_preproc, noise_level=noise_level)
             golds = cls._make_golds(docs, paragraph_tuples)
             for doc, gold in zip(docs, golds):
                 if (not max_length) or len(doc) < max_length:
                     yield doc, gold
 
     @classmethod
-    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):
+    def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
+                   noise_level=0.0):
         if raw_text is not None:
+            raw_text = add_noise(raw_text, noise_level)
             return [nlp.make_doc(raw_text)]
         else:
-            return [Doc(nlp.vocab, words=sent_tuples[1])
+            return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
                 for (sent_tuples, brackets) in paragraph_tuples]
 
     @classmethod
@@ -266,6 +271,30 @@ class GoldCorpus(object):
         return locs
 
 
+def add_noise(orig, noise_level):
+    if random.random() >= noise_level:
+        return orig
+    elif type(orig) == list:
+        corrupted = [_corrupt(word, noise_level) for word in orig]
+        corrupted = [w for w in corrupted if w]
+        return corrupted
+    else:
+        return ''.join(_corrupt(c, noise_level) for c in orig)
+
+
+def _corrupt(c, noise_level):
+    if random.random() >= noise_level:
+        return c
+    elif c == ' ':
+        return '\n'
+    elif c == '\n':
+        return ' '
+    elif c in ['.', "'", "!", "?"]:
+        return ''
+    else:
+        return c.lower()
+
+
 def read_json_file(loc, docs_filter=None, limit=None):
     loc = ensure_path(loc)
     if loc.is_dir():