mirror of https://github.com/explosion/spaCy.git
Add option of data augmentation noise
This commit is contained in:
parent
7b2ede783d
commit
9bc4a26213
|
@ -199,14 +199,16 @@ class GoldCorpus(object):
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_docs(self, nlp, gold_preproc=False,
|
def train_docs(self, nlp, gold_preproc=False,
|
||||||
projectivize=False, max_length=None):
|
projectivize=False, max_length=None,
|
||||||
|
noise_level=0.0):
|
||||||
train_tuples = self.train_tuples
|
train_tuples = self.train_tuples
|
||||||
if projectivize:
|
if projectivize:
|
||||||
train_tuples = nonproj.preprocess_training_data(
|
train_tuples = nonproj.preprocess_training_data(
|
||||||
self.train_tuples)
|
self.train_tuples)
|
||||||
random.shuffle(train_tuples)
|
random.shuffle(train_tuples)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||||
max_length=max_length)
|
max_length=max_length,
|
||||||
|
noise_level=noise_level)
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
def dev_docs(self, nlp, gold_preproc=False):
|
def dev_docs(self, nlp, gold_preproc=False):
|
||||||
|
@ -215,7 +217,8 @@ class GoldCorpus(object):
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None):
|
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
|
||||||
|
noise_level=0.0):
|
||||||
for raw_text, paragraph_tuples in tuples:
|
for raw_text, paragraph_tuples in tuples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
raw_text = None
|
raw_text = None
|
||||||
|
@ -223,18 +226,20 @@ class GoldCorpus(object):
|
||||||
paragraph_tuples = merge_sents(paragraph_tuples)
|
paragraph_tuples = merge_sents(paragraph_tuples)
|
||||||
|
|
||||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||||
gold_preproc)
|
gold_preproc, noise_level=noise_level)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples)
|
golds = cls._make_golds(docs, paragraph_tuples)
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
if (not max_length) or len(doc) < max_length:
|
if (not max_length) or len(doc) < max_length:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):
|
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc,
|
||||||
|
noise_level=0.0):
|
||||||
if raw_text is not None:
|
if raw_text is not None:
|
||||||
|
raw_text = add_noise(raw_text, noise_level)
|
||||||
return [nlp.make_doc(raw_text)]
|
return [nlp.make_doc(raw_text)]
|
||||||
else:
|
else:
|
||||||
return [Doc(nlp.vocab, words=sent_tuples[1])
|
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
|
||||||
for (sent_tuples, brackets) in paragraph_tuples]
|
for (sent_tuples, brackets) in paragraph_tuples]
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -266,6 +271,30 @@ class GoldCorpus(object):
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
|
|
||||||
|
def add_noise(orig, noise_level):
|
||||||
|
if random.random() >= noise_level:
|
||||||
|
return orig
|
||||||
|
elif type(orig) == list:
|
||||||
|
corrupted = [_corrupt(word, noise_level) for word in orig]
|
||||||
|
corrupted = [w for w in corrupted if w]
|
||||||
|
return corrupted
|
||||||
|
else:
|
||||||
|
return ''.join(_corrupt(c, noise_level) for c in orig)
|
||||||
|
|
||||||
|
|
||||||
|
def _corrupt(c, noise_level):
|
||||||
|
if random.random() >= noise_level:
|
||||||
|
return c
|
||||||
|
elif c == ' ':
|
||||||
|
return '\n'
|
||||||
|
elif c == '\n':
|
||||||
|
return ' '
|
||||||
|
elif c in ['.', "'", "!", "?"]:
|
||||||
|
return ''
|
||||||
|
else:
|
||||||
|
return c.lower()
|
||||||
|
|
||||||
|
|
||||||
def read_json_file(loc, docs_filter=None, limit=None):
|
def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
loc = ensure_path(loc)
|
loc = ensure_path(loc)
|
||||||
if loc.is_dir():
|
if loc.is_dir():
|
||||||
|
|
Loading…
Reference in New Issue