Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
ines 2017-11-01 21:49:59 +01:00
commit bc637b2433
3 changed files with 29 additions and 16 deletions

View File

@ -18,7 +18,6 @@ import random
from pathlib import Path from pathlib import Path
import spacy import spacy
from spacy.util import get_lang_class
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.gold import GoldParse from spacy.gold import GoldParse
@ -52,13 +51,13 @@ def main(lang='en', output_dir=None, n_iter=25):
train the tagger with a custom tag map, we're creating a new Language train the tagger with a custom tag map, we're creating a new Language
instance with a custom vocab. instance with a custom vocab.
""" """
lang_cls = get_lang_class(lang) # get Language class nlp = spacy.blank(lang)
lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults
nlp = lang_cls() # initialise Language class
# add the tagger to the pipeline # add the tagger to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy # nlp.create_pipe works for built-ins that are registered with spaCy
tagger = nlp.create_pipe('tagger') tagger = nlp.create_pipe('tagger')
# Add the tags. This needs to be done before you start training.
for tag, values in TAG_MAP.items():
tagger.add_label(tag, values)
nlp.add_pipe(tagger) nlp.add_pipe(tagger)
optimizer = nlp.begin_training() optimizer = nlp.begin_training()

View File

@ -27,8 +27,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
docs.append(doc) docs.append(doc)
sentences = [] sentences = []
output_filename = input_path.parts[-1].replace(".conllu", ".json")
output_filename = input_path.parts[-1].replace(".conll", ".json") output_filename = input_path.parts[-1].replace(".conll", ".json")
output_filename = input_path.parts[-1].replace(".conllu", ".json")
output_file = output_path / output_filename output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f: with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs)) f.write(json_dumps(docs))
@ -55,6 +55,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
id_ = int(id_) - 1 id_ = int(id_) - 1
head = (int(head) - 1) if head != '0' else id_ head = (int(head) - 1) if head != '0' else id_
dep = 'ROOT' if dep == 'root' else dep dep = 'ROOT' if dep == 'root' else dep
tag = pos if tag == '_' else tag
tag = tag+'__'+morph if use_morphology else tag tag = tag+'__'+morph if use_morphology else tag
tokens.append((id_, word, tag, head, dep, 'O')) tokens.append((id_, word, tag, head, dep, 'O'))
except: except:

View File

@ -431,18 +431,31 @@ class Tagger(Pipe):
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, **cfg):
return build_tagger_model(n_tags, **cfg) return build_tagger_model(n_tags, **cfg)
def add_label(self, label): def add_label(self, label, values=None):
if label in self.labels: if label in self.labels:
return 0 return 0
raise NotImplementedError if self.model not in (True, False, None):
#if self.model not in (True, False, None): # Here's how the model resizing will work, once the
# smaller = self.model._layers[-1] # neuron-to-tag mapping is no longer controlled by
# larger = Softmax(len(self.labels)+1, smaller.nI) # the Morphology class, which sorts the tag names.
# copy_array(larger.W[:smaller.nO], smaller.W) # The sorting makes adding labels difficult.
# copy_array(larger.b[:smaller.nO], smaller.b) # smaller = self.model._layers[-1]
# self.model._layers[-1] = larger # larger = Softmax(len(self.labels)+1, smaller.nI)
#self.labels.append(label) # copy_array(larger.W[:smaller.nO], smaller.W)
#return 1 # copy_array(larger.b[:smaller.nO], smaller.b)
# self.model._layers[-1] = larger
raise ValueError(
"Resizing pre-trained Tagger models is not "
"currently supported.")
tag_map = dict(self.vocab.morphology.tag_map)
if values is None:
values = {POS: "X"}
tag_map[label] = values
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer,
exc=self.vocab.morphology.exc)
return 1
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):