mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
bc637b2433
|
@ -18,7 +18,6 @@ import random
|
|||
from pathlib import Path
|
||||
|
||||
import spacy
|
||||
from spacy.util import get_lang_class
|
||||
from spacy.tokens import Doc
|
||||
from spacy.gold import GoldParse
|
||||
|
||||
|
@ -52,13 +51,13 @@ def main(lang='en', output_dir=None, n_iter=25):
|
|||
train the tagger with a custom tag map, we're creating a new Language
|
||||
instance with a custom vocab.
|
||||
"""
|
||||
lang_cls = get_lang_class(lang) # get Language class
|
||||
lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults
|
||||
nlp = lang_cls() # initialise Language class
|
||||
|
||||
nlp = spacy.blank(lang)
|
||||
# add the tagger to the pipeline
|
||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||
tagger = nlp.create_pipe('tagger')
|
||||
# Add the tags. This needs to be done before you start training.
|
||||
for tag, values in TAG_MAP.items():
|
||||
tagger.add_label(tag, values)
|
||||
nlp.add_pipe(tagger)
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
|
|
|
@ -27,8 +27,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
|||
docs.append(doc)
|
||||
sentences = []
|
||||
|
||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
|
@ -55,6 +55,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
|||
id_ = int(id_) - 1
|
||||
head = (int(head) - 1) if head != '0' else id_
|
||||
dep = 'ROOT' if dep == 'root' else dep
|
||||
tag = pos if tag == '_' else tag
|
||||
tag = tag+'__'+morph if use_morphology else tag
|
||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||
except:
|
||||
|
|
|
@ -431,18 +431,31 @@ class Tagger(Pipe):
|
|||
def Model(cls, n_tags, **cfg):
|
||||
return build_tagger_model(n_tags, **cfg)
|
||||
|
||||
def add_label(self, label):
|
||||
def add_label(self, label, values=None):
|
||||
if label in self.labels:
|
||||
return 0
|
||||
raise NotImplementedError
|
||||
#if self.model not in (True, False, None):
|
||||
if self.model not in (True, False, None):
|
||||
# Here's how the model resizing will work, once the
|
||||
# neuron-to-tag mapping is no longer controlled by
|
||||
# the Morphology class, which sorts the tag names.
|
||||
# The sorting makes adding labels difficult.
|
||||
# smaller = self.model._layers[-1]
|
||||
# larger = Softmax(len(self.labels)+1, smaller.nI)
|
||||
# copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
# copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
# self.model._layers[-1] = larger
|
||||
#self.labels.append(label)
|
||||
#return 1
|
||||
raise ValueError(
|
||||
"Resizing pre-trained Tagger models is not "
|
||||
"currently supported.")
|
||||
tag_map = dict(self.vocab.morphology.tag_map)
|
||||
if values is None:
|
||||
values = {POS: "X"}
|
||||
tag_map[label] = values
|
||||
self.vocab.morphology = Morphology(
|
||||
self.vocab.strings, tag_map=tag_map,
|
||||
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||
exc=self.vocab.morphology.exc)
|
||||
return 1
|
||||
|
||||
def use_params(self, params):
|
||||
with self.model.use_params(params):
|
||||
|
|
Loading…
Reference in New Issue