mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
bc637b2433
|
@ -18,7 +18,6 @@ import random
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.util import get_lang_class
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
from spacy.gold import GoldParse
|
from spacy.gold import GoldParse
|
||||||
|
|
||||||
|
@ -52,13 +51,13 @@ def main(lang='en', output_dir=None, n_iter=25):
|
||||||
train the tagger with a custom tag map, we're creating a new Language
|
train the tagger with a custom tag map, we're creating a new Language
|
||||||
instance with a custom vocab.
|
instance with a custom vocab.
|
||||||
"""
|
"""
|
||||||
lang_cls = get_lang_class(lang) # get Language class
|
nlp = spacy.blank(lang)
|
||||||
lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults
|
|
||||||
nlp = lang_cls() # initialise Language class
|
|
||||||
|
|
||||||
# add the tagger to the pipeline
|
# add the tagger to the pipeline
|
||||||
# nlp.create_pipe works for built-ins that are registered with spaCy
|
# nlp.create_pipe works for built-ins that are registered with spaCy
|
||||||
tagger = nlp.create_pipe('tagger')
|
tagger = nlp.create_pipe('tagger')
|
||||||
|
# Add the tags. This needs to be done before you start training.
|
||||||
|
for tag, values in TAG_MAP.items():
|
||||||
|
tagger.add_label(tag, values)
|
||||||
nlp.add_pipe(tagger)
|
nlp.add_pipe(tagger)
|
||||||
|
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
|
|
@ -27,8 +27,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
sentences = []
|
sentences = []
|
||||||
|
|
||||||
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
|
||||||
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
output_filename = input_path.parts[-1].replace(".conll", ".json")
|
||||||
|
output_filename = input_path.parts[-1].replace(".conllu", ".json")
|
||||||
output_file = output_path / output_filename
|
output_file = output_path / output_filename
|
||||||
with output_file.open('w', encoding='utf-8') as f:
|
with output_file.open('w', encoding='utf-8') as f:
|
||||||
f.write(json_dumps(docs))
|
f.write(json_dumps(docs))
|
||||||
|
@ -55,6 +55,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
|
||||||
id_ = int(id_) - 1
|
id_ = int(id_) - 1
|
||||||
head = (int(head) - 1) if head != '0' else id_
|
head = (int(head) - 1) if head != '0' else id_
|
||||||
dep = 'ROOT' if dep == 'root' else dep
|
dep = 'ROOT' if dep == 'root' else dep
|
||||||
|
tag = pos if tag == '_' else tag
|
||||||
tag = tag+'__'+morph if use_morphology else tag
|
tag = tag+'__'+morph if use_morphology else tag
|
||||||
tokens.append((id_, word, tag, head, dep, 'O'))
|
tokens.append((id_, word, tag, head, dep, 'O'))
|
||||||
except:
|
except:
|
||||||
|
|
|
@ -431,18 +431,31 @@ class Tagger(Pipe):
|
||||||
def Model(cls, n_tags, **cfg):
|
def Model(cls, n_tags, **cfg):
|
||||||
return build_tagger_model(n_tags, **cfg)
|
return build_tagger_model(n_tags, **cfg)
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label, values=None):
|
||||||
if label in self.labels:
|
if label in self.labels:
|
||||||
return 0
|
return 0
|
||||||
raise NotImplementedError
|
if self.model not in (True, False, None):
|
||||||
#if self.model not in (True, False, None):
|
# Here's how the model resizing will work, once the
|
||||||
# smaller = self.model._layers[-1]
|
# neuron-to-tag mapping is no longer controlled by
|
||||||
# larger = Softmax(len(self.labels)+1, smaller.nI)
|
# the Morphology class, which sorts the tag names.
|
||||||
# copy_array(larger.W[:smaller.nO], smaller.W)
|
# The sorting makes adding labels difficult.
|
||||||
# copy_array(larger.b[:smaller.nO], smaller.b)
|
# smaller = self.model._layers[-1]
|
||||||
# self.model._layers[-1] = larger
|
# larger = Softmax(len(self.labels)+1, smaller.nI)
|
||||||
#self.labels.append(label)
|
# copy_array(larger.W[:smaller.nO], smaller.W)
|
||||||
#return 1
|
# copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
|
# self.model._layers[-1] = larger
|
||||||
|
raise ValueError(
|
||||||
|
"Resizing pre-trained Tagger models is not "
|
||||||
|
"currently supported.")
|
||||||
|
tag_map = dict(self.vocab.morphology.tag_map)
|
||||||
|
if values is None:
|
||||||
|
values = {POS: "X"}
|
||||||
|
tag_map[label] = values
|
||||||
|
self.vocab.morphology = Morphology(
|
||||||
|
self.vocab.strings, tag_map=tag_map,
|
||||||
|
lemmatizer=self.vocab.morphology.lemmatizer,
|
||||||
|
exc=self.vocab.morphology.exc)
|
||||||
|
return 1
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
|
|
Loading…
Reference in New Issue