From 7568cd6bf8a156a37e3c254ea65f5a479102c424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 21 Mar 2017 23:00:13 +0100 Subject: [PATCH 1/2] Split CONLLX file using tabs and not default split separators --- bin/parser/train_ud.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index c87f40680..98a93dd88 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals import plac import json -from os import path -import shutil -import os import random -import io import pathlib from spacy.tokens import Doc from spacy.syntax.nonproj import PseudoProjectivity from spacy.language import Language from spacy.gold import GoldParse -from spacy.vocab import Vocab from spacy.tagger import Tagger from spacy.pipeline import DependencyParser, BeamDependencyParser from spacy.syntax.parser import get_templates @@ -23,7 +18,6 @@ import spacy.attrs import io - def read_conllx(loc, n=0): with io.open(loc, 'r', encoding='utf8') as file_: text = file_.read() @@ -35,7 +29,8 @@ def read_conllx(loc, n=0): lines.pop(0) tokens = [] for line in lines: - id_, word, lemma, pos, tag, morph, head, dep, _1, _2 = line.split() + id_, word, lemma, pos, tag, morph, head, dep, _1, \ + _2 = line.split('\t') if '-' in id_ or '.' in id_: continue try: From 08346dba1a94989c6a286e51a122a0f2661592d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Bournhonesque?= Date: Tue, 21 Mar 2017 23:18:54 +0100 Subject: [PATCH 2/2] Use specific language class instead of base Language class --- bin/parser/train_ud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/parser/train_ud.py b/bin/parser/train_ud.py index 98a93dd88..afc4491cb 100644 --- a/bin/parser/train_ud.py +++ b/bin/parser/train_ud.py @@ -129,7 +129,7 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None): random.shuffle(train_sents) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.tags_acc)) - nlp = Language(vocab=vocab, tagger=tagger, parser=parser) + nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp.end_training(model_dir) scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))