spaCy/examples/training/conllu.py

'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes
.conllu format for development data, allowing the official scorer to be used.
'''
from __future__ import unicode_literals
import plac
import tqdm
import attr
from pathlib import Path
import re
import sys
import json

import spacy
import spacy.util
from spacy.tokens import Token, Doc
from spacy.gold import GoldParse
from spacy.syntax.nonproj import projectivize
from collections import defaultdict, Counter
from timeit import default_timer as timer
from spacy.matcher import Matcher

import itertools
import random
import numpy.random
import cytoolz

import conll17_ud_eval

import spacy.lang.zh
import spacy.lang.ja

spacy.lang.zh.Chinese.Defaults.use_jieba = False
spacy.lang.ja.Japanese.Defaults.use_janome = False

random.seed(0)
numpy.random.seed(0)

def minibatch_by_words(items, size=5000):
    random.shuffle(items)
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    else:
        size_ = size
    items = iter(items)
    while True:
        batch_size = next(size_)
        batch = []
        while batch_size >= 0:
            try:
                doc, gold = next(items)
            except StopIteration:
                if batch:
                    yield batch
                return
            batch_size -= len(doc)
            batch.append((doc, gold))
        if batch:
            yield batch
        else:
            break

################
# Data reading #
################

space_re = re.compile('\s+')
def split_text(text):
    return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]
 

def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,
              max_doc_length=None, limit=None):
    '''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,
    include Doc objects created using nlp.make_doc and then aligned against
    the gold-standard sequences. If oracle_segments=True, include Doc objects
    created from the gold-standard segments. At least one must be True.'''
    if not raw_text and not oracle_segments:
        raise ValueError("At least one of raw_text or oracle_segments must be True")
    paragraphs = split_text(text_file.read())
    conllu = read_conllu(conllu_file)
    # sd is spacy doc; cd is conllu doc
    # cs is conllu sent, ct is conllu token
    docs = []
    golds = []
    for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):
        sent_annots = []
        for cs in cd:
            sent = defaultdict(list)
            for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:
                if '.' in id_:
                    continue
                if '-' in id_:
                    continue
                id_ = int(id_)-1
                head = int(head)-1 if head != '0' else id_
                sent['words'].append(word)
                sent['tags'].append(tag)
                sent['heads'].append(head)
                sent['deps'].append('ROOT' if dep == 'root' else dep)
                sent['spaces'].append(space_after == '_')
            sent['entities'] = ['-'] * len(sent['words'])
            sent['heads'], sent['deps'] = projectivize(sent['heads'],
                                                       sent['deps'])
            if oracle_segments:
                docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))
                golds.append(GoldParse(docs[-1], **sent))

            sent_annots.append(sent)
            if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
                doc, gold = _make_gold(nlp, None, sent_annots)
                sent_annots = []
                docs.append(doc)
                golds.append(gold)
                if limit and len(docs) >= limit:
                    return docs, golds

        if raw_text and sent_annots:
            doc, gold = _make_gold(nlp, None, sent_annots)
            docs.append(doc)
            golds.append(gold)
        if limit and len(docs) >= limit:
            return docs, golds
    return docs, golds


def read_conllu(file_):
    docs = []
    sent = []
    doc = []
    for line in file_:
        if line.startswith('# newdoc'):
            if doc:
                docs.append(doc)
            doc = []
        elif line.startswith('#'):
            continue
        elif not line.strip():
            if sent:
                doc.append(sent)
            sent = []
        else:
            sent.append(list(line.strip().split('\t')))
            if len(sent[-1]) != 10:
                print(repr(line))
                raise ValueError
    if sent:
        doc.append(sent)
    if doc:
        docs.append(doc)
    return docs


def _make_gold(nlp, text, sent_annots):
    # Flatten the conll annotations, and adjust the head indices
    flat = defaultdict(list)
    for sent in sent_annots:
        flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])
        for field in ['words', 'tags', 'deps', 'entities', 'spaces']:
            flat[field].extend(sent[field])
    # Construct text if necessary
    assert len(flat['words']) == len(flat['spaces'])
    if text is None:
        text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces'])) 
    doc = nlp.make_doc(text)
    flat.pop('spaces')
    gold = GoldParse(doc, **flat)
    return doc, gold

#############################
# Data transforms for spaCy #
#############################

def golds_to_gold_tuples(docs, golds):
    '''Get out the annoying 'tuples' format used by begin_training, given the
    GoldParse objects.'''
    tuples = []
    for doc, gold in zip(docs, golds):
        text = doc.text
        ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)
        sents = [((ids, words, tags, heads, labels, iob), [])]
        tuples.append((text, sents))
    return tuples


##############
# Evaluation #
##############

def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
    with text_loc.open('r', encoding='utf8') as text_file:
        texts = split_text(text_file.read())
        docs = list(nlp.pipe(texts))
    with sys_loc.open('w', encoding='utf8') as out_file:
        write_conllu(docs, out_file)
    with gold_loc.open('r', encoding='utf8') as gold_file:
        gold_ud = conll17_ud_eval.load_conllu(gold_file)
        with sys_loc.open('r', encoding='utf8') as sys_file:
            sys_ud = conll17_ud_eval.load_conllu(sys_file)
        scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)
    return scores


def write_conllu(docs, file_):
    merger = Matcher(docs[0].vocab)
    merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])
    for i, doc in enumerate(docs):
        matches = merger(doc)
        spans = [doc[start:end+1] for _, start, end in matches]
        offsets = [(span.start_char, span.end_char) for span in spans]
        for start_char, end_char in offsets:
            doc.merge(start_char, end_char)
        file_.write("# newdoc id = {i}\n".format(i=i))
        for j, sent in enumerate(doc.sents):
            file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
            file_.write("# text = {text}\n".format(text=sent.text))
            for k, token in enumerate(sent):
                file_.write(token._.get_conllu_lines(k) + '\n')
            file_.write('\n')


def print_progress(itn, losses, ud_scores):
    fields = {
        'dep_loss': losses.get('parser', 0.0),
        'tag_loss': losses.get('tagger', 0.0),
        'words': ud_scores['Words'].f1 * 100,
        'sents': ud_scores['Sentences'].f1 * 100,
        'tags': ud_scores['XPOS'].f1 * 100,
        'uas': ud_scores['UAS'].f1 * 100,
        'las': ud_scores['LAS'].f1 * 100,
    }
    header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']
    if itn == 0:
        print('\t'.join(header))
    tpl = '\t'.join((
        '{:d}',
        '{dep_loss:.1f}',
        '{las:.1f}',
        '{uas:.1f}',
        '{tags:.1f}',
        '{sents:.1f}',
        '{words:.1f}',
    ))
    print(tpl.format(itn, **fields))

#def get_sent_conllu(sent, sent_id):
#    lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]

def get_token_conllu(token, i):
    if token._.begins_fused:
        n = 1
        while token.nbor(n)._.inside_fused:
            n += 1
        id_ = '%d-%d' % (i, i+n)
        lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']
    else:
        lines = []
    if token.head.i == token.i:
        head = 0
    else:
        head = i + (token.head.i - token.i) + 1
    fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',
              str(head), token.dep_.lower(), '_', '_']
    lines.append('\t'.join(fields))
    return '\n'.join(lines)

Token.set_extension('get_conllu_lines', method=get_token_conllu)
Token.set_extension('begins_fused', default=False)
Token.set_extension('inside_fused', default=False)


##################
# Initialization #
##################


def load_nlp(corpus, config):
    lang = corpus.split('_')[0]
    nlp = spacy.blank(lang)
    if config.vectors:
        nlp.vocab.from_disk(config.vectors / 'vocab')
    return nlp

def initialize_pipeline(nlp, docs, golds, config):
    nlp.add_pipe(nlp.create_pipe('parser'))
    if config.multitask_tag:
        nlp.parser.add_multitask_objective('tag')
    if config.multitask_sent:
        nlp.parser.add_multitask_objective('sent_start')
    nlp.parser.moves.add_action(2, 'subtok')
    nlp.add_pipe(nlp.create_pipe('tagger'))
    for gold in golds:
        for tag in gold.tags:
            if tag is not None:
                nlp.tagger.add_label(tag)
    # Replace labels that didn't make the frequency cutoff
    actions = set(nlp.parser.labels)
    label_set = set([act.split('-')[1] for act in actions if '-' in act])
    for gold in golds:
        for i, label in enumerate(gold.labels):
            if label is not None and label not in label_set:
                gold.labels[i] = label.split('||')[0]
    return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))


########################
# Command line helpers #
########################

@attr.s
class Config(object):
    vectors = attr.ib(default=None)
    max_doc_length = attr.ib(default=10)
    multitask_tag = attr.ib(default=True)
    multitask_sent = attr.ib(default=True)
    nr_epoch = attr.ib(default=30)
    batch_size = attr.ib(default=1000)
    dropout = attr.ib(default=0.2)

    @classmethod
    def load(cls, loc):
        with Path(loc).open('r', encoding='utf8') as file_:
            cfg = json.load(file_)
        return cls(**cfg)


class Dataset(object):
    def __init__(self, path, section):
        self.path = path
        self.section = section
        self.conllu = None
        self.text = None
        for file_path in self.path.iterdir():
            name = file_path.parts[-1]
            if section in name and name.endswith('conllu'):
                self.conllu = file_path
            elif section in name and name.endswith('txt'):
                self.text = file_path
        if self.conllu is None:
            msg = "Could not find .txt file in {path} for {section}"
            raise IOError(msg.format(section=section, path=path))
        if self.text is None:
            msg = "Could not find .txt file in {path} for {section}"
        self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]


class TreebankPaths(object):
    def __init__(self, ud_path, treebank, **cfg):
        self.train = Dataset(ud_path / treebank, 'train')
        self.dev = Dataset(ud_path / treebank, 'dev')
        self.lang = self.train.lang


@plac.annotations(
    ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
    corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
            "positional", None, str),
    parses_dir=("Directory to write the development parses", "positional", None, Path),
    config=("Path to json formatted config file", "positional", None, Config.load),
    limit=("Size limit", "option", "n", int)
)
def main(ud_dir, parses_dir, config, corpus, limit=0):
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config)

    docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),
                            max_doc_length=config.max_doc_length, limit=limit)

    optimizer = initialize_pipeline(nlp, docs, golds, config)

    for i in range(config.nr_epoch):
        docs = [nlp.make_doc(doc.text) for doc in docs]
        batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)
        losses = {}
        n_train_words = sum(len(doc) for doc in docs)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
                batch_docs, batch_gold = zip(*batch)
                pbar.update(sum(len(doc) for doc in batch_docs))
                nlp.update(batch_docs, batch_gold, sgd=optimizer,
                           drop=config.dropout, losses=losses)
        
        out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)
        with nlp.use_params(optimizer.averages):
            scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)
            print_progress(i, losses, scores)


if __name__ == '__main__':
    plac.call(main)
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`'''Train for CONLL 2017 UD treebank evaluation. Takes .conllu files, writes`
			`.conllu format for development data, allowing the official scorer to be used.`
			`'''`
			`from __future__ import unicode_literals`
			`import plac`
			`import tqdm`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`import attr`
			`from pathlib import Path`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`import re`
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00			`import sys`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`import json`

Add script to do conllu training 2018-02-21 12:53:59 +00:00			`import spacy`
			`import spacy.util`
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`from spacy.tokens import Token, Doc`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`from spacy.gold import GoldParse`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`from spacy.syntax.nonproj import projectivize`
Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`from collections import defaultdict, Counter`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`from timeit import default_timer as timer`
Clean up conllu script 2018-02-24 09:31:53 +00:00			`from spacy.matcher import Matcher`
Add script to do conllu training 2018-02-21 12:53:59 +00:00
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`import itertools`
CONLLU scoring 80.9% UAS with no oracle segments 2018-02-23 22:49:17 +00:00			`import random`
			`import numpy.random`
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`import cytoolz`
CONLLU scoring 80.9% UAS with no oracle segments 2018-02-23 22:49:17 +00:00
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`import conll17_ud_eval`
Add script to do conllu training 2018-02-21 12:53:59 +00:00
Add Chinese.Defaults.use_jieba setting, for UD 2018-02-25 14:12:38 +00:00			`import spacy.lang.zh`
Fix Japanese tokenizer flag 2018-02-26 09:32:12 +00:00			`import spacy.lang.ja`
Add Chinese.Defaults.use_jieba setting, for UD 2018-02-25 14:12:38 +00:00
			`spacy.lang.zh.Chinese.Defaults.use_jieba = False`
Fix Japanese tokenizer flag 2018-02-26 09:32:12 +00:00			`spacy.lang.ja.Japanese.Defaults.use_janome = False`
Add Chinese.Defaults.use_jieba setting, for UD 2018-02-25 14:12:38 +00:00
CONLLU scoring 80.9% UAS with no oracle segments 2018-02-23 22:49:17 +00:00			`random.seed(0)`
			`numpy.random.seed(0)`

Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`def minibatch_by_words(items, size=5000):`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`random.shuffle(items)`
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`if isinstance(size, int):`
			`size_ = itertools.repeat(size)`
			`else:`
			`size_ = size`
			`items = iter(items)`
			`while True:`
			`batch_size = next(size_)`
			`batch = []`
			`while batch_size >= 0:`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`try:`
			`doc, gold = next(items)`
			`except StopIteration:`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`if batch:`
			`yield batch`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`return`
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`batch_size -= len(doc)`
			`batch.append((doc, gold))`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`if batch:`
			`yield batch`
			`else:`
			`break`
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00
Refactor conllu script 2018-02-25 11:48:22 +00:00			`################`
			`# Data reading #`
			`################`
Add script to do conllu training 2018-02-21 12:53:59 +00:00
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`space_re = re.compile('\s+')`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`def split_text(text):`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`return [space_re.sub(' ', par.strip()) for par in text.split('\n\n')]`
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00

			`def read_data(nlp, conllu_file, text_file, raw_text=True, oracle_segments=False,`
Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`max_doc_length=None, limit=None):`
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00			`'''Read the CONLLU format into (Doc, GoldParse) tuples. If raw_text=True,`
			`include Doc objects created using nlp.make_doc and then aligned against`
			`the gold-standard sequences. If oracle_segments=True, include Doc objects`
			`created from the gold-standard segments. At least one must be True.'''`
			`if not raw_text and not oracle_segments:`
			`raise ValueError("At least one of raw_text or oracle_segments must be True")`
			`paragraphs = split_text(text_file.read())`
			`conllu = read_conllu(conllu_file)`
			`# sd is spacy doc; cd is conllu doc`
			`# cs is conllu sent, ct is conllu token`
			`docs = []`
			`golds = []`
Update conllu script 2018-02-22 18:43:54 +00:00			`for doc_id, (text, cd) in enumerate(zip(paragraphs, conllu)):`
Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`sent_annots = []`
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00			`for cs in cd:`
Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`sent = defaultdict(list)`
			`for id_, word, lemma, pos, tag, morph, head, dep, _, space_after in cs:`
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00			`if '.' in id_:`
			`continue`
			`if '-' in id_:`
			`continue`
			`id_ = int(id_)-1`
			`head = int(head)-1 if head != '0' else id_`
Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`sent['words'].append(word)`
			`sent['tags'].append(tag)`
			`sent['heads'].append(head)`
			`sent['deps'].append('ROOT' if dep == 'root' else dep)`
			`sent['spaces'].append(space_after == '_')`
			`sent['entities'] = ['-'] * len(sent['words'])`
			`sent['heads'], sent['deps'] = projectivize(sent['heads'],`
			`sent['deps'])`
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00			`if oracle_segments:`
Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`docs.append(Doc(nlp.vocab, words=sent['words'], spaces=sent['spaces']))`
			`golds.append(GoldParse(docs[-1], **sent))`

			`sent_annots.append(sent)`
			`if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:`
			`doc, gold = _make_gold(nlp, None, sent_annots)`
			`sent_annots = []`
			`docs.append(doc)`
			`golds.append(gold)`
			`if limit and len(docs) >= limit:`
			`return docs, golds`

			`if raw_text and sent_annots:`
			`doc, gold = _make_gold(nlp, None, sent_annots)`
			`docs.append(doc)`
			`golds.append(gold)`
			`if limit and len(docs) >= limit:`
			`return docs, golds`
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00			`return docs, golds`


Refactor conllu script 2018-02-25 11:48:22 +00:00			`def read_conllu(file_):`
			`docs = []`
			`sent = []`
			`doc = []`
			`for line in file_:`
			`if line.startswith('# newdoc'):`
			`if doc:`
			`docs.append(doc)`
			`doc = []`
			`elif line.startswith('#'):`
			`continue`
			`elif not line.strip():`
			`if sent:`
			`doc.append(sent)`
			`sent = []`
			`else:`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`sent.append(list(line.strip().split('\t')))`
			`if len(sent[-1]) != 10:`
			`print(repr(line))`
			`raise ValueError`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`if sent:`
			`doc.append(sent)`
			`if doc:`
			`docs.append(doc)`
			`return docs`


Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`def _make_gold(nlp, text, sent_annots):`
			`# Flatten the conll annotations, and adjust the head indices`
			`flat = defaultdict(list)`
			`for sent in sent_annots:`
			`flat['heads'].extend(len(flat['words'])+head for head in sent['heads'])`
			`for field in ['words', 'tags', 'deps', 'entities', 'spaces']:`
			`flat[field].extend(sent[field])`
			`# Construct text if necessary`
			`assert len(flat['words']) == len(flat['spaces'])`
			`if text is None:`
			`text = ''.join(word+' '*space for word, space in zip(flat['words'], flat['spaces']))`
			`doc = nlp.make_doc(text)`
			`flat.pop('spaces')`
			`gold = GoldParse(doc, **flat)`
			`return doc, gold`

Refactor conllu script 2018-02-25 11:48:22 +00:00			`#############################`
			`# Data transforms for spaCy #`
			`#############################`

			`def golds_to_gold_tuples(docs, golds):`
			`'''Get out the annoying 'tuples' format used by begin_training, given the`
			`GoldParse objects.'''`
			`tuples = []`
			`for doc, gold in zip(docs, golds):`
			`text = doc.text`
			`ids, words, tags, heads, labels, iob = zip(*gold.orig_annot)`
			`sents = [((ids, words, tags, heads, labels, iob), [])]`
			`tuples.append((text, sents))`
			`return tuples`

Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00
Refactor conllu script 2018-02-25 11:48:22 +00:00			`##############`
			`# Evaluation #`
			`##############`
Add script to do conllu training 2018-02-21 12:53:59 +00:00
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):`
			`with text_loc.open('r', encoding='utf8') as text_file:`
			`texts = split_text(text_file.read())`
			`docs = list(nlp.pipe(texts))`
			`with sys_loc.open('w', encoding='utf8') as out_file:`
			`write_conllu(docs, out_file)`
			`with gold_loc.open('r', encoding='utf8') as gold_file:`
			`gold_ud = conll17_ud_eval.load_conllu(gold_file)`
			`with sys_loc.open('r', encoding='utf8') as sys_file:`
			`sys_ud = conll17_ud_eval.load_conllu(sys_file)`
			`scores = conll17_ud_eval.evaluate(gold_ud, sys_ud)`
			`return scores`


			`def write_conllu(docs, file_):`
Clean up conllu script 2018-02-24 09:31:53 +00:00			`merger = Matcher(docs[0].vocab)`
			`merger.add('SUBTOK', None, [{'DEP': 'subtok', 'op': '+'}])`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`for i, doc in enumerate(docs):`
Clean up conllu script 2018-02-24 09:31:53 +00:00			`matches = merger(doc)`
Generalize conllu script. Now handling Chinese (maybe badly) 2018-02-24 15:04:27 +00:00			`spans = [doc[start:end+1] for _, start, end in matches]`
			`offsets = [(span.start_char, span.end_char) for span in spans]`
			`for start_char, end_char in offsets:`
Clean up conllu script 2018-02-24 09:31:53 +00:00			`doc.merge(start_char, end_char)`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`file_.write("# newdoc id = {i}\n".format(i=i))`
			`for j, sent in enumerate(doc.sents):`
			`file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))`
			`file_.write("# text = {text}\n".format(text=sent.text))`
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`for k, token in enumerate(sent):`
			`file_.write(token._.get_conllu_lines(k) + '\n')`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`file_.write('\n')`

Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00
			`def print_progress(itn, losses, ud_scores):`
			`fields = {`
			`'dep_loss': losses.get('parser', 0.0),`
			`'tag_loss': losses.get('tagger', 0.0),`
			`'words': ud_scores['Words'].f1 * 100,`
			`'sents': ud_scores['Sentences'].f1 * 100,`
			`'tags': ud_scores['XPOS'].f1 * 100,`
			`'uas': ud_scores['UAS'].f1 * 100,`
			`'las': ud_scores['LAS'].f1 * 100,`
			`}`
			`header = ['Epoch', 'Loss', 'LAS', 'UAS', 'TAG', 'SENT', 'WORD']`
			`if itn == 0:`
			`print('\t'.join(header))`
			`tpl = '\t'.join((`
			`'{:d}',`
			`'{dep_loss:.1f}',`
			`'{las:.1f}',`
			`'{uas:.1f}',`
			`'{tags:.1f}',`
			`'{sents:.1f}',`
			`'{words:.1f}',`
			`))`
			`print(tpl.format(itn, **fields))`

Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`#def get_sent_conllu(sent, sent_id):`
			`# lines = ["# sent_id = {sent_id}".format(sent_id=sent_id)]`

			`def get_token_conllu(token, i):`
			`if token._.begins_fused:`
			`n = 1`
			`while token.nbor(n)._.inside_fused:`
			`n += 1`
Fix undefined variable in conllu script 2018-02-26 13:59:56 +00:00			`id_ = '%d-%d' % (i, i+n)`
Minibatch by number of tokens, support other vectors, refactor CoNLL printing 2018-02-25 09:38:06 +00:00			`lines = [id_, token.text, '_', '_', '_', '_', '_', '_', '_', '_']`
			`else:`
			`lines = []`
			`if token.head.i == token.i:`
			`head = 0`
			`else:`
			`head = i + (token.head.i - token.i) + 1`
			`fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, '_',`
			`str(head), token.dep_.lower(), '_', '_']`
			`lines.append('\t'.join(fields))`
			`return '\n'.join(lines)`

			`Token.set_extension('get_conllu_lines', method=get_token_conllu)`
			`Token.set_extension('begins_fused', default=False)`
			`Token.set_extension('inside_fused', default=False)`

Add script to do conllu training 2018-02-21 12:53:59 +00:00
Refactor conllu script 2018-02-25 11:48:22 +00:00			`##################`
			`# Initialization #`
			`##################`


			`def load_nlp(corpus, config):`
			`lang = corpus.split('_')[0]`
			`nlp = spacy.blank(lang)`
			`if config.vectors:`
			`nlp.vocab.from_disk(config.vectors / 'vocab')`
			`return nlp`

			`def initialize_pipeline(nlp, docs, golds, config):`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`nlp.add_pipe(nlp.create_pipe('parser'))`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`if config.multitask_tag:`
			`nlp.parser.add_multitask_objective('tag')`
			`if config.multitask_sent:`
			`nlp.parser.add_multitask_objective('sent_start')`
Clean up conllu script 2018-02-24 09:31:53 +00:00			`nlp.parser.moves.add_action(2, 'subtok')`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`nlp.add_pipe(nlp.create_pipe('tagger'))`
			`for gold in golds:`
			`for tag in gold.tags:`
			`if tag is not None:`
			`nlp.tagger.add_label(tag)`
Replace labels that didn't make freq cutoff 2018-02-21 14:59:22 +00:00			`# Replace labels that didn't make the frequency cutoff`
			`actions = set(nlp.parser.labels)`
			`label_set = set([act.split('-')[1] for act in actions if '-' in act])`
			`for gold in golds:`
			`for i, label in enumerate(gold.labels):`
			`if label is not None and label not in label_set:`
			`gold.labels[i] = label.split('\|\|')[0]`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`return nlp.begin_training(lambda: golds_to_gold_tuples(docs, golds))`


			`########################`
			`# Command line helpers #`
			`########################`

			`@attr.s`
			`class Config(object):`
			`vectors = attr.ib(default=None)`
			`max_doc_length = attr.ib(default=10)`
			`multitask_tag = attr.ib(default=True)`
			`multitask_sent = attr.ib(default=True)`
			`nr_epoch = attr.ib(default=30)`
			`batch_size = attr.ib(default=1000)`
			`dropout = attr.ib(default=0.2)`

			`@classmethod`
			`def load(cls, loc):`
			`with Path(loc).open('r', encoding='utf8') as file_:`
			`cfg = json.load(file_)`
			`return cls(**cfg)`


			`class Dataset(object):`
			`def __init__(self, path, section):`
			`self.path = path`
			`self.section = section`
			`self.conllu = None`
			`self.text = None`
			`for file_path in self.path.iterdir():`
			`name = file_path.parts[-1]`
			`if section in name and name.endswith('conllu'):`
			`self.conllu = file_path`
			`elif section in name and name.endswith('txt'):`
			`self.text = file_path`
			`if self.conllu is None:`
			`msg = "Could not find .txt file in {path} for {section}"`
			`raise IOError(msg.format(section=section, path=path))`
			`if self.text is None:`
			`msg = "Could not find .txt file in {path} for {section}"`
			`self.lang = self.conllu.parts[-1].split('-')[0].split('_')[0]`


			`class TreebankPaths(object):`
			`def __init__(self, ud_path, treebank, **cfg):`
			`self.train = Dataset(ud_path / treebank, 'train')`
			`self.dev = Dataset(ud_path / treebank, 'dev')`
			`self.lang = self.train.lang`


			`@plac.annotations(`
			`ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),`
			`corpus=("UD corpus to train and evaluate on, e.g. en, es_ancora, etc",`
			`"positional", None, str),`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`parses_dir=("Directory to write the development parses", "positional", None, Path),`
			`config=("Path to json formatted config file", "positional", None, Config.load),`
Update conllu training script 2018-02-25 12:12:39 +00:00			`limit=("Size limit", "option", "n", int)`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`)`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`def main(ud_dir, parses_dir, config, corpus, limit=0):`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`paths = TreebankPaths(ud_dir, corpus)`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`if not (parses_dir / corpus).exists():`
			`(parses_dir / corpus).mkdir()`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`print("Train and evaluate", corpus, "using lang", paths.lang)`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`nlp = load_nlp(paths.lang, config)`

			`docs, golds = read_data(nlp, paths.train.conllu.open(), paths.train.text.open(),`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`max_doc_length=config.max_doc_length, limit=limit)`
Refactor conllu script 2018-02-25 11:48:22 +00:00
			`optimizer = initialize_pipeline(nlp, docs, golds, config)`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00
Refactor conllu script 2018-02-25 11:48:22 +00:00			`for i in range(config.nr_epoch):`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`docs = [nlp.make_doc(doc.text) for doc in docs]`
Refactor conllu script 2018-02-25 11:48:22 +00:00			`batches = minibatch_by_words(list(zip(docs, golds)), size=config.batch_size)`
			`losses = {}`
Shuffle data after each epoch. Improve script 2018-02-25 12:35:32 +00:00			`n_train_words = sum(len(doc) for doc in docs)`
			`with tqdm.tqdm(total=n_train_words, leave=False) as pbar:`
			`for batch in batches:`
			`batch_docs, batch_gold = zip(*batch)`
			`pbar.update(sum(len(doc) for doc in batch_docs))`
			`nlp.update(batch_docs, batch_gold, sgd=optimizer,`
			`drop=config.dropout, losses=losses)`
Add script to do conllu training 2018-02-21 12:53:59 +00:00
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`out_path = parses_dir / corpus / 'epoch-{i}.conllu'.format(i=i)`
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`with nlp.use_params(optimizer.averages):`
Refactor conllu script, fix interface, generalize 2018-02-25 13:54:47 +00:00			`scores = evaluate(nlp, paths.dev.text, paths.dev.conllu, out_path)`
			`print_progress(i, losses, scores)`
Add script to do conllu training 2018-02-21 12:53:59 +00:00
Refactor CoNLL training script 2018-02-22 15:00:34 +00:00
Add script to do conllu training 2018-02-21 12:53:59 +00:00			`if __name__ == '__main__':`
			`plac.call(main)`