From 00842d7f1b20cfc8994717098b06b5f0d9fa566d Mon Sep 17 00:00:00 2001 From: Ryan Ford Date: Fri, 15 Mar 2019 18:14:46 +0100 Subject: [PATCH] Merging conversion scripts for conll formats (#3405) * merging conllu/conll and conllubio scripts * tabs to spaces * removing conllubio2json from converters/__init__.py * Move not-really-CLI tests to misc * Add converter test using no-ud data * Fix test I broke * removing include_biluo parameter * fixing read_conllx * remove include_biluo from convert.py --- .github/contributors/Poluglottos.md | 106 +++++++++++++++++++++++++ spacy/cli/convert.py | 4 +- spacy/cli/converters/__init__.py | 1 - spacy/cli/converters/conllu2json.py | 1 + spacy/cli/converters/conllubio2json.py | 85 -------------------- spacy/tests/test_cli.py | 56 ++++++------- spacy/tests/test_misc.py | 31 ++++++++ 7 files changed, 163 insertions(+), 121 deletions(-) create mode 100644 .github/contributors/Poluglottos.md delete mode 100644 spacy/cli/converters/conllubio2json.py diff --git a/.github/contributors/Poluglottos.md b/.github/contributors/Poluglottos.md new file mode 100644 index 000000000..1ce6b732b --- /dev/null +++ b/.github/contributors/Poluglottos.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [X] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ryan Ford | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | Mar 13 2019 | +| GitHub username | Poluglottos | +| Website (optional) | | diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 12a3d2698..51423c46f 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -6,7 +6,7 @@ from pathlib import Path from wasabi import Printer import srsly -from .converters import conllu2json, conllubio2json, iob2json, conll_ner2json +from .converters import conllu2json, iob2json, conll_ner2json from .converters import ner_jsonl2json @@ -14,7 +14,7 @@ from .converters import ner_jsonl2json # entry to this dict with the file extension mapped to the converter function # imported from /converters. CONVERTERS = { - "conllubio": conllubio2json, + "conllubio": conllu2json, "conllu": conllu2json, "conll": conllu2json, "ner": conll_ner2json, diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index c0be857a8..9dcbf5b13 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -1,5 +1,4 @@ from .conllu2json import conllu2json # noqa: F401 -from .conllubio2json import conllubio2json # noqa: F401 from .iob2json import iob2json # noqa: F401 from .conll_ner2json import conll_ner2json # noqa: F401 from .jsonl2json import ner_jsonl2json # noqa: F401 diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index f1102a94a..3a7a68e4a 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -71,6 +71,7 @@ def read_conllx(input_data, use_morphology=False, n=0): dep = "ROOT" if dep == "root" else dep tag = pos if tag == "_" else tag tag = tag + "__" + morph if use_morphology else tag + iob = iob if iob else "O" tokens.append((id_, word, tag, head, dep, iob)) except: # noqa: E722 print(line) diff --git a/spacy/cli/converters/conllubio2json.py b/spacy/cli/converters/conllubio2json.py deleted file mode 100644 index bd6ee7996..000000000 --- a/spacy/cli/converters/conllubio2json.py +++ /dev/null @@ -1,85 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ...gold import iob_to_biluo - - -def conllubio2json(input_data, n_sents=10, use_morphology=False, lang=None): - """ - Convert conllu files into JSON format for use with train cli. - use_morphology parameter enables appending morphology to tags, which is - useful for languages such as Spanish, where UD tags are not so rich. - """ - # by @dvsrepo, via #11 explosion/spacy-dev-resources - docs = [] - sentences = [] - conll_tuples = read_conllx(input_data, use_morphology=use_morphology) - for i, (raw_text, tokens) in enumerate(conll_tuples): - sentence, brackets = tokens[0] - sentences.append(generate_sentence(sentence)) - # Real-sized documents could be extracted using the comments on the - # conluu document - if len(sentences) % n_sents == 0: - doc = create_doc(sentences, i) - docs.append(doc) - sentences = [] - return docs - - -def read_conllx(input_data, use_morphology=False, n=0): - i = 0 - for sent in input_data.strip().split("\n\n"): - lines = sent.strip().split("\n") - if lines: - while lines[0].startswith("#"): - lines.pop(0) - tokens = [] - for line in lines: - - parts = line.split("\t") - id_, word, lemma, pos, tag, morph, head, dep, _1, ner = parts - if "-" in id_ or "." in id_: - continue - try: - id_ = int(id_) - 1 - head = (int(head) - 1) if head != "0" else id_ - dep = "ROOT" if dep == "root" else dep - tag = pos if tag == "_" else tag - tag = tag + "__" + morph if use_morphology else tag - ner = ner if ner else "O" - tokens.append((id_, word, tag, head, dep, ner)) - except: # noqa: E722 - print(line) - raise - tuples = [list(t) for t in zip(*tokens)] - yield (None, [[tuples, []]]) - i += 1 - if n >= 1 and i >= n: - break - - -def generate_sentence(sent): - (id_, word, tag, head, dep, ner) = sent - sentence = {} - tokens = [] - ner = iob_to_biluo(ner) - for i, id in enumerate(id_): - token = {} - token["orth"] = word[i] - token["tag"] = tag[i] - token["head"] = head[i] - id - token["dep"] = dep[i] - token["ner"] = ner[i] - tokens.append(token) - sentence["tokens"] = tokens - return sentence - - -def create_doc(sentences, id): - doc = {} - paragraph = {} - doc["id"] = id - doc["paragraphs"] = [] - paragraph["sentences"] = sentences - doc["paragraphs"].append(paragraph) - return doc diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index e0067c088..2afa6a71b 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,38 +1,28 @@ # coding: utf-8 from __future__ import unicode_literals -import pytest -import os -from pathlib import Path -from spacy.compat import symlink_to, symlink_remove, path2str +from spacy.cli.converters import conllu2json -@pytest.fixture -def target_local_path(): - return Path("./foo-target") - - -@pytest.fixture -def link_local_path(): - return Path("./foo-symlink") - - -@pytest.fixture(scope="function") -def setup_target(request, target_local_path, link_local_path): - if not target_local_path.exists(): - os.mkdir(path2str(target_local_path)) - - # yield -- need to cleanup even if assertion fails - # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240 - def cleanup(): - symlink_remove(link_local_path) - os.rmdir(path2str(target_local_path)) - - request.addfinalizer(cleanup) - - -def test_create_symlink_windows(setup_target, target_local_path, link_local_path): - assert target_local_path.exists() - - symlink_to(link_local_path, target_local_path) - assert link_local_path.exists() +def test_cli_converters_conllu2json(): + # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu + lines = [ + "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", + "2\tFinn\tFinn\tPROPN\t_\tGender=Masc\t4\tnsubj\t_\tB-PER", + "3\tEilertsen\tEilertsen\tPROPN\t_\t_\t2\tname\t_\tI-PER", + "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", + ] + input_data = "\n".join(lines) + converted = conllu2json(input_data, n_sents=1) + assert len(converted) == 1 + assert converted[0]["id"] == 0 + assert len(converted[0]["paragraphs"]) == 1 + assert len(converted[0]["paragraphs"][0]["sentences"]) == 1 + sent = converted[0]["paragraphs"][0]["sentences"][0] + assert len(sent["tokens"]) == 4 + tokens = sent["tokens"] + assert [t["orth"] for t in tokens] == ["Dommer", "Finn", "Eilertsen", "avstår"] + assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"] + assert [t["head"] for t in tokens] == [1, 2, -1, 0] + assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"] + assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"] diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 64112923f..6472dc7e1 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -2,12 +2,37 @@ from __future__ import unicode_literals import pytest +import os from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu +from spacy.compat import symlink_to, symlink_remove, path2str from spacy._ml import PrecomputableAffine +@pytest.fixture +def symlink_target(): + return Path("./foo-target") + + +@pytest.fixture +def symlink(): + return Path("./foo-symlink") + + +@pytest.fixture(scope="function") +def symlink_setup_target(request, symlink_target, symlink): + if not symlink_target.exists(): + os.mkdir(path2str(symlink_target)) + # yield -- need to cleanup even if assertion fails + # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240 + def cleanup(): + symlink_remove(symlink) + os.rmdir(path2str(symlink_target)) + + request.addfinalizer(cleanup) + + @pytest.mark.parametrize("text", ["hello/world", "hello world"]) def test_util_ensure_path_succeeds(text): path = util.ensure_path(text) @@ -60,3 +85,9 @@ def test_prefer_gpu(): def test_require_gpu(): with pytest.raises(ValueError): require_gpu() + + +def test_create_symlink_windows(symlink_setup_target, symlink_target, symlink): + assert symlink_target.exists() + symlink_to(symlink, symlink_target) + assert symlink.exists()