Tidy up merge conflict leftovers

This commit is contained in:
Ines Montani 2018-12-18 13:58:30 +01:00
parent 61d09c481b
commit ae880ef912
9 changed files with 3 additions and 113 deletions

View File

@ -1,11 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
import re
from ...gold import iob_to_biluo
from ...util import minibatch
import re
def iob2json(input_data, n_sents=10, *args, **kwargs):
"""

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
# stop words from HAZM package
# Stop words from HAZM package
STOP_WORDS = set(

View File

@ -1,10 +1,3 @@
"""
Slang and abbreviations
Daftar kosakata yang sering salah dieja
https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
"""
# coding: utf8
from __future__ import unicode_literals

View File

@ -1,6 +1,3 @@
"""
List of stop words in Bahasa Indonesia.
"""
# coding: utf8
from __future__ import unicode_literals

View File

@ -1,7 +1,3 @@
"""
Daftar singkatan dan Akronim dari:
https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
"""
# coding: utf8
from __future__ import unicode_literals

View File

@ -291,8 +291,6 @@ cdef char get_quantifier(PatternStateC state) nogil:
DEF PADDING = 5
DEF PADDING = 5
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
object token_specs) except NULL:

View File

@ -53,24 +53,7 @@ def test_spans_merge_heads(en_tokenizer):
def test_spans_merge_non_disjoint(en_tokenizer):
text = "Los Angeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.merge(
doc[0:2],
attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
)
retokenizer.merge(
doc[0:1],
attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
)
def test_spans_merge_non_disjoint(en_tokenizer):
text = "Los Angeles start."
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens])
doc = en_tokenizer(text)
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.merge(

View File

@ -1,36 +0,0 @@
'''Test issue that arises when too many labels are added to NER model.'''
from __future__ import unicode_literals
import random
from ...lang.en import English
def train_model(train_data, entity_types):
nlp = English(pipeline=[])
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
for entity_type in list(entity_types):
ner.add_label(entity_type)
optimizer = nlp.begin_training()
# Start training
for i in range(20):
losses = {}
index = 0
random.shuffle(train_data)
for statement, entities in train_data:
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
return nlp
def test_train_with_many_entity_types():
train_data = []
train_data.extend([("One sentence", {"entities": []})])
entity_types = [str(i) for i in range(1000)]
model = train_model(train_data, entity_types)

View File

@ -1,40 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
import os
from pathlib import Path
from ..compat import symlink_to, symlink_remove, path2str
def target_local_path():
return "./foo-target"
def link_local_path():
return "./foo-symlink"
@pytest.fixture(scope="function")
def setup_target(request):
target = Path(target_local_path())
if not target.exists():
os.mkdir(path2str(target))
# yield -- need to cleanup even if assertion fails
# https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240
def cleanup():
symlink_remove(Path(link_local_path()))
os.rmdir(target_local_path())
request.addfinalizer(cleanup)
def test_create_symlink_windows(setup_target):
target = Path(target_local_path())
link = Path(link_local_path())
assert target.exists()
symlink_to(link, target)
assert link.exists()