mirror of https://github.com/explosion/spaCy.git
Tidy up merge conflict leftovers
This commit is contained in:
parent
61d09c481b
commit
ae880ef912
|
@ -1,11 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
from ...gold import iob_to_biluo
|
||||
from ...util import minibatch
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def iob2json(input_data, n_sents=10, *args, **kwargs):
|
||||
"""
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
# stop words from HAZM package
|
||||
|
||||
# Stop words from HAZM package
|
||||
STOP_WORDS = set(
|
||||
|
|
|
@ -1,10 +1,3 @@
|
|||
"""
|
||||
Slang and abbreviations
|
||||
|
||||
Daftar kosakata yang sering salah dieja
|
||||
https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja
|
||||
|
||||
"""
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
|
|
@ -1,6 +1,3 @@
|
|||
"""
|
||||
List of stop words in Bahasa Indonesia.
|
||||
"""
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
|
|
@ -1,7 +1,3 @@
|
|||
"""
|
||||
Daftar singkatan dan Akronim dari:
|
||||
https://id.wiktionary.org/wiki/Wiktionary:Daftar_singkatan_dan_akronim_bahasa_Indonesia#A
|
||||
"""
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
|
|
@ -291,8 +291,6 @@ cdef char get_quantifier(PatternStateC state) nogil:
|
|||
|
||||
DEF PADDING = 5
|
||||
|
||||
DEF PADDING = 5
|
||||
|
||||
|
||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||
object token_specs) except NULL:
|
||||
|
|
|
@ -53,24 +53,7 @@ def test_spans_merge_heads(en_tokenizer):
|
|||
|
||||
def test_spans_merge_non_disjoint(en_tokenizer):
|
||||
text = "Los Angeles start."
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||
with pytest.raises(ValueError):
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(
|
||||
doc[0:2],
|
||||
attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
|
||||
)
|
||||
retokenizer.merge(
|
||||
doc[0:1],
|
||||
attrs={"tag": "NNP", "lemma": "Los Angeles", "ent_type": "GPE"},
|
||||
)
|
||||
|
||||
|
||||
def test_spans_merge_non_disjoint(en_tokenizer):
|
||||
text = "Los Angeles start."
|
||||
tokens = en_tokenizer(text)
|
||||
doc = get_doc(tokens.vocab, [t.text for t in tokens])
|
||||
doc = en_tokenizer(text)
|
||||
with pytest.raises(ValueError):
|
||||
with doc.retokenize() as retokenizer:
|
||||
retokenizer.merge(
|
||||
|
|
|
@ -1,36 +0,0 @@
|
|||
'''Test issue that arises when too many labels are added to NER model.'''
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import random
|
||||
from ...lang.en import English
|
||||
|
||||
def train_model(train_data, entity_types):
|
||||
nlp = English(pipeline=[])
|
||||
|
||||
ner = nlp.create_pipe("ner")
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
for entity_type in list(entity_types):
|
||||
ner.add_label(entity_type)
|
||||
|
||||
optimizer = nlp.begin_training()
|
||||
|
||||
# Start training
|
||||
for i in range(20):
|
||||
losses = {}
|
||||
index = 0
|
||||
random.shuffle(train_data)
|
||||
|
||||
for statement, entities in train_data:
|
||||
nlp.update([statement], [entities], sgd=optimizer, losses=losses, drop=0.5)
|
||||
return nlp
|
||||
|
||||
|
||||
def test_train_with_many_entity_types():
|
||||
train_data = []
|
||||
train_data.extend([("One sentence", {"entities": []})])
|
||||
entity_types = [str(i) for i in range(1000)]
|
||||
|
||||
model = train_model(train_data, entity_types)
|
||||
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from ..compat import symlink_to, symlink_remove, path2str
|
||||
|
||||
|
||||
def target_local_path():
|
||||
return "./foo-target"
|
||||
|
||||
|
||||
def link_local_path():
|
||||
return "./foo-symlink"
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def setup_target(request):
|
||||
target = Path(target_local_path())
|
||||
if not target.exists():
|
||||
os.mkdir(path2str(target))
|
||||
|
||||
# yield -- need to cleanup even if assertion fails
|
||||
# https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240
|
||||
def cleanup():
|
||||
symlink_remove(Path(link_local_path()))
|
||||
os.rmdir(target_local_path())
|
||||
|
||||
request.addfinalizer(cleanup)
|
||||
|
||||
|
||||
def test_create_symlink_windows(setup_target):
|
||||
target = Path(target_local_path())
|
||||
link = Path(link_local_path())
|
||||
assert target.exists()
|
||||
|
||||
symlink_to(link, target)
|
||||
assert link.exists()
|
Loading…
Reference in New Issue