2019-07-12 08:01:35 +00:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import spacy
|
|
|
|
from spacy.util import minibatch, compounding
|
|
|
|
|
|
|
|
|
|
|
|
def test_issue3611():
|
|
|
|
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
|
|
|
unique_classes = ["offensive", "inoffensive"]
|
2019-07-17 10:34:13 +00:00
|
|
|
x_train = [
|
|
|
|
"This is an offensive text",
|
|
|
|
"This is the second offensive text",
|
|
|
|
"inoff",
|
|
|
|
]
|
2019-07-12 08:01:35 +00:00
|
|
|
y_train = ["offensive", "offensive", "inoffensive"]
|
|
|
|
|
|
|
|
# preparing the data
|
|
|
|
pos_cats = list()
|
|
|
|
for train_instance in y_train:
|
|
|
|
pos_cats.append({label: label == train_instance for label in unique_classes})
|
2019-07-17 10:34:13 +00:00
|
|
|
train_data = list(zip(x_train, [{"cats": cats} for cats in pos_cats]))
|
2019-07-12 08:01:35 +00:00
|
|
|
|
|
|
|
# set up the spacy model with a text categorizer component
|
2019-07-17 10:34:13 +00:00
|
|
|
nlp = spacy.blank("en")
|
2019-07-12 08:01:35 +00:00
|
|
|
|
|
|
|
textcat = nlp.create_pipe(
|
|
|
|
"textcat",
|
2019-07-17 10:34:13 +00:00
|
|
|
config={"exclusive_classes": True, "architecture": "bow", "ngram_size": 2},
|
2019-07-12 08:01:35 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
for label in unique_classes:
|
|
|
|
textcat.add_label(label)
|
|
|
|
nlp.add_pipe(textcat, last=True)
|
|
|
|
|
|
|
|
# training the network
|
2019-10-25 14:19:08 +00:00
|
|
|
with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]):
|
2019-07-12 08:01:35 +00:00
|
|
|
optimizer = nlp.begin_training()
|
|
|
|
for i in range(3):
|
|
|
|
losses = {}
|
|
|
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
|
|
|
|
|
|
|
for batch in batches:
|
2019-07-17 10:34:13 +00:00
|
|
|
nlp.update(
|
2019-11-11 16:35:27 +00:00
|
|
|
examples=batch,
|
2019-07-17 10:34:13 +00:00
|
|
|
sgd=optimizer,
|
|
|
|
drop=0.1,
|
|
|
|
losses=losses,
|
|
|
|
)
|