mirror of https://github.com/explosion/spaCy.git
101 lines
3.7 KiB
Python
101 lines
3.7 KiB
Python
from pathlib import Path
|
|
from typing import Optional, Callable, Iterable, List, Tuple
|
|
from thinc.types import Floats2d
|
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
|
from thinc.api import Model, Maxout, Linear, noop, tuplify, Ragged
|
|
|
|
from ...util import registry
|
|
from ...kb import KnowledgeBase, Candidate, get_candidates
|
|
from ...vocab import Vocab
|
|
from ...tokens import Span, Doc
|
|
from ..extract_spans import extract_spans
|
|
from ...errors import Errors
|
|
|
|
|
|
@registry.architectures("spacy.EntityLinker.v2")
|
|
def build_nel_encoder(
|
|
tok2vec: Model, nO: Optional[int] = None
|
|
) -> Model[List[Doc], Floats2d]:
|
|
with Model.define_operators({">>": chain, "&": tuplify}):
|
|
token_width = tok2vec.maybe_get_dim("nO")
|
|
output_layer = Linear(nO=nO, nI=token_width)
|
|
model = (
|
|
((tok2vec >> list2ragged()) & build_span_maker())
|
|
>> extract_spans()
|
|
>> reduce_mean()
|
|
>> residual(Maxout(nO=token_width, nI=token_width, nP=2, dropout=0.0)) # type: ignore[arg-type]
|
|
>> output_layer
|
|
)
|
|
model.set_ref("output_layer", output_layer)
|
|
model.set_ref("tok2vec", tok2vec)
|
|
# flag to show this isn't legacy
|
|
model.attrs["include_span_maker"] = True
|
|
return model
|
|
|
|
|
|
def build_span_maker(n_sents: int = 0) -> Model:
|
|
model: Model = Model("span_maker", forward=span_maker_forward)
|
|
model.attrs["n_sents"] = n_sents
|
|
return model
|
|
|
|
|
|
def span_maker_forward(model, docs: List[Doc], is_train) -> Tuple[Ragged, Callable]:
|
|
ops = model.ops
|
|
n_sents = model.attrs["n_sents"]
|
|
candidates = []
|
|
for doc in docs:
|
|
cands = []
|
|
try:
|
|
sentences = [s for s in doc.sents]
|
|
except ValueError:
|
|
# no sentence info, normal in initialization
|
|
for tok in doc:
|
|
tok.is_sent_start = tok.i == 0
|
|
sentences = [doc[:]]
|
|
for ent in doc.ents:
|
|
try:
|
|
# find the sentence in the list of sentences.
|
|
sent_index = sentences.index(ent.sent)
|
|
except AttributeError:
|
|
# Catch the exception when ent.sent is None and provide a user-friendly warning
|
|
raise RuntimeError(Errors.E030) from None
|
|
# get n previous sentences, if there are any
|
|
start_sentence = max(0, sent_index - n_sents)
|
|
# get n posterior sentences, or as many < n as there are
|
|
end_sentence = min(len(sentences) - 1, sent_index + n_sents)
|
|
# get token positions
|
|
start_token = sentences[start_sentence].start
|
|
end_token = sentences[end_sentence].end
|
|
# save positions for extraction
|
|
cands.append((start_token, end_token))
|
|
|
|
candidates.append(ops.asarray2i(cands))
|
|
candlens = ops.asarray1i([len(cands) for cands in candidates])
|
|
candidates = ops.xp.concatenate(candidates)
|
|
outputs = Ragged(candidates, candlens)
|
|
# because this is just rearranging docs, the backprop does nothing
|
|
return outputs, lambda x: []
|
|
|
|
|
|
@registry.misc("spacy.KBFromFile.v1")
|
|
def load_kb(kb_path: Path) -> Callable[[Vocab], KnowledgeBase]:
|
|
def kb_from_file(vocab):
|
|
kb = KnowledgeBase(vocab, entity_vector_length=1)
|
|
kb.from_disk(kb_path)
|
|
return kb
|
|
|
|
return kb_from_file
|
|
|
|
|
|
@registry.misc("spacy.EmptyKB.v1")
|
|
def empty_kb(entity_vector_length: int) -> Callable[[Vocab], KnowledgeBase]:
|
|
def empty_kb_factory(vocab):
|
|
return KnowledgeBase(vocab=vocab, entity_vector_length=entity_vector_length)
|
|
|
|
return empty_kb_factory
|
|
|
|
|
|
@registry.misc("spacy.CandidateGenerator.v1")
|
|
def create_candidates() -> Callable[[KnowledgeBase, Span], Iterable[Candidate]]:
|
|
return get_candidates
|