mirror of https://github.com/explosion/spaCy.git
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
c087a14380
|
@ -87,11 +87,11 @@ U.S. Federal law. Any choice of law rules will not apply.
|
|||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [ ] I am signing on behalf of my employer or a legal entity and I have the
|
||||
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
|
|
@ -218,7 +218,7 @@ then call its ``load()`` method:
|
|||
import spacy
|
||||
import en_core_web_sm
|
||||
|
||||
nlp = en_core_web_.load()
|
||||
nlp = en_core_web_sm.load()
|
||||
doc = nlp(u'This is a sentence.')
|
||||
|
||||
📖 **For more info and examples, check out the**
|
||||
|
|
1
setup.py
1
setup.py
|
@ -192,6 +192,7 @@ def setup_package():
|
|||
'thinc>=6.10.1,<6.11.0',
|
||||
'plac<1.0.0,>=0.9.6',
|
||||
'six',
|
||||
'html5lib==1.0b8',
|
||||
'pathlib',
|
||||
'ujson>=1.35',
|
||||
'dill>=0.2,<0.3',
|
||||
|
|
|
@ -36,7 +36,7 @@ def init_model(lang, output_dir, freqs_loc, clusters_loc=None, vectors_loc=None,
|
|||
vectors_loc = ensure_path(vectors_loc)
|
||||
|
||||
probs, oov_prob = read_freqs(freqs_loc)
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else None, None
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||
|
||||
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
|
||||
|
@ -69,7 +69,7 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
|
|||
lex_added += 1
|
||||
nlp.vocab.cfg.update({'oov_prob': oov_prob})
|
||||
|
||||
if vectors_data:
|
||||
if len(vectors_data):
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if prune_vectors >= 1:
|
||||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
|
|
|
@ -13,6 +13,12 @@ from ...language import Language
|
|||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
|
||||
# Borrowing french syntax parser because both languages use
|
||||
# universal dependencies for tagging/parsing.
|
||||
# Read here for more:
|
||||
# https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573
|
||||
from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
|
@ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
tag_map = TAG_MAP
|
||||
lemma_lookup = LOOKUP
|
||||
syntax_iterators = SYNTAX_ITERATORS
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import NOUN, PROPN, PRON
|
||||
|
||||
|
||||
def noun_chunks(obj):
|
||||
"""
|
||||
Detect base noun phrases from a dependency parse. Works on both Doc and Span.
|
||||
"""
|
||||
labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss']
|
||||
doc = obj.doc # Ensure works on both Doc and Span.
|
||||
np_deps = [doc.vocab.strings[label] for label in labels]
|
||||
conj = doc.vocab.strings.add('conj')
|
||||
np_label = doc.vocab.strings.add('NP')
|
||||
seen = set()
|
||||
for i, word in enumerate(obj):
|
||||
if word.pos not in (NOUN, PROPN, PRON):
|
||||
continue
|
||||
# Prevent nested chunks from being produced
|
||||
if word.i in seen:
|
||||
continue
|
||||
if word.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
elif word.dep == conj:
|
||||
head = word.head
|
||||
while head.dep == conj and head.head.i < head.i:
|
||||
head = head.head
|
||||
# If the head is an NP, and we're coordinated to it, we're an NP
|
||||
if head.dep in np_deps:
|
||||
if any(w.i in seen for w in word.subtree):
|
||||
continue
|
||||
seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1))
|
||||
yield word.left_edge.i, word.right_edge.i+1, np_label
|
||||
|
||||
|
||||
SYNTAX_ITERATORS = {
|
||||
'noun_chunks': noun_chunks
|
||||
}
|
|
@ -461,7 +461,8 @@ class Language(object):
|
|||
if hasattr(proc, 'begin_training'):
|
||||
proc.begin_training(get_gold_tuples(),
|
||||
pipeline=self.pipeline,
|
||||
sgd=self._optimizer)
|
||||
sgd=self._optimizer,
|
||||
**cfg)
|
||||
return self._optimizer
|
||||
|
||||
def evaluate(self, docs_golds, verbose=False):
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
# coding: utf8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
from ...language import Language
|
||||
|
||||
|
||||
def test_simple_ner():
|
||||
cfg = {
|
||||
'hidden_depth': 2, # should error out
|
||||
}
|
||||
|
||||
nlp = Language()
|
||||
nlp.add_pipe(nlp.create_pipe('ner'))
|
||||
nlp.get_pipe('ner').add_label('answer')
|
||||
try:
|
||||
nlp.begin_training(**cfg)
|
||||
assert False # should error out
|
||||
except ValueError:
|
||||
assert True
|
|
@ -245,7 +245,9 @@ p Check whether an extension has been registered on the #[code Doc] class.
|
|||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
||||
p
|
||||
| Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
||||
| Returns #[code None] if the character indices don't map to a valid span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York')
|
||||
|
@ -276,7 +278,7 @@ p Create a #[code Span] object from the slice #[code doc.text[start : end]].
|
|||
+row("foot")
|
||||
+cell returns
|
||||
+cell #[code Span]
|
||||
+cell The newly constructed object.
|
||||
+cell The newly constructed object or #[code None].
|
||||
|
||||
+h(2, "similarity") Doc.similarity
|
||||
+tag method
|
||||
|
|
|
@ -185,7 +185,7 @@ p
|
|||
|
||||
p
|
||||
| Install a version of the
|
||||
| #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Bulild Tools] or
|
||||
| #[+a("http://landinghub.visualstudio.com/visual-cpp-build-tools") Visual C++ Build Tools] or
|
||||
| #[+a("https://www.visualstudio.com/vs/visual-studio-express/") Visual Studio Express]
|
||||
| that matches the version that was used to compile your Python
|
||||
| interpreter. For official distributions these are:
|
||||
|
|
|
@ -74,7 +74,8 @@ p
|
|||
| #[+a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet].
|
||||
| Simply convert the dependency parse or recognised entities to displaCy's
|
||||
| format and set #[code manual=True] on either #[code render()] or
|
||||
| #[code serve()].
|
||||
| #[code serve()]. When setting #[code ents] manually, make sure to supply
|
||||
| them in the right order, i.e. starting with the lowest start position.
|
||||
|
||||
+aside-code("Example").
|
||||
ex = [{'text': 'But Google is starting from behind.',
|
||||
|
|
Loading…
Reference in New Issue