mirror of https://github.com/explosion/spaCy.git
Remove trailing whitespace
This commit is contained in:
parent
5f0f940a1f
commit
3a8d9b37a6
|
@ -30,5 +30,3 @@ def main(text_loc):
|
|||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
||||
|
||||
|
|
|
@ -132,7 +132,7 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, out_loc="", verbose=False,
|
|||
print 'NER P', scorer.ents_p
|
||||
print 'NER R', scorer.ents_r
|
||||
print 'NER F', scorer.ents_f
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
|
|
@ -7,7 +7,7 @@ from spacy.vocab import write_binary_vectors
|
|||
|
||||
def main(in_loc, out_loc):
|
||||
write_binary_vectors(in_loc, out_loc)
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
plac.call(main)
|
||||
|
|
|
@ -11,5 +11,3 @@ The CLA must be signed on your first pull request. To do this, simply fill in th
|
|||
$ git add -A spaCy/contributors/<your GitHub username>.md
|
||||
|
||||
Now finish your pull request, and you're done.
|
||||
|
||||
|
||||
|
|
|
@ -102,7 +102,7 @@ exts = [
|
|||
Extension("spacy.syntax.arc_eager", ["spacy/syntax/arc_eager.pyx"], **ext_args),
|
||||
Extension("spacy.syntax._parse_features", ["spacy/syntax/_parse_features.pyx"],
|
||||
**ext_args)
|
||||
|
||||
|
||||
#Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
|
||||
#Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
|
||||
|
|
|
@ -28,7 +28,7 @@ API
|
|||
|
||||
|
||||
.. autoclass:: spacy.tokens.Tokens
|
||||
|
||||
|
||||
+---------------+-------------+-------------+
|
||||
| Attribute | Type | Attr API |
|
||||
+===============+=============+=============+
|
||||
|
@ -48,7 +48,7 @@ API
|
|||
For faster access, the underlying C data can be accessed from Cython. You
|
||||
can also export the data to a numpy array, via `Tokens.to_array`, if pure Python
|
||||
access is required, and you need slightly better performance. However, this
|
||||
is both slower and has a worse API than Cython access.
|
||||
is both slower and has a worse API than Cython access.
|
||||
|
||||
|
||||
.. autoclass:: spacy.tokens.Token
|
||||
|
@ -119,7 +119,7 @@ API
|
|||
|
||||
shape
|
||||
A transform of the word's string, to show orthographic features. The
|
||||
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
|
||||
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
|
||||
After these mappings, sequences of 4 or more of the same character are
|
||||
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||
:) --> :)
|
||||
|
@ -161,7 +161,7 @@ API
|
|||
pos
|
||||
A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB,
|
||||
ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech.
|
||||
|
||||
|
||||
dep
|
||||
The type of syntactic dependency relation between the word and its
|
||||
syntactic head.
|
||||
|
@ -185,10 +185,10 @@ API
|
|||
|
||||
rights
|
||||
An iterator for the immediate rightward syntactic children of the word.
|
||||
|
||||
|
||||
children
|
||||
An iterator that yields from lefts, and then yields from rights.
|
||||
|
||||
|
||||
subtree
|
||||
An iterator for the part of the sentence syntactically governed by the
|
||||
word, including the word itself.
|
||||
|
@ -205,15 +205,15 @@ API
|
|||
.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None)
|
||||
|
||||
.. py:method:: __len__(self) --> int
|
||||
|
||||
|
||||
.. py:method:: __getitem__(self, id: int) --> unicode
|
||||
|
||||
|
||||
.. py:method:: __getitem__(self, string: unicode) --> int
|
||||
|
||||
|
||||
.. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None
|
||||
|
||||
.. py:method:: dump(self, loc: unicode) --> None
|
||||
|
||||
|
||||
.. py:method:: load_lexemes(self, loc: unicode) --> None
|
||||
|
||||
.. py:method:: load_vectors(self, loc: unicode) --> None
|
||||
|
@ -223,9 +223,9 @@ API
|
|||
.. py:method:: __len__(self) --> int
|
||||
|
||||
.. py:method:: __getitem__(self, id: int) --> unicode
|
||||
|
||||
|
||||
.. py:method:: __getitem__(self, string: bytes) --> id
|
||||
|
||||
|
||||
.. py:method:: __getitem__(self, string: unicode) --> id
|
||||
|
||||
.. py:method:: dump(self, loc: unicode) --> None
|
||||
|
|
|
@ -75,4 +75,3 @@ Boolean features
|
|||
+-------------+--------------------------------------------------------------+
|
||||
| IN_LIST | Facility for loading arbitrary run-time word lists? |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
|
||||
|
|
|
@ -68,4 +68,3 @@ Cons:
|
|||
- Higher memory usage (up to 1gb)
|
||||
- More conceptually complicated
|
||||
- Tokenization rules expressed in code, not as data
|
||||
|
||||
|
|
|
@ -122,7 +122,7 @@ it is, we stop splitting, and return the tokenization at that point.
|
|||
The advantage of this design is that the prefixes, suffixes and special-cases
|
||||
can be declared separately, in easy-to-understand files. If a new entry is
|
||||
added to the special-cases, you can be sure that it won't have some unforeseen
|
||||
consequence to a complicated regular-expression grammar.
|
||||
consequence to a complicated regular-expression grammar.
|
||||
|
||||
Coupling the Tokenizer and Lexicon
|
||||
##################################
|
||||
|
@ -159,7 +159,7 @@ Dependency Parser
|
|||
|
||||
The parser uses the algorithm described in my `2014 blog post`_.
|
||||
This algorithm, shift-reduce dependency parsing, is becoming widely adopted due
|
||||
to its compelling speed/accuracy trade-off.
|
||||
to its compelling speed/accuracy trade-off.
|
||||
|
||||
Some quick details about spaCy's take on this, for those who happen to know
|
||||
these models well. I'll write up a better description shortly.
|
||||
|
@ -176,7 +176,7 @@ scored 91.0. So how have I gotten it to 92.4? The following tweaks:
|
|||
1. I use Brown cluster features --- these help a lot;
|
||||
2. I redesigned the feature set. I've long known that the Zhang and Nivre
|
||||
(2011) feature set was suboptimal, but a few features don't make a very
|
||||
compelling publication. Still, they're important.
|
||||
compelling publication. Still, they're important.
|
||||
3. When I do the dynamic oracle training, I also make
|
||||
the upate cost-sensitive: if the oracle determines that the move the parser
|
||||
took has a cost of N, then the weights for the gold class are incremented by
|
||||
|
@ -253,12 +253,10 @@ the classes. In the case of the parser, this means the hash table is accessed
|
|||
2NKC times, instead of the 2NK times if you have a weights vector. You should
|
||||
also be careful to store the weights contiguously in memory --- you don't want
|
||||
a linked list here. I use a block-sparse format, because my problems tend to
|
||||
have a few dozen classes.
|
||||
have a few dozen classes.
|
||||
|
||||
I guess if I had to summarize my experience, I'd say that the efficiency of
|
||||
these models is really all about the data structures. We want to stay small,
|
||||
and stay contiguous. Minimize redundancy and minimize pointer chasing.
|
||||
That's why Cython is so well suited to this: we get to lay out our data
|
||||
structures, and manage the memory ourselves, with full C-level control.
|
||||
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ it, link it, filter it, categorise it, generate it and correct it.
|
|||
|
||||
spaCy provides a library of utility functions that help programmers build such
|
||||
products. It's commercial open source software: you can either use it under
|
||||
the AGPL, or you can `buy a commercial license`_ for a one-time fee.
|
||||
the AGPL, or you can `buy a commercial license`_ for a one-time fee.
|
||||
|
||||
.. _buy a commercial license: license.html
|
||||
|
||||
|
@ -148,7 +148,7 @@ cosine metric:
|
|||
|
||||
>>> from numpy import dot
|
||||
>>> from numpy.linalg import norm
|
||||
|
||||
|
||||
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
>>> words = [w for w in nlp.vocab if w.has_repvec]
|
||||
>>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
|
@ -200,7 +200,7 @@ this:
|
|||
|
||||
|
||||
|
||||
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
||||
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
||||
of communication, like "pleaded", were highlighted. We've now built a vector that
|
||||
represents that type of word, so now we can highlight adverbs based on very
|
||||
subtle logic, honing in on adverbs that seem the most stylistically
|
||||
|
@ -213,7 +213,7 @@ problematic, given our starting assumptions:
|
|||
>>> from spacy.parts_of_speech import ADV, VERB
|
||||
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
>>> def is_bad_adverb(token, target_verb, tol):
|
||||
... if token.pos != ADV
|
||||
... if token.pos != ADV
|
||||
... return False
|
||||
... elif token.head.pos != VERB:
|
||||
... return False
|
||||
|
@ -238,11 +238,11 @@ database, and processed with an NLP library, to one of three levels of detail
|
|||
--- tokenization, tagging, or parsing. The tasks are additive: to parse the
|
||||
text you have to tokenize and tag it. The pre-processing was not subtracted
|
||||
from the times --- I report the time required for the pipeline to complete.
|
||||
I report mean times per document, in milliseconds.
|
||||
I report mean times per document, in milliseconds.
|
||||
|
||||
**Hardware**: Intel i7-3770 (2012)
|
||||
|
||||
.. table:: Efficiency comparison. Lower is better.
|
||||
.. table:: Efficiency comparison. Lower is better.
|
||||
|
||||
+--------------+---------------------------+--------------------------------+
|
||||
| | Absolute (ms per doc) | Relative (to spaCy) |
|
||||
|
@ -287,7 +287,7 @@ representations.
|
|||
.. spaCy is based on science, not alchemy. It's open source, and I am happy to
|
||||
clarify any detail of the algorithms I've implemented.
|
||||
It's evaluated against the current best published systems, following the standard
|
||||
methodologies. These evaluations show that it performs extremely well.
|
||||
methodologies. These evaluations show that it performs extremely well.
|
||||
|
||||
Accuracy Comparison
|
||||
-------------------
|
||||
|
@ -299,7 +299,7 @@ Accuracy Comparison
|
|||
+--------------+----------+------------+
|
||||
| spaCy | 97.2 | 92.4 |
|
||||
+--------------+----------+------------+
|
||||
| CoreNLP | 96.9 | 92.2 |
|
||||
| CoreNLP | 96.9 | 92.2 |
|
||||
+--------------+----------+------------+
|
||||
| ZPar | 97.3 | 92.9 |
|
||||
+--------------+----------+------------+
|
||||
|
@ -329,5 +329,5 @@ previous fastest parser that I'm aware of.
|
|||
quickstart.rst
|
||||
api.rst
|
||||
howworks.rst
|
||||
license.rst
|
||||
license.rst
|
||||
updates.rst
|
||||
|
|
|
@ -97,7 +97,7 @@ like lead-text take a while to float up the priority list. This strategy also h
|
|||
the advantage of transparency: it's obvious to users how the decision is being
|
||||
made, so nobody is likely to complain about the feature if it works this way.
|
||||
|
||||
Instead of cutting off the text mid-word, we can tokenize the text, and
|
||||
Instead of cutting off the text mid-word, we can tokenize the text, and
|
||||
|
||||
+----------------+-----------+
|
||||
| System | Rouge-1 R |
|
||||
|
@ -116,7 +116,7 @@ A simple bag-of-words model can be created using the `count_by` method, which
|
|||
produces a dictionary of frequencies, keyed by string IDs:
|
||||
|
||||
.. code:: python
|
||||
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> from spacy.en.attrs import SIC
|
||||
>>> nlp = English()
|
||||
|
@ -148,7 +148,7 @@ from any token:
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
|
||||
|
||||
|
@ -196,8 +196,8 @@ undirected --- so, it's natural to represent this as a matrix:
|
|||
|
||||
from scipy.spatial.distance import cosine
|
||||
import numpy
|
||||
|
||||
|
||||
|
||||
|
||||
def lexrank(sent_vectors):
|
||||
n = len(sent_vectors)
|
||||
# Build the cosine similarity matrix
|
||||
|
@ -205,7 +205,7 @@ undirected --- so, it's natural to represent this as a matrix:
|
|||
for i in range(n):
|
||||
for j in range(n):
|
||||
matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j])
|
||||
# Normalize
|
||||
# Normalize
|
||||
for i in range(n):
|
||||
matrix[i] /= sum(matrix[i])
|
||||
return _pagerank(matrix)
|
||||
|
@ -278,6 +278,3 @@ sentence represents the document as a whole.
|
|||
|
||||
Document Model
|
||||
--------------
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@ I've been writing spaCy for six months now, and I'm very excited to release it.
|
|||
I think it's the most valuable thing I could have built. When I was in
|
||||
academia, I noticed that small companies couldn't really make use of our work.
|
||||
Meanwhile the tech giants have been hiring *everyone*, and putting this stuff
|
||||
into production. I think spaCy can change that.
|
||||
into production. I think spaCy can change that.
|
||||
|
||||
|
||||
+------------+-----------+----------+-------------------------------------+
|
||||
|
@ -52,14 +52,14 @@ Examples
|
|||
--------
|
||||
|
||||
In order to clarify how spaCy's license structure might apply to you, I've
|
||||
written a few examples, in the form of user-stories.
|
||||
written a few examples, in the form of user-stories.
|
||||
|
||||
Ashley and Casey: Seed stage start-up
|
||||
#####################################
|
||||
|
||||
Ashley and Casey have an idea for a start-up. To explore their idea, they want
|
||||
to build a minimum viable product they can put in front of potential users and
|
||||
investors.
|
||||
investors.
|
||||
|
||||
They have two options.
|
||||
|
||||
|
@ -75,7 +75,7 @@ They have two options.
|
|||
import a module that imports it, etc). They also cannot use spaCy as
|
||||
a network resource, by running it as a service --- this is the
|
||||
loophole that the "A" part of the AGPL is designed to close.
|
||||
|
||||
|
||||
Ashley and Casey find the AGPL license unattractive for commercial use.
|
||||
They decide to take up the trial commercial license.
|
||||
However, over the next 90 days, Ashley has to move house twice, and Casey gets
|
||||
|
|
|
@ -18,7 +18,7 @@ With Python 2.7 or Python 3, using Linux or OSX, run:
|
|||
.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz
|
||||
|
||||
|
||||
The download command fetches and installs about 300mb of data, for the
|
||||
The download command fetches and installs about 300mb of data, for the
|
||||
parser model and word vectors, which it installs within the spacy.en package directory.
|
||||
|
||||
If you're stuck using a server with an old version of Python, and you don't
|
||||
|
@ -88,7 +88,7 @@ the original orthographic form of the word.
|
|||
|
||||
.. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data'))
|
||||
|
||||
.. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens
|
||||
.. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Tokens
|
||||
|
||||
+-----------------+--------------+--------------+
|
||||
| Attribute | Type | Its API |
|
||||
|
@ -121,7 +121,7 @@ the original orthographic form of the word.
|
|||
**Get sentence or named entity spans**
|
||||
|
||||
.. py:attribute:: tokens.Tokens.sents --> Iterator[Span]
|
||||
|
||||
|
||||
.. py:attribute:: tokens.Tokens.ents --> Iterator[Span]
|
||||
|
||||
You can iterate over a Span to access individual Tokens, or access its
|
||||
|
@ -131,7 +131,7 @@ the original orthographic form of the word.
|
|||
**Embedded word representenations**
|
||||
|
||||
.. py:attribute:: tokens.Token.repvec
|
||||
|
||||
|
||||
.. py:attribute:: lexeme.Lexeme.repvec
|
||||
|
||||
|
||||
|
@ -150,13 +150,13 @@ the original orthographic form of the word.
|
|||
**Align to original string**
|
||||
|
||||
.. py:attribute:: string: unicode
|
||||
|
||||
|
||||
Padded with original whitespace.
|
||||
|
||||
.. py:attribute:: length: int
|
||||
|
||||
Length, in unicode code-points. Equal to len(self.orth_).
|
||||
|
||||
|
||||
.. py:attribute:: idx: int
|
||||
|
||||
Starting offset of word in the original string.
|
||||
|
@ -234,4 +234,3 @@ Features
|
|||
+---------+-----------------------------------------------------------+
|
||||
| prob | Log probability of word, smoothed with Simple Good-Turing |
|
||||
+---------+-----------------------------------------------------------+
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ Bug Fixes
|
|||
all look-ups into the vocabulary failed on wide unicode builds, which
|
||||
further meant that the part-of-speech tagger and parser features were not
|
||||
computed correctly.
|
||||
|
||||
|
||||
The fix is simple: we already have to read in a list of all the strings, so
|
||||
just store an index into that list, instead of a hash.
|
||||
|
||||
|
@ -36,7 +36,7 @@ Bug Fixes
|
|||
and we want to freely navigate up and down it without creating reference
|
||||
cycles that inhibit garbage collection, and without doing a lot of copying,
|
||||
creating and deleting.
|
||||
|
||||
|
||||
I think I've got a promising solution to this, but I suspect there's
|
||||
currently a memory leak. Please get in touch no the tracker if you want to
|
||||
know more, especially if you think you can help.
|
||||
|
@ -60,7 +60,7 @@ Most English parsing research is performed on text with perfect pre-processing:
|
|||
one newline between every sentence, one space between every token.
|
||||
It's always been done this way, and it's good. It's a useful idealisation,
|
||||
because the pre-processing has few algorithmic implications.
|
||||
|
||||
|
||||
But, for practical performance, this stuff can matter a lot.
|
||||
Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few
|
||||
parsers on raw text. Even on the standard Wall Street Journal corpus,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"PRP": {
|
||||
"I": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 1},
|
||||
"me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3},
|
||||
"me": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 3},
|
||||
"mine": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 2},
|
||||
"myself": {"L": "-PRON-", "person": 1, "number": 1, "gender": 0, "case": 4},
|
||||
"you": {"L": "-PRON-", "person": 2, "number": 0, "gender": 0, "case": 0},
|
||||
|
|
2
setup.py
2
setup.py
|
@ -150,7 +150,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
|
|||
'spacy.morphology',
|
||||
'spacy._ml', 'spacy.tokenizer', 'spacy.en.attrs',
|
||||
'spacy.en.pos', 'spacy.syntax.parser', 'spacy.syntax._state',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.transition_system',
|
||||
'spacy.syntax.arc_eager', 'spacy.syntax._parse_features',
|
||||
'spacy.syntax.conll', 'spacy.orth',
|
||||
'spacy.syntax.ner']
|
||||
|
|
|
@ -33,7 +33,7 @@ cdef class Model:
|
|||
cdef class HastyModel:
|
||||
cdef Pool mem
|
||||
cdef weight_t* _scores
|
||||
|
||||
|
||||
cdef const weight_t* score(self, atom_t* context) except NULL
|
||||
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
|
||||
|
||||
|
|
|
@ -79,5 +79,3 @@ cpdef enum attr_id_t:
|
|||
POS
|
||||
TAG
|
||||
DEP
|
||||
|
||||
|
||||
|
|
|
@ -129,19 +129,19 @@ class English(object):
|
|||
entity=parse_if_model_present, merge_mwes=False):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
|
||||
|
||||
The tagger and parser are lazy-loaded the first time they are required.
|
||||
Loading the parser model usually takes 5-10 seconds.
|
||||
|
||||
|
||||
Args:
|
||||
text (unicode): The text to be processed.
|
||||
|
||||
Keyword args:
|
||||
tag (bool): Whether to add part-of-speech tags to the text. Also
|
||||
sets morphological analysis and lemmas.
|
||||
|
||||
|
||||
parse (True, False, -1): Whether to add labelled syntactic dependencies.
|
||||
|
||||
|
||||
-1 (default) is "guess": It will guess True if tag=True and the
|
||||
model has been installed.
|
||||
|
||||
|
|
|
@ -39,7 +39,7 @@ def install_parser_model(url, dest_dir):
|
|||
def install_dep_vectors(url, dest_dir):
|
||||
if not os.path.exists(dest_dir):
|
||||
os.mkdir(dest_dir)
|
||||
|
||||
|
||||
filename = download_file(url, dest_dir)
|
||||
|
||||
|
||||
|
|
|
@ -22,4 +22,3 @@ cdef class EnPosTagger:
|
|||
|
||||
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
||||
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
||||
|
||||
|
|
|
@ -353,7 +353,7 @@ cdef class EnPosTagger:
|
|||
cached.lemma = self.strings[lemma_str]
|
||||
set_morph_from_dict(&cached.morph, props)
|
||||
self._morph_cache.set(pos, orth, <void*>cached)
|
||||
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
||||
|
@ -381,4 +381,3 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
context[7] = 4
|
||||
else:
|
||||
context[7] = 0
|
||||
|
||||
|
|
|
@ -12,7 +12,7 @@ cdef LexemeC EMPTY_LEXEME
|
|||
|
||||
cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings,
|
||||
const float* empty_vec) except -1
|
||||
|
||||
|
||||
cdef class Lexeme:
|
||||
cdef readonly ndarray repvec
|
||||
|
||||
|
|
|
@ -17,12 +17,12 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
|||
const float* empty_vec) except -1:
|
||||
lex.length = props['length']
|
||||
lex.orth = string_store[props['orth']]
|
||||
lex.lower = string_store[props['lower']]
|
||||
lex.norm = string_store[props['norm']]
|
||||
lex.shape = string_store[props['shape']]
|
||||
lex.lower = string_store[props['lower']]
|
||||
lex.norm = string_store[props['norm']]
|
||||
lex.shape = string_store[props['shape']]
|
||||
lex.prefix = string_store[props['prefix']]
|
||||
lex.suffix = string_store[props['suffix']]
|
||||
|
||||
|
||||
lex.cluster = props['cluster']
|
||||
lex.prob = props['prob']
|
||||
lex.sentiment = props['sentiment']
|
||||
|
|
|
@ -58,10 +58,10 @@ LOCAL = (
|
|||
(N3.sic,),
|
||||
(P4.sic,),
|
||||
(N4.sic,),
|
||||
|
||||
|
||||
(P1.sic, N0.sic,),
|
||||
(N0.sic, N1.sic),
|
||||
|
||||
|
||||
(N0.prefix,),
|
||||
(N0.suffix,),
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ cdef class NERAnnotation:
|
|||
memset(self.starts, -1, sizeof(int) * length)
|
||||
memset(self.ends, -1, sizeof(int) * length)
|
||||
memset(self.labels, -1, sizeof(int) * length)
|
||||
|
||||
|
||||
cdef int start, end, label
|
||||
for start, end, label in entities:
|
||||
for i in range(start, end):
|
||||
|
|
|
@ -107,7 +107,7 @@ cdef bint is_oracle(ActionType act, int tag, ActionType g_act, int g_tag,
|
|||
# U, Gold L --> False
|
||||
# U, Gold O --> False
|
||||
return False
|
||||
|
||||
|
||||
|
||||
cdef int set_accept_if_valid(Move* moves, int n_classes, State* s) except 0:
|
||||
cdef int n_accept = 0
|
||||
|
@ -160,7 +160,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
|||
cdef int best = first_accept
|
||||
cdef weight_t score = scores[first_accept-1]
|
||||
cdef int i
|
||||
for i in range(first_accept+1, n):
|
||||
for i in range(first_accept+1, n):
|
||||
if moves[i].accept and scores[i-1] > score:
|
||||
best = i
|
||||
score = scores[i-1]
|
||||
|
@ -179,7 +179,7 @@ cdef int transition(State *s, Move* move) except -1:
|
|||
end_entity(s)
|
||||
elif move.action == OUT:
|
||||
pass
|
||||
s.tags[s.i] = move.clas
|
||||
s.tags[s.i] = move.clas
|
||||
s.i += 1
|
||||
|
||||
|
||||
|
|
|
@ -149,5 +149,3 @@ cpdef enum:
|
|||
|
||||
|
||||
cdef int fill_context(atom_t* context, State* s, Tokens tokens) except -1
|
||||
|
||||
|
||||
|
|
|
@ -18,7 +18,7 @@ cdef int _fill_token(atom_t* c, Lexeme* lex, atom_t pos):
|
|||
c[T_postype] = lex.postype
|
||||
c[T_nertype] = 0
|
||||
c[T_sensetype] = 0
|
||||
|
||||
|
||||
c[T_is_alpha] = lex.flags & (1 << IS_ALPHA)
|
||||
c[T_is_digit] = lex.flags & (1 << IS_DIGIT)
|
||||
c[T_is_lower] = lex.flags & (1 << IS_LOWER)
|
||||
|
|
|
@ -7,10 +7,10 @@ LOCAL = (
|
|||
(N1_sic,),
|
||||
(P2_sic,),
|
||||
(N2_sic,),
|
||||
|
||||
|
||||
(P1_sic, W_sic,),
|
||||
(W_sic, N1_sic),
|
||||
|
||||
|
||||
(W_prefix,),
|
||||
(W_suffix,),
|
||||
|
||||
|
|
|
@ -92,7 +92,7 @@ cdef class NERParser:
|
|||
fill_context(self._context, s, tokens)
|
||||
self.extractor.extract(self._feats, self._values, self._context, NULL)
|
||||
self.model.score(self._scores, self._feats, self._values)
|
||||
|
||||
|
||||
set_accept_if_valid(self._moves, self.n_classes, s)
|
||||
guess = best_accepted(self._moves, self._scores, self.n_classes)
|
||||
assert guess.clas != 0
|
||||
|
|
|
@ -16,7 +16,7 @@ cpdef enum ActionType:
|
|||
|
||||
cdef int set_accept_if_oracle(Move* moves, int n, State* s,
|
||||
int* g_starts, int* g_ends, int* g_labels) except 0
|
||||
|
||||
|
||||
cdef int set_accept_if_valid(Move* moves, int n, State* s) except 0
|
||||
|
||||
cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL
|
||||
|
|
|
@ -97,7 +97,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
|||
cdef int best = first_accept
|
||||
cdef weight_t score = scores[first_accept-1]
|
||||
cdef int i
|
||||
for i in range(first_accept+1, n):
|
||||
for i in range(first_accept+1, n):
|
||||
if moves[i].accept and scores[i-1] > score:
|
||||
best = i
|
||||
score = scores[i-1]
|
||||
|
@ -105,7 +105,7 @@ cdef Move* best_accepted(Move* moves, weight_t* scores, int n) except NULL:
|
|||
|
||||
|
||||
cdef int transition(State *s, Move* move) except -1:
|
||||
s.tags[s.i] = move.clas
|
||||
s.tags[s.i] = move.clas
|
||||
if move.action == OUT:
|
||||
s.i += 1
|
||||
elif move.action == SHIFT:
|
||||
|
|
|
@ -8,7 +8,7 @@ cdef class PyState:
|
|||
cdef readonly list tag_names
|
||||
cdef readonly int n_classes
|
||||
cdef readonly dict moves_by_name
|
||||
|
||||
|
||||
cdef Move* _moves
|
||||
cdef Move* _golds
|
||||
cdef State* _s
|
||||
|
|
|
@ -33,7 +33,7 @@ class Scorer(object):
|
|||
@property
|
||||
def ents_r(self):
|
||||
return (self.ents_tp / (self.ents_tp + self.ents_fn + 1e-100)) * 100
|
||||
|
||||
|
||||
@property
|
||||
def ents_f(self):
|
||||
return (2 * self.ents_p * self.ents_r) / (self.ents_p + self.ents_r + 1e-100)
|
||||
|
|
|
@ -5,7 +5,7 @@ from .structs cimport Morphology, TokenC, LexemeC
|
|||
from .vocab cimport Vocab
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
|
||||
cdef class Span:
|
||||
cdef readonly Tokens _seq
|
||||
cdef public int i
|
||||
|
@ -15,4 +15,3 @@ cdef class Span:
|
|||
cdef public Span head
|
||||
cdef public list rights
|
||||
cdef public list lefts
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ cdef struct LexemeC:
|
|||
const float* repvec
|
||||
|
||||
flags_t flags
|
||||
|
||||
|
||||
attr_t id
|
||||
attr_t length
|
||||
|
||||
|
@ -18,7 +18,7 @@ cdef struct LexemeC:
|
|||
attr_t shape
|
||||
attr_t prefix
|
||||
attr_t suffix
|
||||
|
||||
|
||||
attr_t cluster
|
||||
|
||||
float prob
|
||||
|
|
|
@ -99,7 +99,7 @@ cpdef enum:
|
|||
S0_shape
|
||||
S0_ne_iob
|
||||
S0_ne_type
|
||||
|
||||
|
||||
S0r2w
|
||||
S0r2W
|
||||
S0r2p
|
||||
|
@ -164,7 +164,7 @@ cpdef enum:
|
|||
N0_shape
|
||||
N0_ne_iob
|
||||
N0_ne_type
|
||||
|
||||
|
||||
N1w
|
||||
N1W
|
||||
N1p
|
||||
|
@ -190,7 +190,7 @@ cpdef enum:
|
|||
N2_shape
|
||||
N2_ne_iob
|
||||
N2_ne_type
|
||||
|
||||
|
||||
P1w
|
||||
P1W
|
||||
P1p
|
||||
|
@ -203,7 +203,7 @@ cpdef enum:
|
|||
P1_shape
|
||||
P1_ne_iob
|
||||
P1_ne_type
|
||||
|
||||
|
||||
P2w
|
||||
P2W
|
||||
P2p
|
||||
|
@ -216,7 +216,7 @@ cpdef enum:
|
|||
P2_shape
|
||||
P2_ne_iob
|
||||
P2_ne_type
|
||||
|
||||
|
||||
E0w
|
||||
E0W
|
||||
E0p
|
||||
|
@ -229,7 +229,7 @@ cpdef enum:
|
|||
E0_shape
|
||||
E0_ne_iob
|
||||
E0_ne_type
|
||||
|
||||
|
||||
E1w
|
||||
E1W
|
||||
E1p
|
||||
|
@ -242,7 +242,7 @@ cpdef enum:
|
|||
E1_shape
|
||||
E1_ne_iob
|
||||
E1_ne_type
|
||||
|
||||
|
||||
# Misc features at the end
|
||||
dist
|
||||
N0lv
|
||||
|
|
|
@ -111,10 +111,10 @@ ner = (
|
|||
(N1W,),
|
||||
(P2W,),
|
||||
(N2W,),
|
||||
|
||||
|
||||
(P1W, N0W,),
|
||||
(N0W, N1W),
|
||||
|
||||
|
||||
(N0_prefix,),
|
||||
(N0_suffix,),
|
||||
|
||||
|
@ -205,22 +205,22 @@ ner = (
|
|||
unigrams = (
|
||||
(S2W, S2p),
|
||||
(S2c6, S2p),
|
||||
|
||||
|
||||
(S1W, S1p),
|
||||
(S1c6, S1p),
|
||||
|
||||
(S0W, S0p),
|
||||
(S0c6, S0p),
|
||||
|
||||
|
||||
(N0W, N0p),
|
||||
(N0p,),
|
||||
(N0c,),
|
||||
(N0c6, N0p),
|
||||
(N0L,),
|
||||
|
||||
|
||||
(N1W, N1p),
|
||||
(N1c6, N1p),
|
||||
|
||||
|
||||
(N2W, N2p),
|
||||
(N2c6, N2p),
|
||||
|
||||
|
@ -316,7 +316,7 @@ trigrams = (
|
|||
(S0p, S0lp, N0p),
|
||||
(S0p, N0p, N0lp),
|
||||
(N0p, N0lp, N0l2p),
|
||||
|
||||
|
||||
(S0W, S0p, S0rL, S0r2L),
|
||||
(S0p, S0rL, S0r2L),
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ cdef int pop_stack(State *s) except -1:
|
|||
s.stack -= 1
|
||||
if s.stack_len == 0 and not at_eol(s):
|
||||
push_stack(s)
|
||||
|
||||
|
||||
|
||||
cdef int push_stack(State *s) except -1:
|
||||
assert s.i < s.sent_len
|
||||
|
|
|
@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
|
|||
from thinc.typedefs cimport weight_t
|
||||
|
||||
|
||||
from ._state cimport State
|
||||
from ._state cimport State
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
|
||||
|
||||
|
|
|
@ -280,5 +280,3 @@ class OracleError(Exception):
|
|||
|
||||
class UnknownMove(Exception):
|
||||
pass
|
||||
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ from . import _parse_features
|
|||
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
||||
|
||||
|
||||
DEBUG = False
|
||||
DEBUG = False
|
||||
def set_debug(val):
|
||||
global DEBUG
|
||||
DEBUG = val
|
||||
|
@ -112,7 +112,7 @@ cdef class GreedyParser:
|
|||
scores = self.model.score(context)
|
||||
guess = self.moves.best_valid(scores, state)
|
||||
best = self.moves.best_gold(scores, state, gold)
|
||||
|
||||
|
||||
cost = guess.get_cost(&guess, state, gold)
|
||||
self.model.update(context, guess.clas, best.clas, cost)
|
||||
|
||||
|
|
|
@ -33,16 +33,16 @@ cdef class TransitionSystem:
|
|||
cdef int first_state(self, State* state) except -1
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *
|
||||
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||
|
||||
cdef Transition best_valid(self, const weight_t* scores, const State* state) except *
|
||||
|
||||
cdef Transition best_gold(self, const weight_t* scores, const State* state,
|
||||
GoldParse gold) except *
|
||||
|
||||
|
||||
|
||||
#cdef class PyState:
|
||||
# """Provide a Python class for testing purposes."""
|
||||
|
|
|
@ -13,5 +13,3 @@ class Config(object):
|
|||
@classmethod
|
||||
def read(cls, model_dir, name):
|
||||
return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
|
||||
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ cdef class Tokenizer:
|
|||
split off a suffix, and repeat.
|
||||
|
||||
Args:
|
||||
string (unicode): The string to be tokenized.
|
||||
string (unicode): The string to be tokenized.
|
||||
|
||||
Returns:
|
||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
|
||||
|
@ -213,7 +213,7 @@ cdef class Tokenizer:
|
|||
cdef unicode string = chars[:length]
|
||||
match = self._infix_re.search(string)
|
||||
return match.start() if match is not None else 0
|
||||
|
||||
|
||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||
cdef unicode string = chars[:length]
|
||||
match = self._prefix_re.search(string)
|
||||
|
|
|
@ -31,9 +31,9 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
|
|||
cdef class Tokens:
|
||||
cdef Pool mem
|
||||
cdef Vocab vocab
|
||||
|
||||
|
||||
cdef TokenC* data
|
||||
|
||||
|
||||
|
||||
cdef list _py_tokens
|
||||
cdef unicode _string
|
||||
|
@ -61,7 +61,7 @@ cdef class Token:
|
|||
cdef int array_len
|
||||
cdef bint _owns_c_data
|
||||
|
||||
|
||||
|
||||
cdef Tokens _seq
|
||||
|
||||
@staticmethod
|
||||
|
|
|
@ -104,10 +104,10 @@ cdef class Tokens:
|
|||
|
||||
def __getitem__(self, object i):
|
||||
"""Retrieve a token.
|
||||
|
||||
|
||||
The Python Token objects are created lazily from internal C data, and
|
||||
cached in _py_tokens
|
||||
|
||||
|
||||
Returns:
|
||||
token (Token):
|
||||
"""
|
||||
|
@ -180,7 +180,7 @@ cdef class Tokens:
|
|||
yield Span(self, start, i+1)
|
||||
start = None
|
||||
if start is not None:
|
||||
yield Span(self, start, self.length)
|
||||
yield Span(self, start, self.length)
|
||||
|
||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||
if self.length == self.max_length:
|
||||
|
@ -298,7 +298,7 @@ cdef class Tokens:
|
|||
# What to do about morphology??
|
||||
# TODO: token.morph = ???
|
||||
token.tag = self.vocab.strings[tag]
|
||||
token.lemma = self.vocab.strings[lemma]
|
||||
token.lemma = self.vocab.strings[lemma]
|
||||
if ent_type == 'O':
|
||||
token.ent_iob = 2
|
||||
token.ent_type = 0
|
||||
|
@ -355,7 +355,7 @@ cdef class Tokens:
|
|||
self._py_tokens = [None] * self.length
|
||||
# Return the merged Python object
|
||||
return self[start]
|
||||
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||
|
@ -608,4 +608,3 @@ _parse_unset_error = """Text has not been parsed, so cannot be accessed.
|
|||
Check that the parser data is installed. Run "python -m spacy.en.download" if not.
|
||||
Check whether parse=False in the call to English.__call__
|
||||
"""
|
||||
|
||||
|
|
|
@ -94,5 +94,3 @@ ctypedef uint64_t flags_t
|
|||
ctypedef uint32_t id_t
|
||||
ctypedef uint16_t len_t
|
||||
ctypedef uint16_t tag_t
|
||||
|
||||
|
||||
|
|
|
@ -71,7 +71,7 @@ def read_detoken_rules(lang):
|
|||
for line in file_:
|
||||
entries.append(line.strip())
|
||||
return entries
|
||||
|
||||
|
||||
|
||||
def align_tokens(ref, indices):
|
||||
start = 0
|
||||
|
@ -87,7 +87,7 @@ def align_tokens(ref, indices):
|
|||
|
||||
|
||||
def detokenize(token_rules, words):
|
||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||
chunk should be a tuple of token indices, e.g.
|
||||
|
||||
|
|
|
@ -31,6 +31,5 @@ cdef class Vocab:
|
|||
|
||||
cdef const LexemeC* get(self, Pool mem, UniStr* s) except NULL
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
|
||||
|
||||
|
||||
cdef PreshMap _map
|
||||
|
||||
|
|
|
@ -170,7 +170,7 @@ cdef class Vocab:
|
|||
self.lexemes[lexeme.id] = lexeme
|
||||
i += 1
|
||||
fclose(fp)
|
||||
|
||||
|
||||
def load_rep_vectors(self, loc):
|
||||
file_ = _CFile(loc, b'rb')
|
||||
cdef int32_t word_len
|
||||
|
@ -187,7 +187,7 @@ cdef class Vocab:
|
|||
except IOError:
|
||||
break
|
||||
file_.read(&vec_len, sizeof(vec_len), 1)
|
||||
|
||||
|
||||
mem = Address(word_len, sizeof(char))
|
||||
chars = <char*>mem.ptr
|
||||
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
|
||||
The Sun is the star at the center of the Solar System. It is almost perfectly spherical and consists of hot plasma interwoven with magnetic fields. [ 12 ] [ 13 ] It has a diameter of about 1 , 392 , 684 km ( 865 , 374 mi ) , [ 5 ] around 109 times that of Earth , and its mass ( 1.989×1030 kilograms , approximately 330 , 000 times the mass of Earth ) accounts for about 99.86 % of the total mass of the Solar System. [ 14 ] Chemically , about three quarters of the Sun 's mass consists of hydrogen , while the rest is mostly helium. The remaining 1.69 % ( equal to 5 , 600 times the mass of Earth ) consists of heavier elements , including oxygen , carbon , neon and iron , among others. [ 15 ]
|
||||
|
||||
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
|
||||
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
|
||||
The Sun formed about 4.567 billion [ a ] [ 16 ] years ago from the gravitational collapse of a region within a large molecular cloud. Most of the matter gathered in the center , while the rest flattened into an orbiting disk that would become the Solar System. The central mass became increasingly hot and dense , eventually initiating thermonuclear fusion in its core. It is thought that almost all stars form by this process. The Sun is a G-type main-sequence star ( G2V ) based on spectral class and it is informally designated as a yellow dwarf because its visible radiation is most intense in the yellow-green portion of the spectrum , and although it is actually white in color , from the surface of the Earth it may appear yellow because of atmospheric scattering of blue light. [ 17 ] In the spectral class label , G2 indicates its surface temperature , of approximately 5778 K ( 5505 °C ) , and V indicates that the Sun , like most stars , is a main-sequence star , and thus generates its energy by nuclear fusion of hydrogen nuclei into helium. In its core , the Sun fuses about 620 million metric tons of hydrogen each second. [ 18 ] [ 19 ]
|
||||
Once regarded by astronomers as a small and relatively insignificant star , the Sun is now thought to be brighter than about 85 % of the stars in the Milky Way , most of which are red dwarfs. [ 20 ] [ 21 ] The absolute magnitude of the Sun is +4.83 ; however , as the star closest to Earth , the Sun is by far the brightest object in the sky with an apparent magnitude of −26.74. [ 22 ] [ 23 ] This is about 13 billion times brighter than the next brightest star , Sirius , with an apparent magnitude of −1.46. The Sun 's hot corona continuously expands in space creating the solar wind , a stream of charged particles that extends to the heliopause at roughly 100 astronomical units. The bubble in the interstellar medium formed by the solar wind , the heliosphere , is the largest continuous structure in the Solar System. [ 24 ] [ 25 ]
|
||||
|
|
|
@ -30,6 +30,3 @@ def test_align_continue():
|
|||
assert aligned[2] == ('re-align', [(5, 7), (7, 8), (8, 13)])
|
||||
assert aligned[3] == ('and', [(13, 16)])
|
||||
assert aligned[4] == ('continue', [(16, 24)])
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -37,5 +37,3 @@ def test_dep():
|
|||
assert feats_array[1][1] == tokens[1].dep
|
||||
assert feats_array[2][1] == tokens[2].dep
|
||||
assert feats_array[3][1] == tokens[3].dep
|
||||
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ def test3():
|
|||
assert sum(o) != 0
|
||||
from numpy import dot
|
||||
from numpy.linalg import norm
|
||||
|
||||
|
||||
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
words = [w for w in nlp.vocab if w.check(IS_LOWER) and w.has_repvec]
|
||||
words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
|
|
|
@ -35,4 +35,3 @@ def test_merge_heads():
|
|||
def test_issue_54():
|
||||
text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
|
||||
tokens = NLU(text, merge_mwes=True)
|
||||
|
||||
|
|
|
@ -33,4 +33,3 @@ def test_word():
|
|||
def test_not_number():
|
||||
assert not like_number('dog')
|
||||
assert not like_number(',')
|
||||
|
||||
|
|
|
@ -58,4 +58,3 @@ def test_child_consistency(nlp, sun_text):
|
|||
assert not children
|
||||
for head_index, children in rights.items():
|
||||
assert not children
|
||||
|
||||
|
|
|
@ -49,4 +49,3 @@ def test_three_same_close(close_puncts, EN):
|
|||
def test_double_end_quote(EN):
|
||||
assert len(EN("Hello''")) == 2
|
||||
assert len(EN("''")) == 1
|
||||
|
||||
|
|
|
@ -16,6 +16,3 @@ def test_one(EN):
|
|||
assert tokens[0].orth_ == 'Betty'
|
||||
tokens2 = EN('Betty also bought a pound of butter.')
|
||||
assert tokens2[0].orth_ == 'Betty'
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -16,4 +16,3 @@ def test_subtrees():
|
|||
assert len(list(bus.children)) == 1
|
||||
|
||||
assert len(list(wheels.subtree)) == 6
|
||||
|
||||
|
|
|
@ -35,5 +35,3 @@ def test_single_token_string():
|
|||
nlp = English()
|
||||
tokens = nlp(u'foobar')
|
||||
assert tokens[0].string == 'foobar'
|
||||
|
||||
|
||||
|
|
|
@ -63,15 +63,15 @@ def test_contraction_punct(EN):
|
|||
def test_sample(EN):
|
||||
text = """Tributes pour in for late British Labour Party leader
|
||||
|
||||
Tributes poured in from around the world Thursday
|
||||
to the late Labour Party leader John Smith, who died earlier from a massive
|
||||
Tributes poured in from around the world Thursday
|
||||
to the late Labour Party leader John Smith, who died earlier from a massive
|
||||
heart attack aged 55.
|
||||
|
||||
In Washington, the US State Department issued a statement regretting "the
|
||||
In Washington, the US State Department issued a statement regretting "the
|
||||
untimely death" of the rapier-tongued Scottish barrister and parliamentarian.
|
||||
|
||||
"Mr. Smith, throughout his distinguished"""
|
||||
|
||||
|
||||
tokens = EN(text)
|
||||
assert len(tokens) > 5
|
||||
|
||||
|
|
|
@ -39,5 +39,3 @@ def test_newline_double_space(EN):
|
|||
def test_newline_space_wrap(EN):
|
||||
tokens = EN('hello \n possums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ s=\.\.\.= ... =g
|
|||
s=[,;:@#$%&]= & =g
|
||||
|
||||
# Assume sentence tokenization has been done first, so split FINAL periods
|
||||
# only.
|
||||
# only.
|
||||
s=\([^.]\)\([.]\)\([])}>"']*\)[ ]*$=\1 \2\3 =g
|
||||
# however, we may as well split ALL question marks and exclamation points,
|
||||
# since they shouldn't have the abbrev.-marker ambiguity problem
|
||||
|
|
Loading…
Reference in New Issue