* Further improvements to spacy docs, tweaks to code.

This commit is contained in:
Matthew Honnibal 2014-08-22 04:20:24 +02:00
parent 4eb9c2b30f
commit 4bcdd6d31c
4 changed files with 9 additions and 13 deletions

View File

@ -35,7 +35,8 @@ extensions = [
'sphinx.ext.coverage',
'sphinx.ext.viewcode',
'sphinx.ext.autodoc',
'sphinxcontrib.napoleon'
'sphinxcontrib.napoleon',
'sphinx.ext.doctest'
]
# Add any paths that contain templates here, relative to this directory.

View File

@ -19,18 +19,14 @@ an excellent set of pre-computed orthographic and distributional features:
::
>>> from spacy import en
>>> apples, are, not, oranges, dots = en.tokenize(u"Apples aren't oranges...")
>>> apples, are, nt, oranges, dots = en.tokenize(u"Apples aren't oranges...")
>>> en.is_lower(apples)
False
# Distributional features calculated from large corpora
# Smoothed unigram log probability
>>> en.prob_of(are) > en.prob_of(oranges)
>>> en.prob_of(are) >= en.prob_of(oranges)
True
# After POS tagging lots of text, is this word ever a noun?
>>> en.can_tag(are, en.NOUN)
False
# Is this word always title-cased?
>>> en.often_title(apples)
>>> en.is_often_titled(apples)
False
Accessing these properties is essentially free: the Lexeme IDs are actually
@ -72,6 +68,7 @@ Pros:
Cons:
- It's new (released September 2014)
- Security concerns, from memory management
- Higher memory usage (up to 1gb)
- More conceptually complicated
- Tokenization rules expressed in code, not as data

View File

@ -7,7 +7,6 @@ from spacy.lexeme import lex_of
from spacy import LEX, NORM, SHAPE, LAST3
def test_group_by_lex():
tokens = en.tokenize("I like the red one and I like the blue one")
names, hashes, groups = tokens.group_by(LEX)

View File

@ -4,8 +4,7 @@ import pytest
from spacy.en import lookup, unhash
from spacy.lexeme import lex_of, norm_of, shape_of, first_of, length_of
from spacy.lexeme import shape_of
from spacy.en import lex_of, shape_of, norm_of, first_of, length_of
@pytest.fixture
def C3P0():
@ -19,8 +18,8 @@ def test_shape(C3P0):
def test_length():
t = lookup('the')
assert length_of(t) == 3
t = lookup('')
assert length_of(t) == 0
#t = lookup('')
#assert length_of(t) == 0
t = lookup("n't")
assert length_of(t) == 3
t = lookup("'s")