* Make corrections to example code

This commit is contained in:
Matthew Honnibal 2015-02-07 08:45:09 -05:00
parent f0e0588833
commit a7e4f0a86c
1 changed files with 13 additions and 13 deletions

View File

@ -83,10 +83,9 @@ particularly egregious:
>>> from spacy.parts_of_speech import ADV >>> from spacy.parts_of_speech import ADV
>>> # Load the pipeline, and call it with some text. >>> # Load the pipeline, and call it with some text.
>>> nlp = spacy.en.English() >>> nlp = spacy.en.English()
>>> tokens = nlp("Give it back, he pleaded abjectly, its mine.", >>> tokens = nlp(u"Give it back, he pleaded abjectly, its mine.", tag=True, parse=False)
tag=True, parse=False) >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
>>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)) uGive it BACK, he pleaded ABJECTLY, its mine.
Give it BACK, he pleaded ABJECTLY, its mine.
Easy enough --- but the problem is that we've also highlighted "back". Easy enough --- but the problem is that we've also highlighted "back".
@ -103,11 +102,11 @@ manner adverbs that the style guides are worried about.
The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a
log probability estimate of the word: log probability estimate of the word:
>>> nlp.vocab['back'].prob >>> nlp.vocab[u'back'].prob
-7.403977394104004 -7.403977394104004
>>> nlp.vocab['not'].prob >>> nlp.vocab[u'not'].prob
-5.407193660736084 -5.407193660736084
>>> nlp.vocab['quietly'].prob >>> nlp.vocab[u'quietly'].prob
-11.07155704498291 -11.07155704498291
(The probability estimate is based on counts from a 3 billion word corpus, (The probability estimate is based on counts from a 3 billion word corpus,
@ -125,8 +124,8 @@ marker. Let's try N=1000 for now:
>>> probs = [lex.prob for lex in nlp.vocab] >>> probs = [lex.prob for lex in nlp.vocab]
>>> probs.sort() >>> probs.sort()
>>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
>>> tokens = nlp("Give it back, he pleaded abjectly, its mine.") >>> tokens = nlp(u"Give it back, he pleaded abjectly, its mine.")
>>> print(''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)) >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
Give it back, he pleaded ABJECTLY, its mine. Give it back, he pleaded ABJECTLY, its mine.
There are lots of other ways we could refine the logic, depending on just what There are lots of other ways we could refine the logic, depending on just what
@ -136,7 +135,7 @@ representation for every word (by default, the vectors produced by
`Levy and Goldberg (2014)`_). Naturally, the vector is provided as a numpy `Levy and Goldberg (2014)`_). Naturally, the vector is provided as a numpy
array: array:
>>> pleaded = tokens[8] >>> pleaded = tokens[7]
>>> pleaded.repvec.shape >>> pleaded.repvec.shape
(300,) (300,)
>>> pleaded.repvec[:5] >>> pleaded.repvec[:5]
@ -150,9 +149,10 @@ cosine metric:
>>> from numpy import dot >>> from numpy import dot
>>> from numpy.linalg import norm >>> from numpy.linalg import norm
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1), norm(v2))
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
>>> words = [w for w in nlp.vocab if w.lower] >>> words = [w for w in nlp.vocab if w.lower]
>>> words.sort(key=lambda w: cosine(w, pleaded)) >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
>>> words.reverse() >>> words.reverse()
>>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading
@ -177,7 +177,7 @@ as our target:
>>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested'] >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested']
>>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs) >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs)
>>> words.sort(key=lambda w: cosine(w.repvec, say_vector)) >>> words.sort(key=lambda w: cosine(w.repvec * say_vector))
>>> words.reverse() >>> words.reverse()
>>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired