From e28b224b80d1d654790ce11bf1bd2726ab9e0fd6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 16 Jan 2015 07:08:35 +1100 Subject: [PATCH] * Impove index docs --- docs/source/index.rst | 219 ++++++++++++++++++++++++++++++------------ 1 file changed, 158 insertions(+), 61 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index e409fd88b..0da16c6e2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -3,87 +3,184 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -=================================== -spaCy: Text-processing for products -=================================== +============================== +spaCy: Industrial-strength NLP +============================== spaCy is a library for industrial-strength text processing in Python and Cython. -Its core values are efficiency, accuracy and minimalism: you get a fast pipeline of -state-of-the-art components, a nice API, and no clutter: +It is commercial open source software, with a dual (AGPL or commercial) +license. - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp(u'An example sentence', tag=True, parse=True) - >>> for token in tokens: - ... print token.lemma, token.pos, bin(token.cluster) - an DT Xx 0b111011110 - example NN xxxx 0b111110001 - sentence NN xxxx 0b1101111110010 - -spaCy is particularly good for feature extraction, because it pre-loads lexical -resources, maps strings to integer IDs, and supports output of numpy arrays: +If you're a small company doing NLP, spaCy might seem like a minor miracle. +It's by far the fastest NLP software available. The full processing pipeline +completes in 7ms, including state-of-the-art part-of-speech tagging and +dependency parsing. All strings are mapped to integer IDs, tokens +are linked to word vectors and other lexical resources, and a range of useful +features are pre-calculated and cached. - >>> from spacy.en import attrs - >>> tokens.to_array((attrs.LEMMA, attrs.POS, attrs.SHAPE, attrs.CLUSTER)) - array([[ 1265, 14, 76, 478], - [ 1545, 24, 262, 497], - [ 3385, 24, 262, 14309]]) +If none of that made any sense to you, here's the gist of it. Computers don't +understand text. This is unfortunate, because that's what the web almost entirely +consists of. We want to recommend people text based on other text they liked. +We want to shorten text to display it on a mobile screen. We want to aggregate +it, link it, filter it, categorise it, generate it and correct it. -spaCy also makes it easy to add in-line mark up. Let's say you're convinced by -Stephen King's advice that `adverbs are not your friend `_, so you want to mark -them in red. We'll use one of the examples he finds particularly egregious: +spaCy provides a set of utility functions that help programmers build such +products. It's an NLP engine, analogous to the 3d engines commonly licensed +for game development. - >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - >>> red = lambda string: u'\033[91m{0}\033[0m'.format(string) - >>> red = lambda string: unicode(string).upper() # TODO -- make red work on website... - >>> print u''.join(red(t) if t.is_adverb else unicode(t) for t in tokens) +Example functionality +--------------------- + +Let's say you're developing a proofreading tool, or possibly an IDE for +writers. You're convinced by Stephen King's advice that `adverbs are not your +friend `_, so +you want to **mark adverbs in red**. We'll use one of the examples he finds +particularly egregious: + + >>> import spacy.en + >>> from spacy.enums import ADVERB + >>> # Load the pipeline, and call it with some text. + >>> nlp = spacy.en.English() + >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", + tag=True, parse=True) + >>> output = '' + >>> for tok in tokens: + ... # Token.string preserves whitespace, making it easy to + ... # reconstruct the original string. + ... output += tok.string.upper() if tok.is_pos(ADVERB) else tok.string + >>> print(output) ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ -Easy --- except, "back" isn't the sort of word we're looking for, even though -it's undeniably an adverb. Let's search refine the logic a little, and only -highlight adverbs that modify verbs: +Easy enough --- but the problem is that we've also highlighted "back", when probably +we only wanted to highlight "abjectly". This is undoubtedly an adverb, but it's +not the sort of adverb King is talking about. This is a persistent problem when +dealing with linguistic categories: the prototypical examples, the ones whic +spring to your mind, are often not the most common cases. - >>> print u''.join(red(t) if t.is_adverb and t.head.is_verb else unicode(t) for t in tokens) +There are lots of ways we might refine our logic, depending on just what words +we want to flag. The simplest way to filter out adverbs like "back" and "not" +is by word frequency: these words are much more common than the manner adverbs +the style guides are worried about. + +The prob attribute of a Lexeme or Token object gives a log probability estimate +of the word, based on smoothed counts from a 3bn word corpus: + + >>> nlp.vocab[u'back'].prob + -7.403977394104004 + >>> nlp.vocab[u'not'].prob + -5.407193660736084 + >>> nlp.vocab[u'quietly'].prob + -11.07155704498291 + +So we can easily exclude the N most frequent words in English from our adverb +marker. Let's try N=1000 for now: + + >>> import spacy.en + >>> from spacy.enums import ADVERB + >>> nlp = spacy.en.English() + >>> # Find log probability of Nth most frequent word + >>> probs = [lex.prob for lex in nlp.vocab] + >>> is_adverb = lambda tok: tok.is_pos(ADVERB) and tok.prob < probs[-1000] + >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", + tag=True, parse=True) + >>> print(''.join(tok.string.upper() if is_adverb(tok) else tok.string)) ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’ -spaCy is also very efficient --- much more efficient than any other language -processing tools available. The table below compares the time to tokenize, POS -tag and parse a document (amortized over 100k samples). It also shows accuracy -on the standard evaluation, from the Wall Street Journal: +There are lots of ways to refine the logic, depending on just what words we +want to flag. Let's define this narrowly, and only flag adverbs applied to +verbs of communication or perception: -+----------+----------+---------+----------+----------+------------+ -| System | Tokenize | POS Tag | Parse | POS Acc. | Parse Acc. | -+----------+----------+---------+----------+----------+------------+ -| spaCy | 0.37ms | 0.98ms | 10ms | 97.3% | 92.4% | -+----------+----------+---------+----------+----------+------------+ -| NLTK | 6.2ms | 443ms | n/a | 94.0% | n/a | -+----------+----------+---------+----------+----------+------------+ -| CoreNLP | 4.2ms | 13ms | todo | 96.97% | 92.2% | -+----------+----------+---------+----------+----------+------------+ -| ZPar | n/a | 15ms | 850ms | 97.3% | 92.9% | -+----------+----------+---------+----------+----------+------------+ + >>> from spacy.enums import VERB, WN_V_COMMUNICATION, WN_V_COGNITION + >>> def is_say_verb(tok): + ... return tok.is_pos(VERB) and (tok.check_flag(WN_V_COMMUNICATION) or + tok.check_flag(WN_V_COGNITION)) + >>> print(''.join(tok.string.upper() if is_adverb(tok) and is_say_verb(tok.head) + else tok.string)) + ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’ -(The CoreNLP results refer to their recently published shift-reduce neural -network parser.) +The two flags refer to the 45 top-level categories in the WordNet ontology. +spaCy stores membership in these categories as a bit set, because +words can have multiple senses. We only need one 64 +bit flag variable per word in the vocabulary, so this useful data requires only +2.4mb of memory. -I wrote spaCy so that startups and other small companies could take advantage -of the enormous progress being made by NLP academics. Academia is competitive, -and what you're competing to do is write papers --- so it's very hard to write -software useful to non-academics. Seeing this gap, I resigned from my post-doc, -and wrote spaCy. +spaCy packs all sorts of other goodies into its lexicon. +Words are mapped to one these rich lexical types immediately, during +tokenization --- and spaCy's tokenizer is *fast*. + +Efficiency +---------- + +.. table:: Efficiency comparison. See `Benchmarks`_ for details. + + +--------------+---------------------------+--------------------------------+ + | | Absolute (ms per doc) | Relative (to spaCy) | + +--------------+----------+--------+-------+----------+---------+-----------+ + | System | Tokenize | Tag | Parse | Tokenize | Tag | Parse | + +--------------+----------+--------+-------+----------+---------+-----------+ + | spaCy | 0.2ms | 1ms | 7ms | 1x | 1x | 1x | + +--------------+----------+--------+-------+----------+---------+-----------+ + | CoreNLP | 2ms | 10ms | 49ms | 10x | 10x | 7x | + +--------------+----------+--------+-------+----------+---------+-----------+ + | ZPar | 1ms | 8ms | 850ms | 5x | 8x | 121x | + +--------------+----------+--------+-------+----------+---------+-----------+ + | NLTK | 4ms | 443ms | n/a | 20x | 443x | n/a | + +--------------+----------+--------+-------+----------+---------+-----------+ + + +Efficiency is a major concern for NLP applications. It is very common to hear +people say that they cannot afford more detailed processing, because their +datasets are too large. This is a bad position to be in. If you can't apply +detailed processing, you generally have to cobble together various heuristics. +This normally takes a few iterations, and what you come up with will usually be +brittle and difficult to reason about. + +spaCy's parser is faster than most taggers, and its tokenizer is fast enough +for truly web-scale processing. And the tokenizer doesn't just give you a list +of strings. A spaCy token is a pointer to a Lexeme struct, from which you can +access a wide range of pre-computed features. + +.. I wrote spaCy because I think existing commercial NLP engines are crap. + Alchemy API are a typical example. Check out this part of their terms of + service: + publish or perform any benchmark or performance tests or analysis relating to + the Service or the use thereof without express authorization from AlchemyAPI; + +.. Did you get that? You're not allowed to evaluate how well their system works, + unless you're granted a special exception. Their system must be pretty + terrible to motivate such an embarrassing restriction. + They must know this makes them look bad, but they apparently believe allowing + you to evaluate their product would make them look even worse! + +.. spaCy is based on science, not alchemy. It's open source, and I am happy to + clarify any detail of the algorithms I've implemented. + It's evaluated against the current best published systems, following the standard + methodologies. These evaluations show that it performs extremely well. + +Accuracy +-------- + +.. table:: Accuracy comparison, on the standard benchmark data from the Wall Street Journal. See `Benchmarks`_ for details. + + +--------------+----------+------------+ + | System | POS acc. | Parse acc. | + +--------------+----------+------------+ + | spaCy | 97.2 | 92.4 | + +--------------+----------+------------+ + | CoreNLP | 96.9 | 92.2 | + +--------------+----------+------------+ + | ZPar | 97.3 | 92.9 | + +--------------+----------+------------+ + | NLTK | 94.3 | n/a | + +--------------+----------+------------+ -spaCy is dual-licensed: you can either use it under the GPL, or pay a one-time -fee of $5000 for a commercial license. I think this is excellent value: -you'll find NLTK etc much more expensive, because what you save on license -cost, you'll lose many times over in lost productivity. $5000 does not buy you -much developer time. .. toctree:: - :hidden: :maxdepth: 3 + license.rst + quickstart.rst features.rst - license_stories.rst api.rst