From 0f2cb7443365505aff2468dccaec096ef262f64f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Aug 2015 08:56:30 +0200 Subject: [PATCH] * Work on docs --- docs/redesign/blog.jade | 64 ++-------- docs/redesign/blog_parser.jade | 2 +- docs/redesign/comparisons.jade | 191 ++++++++++++++++++++---------- docs/redesign/docs.jade | 2 +- docs/redesign/home.jade | 17 +-- docs/redesign/mixins.jade | 4 +- docs/redesign/online_demo.jade | 2 +- docs/redesign/tutorials.jade | 29 +++++ docs/redesign/usage_examples.jade | 111 ++++++++++++----- 9 files changed, 256 insertions(+), 166 deletions(-) diff --git a/docs/redesign/blog.jade b/docs/redesign/blog.jade index 119a5aad9..8a712267d 100644 --- a/docs/redesign/blog.jade +++ b/docs/redesign/blog.jade @@ -13,6 +13,7 @@ mixin Teaser(title, url, date_long, date_short, author, lede) a.readmore(href='#') ► + doctype html html(lang='en') head @@ -71,63 +72,22 @@ html(lang='en') "in syntactic parsing over the last few years. It’s now possible for a" + "tiny Python implementation to perform better than the widely-used Stanford " + "PCFG parser.") + +Teaser( + "A good Part-of-Speech tagger in about 200 lines of Python", + "blog_tagger.html", + "October 11, 2013", + "2013-09-11", + "Matthew Honnibal", + "There are a tonne of “best known techniques” for POS tagging, and you " + + "should ignore the others and just use greedy Averaged Perceptron." + ) - article.post - header - h2 - a(href='#') Another headline - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - article.post - header - h2 - a(href='#') Another headline - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - article.post - header - h2 - a(href='#') Another headline - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - .readmore - a.button(href='#') Read more posts section.intro h2 a.permalink(href='#tutorials', name='tutorials') Tutorials - p - | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est. + section.tutorials - details - summary - h4 Tutorial #1: How to do something cool - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - details - summary - h4 Tutorial #2 - details - summary - h4 Tutorial #3 + include ./tutorials.jade footer(role="contentinfo") span.slogan.copyright © 2015 Syllogism Co. diff --git a/docs/redesign/blog_parser.jade b/docs/redesign/blog_parser.jade index 4930d8d26..e94376e32 100644 --- a/docs/redesign/blog_parser.jade +++ b/docs/redesign/blog_parser.jade @@ -15,7 +15,7 @@ block body_block article.post header - h2 Parsing English with 500 lines of Python + h2 Parsing English in 500 lines of Python .subhead | by a(href='#', rel='author') Matthew Honnibal diff --git a/docs/redesign/comparisons.jade b/docs/redesign/comparisons.jade index a80df8235..c4434db5c 100644 --- a/docs/redesign/comparisons.jade +++ b/docs/redesign/comparisons.jade @@ -1,78 +1,139 @@ +- var urls = {} +- urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf" +- urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf" + + +comparison("NLTK") + p spaCy is: + ul + li.pro 100x faster; + li.pro 50% more accurate; + li.pro Serializes TODO% smaller; + + p spaCy features: + ul + li.pro Integrated word vectors; + li.pro Efficient binary serialization; + + p NLTK features: + ul + li.con Multiple languages; + li.neutral Educational resources + + //+comparison("Pattern") +comparison("CoreNLP") + p spaCy is: + + ul + li.pro TODO% faster; + li.pro TODO% more accurate; + li.pro Not Java; + li.pro Well documented; + li.pro Cheaper to license commercially; + li.neutral + | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping + | options. + + p CoreNLP features: + + ul + li.con Multiple Languages; + li.con Sentiment analysis + li.con Coreference resolution + + +comparison("ClearNLP") -//+comparison("OpenNLP") -//+comparison("GATE") + p spaCy is: -+comparison("Accuracy Summary") + ul + li.pro Not Java; + li.pro TODO% faster; + li.pro Well documented; + li.neutral Slightly more accurate; + + p ClearNLP features: + + ul + li.con Semantic Role Labelling + li.con Multiple Languages + li.con Model for biology/life-science; + +//+comparison("Accuracy Summary") + +//+comparison("Speed Summary") +// table +// thead +// tr +// th. +// th(colspan=3) Absolute (ms per doc) +// th(colspan=3) Relative (to spaCy) +// +// tbody +// tr +// td: strong System +// td: strong Split +// td: strong Tag +// td: strong Parse +// td: strong Split +// td: strong Tag +// td: strong Parse +// +// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") +// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") +// +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") +// +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") +// +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") +// +// p +// | Set up: 100,000 plain-text documents were streamed +// | from an SQLite3 database, and processed with an NLP library, to one +// | of three levels of detail – tokenization, tagging, or parsing. +// | The tasks are additive: to parse the text you have to tokenize and +// | tag it. The pre-processing was not subtracted from the times – +// | I report the time required for the pipeline to complete. I report +// | mean times per document, in milliseconds. +// +// p +// | Hardware: Intel i7-3770 (2012) + + + + + ++comparison("Peer-reviewed Evaluations") + p. + spaCy is committed to rigorous evaluation under standard methodology. Two + papers in 2015 confirm that: + ol + li spaCy is the fastest syntactic parser in the world; + li Its accuracy is within 1% of the best available; + li The few systems that are more accurate are 20× slower or more. + + p + | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University, + | as part of a survey paper benchmarking the current state-of-the-art dependency + | parsers + a(href=urls.choi_paper) (Choi et al., 2015) + | . -+comparison("Speed Summary") table thead - tr - th. - th(colspan=3) Absolute (ms per doc) - th(colspan=3) Relative (to spaCy) + +columns("System", "Language", "Accuracy", "Speed") tbody - tr - td: strong System - td: strong Split - td: strong Tag - td: strong Parse - td: strong Split - td: strong Tag - td: strong Parse - - +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") - +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") - +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") - +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") - +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") + +row("spaCy v0.84", "Cython", "90.6", "13,963") + +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)") + +row("ClearNLP", "Java", "91.7", "10,271") + +row("CoreNLP", "Java", "89.6", "8,602") + +row("MATE", "Java", "92.5", "550") + +row("Turbo", "C++", "92.4", "349") + +row("Yara", "Java", "92.3", "340") p - | Set up: 100,000 plain-text documents were streamed - | from an SQLite3 database, and processed with an NLP library, to one - | of three levels of detail – tokenization, tagging, or parsing. - | The tasks are additive: to parse the text you have to tokenize and - | tag it. The pre-processing was not subtracted from the times – - | I report the time required for the pipeline to complete. I report - | mean times per document, in milliseconds. + | Discussion with the authors led to accuracy improvements in spaCy, which + | have been accepted for publication in EMNLP, in joint work with Macquarie + | University + a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015) + | . - p - | Hardware: Intel i7-3770 (2012) - - - +comparison("Independent Evaluation") - p - | Independent evaluation by Yahoo! Labs and Emory - | University, to appear at ACL 2015. Higher is better. - - table - thead - +columns("System", "Language", "Accuracy", "Speed") - - tbody - +row("spaCy v0.86", "Cython", "91.9", "13,963") - +row("spaCy v0.84", "Cython", "90.6", "13,963") - +row("ClearNLP", "Java", "91.7", "10,271") - +row("CoreNLP", "Java", "89.6", "8,602") - +row("MATE", "Java", "92.5", "550") - +row("Turbo", "C++", "92.4", "349") - +row("Yara", "Java", "92.3", "340") - - p - | Accuracy is % unlabelled arcs correct, speed is tokens per second. - - p - | Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory) - | performed a detailed comparison of the best parsers available. - | All numbers above are taken from the pre-print they kindly made - | available to me, except for spaCy v0.86. - - p - | I'm particularly grateful to the authors for discussion of their - | results, which led to the improvement in accuracy between v0.84 and - | v0.86. A tip from Jin-ho developer of ClearNLP) was particularly - | useful. diff --git a/docs/redesign/docs.jade b/docs/redesign/docs.jade index 2b5c88760..e098bb0c0 100644 --- a/docs/redesign/docs.jade +++ b/docs/redesign/docs.jade @@ -125,5 +125,5 @@ block body_block article +Section("API", "api", "api.jade") - +Section("Tutorals", "tutorials", "tutorials.jade") + +Section("Tutorials", "tutorials", "tutorials.jade") +Section("Annotation Specifications", "spec", "spec.jade") diff --git a/docs/redesign/home.jade b/docs/redesign/home.jade index a628da2db..66efd1455 100644 --- a/docs/redesign/home.jade +++ b/docs/redesign/home.jade @@ -28,17 +28,6 @@ mixin lede If you're a small company doing NLP, we want spaCy to seem like !{a_minor_miracle}. -mixin overview() - p. - Overview text - -mixin benchmarks() - p. - Benchmarks - -mixin get_started() - p. - Get Started mixin comparison(name) details @@ -78,20 +67,22 @@ block intro_block nav(role="navigation") ul li: a(href="#example-use" class="button") Examples - li: a(href="#online-demo" class="button") Demo li: a(href="#comparisons" class="button") Comparisons + li: a(href="#online-demo" class="button") Try Online li: a(href="#install" class="button") | Install v0.89 + block body_block article(class="page landing-page") +Section("Usage by Example", "example-use", "./usage_examples.jade") + +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") + +Section("Online Demo", "online-demo", "./online_demo.jade") - +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") +Section("Install", "install", "./install.jade") diff --git a/docs/redesign/mixins.jade b/docs/redesign/mixins.jade index 34ad293aa..005149a2b 100644 --- a/docs/redesign/mixins.jade +++ b/docs/redesign/mixins.jade @@ -1,5 +1,5 @@ mixin Section(title_text, link_name, include_file) - h3: a(name=link_name href=link_name) #{title_text} + h3: a(name=link_name) #{title_text} if (link_name == "example-use") include ./usage_examples.jade @@ -15,5 +15,3 @@ mixin Section(title_text, link_name, include_file) include ./tutorials.jade else if (link_name == "spec") include ./spec.jade - - diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade index 0e2bbb331..92a61eefc 100644 --- a/docs/redesign/online_demo.jade +++ b/docs/redesign/online_demo.jade @@ -5,7 +5,7 @@ mixin Displacy(sentence, caption_text, height) iframe.displacy(src="displacy/displacy_demo.html" height=height) a.view-displacy(href=url) - | View on displaCy + | Interactive Visualizer p.caption. #{caption_text} diff --git a/docs/redesign/tutorials.jade b/docs/redesign/tutorials.jade index e69de29bb..ad1a4dbc9 100644 --- a/docs/redesign/tutorials.jade +++ b/docs/redesign/tutorials.jade @@ -0,0 +1,29 @@ +mixin Tutorial(title) + details + summary + h4= title + + block + ++Tutorial("Mark-up all manner adverbs, especially for verbs of speech") + | Let's say you're developing a proofreading tool, or possibly an IDE for + | writers. You're convinced by Stephen King's advice that + | adverbs are not your friend + | so you want to + a.readmore(href='tute_adverbs.html') + | highlight all adverbs. ► + ++Tutorial("Search Reddit for comments about Google doing something") + | Example use of the spaCy NLP tools for data exploration. + | Here we will look for Reddit comments that describe Google doing something, + | i.e. discuss the company's actions. This is difficult, because other + | senses of "Google" now dominate usage of the word in conversation, + | particularly references to using Google products. + a.readmore(href='tute_adverbs.html') + | ► + ++Tutorial("Use word vectors for semantic search of Twitter") + | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. + | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. + a.readmore(href='tute_twitter.html') + | ► diff --git a/docs/redesign/usage_examples.jade b/docs/redesign/usage_examples.jade index 418ca9c57..04f29eeb9 100644 --- a/docs/redesign/usage_examples.jade +++ b/docs/redesign/usage_examples.jade @@ -23,7 +23,7 @@ mixin example(name) | hello_id = nlp.vocab.strings['Hello'] | hello_str = nlp.vocab.strings[hello_id] | - | assert token.orth == hello_id == 52 + | assert token.orth == hello_id == 52 | assert token.orth_ == hello_str == 'Hello' +example("Get and set string views and flags") @@ -66,51 +66,102 @@ mixin example(name) +example("Part-of-speech tags") pre.language-python: code - | doc[0].pos - | doc[0].tag + | from spacy.parts_of_speech import ADV + | + | def is_adverb(token): + | return token.pos == spacy.parts_of_speech.ADV + | + | # These are data-specific, so no constants are provided. You have to look + | # up the IDs from the StringStore. + | NNS = nlp.vocab.strings['NNS'] + | NNPS = nlp.vocab.strings['NNPS'] + | def is_plural_noun(token): + | return token.tag == NNS or token.tag == NNPS + | + | def print_coarse_pos(token): + | print(token.pos_) + | + | def print_fine_pos(token): + | print(token.tag_) +example("Syntactic dependencies") pre.language-python: code - | for head in tokens: - | for child in head.lefts: - | assert child.head is head - | for child in head.rights: - | assert child.head is head - | sent = nlp('The four wheels on the bus turned quickly.') - | wheels = sent[2] - | bus = sent[5] - | assert len(list(wheels.lefts)) == 2 - | assert len(list(wheels.rights)) == 1 - | assert len(list(wheels.children)) == 3 - | assert len(list(bus.lefts)) == 1 - | assert len(list(bus.rights)) == 0 - | assert len(list(bus.children)) == 1 - | - | assert len(list(wheels.subtree)) == 6 + | def dependency_labels_to_root(token): + | '''Walk up the syntactic tree, collecting the arc labels.''' + | dep_labels = [] + | while token.root is not token: + | dep_labels.append(token.dep) + | token = token.head + | return dep_labels +example("Named entities") pre.language-python: code - | doc.ents - | token.ent_type - | token.ent_iob + | def iter_products(docs): + | for doc in docs: + | for ent in doc.ents: + | if ent.label_ == 'PRODUCT': + | yield ent + | + | def word_is_in_entity(word): + | return word.ent_type != 0 + | + | def count_parent_verb_by_person(docs): + | counts = defaultdict(defaultdict(int)) + | for doc in docs: + | for ent in doc.ents: + | if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: + | counts[ent.orth_][ent.root.head.lemma_] += 1 + | return counts + + //+example("Define custom NER rules") + // pre.language-python: code + // | nlp.matcher -+example("Define custom NER rules") - pre.language-python: code - | nlp.matcher +example("Calculate inline mark-up on original string") pre.language-python: code - | token.string - | token.spacy - | token.whitespace_ + | def put_spans_around_tokens(doc, get_classes): + | '''Given some function to compute class names, put each token in a + | span element, with the appropriate classes computed. + | + | All whitespace is preserved, outside of the spans. (Yes, I know HTML + | won't display it. But the point is no information is lost, so you can + | calculate what you need, e.g.
tags,

tags, etc.) + | ''' + | output = [] + | template = '<span classes="{classes}">{word}</span>{space}' + | for token in doc: + | if token.is_space: + | output.append(token.orth_) + | else: + | output.append( + | template.format( + | classes=' '.join(get_classes(token)), + | word=token.orth_, + | space=token.whitespace_)) + | string = ''.join(output) + | string = string.replace('\n', '
') + | string = string.replace('\t', '    ' + | return string + +example("Efficient binary serialization") pre.language-python: code - + | | byte_string = doc.as_bytes() | open('/tmp/moby_dick.bin', 'wb').write(byte_string) - + | | nlp = spacy.en.English() | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): | doc = Doc(nlp.vocab) | doc.from_bytes(byte_string) + + +p + | See the + a(href="docs.html") docs page + | for + a(href="docs.html#api") API documentation, + a(href="docs.html#tutorials") tutorials, + | and + a(href="docs.html#spec") annotation specs.