diff --git a/docs/redesign/blog.jade b/docs/redesign/blog.jade
index 119a5aad9..8a712267d 100644
--- a/docs/redesign/blog.jade
+++ b/docs/redesign/blog.jade
@@ -13,6 +13,7 @@ mixin Teaser(title, url, date_long, date_short, author, lede)
a.readmore(href='#') ►
+
doctype html
html(lang='en')
head
@@ -71,63 +72,22 @@ html(lang='en')
"in syntactic parsing over the last few years. It’s now possible for a" +
"tiny Python implementation to perform better than the widely-used Stanford " +
"PCFG parser.")
+ +Teaser(
+ "A good Part-of-Speech tagger in about 200 lines of Python",
+ "blog_tagger.html",
+ "October 11, 2013",
+ "2013-09-11",
+ "Matthew Honnibal",
+ "There are a tonne of “best known techniques” for POS tagging, and you " +
+ "should ignore the others and just use greedy Averaged Perceptron."
+ )
- article.post
- header
- h2
- a(href='#') Another headline
- .subhead
- | by
- a(href='#', rel='author') Matthew Honnibal
- | on
- time(datetime='2013-12-18') December 18, 2013
- p
- | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
- a.readmore(href='#') ►
- article.post
- header
- h2
- a(href='#') Another headline
- .subhead
- | by
- a(href='#', rel='author') Matthew Honnibal
- | on
- time(datetime='2013-12-18') December 18, 2013
- p
- | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
- a.readmore(href='#') ►
- article.post
- header
- h2
- a(href='#') Another headline
- .subhead
- | by
- a(href='#', rel='author') Matthew Honnibal
- | on
- time(datetime='2013-12-18') December 18, 2013
- p
- | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
- a.readmore(href='#') ►
- .readmore
- a.button(href='#') Read more posts
section.intro
h2
a.permalink(href='#tutorials', name='tutorials') Tutorials
- p
- | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est.
+
section.tutorials
- details
- summary
- h4 Tutorial #1: How to do something cool
- p
- | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser.
- a.readmore(href='#') ►
- details
- summary
- h4 Tutorial #2
- details
- summary
- h4 Tutorial #3
+ include ./tutorials.jade
footer(role="contentinfo")
span.slogan.copyright © 2015 Syllogism Co.
diff --git a/docs/redesign/blog_parser.jade b/docs/redesign/blog_parser.jade
index 4930d8d26..e94376e32 100644
--- a/docs/redesign/blog_parser.jade
+++ b/docs/redesign/blog_parser.jade
@@ -15,7 +15,7 @@ block body_block
article.post
header
- h2 Parsing English with 500 lines of Python
+ h2 Parsing English in 500 lines of Python
.subhead
| by
a(href='#', rel='author') Matthew Honnibal
diff --git a/docs/redesign/comparisons.jade b/docs/redesign/comparisons.jade
index a80df8235..c4434db5c 100644
--- a/docs/redesign/comparisons.jade
+++ b/docs/redesign/comparisons.jade
@@ -1,78 +1,139 @@
+- var urls = {}
+- urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf"
+- urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf"
+
+
+comparison("NLTK")
+ p spaCy is:
+ ul
+ li.pro 100x faster;
+ li.pro 50% more accurate;
+ li.pro Serializes TODO% smaller;
+
+ p spaCy features:
+ ul
+ li.pro Integrated word vectors;
+ li.pro Efficient binary serialization;
+
+ p NLTK features:
+ ul
+ li.con Multiple languages;
+ li.neutral Educational resources
+
+
//+comparison("Pattern")
+comparison("CoreNLP")
+ p spaCy is:
+
+ ul
+ li.pro TODO% faster;
+ li.pro TODO% more accurate;
+ li.pro Not Java;
+ li.pro Well documented;
+ li.pro Cheaper to license commercially;
+ li.neutral
+ | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping
+ | options.
+
+ p CoreNLP features:
+
+ ul
+ li.con Multiple Languages;
+ li.con Sentiment analysis
+ li.con Coreference resolution
+
+
+comparison("ClearNLP")
-//+comparison("OpenNLP")
-//+comparison("GATE")
+ p spaCy is:
-+comparison("Accuracy Summary")
+ ul
+ li.pro Not Java;
+ li.pro TODO% faster;
+ li.pro Well documented;
+ li.neutral Slightly more accurate;
+
+ p ClearNLP features:
+
+ ul
+ li.con Semantic Role Labelling
+ li.con Multiple Languages
+ li.con Model for biology/life-science;
+
+//+comparison("Accuracy Summary")
+
+//+comparison("Speed Summary")
+// table
+// thead
+// tr
+// th.
+// th(colspan=3) Absolute (ms per doc)
+// th(colspan=3) Relative (to spaCy)
+//
+// tbody
+// tr
+// td: strong System
+// td: strong Split
+// td: strong Tag
+// td: strong Parse
+// td: strong Split
+// td: strong Tag
+// td: strong Parse
+//
+// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
+// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
+// +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x")
+// +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x")
+// +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a")
+//
+// p
+// | Set up: 100,000 plain-text documents were streamed
+// | from an SQLite3 database, and processed with an NLP library, to one
+// | of three levels of detail – tokenization, tagging, or parsing.
+// | The tasks are additive: to parse the text you have to tokenize and
+// | tag it. The pre-processing was not subtracted from the times –
+// | I report the time required for the pipeline to complete. I report
+// | mean times per document, in milliseconds.
+//
+// p
+// | Hardware: Intel i7-3770 (2012)
+
+
+
+
+
++comparison("Peer-reviewed Evaluations")
+ p.
+ spaCy is committed to rigorous evaluation under standard methodology. Two
+ papers in 2015 confirm that:
+ ol
+ li spaCy is the fastest syntactic parser in the world;
+ li Its accuracy is within 1% of the best available;
+ li The few systems that are more accurate are 20× slower or more.
+
+ p
+ | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University,
+ | as part of a survey paper benchmarking the current state-of-the-art dependency
+ | parsers
+ a(href=urls.choi_paper) (Choi et al., 2015)
+ | .
-+comparison("Speed Summary")
table
thead
- tr
- th.
- th(colspan=3) Absolute (ms per doc)
- th(colspan=3) Relative (to spaCy)
+ +columns("System", "Language", "Accuracy", "Speed")
tbody
- tr
- td: strong System
- td: strong Split
- td: strong Tag
- td: strong Parse
- td: strong Split
- td: strong Tag
- td: strong Parse
-
- +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
- +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
- +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x")
- +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x")
- +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a")
+ +row("spaCy v0.84", "Cython", "90.6", "13,963")
+ +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)")
+ +row("ClearNLP", "Java", "91.7", "10,271")
+ +row("CoreNLP", "Java", "89.6", "8,602")
+ +row("MATE", "Java", "92.5", "550")
+ +row("Turbo", "C++", "92.4", "349")
+ +row("Yara", "Java", "92.3", "340")
p
- | Set up: 100,000 plain-text documents were streamed
- | from an SQLite3 database, and processed with an NLP library, to one
- | of three levels of detail – tokenization, tagging, or parsing.
- | The tasks are additive: to parse the text you have to tokenize and
- | tag it. The pre-processing was not subtracted from the times –
- | I report the time required for the pipeline to complete. I report
- | mean times per document, in milliseconds.
+ | Discussion with the authors led to accuracy improvements in spaCy, which
+ | have been accepted for publication in EMNLP, in joint work with Macquarie
+ | University
+ a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015)
+ | .
- p
- | Hardware: Intel i7-3770 (2012)
-
-
- +comparison("Independent Evaluation")
- p
- | Independent evaluation by Yahoo! Labs and Emory
- | University, to appear at ACL 2015. Higher is better.
-
- table
- thead
- +columns("System", "Language", "Accuracy", "Speed")
-
- tbody
- +row("spaCy v0.86", "Cython", "91.9", "13,963")
- +row("spaCy v0.84", "Cython", "90.6", "13,963")
- +row("ClearNLP", "Java", "91.7", "10,271")
- +row("CoreNLP", "Java", "89.6", "8,602")
- +row("MATE", "Java", "92.5", "550")
- +row("Turbo", "C++", "92.4", "349")
- +row("Yara", "Java", "92.3", "340")
-
- p
- | Accuracy is % unlabelled arcs correct, speed is tokens per second.
-
- p
- | Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory)
- | performed a detailed comparison of the best parsers available.
- | All numbers above are taken from the pre-print they kindly made
- | available to me, except for spaCy v0.86.
-
- p
- | I'm particularly grateful to the authors for discussion of their
- | results, which led to the improvement in accuracy between v0.84 and
- | v0.86. A tip from Jin-ho developer of ClearNLP) was particularly
- | useful.
diff --git a/docs/redesign/docs.jade b/docs/redesign/docs.jade
index 2b5c88760..e098bb0c0 100644
--- a/docs/redesign/docs.jade
+++ b/docs/redesign/docs.jade
@@ -125,5 +125,5 @@ block body_block
article
+Section("API", "api", "api.jade")
- +Section("Tutorals", "tutorials", "tutorials.jade")
+ +Section("Tutorials", "tutorials", "tutorials.jade")
+Section("Annotation Specifications", "spec", "spec.jade")
diff --git a/docs/redesign/home.jade b/docs/redesign/home.jade
index a628da2db..66efd1455 100644
--- a/docs/redesign/home.jade
+++ b/docs/redesign/home.jade
@@ -28,17 +28,6 @@ mixin lede
If you're a small company doing NLP, we want spaCy to seem
like !{a_minor_miracle}.
-mixin overview()
- p.
- Overview text
-
-mixin benchmarks()
- p.
- Benchmarks
-
-mixin get_started()
- p.
- Get Started
mixin comparison(name)
details
@@ -78,20 +67,22 @@ block intro_block
nav(role="navigation")
ul
li: a(href="#example-use" class="button") Examples
- li: a(href="#online-demo" class="button") Demo
li: a(href="#comparisons" class="button") Comparisons
+ li: a(href="#online-demo" class="button") Try Online
li: a(href="#install" class="button")
| Install
tags,
tags, etc.)
+ | '''
+ | output = []
+ | template = '<span classes="{classes}">{word}</span>{space}'
+ | for token in doc:
+ | if token.is_space:
+ | output.append(token.orth_)
+ | else:
+ | output.append(
+ | template.format(
+ | classes=' '.join(get_classes(token)),
+ | word=token.orth_,
+ | space=token.whitespace_))
+ | string = ''.join(output)
+ | string = string.replace('\n', '
')
+ | string = string.replace('\t', ' '
+ | return string
+
+example("Efficient binary serialization")
pre.language-python: code
-
+ |
| byte_string = doc.as_bytes()
| open('/tmp/moby_dick.bin', 'wb').write(byte_string)
-
+ |
| nlp = spacy.en.English()
| for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
| doc = Doc(nlp.vocab)
| doc.from_bytes(byte_string)
+
+
+p
+ | See the
+ a(href="docs.html") docs page
+ | for
+ a(href="docs.html#api") API documentation,
+ a(href="docs.html#tutorials") tutorials,
+ | and
+ a(href="docs.html#spec") annotation specs.