mirror of https://github.com/explosion/spaCy.git
* More work on docs
This commit is contained in:
parent
c767ab9fdf
commit
1db080047b
|
@ -1,22 +1,33 @@
|
||||||
- var slogan = "Build Tomorrow's Language Technologies"
|
extends ./outline.jade
|
||||||
- var tag_line = "spaCy – #{slogan}"
|
|
||||||
- var a_minor_miracle = '<a href="">a minor miracle</a>'
|
// Notes
|
||||||
|
//
|
||||||
|
// 1. Where to put version notice? Should say something like
|
||||||
|
// 2015-08-12: v0.89
|
||||||
|
// and be a link
|
||||||
|
//
|
||||||
|
// Only needs to appear on home page.
|
||||||
|
|
||||||
|
|
||||||
|
- var slogan = "Build Tomorrow's Language Technologies"
|
||||||
|
- var tag_line = "spaCy – " + slogan
|
||||||
|
|
||||||
|
mixin lede
|
||||||
|
- var state_of_the_art = '<a href="#">state-of-the-art</a>'
|
||||||
|
- var a_minor_miracle = '<a href="">a minor miracle</a>'
|
||||||
|
- var great_documentation = '<a href="">great documentation</a>'
|
||||||
|
|
||||||
mixin lede()
|
|
||||||
p.
|
p.
|
||||||
<strong>spaCy</strong> is a library for industrial-strength NLP in Python and
|
<a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a
|
||||||
Cython. It features state-of-the-art speed and accuracy, a concise API, and
|
library for industrial-strength NLP in Python and Cython. It features
|
||||||
great documentation. If you're a small company doing NLP, we want spaCy to
|
!{state_of_the_art} speed and accuracy, a concise API, and great documentation.
|
||||||
seem like !{a_minor_miracle}.
|
If you're a small company doing NLP, we want <strong>spaCy</strong> to seem
|
||||||
|
like !{a_minor_miracle}.
|
||||||
|
|
||||||
mixin overview()
|
mixin overview()
|
||||||
p.
|
p.
|
||||||
Overview text
|
Overview text
|
||||||
|
|
||||||
mixin example()
|
|
||||||
p.
|
|
||||||
Example text
|
|
||||||
|
|
||||||
mixin benchmarks()
|
mixin benchmarks()
|
||||||
p.
|
p.
|
||||||
Benchmarks
|
Benchmarks
|
||||||
|
@ -25,18 +36,11 @@ mixin get_started()
|
||||||
p.
|
p.
|
||||||
Get Started
|
Get Started
|
||||||
|
|
||||||
mixin example(name)
|
|
||||||
details
|
|
||||||
summary
|
|
||||||
span(class="example-name")= name
|
|
||||||
|
|
||||||
block
|
|
||||||
|
|
||||||
mixin comparison(name)
|
mixin comparison(name)
|
||||||
details
|
details
|
||||||
summary
|
summary
|
||||||
h4
|
h4= name
|
||||||
name
|
|
||||||
|
|
||||||
block
|
block
|
||||||
|
|
||||||
|
@ -52,215 +56,51 @@ mixin row(...cells)
|
||||||
td= cell
|
td= cell
|
||||||
|
|
||||||
|
|
||||||
doctype html
|
mixin social
|
||||||
html(lang="en")
|
footer(role="contentinfo")
|
||||||
head
|
a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter
|
||||||
meta(charset="utf-8")
|
|
||||||
title!= tag_line
|
|
||||||
meta(name="description" content="")
|
|
||||||
meta(name="author" content="Matthew Honnibal")
|
|
||||||
link(rel="stylesheet" href="css/style.css")
|
|
||||||
<!--[if lt IE 9]>
|
|
||||||
script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
|
|
||||||
<![endif]-->
|
|
||||||
|
|
||||||
body(id="page" role="document")
|
div.discuss
|
||||||
header(role="banner")
|
a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn")
|
||||||
h1(class="logo")!= tag_line
|
| Discuss on Hacker News
|
||||||
div(class="slogan")!= slogan
|
|
||||||
|
|
||||||
nav(role="navigation")
|
a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit")
|
||||||
ul
|
| Discuss on Reddit
|
||||||
li: a(href="#") Home
|
|
||||||
li: a(href="#") Docs
|
|
||||||
li: a(href="#") License
|
|
||||||
li: a(href="#") Blog
|
|
||||||
|
|
||||||
main(id="content" role="main")
|
|
||||||
|
mixin Section(title_text, link_name, include_file)
|
||||||
|
a(name=link_name): h3 #{title_text}
|
||||||
|
|
||||||
|
if (link_name == "example-use")
|
||||||
|
include ./usage_examples.jade
|
||||||
|
else if (link_name == "online-demo")
|
||||||
|
include ./online_demo.jade
|
||||||
|
else if (link_name == "comparisons")
|
||||||
|
include ./comparisons.jade
|
||||||
|
else if (link_name == "install")
|
||||||
|
include ./installation.jade
|
||||||
|
|
||||||
|
|
||||||
|
block intro_block
|
||||||
section(class="intro")
|
section(class="intro")
|
||||||
+lede
|
+lede
|
||||||
|
|
||||||
nav(role="navigation")
|
nav(role="navigation")
|
||||||
ul
|
ul
|
||||||
li: a(href="#overview" class="button") Examples
|
li: a(href="#example-use" class="button") Examples
|
||||||
li: a(href="#overview" class="button") Comparisons
|
li: a(href="#online-demo" class="button") Demo
|
||||||
li: a(href="#example-use" class="button") Demo
|
li: a(href="#comparisons" class="button") Comparisons
|
||||||
li: a(href="#get-started" class="button") Install
|
li: a(href="#install" class="button") Install v0.89
|
||||||
|
|
||||||
|
|
||||||
|
block body_block
|
||||||
article(class="page landing-page")
|
article(class="page landing-page")
|
||||||
a(name="example-use"): h3 Usage by Example
|
|
||||||
|
|
||||||
+example("Load resources and process text")
|
+Section("Usage by Example", "example-use", "./usage_examples.jade")
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| from __future__ import unicode_literals, print_function
|
|
||||||
| from spacy.en import English
|
|
||||||
| nlp = English()
|
|
||||||
| doc = nlp('Hello, world. Here are two sentences.')
|
|
||||||
|
|
||||||
+example("Get tokens and sentences")
|
+Section("Online Demo", "online-demo", "./online_demo.jade")
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| token = doc[0]
|
|
||||||
| sentence = doc.sents[0]
|
|
||||||
| assert token[0] is sentence[0]
|
|
||||||
|
|
||||||
+example("Use integer IDs for any string")
|
+Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| hello_id = nlp.vocab.strings['Hello']
|
|
||||||
| hello_str = nlp.vocab.strings[hello_id]
|
|
||||||
|
|
|
||||||
| assert token.orth == hello_id == 52
|
|
||||||
| assert token.orth_ == hello_str == 'Hello'
|
|
||||||
|
|
||||||
+example("Get and set string views and flags")
|
+Section("Install", "install", "./install.jade")
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| assert token.shape_ == 'Xxxx'
|
|
||||||
| for lexeme in nlp.vocab:
|
|
||||||
| if lexeme.is_alpha:
|
|
||||||
| lexeme.shape_ = 'W'
|
|
||||||
| elif lexeme.is_digit:
|
|
||||||
| lexeme.shape_ = 'D'
|
|
||||||
| elif lexeme.is_punct:
|
|
||||||
| lexeme.shape_ = 'P'
|
|
||||||
| else:
|
|
||||||
| lexeme.shape_ = 'M'
|
|
||||||
| assert token.shape_ == 'W'
|
|
||||||
|
|
||||||
+example("Export to numpy arrays")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
+example("Word vectors")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
+example("Part-of-speech tags")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
+example("Syntactic dependencies")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
+example("Named entities")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
+example("Define custom NER rules")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
+example("Calculate inline mark-up on original string")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
+example("Efficient binary serialization")
|
|
||||||
pre.language-python
|
|
||||||
code
|
|
||||||
| Do me
|
|
||||||
|
|
||||||
a(name="benchmarks"): h3 Benchmarks
|
|
||||||
|
|
||||||
+comparison("spaCy vs. NLTK")
|
|
||||||
+comparison("spaCy vs. Pattern")
|
|
||||||
+comparison("spaCy vs. CoreNLP")
|
|
||||||
+comparison("spaCy vs. ClearNLP")
|
|
||||||
+comparison("spaCy vs. OpenNLP")
|
|
||||||
+comparison("spaCy vs. GATE")
|
|
||||||
|
|
||||||
details
|
|
||||||
summary: h4 Independent Evaluation
|
|
||||||
|
|
||||||
p
|
|
||||||
| Independent evaluation by Yahoo! Labs and Emory
|
|
||||||
| University, to appear at ACL 2015. Higher is better.
|
|
||||||
|
|
||||||
table
|
|
||||||
thead
|
|
||||||
+columns("System", "Language", "Accuracy", "Speed")
|
|
||||||
|
|
||||||
tbody
|
|
||||||
+row("spaCy v0.86", "Cython", "91.9", "13,963")
|
|
||||||
+row("spaCy v0.84", "Cython", "90.6", "13,963")
|
|
||||||
+row("ClearNLP", "Java", "91.7", "10,271")
|
|
||||||
+row("CoreNLP", "Java", "89.6", "8,602")
|
|
||||||
+row("MATE", "Java", "92.5", "550")
|
|
||||||
+row("Turbo", "C++", "92.4", "349")
|
|
||||||
+row("Yara", "Java", "92.3", "340")
|
|
||||||
|
|
||||||
p
|
|
||||||
| Accuracy is % unlabelled arcs correct, speed is tokens per second.
|
|
||||||
|
|
||||||
p
|
|
||||||
| Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory)
|
|
||||||
| performed a detailed comparison of the best parsers available.
|
|
||||||
| All numbers above are taken from the pre-print they kindly made
|
|
||||||
| available to me, except for spaCy v0.86.
|
|
||||||
|
|
||||||
p
|
|
||||||
| I'm particularly grateful to the authors for discussion of their
|
|
||||||
| results, which led to the improvement in accuracy between v0.84 and
|
|
||||||
| v0.86. A tip from Jin-ho developer of ClearNLP) was particularly
|
|
||||||
| useful.
|
|
||||||
|
|
||||||
details
|
|
||||||
summary: h4 Detailed Accuracy Comparison
|
|
||||||
|
|
||||||
details
|
|
||||||
summary: h4 Detailed Speed Comparison
|
|
||||||
|
|
||||||
table
|
|
||||||
thead
|
|
||||||
tr
|
|
||||||
th.
|
|
||||||
th(colspan=3) Absolute (ms per doc)
|
|
||||||
th(colspan=3) Relative (to spaCy)
|
|
||||||
|
|
||||||
tbody
|
|
||||||
tr
|
|
||||||
td: strong System
|
|
||||||
td: strong Split
|
|
||||||
td: strong Tag
|
|
||||||
td: strong Parse
|
|
||||||
td: strong Split
|
|
||||||
td: strong Tag
|
|
||||||
td: strong Parse
|
|
||||||
|
|
||||||
+row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
|
||||||
+row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
|
|
||||||
+row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x")
|
|
||||||
+row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x")
|
|
||||||
+row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a")
|
|
||||||
|
|
||||||
p
|
|
||||||
| <strong>Set up</strong>: 100,000 plain-text documents were streamed
|
|
||||||
| from an SQLite3 database, and processed with an NLP library, to one
|
|
||||||
| of three levels of detail – tokenization, tagging, or parsing.
|
|
||||||
| The tasks are additive: to parse the text you have to tokenize and
|
|
||||||
| tag it. The pre-processing was not subtracted from the times –
|
|
||||||
| I report the time required for the pipeline to complete. I report
|
|
||||||
| mean times per document, in milliseconds.
|
|
||||||
|
|
||||||
p
|
|
||||||
| <strong>Hardware</strong>: Intel i7-3770 (2012)
|
|
||||||
|
|
||||||
a(name="get-started"): h3 Get started
|
|
||||||
|
|
||||||
+get_started
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
footer(role="contentinfo")
|
|
||||||
|
|
||||||
script(src="js/prism.js")
|
|
||||||
|
|
Loading…
Reference in New Issue