From 005074c31eec65984570579b047dce000509c228 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 13 Aug 2015 15:49:33 +0200 Subject: [PATCH] * Add post introducing spaCy --- docs/redesign/blog_intro.jade | 93 +++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 docs/redesign/blog_intro.jade diff --git a/docs/redesign/blog_intro.jade b/docs/redesign/blog_intro.jade new file mode 100644 index 000000000..8b56d7daf --- /dev/null +++ b/docs/redesign/blog_intro.jade @@ -0,0 +1,93 @@ +- + var urls = { + 'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', + 'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html", + 'implementation': 'https://gist.github.com/syllog1sm/10343947', + 'redshift': 'http://github.com/syllog1sm/redshift', + 'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm', + 'acl_anthology': 'http://aclweb.org/anthology/', + 'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' + } + + +- var my_research_software = 'my research software' + +- var how_to_write_a_POS_tagger = 'how to write a part-of-speech tagger' + +- var parser_lnk = 'parser' + +- var buy_a_commercial_license = 'buy a commercial license' + +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + main#content(role='main') + article.post + p. + spaCy is a new library for text processing in Python + and Cython. I wrote it because I think small companies are terrible at + natural language processing (NLP). Or rather: small companies are using + terrible NLP technology. + + p. + To do great NLP, you have to know a little about linguistics, a lot + about machine learning, and almost everything about the latest research. + The people who fit this description seldom join small companies. + Most are broke – they've just finished grad school. + If they don't want to stay in academia, they join Google, IBM, etc. + + p. + The net result is that outside of the tech giants, commercial NLP has + changed little in the last ten years. In academia, it's changed entirely. + Amazing improvements in quality. Orders of magnitude faster. But the + academic code is always GPL, undocumented, unuseable, or all three. + You could implement the ideas yourself, but the papers are hard to read, + and training data is exorbitantly expensive. So what are you left with? + A common answer is NLTK, which was written primarily as an educational resource. + Nothing past the tokenizer is suitable for production use. + + p. + I used to think that the NLP community just needed to do more to communicate + its findings to software engineers. So I wrote two blog posts, explaining + !{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well + received, and there's been a bit of interest in !{my_research_software} + – even though it's entirely undocumented, and mostly unuseable to + anyone but me. + p. + So six months ago I quit my post-doc, and I've been working day and night + on spaCy since. I'm now pleased to announce an alpha release. + + p. + If you're a small company doing NLP, I think spaCy will seem like a minor + miracle. It's by far the fastest NLP software ever released. The + full processing pipeline completes in 20ms per document, including accurate + tagging and parsing. All strings are mapped to integer IDs, tokens are + linked to embedded word representations, and a range of useful features + are pre-calculated and cached. + + p. + If none of that made any sense to you, here's the gist of it. Computers + don't understand text. This is unfortunate, because that's what the + web almost entirely consists of. We want to recommend people text based + on other text they liked. We want to shorten text to display it on a + mobile screen. We want to aggregate it, link it, filter it, categorise + it, generate it and correct it. + + p. + spaCy provides a library of utility functions that help programmers + build such products. It's commercial open source software: you can + either use it under the AGPL, or you can !{buy_a_commercial_license} + under generous terms. + + footer(role='contentinfo')