mirror of https://github.com/explosion/spaCy.git
* Remove docs
This commit is contained in:
parent
dcc8fadc7e
commit
ffbf9e9ca5
177
docs/Makefile
177
docs/Makefile
|
@ -1,177 +0,0 @@
|
|||
# Makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXOPTS =
|
||||
SPHINXBUILD = sphinx-build
|
||||
PAPER =
|
||||
BUILDDIR = ../../docs-spacy
|
||||
|
||||
# User-friendly check for sphinx-build
|
||||
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
|
||||
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
|
||||
endif
|
||||
|
||||
# Internal variables.
|
||||
PAPEROPT_a4 = -D latex_paper_size=a4
|
||||
PAPEROPT_letter = -D latex_paper_size=letter
|
||||
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
|
||||
# the i18n builder cannot share the environment and doctrees with the others
|
||||
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
|
||||
|
||||
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
|
||||
|
||||
help:
|
||||
@echo "Please use \`make <target>' where <target> is one of"
|
||||
@echo " html to make standalone HTML files"
|
||||
@echo " dirhtml to make HTML files named index.html in directories"
|
||||
@echo " singlehtml to make a single large HTML file"
|
||||
@echo " pickle to make pickle files"
|
||||
@echo " json to make JSON files"
|
||||
@echo " htmlhelp to make HTML files and a HTML help project"
|
||||
@echo " qthelp to make HTML files and a qthelp project"
|
||||
@echo " devhelp to make HTML files and a Devhelp project"
|
||||
@echo " epub to make an epub"
|
||||
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
|
||||
@echo " latexpdf to make LaTeX files and run them through pdflatex"
|
||||
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
|
||||
@echo " text to make text files"
|
||||
@echo " man to make manual pages"
|
||||
@echo " texinfo to make Texinfo files"
|
||||
@echo " info to make Texinfo files and run them through makeinfo"
|
||||
@echo " gettext to make PO message catalogs"
|
||||
@echo " changes to make an overview of all changed/added/deprecated items"
|
||||
@echo " xml to make Docutils-native XML files"
|
||||
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
|
||||
@echo " linkcheck to check all external links for integrity"
|
||||
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
|
||||
|
||||
clean:
|
||||
rm -rf $(BUILDDIR)/*
|
||||
|
||||
html:
|
||||
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
|
||||
@echo
|
||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
|
||||
|
||||
dirhtml:
|
||||
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
|
||||
@echo
|
||||
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
|
||||
|
||||
singlehtml:
|
||||
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
|
||||
@echo
|
||||
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
|
||||
|
||||
pickle:
|
||||
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
|
||||
@echo
|
||||
@echo "Build finished; now you can process the pickle files."
|
||||
|
||||
json:
|
||||
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
|
||||
@echo
|
||||
@echo "Build finished; now you can process the JSON files."
|
||||
|
||||
htmlhelp:
|
||||
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
|
||||
@echo
|
||||
@echo "Build finished; now you can run HTML Help Workshop with the" \
|
||||
".hhp project file in $(BUILDDIR)/htmlhelp."
|
||||
|
||||
qthelp:
|
||||
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
|
||||
@echo
|
||||
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
|
||||
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
|
||||
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spaCy.qhcp"
|
||||
@echo "To view the help file:"
|
||||
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spaCy.qhc"
|
||||
|
||||
devhelp:
|
||||
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
|
||||
@echo
|
||||
@echo "Build finished."
|
||||
@echo "To view the help file:"
|
||||
@echo "# mkdir -p $$HOME/.local/share/devhelp/spaCy"
|
||||
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spaCy"
|
||||
@echo "# devhelp"
|
||||
|
||||
epub:
|
||||
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
|
||||
@echo
|
||||
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
|
||||
|
||||
latex:
|
||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||
@echo
|
||||
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
|
||||
@echo "Run \`make' in that directory to run these through (pdf)latex" \
|
||||
"(use \`make latexpdf' here to do that automatically)."
|
||||
|
||||
latexpdf:
|
||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||
@echo "Running LaTeX files through pdflatex..."
|
||||
$(MAKE) -C $(BUILDDIR)/latex all-pdf
|
||||
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||
|
||||
latexpdfja:
|
||||
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
|
||||
@echo "Running LaTeX files through platex and dvipdfmx..."
|
||||
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
|
||||
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
|
||||
|
||||
text:
|
||||
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
|
||||
@echo
|
||||
@echo "Build finished. The text files are in $(BUILDDIR)/text."
|
||||
|
||||
man:
|
||||
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
|
||||
@echo
|
||||
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
|
||||
|
||||
texinfo:
|
||||
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||
@echo
|
||||
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
|
||||
@echo "Run \`make' in that directory to run these through makeinfo" \
|
||||
"(use \`make info' here to do that automatically)."
|
||||
|
||||
info:
|
||||
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
|
||||
@echo "Running Texinfo files through makeinfo..."
|
||||
make -C $(BUILDDIR)/texinfo info
|
||||
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
|
||||
|
||||
gettext:
|
||||
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
|
||||
@echo
|
||||
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
|
||||
|
||||
changes:
|
||||
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
|
||||
@echo
|
||||
@echo "The overview file is in $(BUILDDIR)/changes."
|
||||
|
||||
linkcheck:
|
||||
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
|
||||
@echo
|
||||
@echo "Link check complete; look for any errors in the above output " \
|
||||
"or in $(BUILDDIR)/linkcheck/output.txt."
|
||||
|
||||
doctest:
|
||||
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
|
||||
@echo "Testing of doctests in the sources finished, look at the " \
|
||||
"results in $(BUILDDIR)/doctest/output.txt."
|
||||
|
||||
xml:
|
||||
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
|
||||
@echo
|
||||
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
|
||||
|
||||
pseudoxml:
|
||||
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
|
||||
@echo
|
||||
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
|
|
@ -1,271 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# spaCy documentation build configuration file, created by
|
||||
# sphinx-quickstart on Thu Sep 25 17:47:15 2014.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
# containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.doctest',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx.ext.viewcode',
|
||||
'sphinxcontrib.napoleon',
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# The suffix of source filenames.
|
||||
source_suffix = '.rst'
|
||||
|
||||
# The encoding of source files.
|
||||
#source_encoding = 'utf-8-sig'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = u'spaCy'
|
||||
copyright = u'2015, Matthew Honnibal'
|
||||
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
# built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = '0.85'
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = '0.85'
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#language = None
|
||||
|
||||
# There are two options for replacing |today|: either, you set today to some
|
||||
# non-false value, then it is used:
|
||||
#today = ''
|
||||
# Else, today_fmt is used as the format for a strftime call.
|
||||
#today_fmt = '%B %d, %Y'
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
exclude_patterns = []
|
||||
|
||||
# The reST default role (used for this markup: `text`) to use for all
|
||||
# documents.
|
||||
#default_role = None
|
||||
|
||||
# If true, '()' will be appended to :func: etc. cross-reference text.
|
||||
#add_function_parentheses = True
|
||||
|
||||
# If true, the current module name will be prepended to all description
|
||||
# unit titles (such as .. function::).
|
||||
#add_module_names = True
|
||||
|
||||
# If true, sectionauthor and moduleauthor directives will be shown in the
|
||||
# output. They are ignored by default.
|
||||
#show_authors = False
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
|
||||
# A list of ignored prefixes for module index sorting.
|
||||
#modindex_common_prefix = []
|
||||
|
||||
# If true, keep warnings as "system message" paragraphs in the built documents.
|
||||
#keep_warnings = False
|
||||
|
||||
|
||||
# -- Options for HTML output ----------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
html_theme_options = {
|
||||
'google_analytics_id': 'UA-58931649-1'
|
||||
}
|
||||
|
||||
# Add any paths that contain custom themes here, relative to this directory.
|
||||
html_theme_path = ["../_themes"]
|
||||
|
||||
# The name for this set of Sphinx documents. If None, it defaults to
|
||||
# "<project> v<release> documentation".
|
||||
#html_title = None
|
||||
|
||||
# A shorter title for the navigation bar. Default is the same as html_title.
|
||||
#html_short_title = None
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top
|
||||
# of the sidebar.
|
||||
#html_logo = None
|
||||
|
||||
# The name of an image file (within the static path) to use as favicon of the
|
||||
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
|
||||
# pixels large.
|
||||
#html_favicon = None
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
|
||||
# Add any extra paths that contain custom files (such as robots.txt or
|
||||
# .htaccess) here, relative to this directory. These files are copied
|
||||
# directly to the root of the documentation.
|
||||
#html_extra_path = []
|
||||
|
||||
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
|
||||
# using the given strftime format.
|
||||
#html_last_updated_fmt = '%b %d, %Y'
|
||||
|
||||
# If true, SmartyPants will be used to convert quotes and dashes to
|
||||
# typographically correct entities.
|
||||
#html_use_smartypants = True
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
#html_sidebars = {}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names to
|
||||
# template names.
|
||||
#html_additional_pages = {}
|
||||
|
||||
# If false, no module index is generated.
|
||||
#html_domain_indices = True
|
||||
|
||||
# If false, no index is generated.
|
||||
#html_use_index = True
|
||||
|
||||
# If true, the index is split into individual pages for each letter.
|
||||
#html_split_index = False
|
||||
|
||||
# If true, links to the reST sources are added to the pages.
|
||||
#html_show_sourcelink = True
|
||||
|
||||
# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
|
||||
#html_show_sphinx = True
|
||||
|
||||
# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
|
||||
#html_show_copyright = True
|
||||
|
||||
# If true, an OpenSearch description file will be output, and all pages will
|
||||
# contain a <link> tag referring to it. The value of this option must be the
|
||||
# base URL from which the finished HTML is served.
|
||||
#html_use_opensearch = ''
|
||||
|
||||
# This is the file name suffix for HTML files (e.g. ".xhtml").
|
||||
#html_file_suffix = None
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'spaCydoc'
|
||||
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#'papersize': 'letterpaper',
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#'pointsize': '10pt',
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#'preamble': '',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
('index', 'spaCy.tex', u'spaCy Documentation',
|
||||
u'Matthew Honnibal', 'manual'),
|
||||
]
|
||||
|
||||
# The name of an image file (relative to this directory) to place at the top of
|
||||
# the title page.
|
||||
#latex_logo = None
|
||||
|
||||
# For "manual" documents, if this is true, then toplevel headings are parts,
|
||||
# not chapters.
|
||||
#latex_use_parts = False
|
||||
|
||||
# If true, show page references after internal links.
|
||||
#latex_show_pagerefs = False
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
#latex_show_urls = False
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#latex_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#latex_domain_indices = True
|
||||
|
||||
|
||||
# -- Options for manual page output ---------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
('index', 'spacy', u'spaCy Documentation',
|
||||
[u'Matthew Honnibal'], 1)
|
||||
]
|
||||
|
||||
# If true, show URL addresses after external links.
|
||||
#man_show_urls = False
|
||||
|
||||
|
||||
# -- Options for Texinfo output -------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
('index', 'spaCy', u'spaCy Documentation',
|
||||
u'Matthew Honnibal', 'spaCy', 'One line description of project.',
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
# Documents to append as an appendix to all manuals.
|
||||
#texinfo_appendices = []
|
||||
|
||||
# If false, no module index is generated.
|
||||
#texinfo_domain_indices = True
|
||||
|
||||
# How to display URL addresses: 'footnote', 'no', or 'inline'.
|
||||
#texinfo_show_urls = 'footnote'
|
||||
|
||||
# If true, do not generate a @detailmenu in the "Top" node's menu.
|
||||
#texinfo_no_detailmenu = False
|
||||
|
||||
|
||||
# Example configuration for intersphinx: refer to the Python standard library.
|
||||
intersphinx_mapping = {'http://docs.python.org/': None}
|
|
@ -1,116 +0,0 @@
|
|||
====================
|
||||
Annotation Standards
|
||||
====================
|
||||
|
||||
This document describes the target annotations spaCy is trained to predict.
|
||||
|
||||
This is currently a work in progress. Please ask questions on the issue tracker,
|
||||
so that the answers can be integrated here to improve the documentation.
|
||||
|
||||
https://github.com/honnibal/spaCy/issues
|
||||
|
||||
English
|
||||
=======
|
||||
|
||||
Tokenization
|
||||
------------
|
||||
|
||||
Tokenization standards are based on the OntoNotes 5 corpus.
|
||||
|
||||
The tokenizer differs from most by including tokens for significant whitespace.
|
||||
Any sequence of whitespace characters beyond a single space (' ') is included
|
||||
as a token. For instance:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English(parse=False)
|
||||
>>> tokens = nlp(u'Some\nspaces and\ttab characters')
|
||||
>>> print [t.orth_ for t in tokens]
|
||||
[u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters']
|
||||
|
||||
The whitespace tokens are useful for much the same reason punctuation is --- it's
|
||||
often an important delimiter in the text. By preserving it in the token output,
|
||||
we are able to maintain a simple alignment between the tokens and the original
|
||||
string, and we ensure that the token stream does not lose information.
|
||||
|
||||
Sentence boundary detection
|
||||
---------------------------
|
||||
|
||||
Sentence boundaries are calculated from the syntactic parse tree, so features
|
||||
such as punctuation and capitalisation play an important but non-decisive role
|
||||
in determining the sentence boundaries. Usually this means that the sentence
|
||||
boundaries will at least coincide with clause boundaries, even given poorly
|
||||
punctuated text.
|
||||
|
||||
Part-of-speech Tagging
|
||||
----------------------
|
||||
|
||||
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
|
||||
tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
||||
|
||||
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
|
||||
|
||||
Lemmatization
|
||||
-------------
|
||||
|
||||
A "lemma" is the uninflected form of a word. In English, this means:
|
||||
|
||||
* Adjectives: The form like "happy", not "happier" or "happiest"
|
||||
* Adverbs: The form like "badly", not "worse" or "worst"
|
||||
* Nouns: The form like "dog", not "dogs"; like "child", not "children"
|
||||
* Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||
|
||||
The lemmatization data is taken from WordNet. However, we also add a special
|
||||
case for pronouns: all pronouns are lemmatized to the special token -PRON-.
|
||||
|
||||
Syntactic Dependency Parsing
|
||||
----------------------------
|
||||
|
||||
The parser is trained on data produced by the ClearNLP converter. Details of
|
||||
the annotation scheme can be found here:
|
||||
|
||||
http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
|
||||
|
||||
Named Entity Recognition
|
||||
------------------------
|
||||
|
||||
+--------------+-----------------------------------------------------+
|
||||
| PERSON | People, including fictional |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| NORP | Nationalities or religious or political groups |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| FACILITY | Buildings, airports, highways, bridges, etc. |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| ORGANIZATION | Companies, agencies, institutions, etc. |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| GPE | Countries, cities, states |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| LOCATION | Non-GPE locations, mountain ranges, bodies of water |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| PRODUCT | Vehicles, weapons, foods, etc. (Not services) |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| EVENT | Named hurricanes, battles, wars, sports events, etc.|
|
||||
+--------------+-----------------------------------------------------+
|
||||
| WORK OF ART | Titles of books, songs, etc. |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| LAW | Named documents made into laws |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| LANGUAGE | Any named language |
|
||||
+--------------+-----------------------------------------------------+
|
||||
|
||||
The following values are also annotated in a style similar to names:
|
||||
|
||||
+--------------+---------------------------------------------+
|
||||
| DATE | Absolute or relative dates or periods |
|
||||
+--------------+---------------------------------------------+
|
||||
| TIME | Times smaller than a day |
|
||||
+--------------+---------------------------------------------+
|
||||
| PERCENT | Percentage (including “%”) |
|
||||
+--------------+---------------------------------------------+
|
||||
| MONEY | Monetary values, including unit |
|
||||
+--------------+---------------------------------------------+
|
||||
| QUANTITY | Measurements, as of weight or distance |
|
||||
+--------------+---------------------------------------------+
|
||||
| ORDINAL | "first", "second" |
|
||||
+--------------+---------------------------------------------+
|
||||
| CARDINAL | Numerals that do not fall under another type|
|
||||
+--------------+---------------------------------------------+
|
|
@ -1 +0,0 @@
|
|||
|
|
@ -1,77 +0,0 @@
|
|||
Lexeme Features
|
||||
===============
|
||||
|
||||
A lexeme is an entry in the lexicon --- the vocabulary --- for a word, punctuation
|
||||
symbol, whitespace unit, etc. Lexemes come with lots of pre-computed information,
|
||||
that help you write good feature functions. Features are integer-valued where
|
||||
possible --- instead of strings, spaCy refers to strings by consecutive ID numbers,
|
||||
which you can use to look up the string values if necessary.
|
||||
|
||||
String features
|
||||
---------------
|
||||
|
||||
+---------+-------------------------------------------------------------------+
|
||||
| SIC | The word as it appeared in the sentence, unaltered. |
|
||||
+---------+-------------------------------------------------------------------+
|
||||
| NORM | For frequent words, case normalization is applied. |
|
||||
| | Otherwise, back-off to SHAPE. |
|
||||
+---------+-------------------------------------------------------------------+
|
||||
| SHAPE | Remap the characters of the word as follows: |
|
||||
| | |
|
||||
| | a-z --> x, A-Z --> X, 0-9 --> d, ,.;:"'?!$- --> self, other --> \*|
|
||||
| | |
|
||||
| | Trim sequences of length 3+ to 3, e.g |
|
||||
| | |
|
||||
| | apples --> xxx, Apples --> Xxxx, app9LES@ --> xxx9XXX* |
|
||||
+---------+-------------------------------------------------------------------+
|
||||
| ASCIIED | Use unidecode.unidecode(sic) to approximate the word using the |
|
||||
| | ascii characters. |
|
||||
+---------+-------------------------------------------------------------------+
|
||||
| PREFIX | sic_unicode_string[:1] |
|
||||
+---------+-------------------------------------------------------------------+
|
||||
| SUFFIX | sic_unicode_string[-3:] |
|
||||
+---------+-------------------------------------------------------------------+
|
||||
|
||||
|
||||
Integer features
|
||||
----------------
|
||||
|
||||
+--------------+--------------------------------------------------------------+
|
||||
| LENGTH | Length of the string, in unicode |
|
||||
+--------------+--------------------------------------------------------------+
|
||||
| CLUSTER | Brown cluster |
|
||||
+--------------+--------------------------------------------------------------+
|
||||
| POS_TYPE | K-means cluster of word's tag affinities |
|
||||
+--------------+--------------------------------------------------------------+
|
||||
| SENSE_TYPE | K-means cluster of word's sense affinities |
|
||||
+--------------+--------------------------------------------------------------+
|
||||
|
||||
Boolean features
|
||||
----------------
|
||||
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_ALPHA | The result of sic.isalpha() |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_ASCII | Check whether all the word's characters are ascii characters |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_DIGIT | The result of sic.isdigit() |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_LOWER | The result of sic.islower() |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_PUNCT | Check whether all characters are in the class TODO |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_SPACE | The result of sic.isspace() |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_TITLE | The result of sic.istitle() |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IS_UPPER | The result of sic.isupper() |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| LIKE_URL | Check whether the string looks like it could be a URL. Aims |
|
||||
| | for low false negative rate. |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| LIKE_NUMBER | Check whether the string looks like it could be a numeric |
|
||||
| | entity, e.g. 10,000 10th .10 . Skews for low false negative |
|
||||
| | rate. |
|
||||
+-------------+--------------------------------------------------------------+
|
||||
| IN_LIST | Facility for loading arbitrary run-time word lists? |
|
||||
+-------------+--------------------------------------------------------------+
|
|
@ -1,337 +0,0 @@
|
|||
{
|
||||
"id": "wsj_0001",
|
||||
"paragraphs": [
|
||||
{
|
||||
"raw": "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.",
|
||||
|
||||
"segmented": "Pierre Vinken<SEP>, 61 years old<SEP>, will join the board as a nonexecutive director Nov. 29<SEP>.<SENT>Mr. Vinken is chairman of Elsevier N.V.<SEP>, the Dutch publishing group<SEP>.",
|
||||
|
||||
"sents": [
|
||||
0,
|
||||
85
|
||||
],
|
||||
|
||||
"tokens": [
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 0,
|
||||
"head": 7,
|
||||
"tag": "NNP",
|
||||
"orth": "Pierre"
|
||||
},
|
||||
{
|
||||
"dep": "SUB",
|
||||
"start": 7,
|
||||
"head": 29,
|
||||
"tag": "NNP",
|
||||
"orth": "Vinken"
|
||||
},
|
||||
{
|
||||
"dep": "P",
|
||||
"start": 13,
|
||||
"head": 7,
|
||||
"tag": ",",
|
||||
"orth": ","
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 15,
|
||||
"head": 18,
|
||||
"tag": "CD",
|
||||
"orth": "61"
|
||||
},
|
||||
{
|
||||
"dep": "AMOD",
|
||||
"start": 18,
|
||||
"head": 24,
|
||||
"tag": "NNS",
|
||||
"orth": "years"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 24,
|
||||
"head": 7,
|
||||
"tag": "JJ",
|
||||
"orth": "old"
|
||||
},
|
||||
{
|
||||
"dep": "P",
|
||||
"start": 27,
|
||||
"head": 7,
|
||||
"tag": ",",
|
||||
"orth": ","
|
||||
},
|
||||
{
|
||||
"dep": "ROOT",
|
||||
"start": 29,
|
||||
"head": -1,
|
||||
"tag": "MD",
|
||||
"orth": "will"
|
||||
},
|
||||
{
|
||||
"dep": "VC",
|
||||
"start": 34,
|
||||
"head": 29,
|
||||
"tag": "VB",
|
||||
"orth": "join"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 39,
|
||||
"head": 43,
|
||||
"tag": "DT",
|
||||
"orth": "the"
|
||||
},
|
||||
{
|
||||
"dep": "OBJ",
|
||||
"start": 43,
|
||||
"head": 34,
|
||||
"tag": "NN",
|
||||
"orth": "board"
|
||||
},
|
||||
{
|
||||
"dep": "VMOD",
|
||||
"start": 49,
|
||||
"head": 34,
|
||||
"tag": "IN",
|
||||
"orth": "as"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 52,
|
||||
"head": 67,
|
||||
"tag": "DT",
|
||||
"orth": "a"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 54,
|
||||
"head": 67,
|
||||
"tag": "JJ",
|
||||
"orth": "nonexecutive"
|
||||
},
|
||||
{
|
||||
"dep": "PMOD",
|
||||
"start": 67,
|
||||
"head": 49,
|
||||
"tag": "NN",
|
||||
"orth": "director"
|
||||
},
|
||||
{
|
||||
"dep": "VMOD",
|
||||
"start": 76,
|
||||
"head": 34,
|
||||
"tag": "NNP",
|
||||
"orth": "Nov."
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 81,
|
||||
"head": 76,
|
||||
"tag": "CD",
|
||||
"orth": "29"
|
||||
},
|
||||
{
|
||||
"dep": "P",
|
||||
"start": 83,
|
||||
"head": 29,
|
||||
"tag": ".",
|
||||
"orth": "."
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 85,
|
||||
"head": 89,
|
||||
"tag": "NNP",
|
||||
"orth": "Mr."
|
||||
},
|
||||
{
|
||||
"dep": "SUB",
|
||||
"start": 89,
|
||||
"head": 96,
|
||||
"tag": "NNP",
|
||||
"orth": "Vinken"
|
||||
},
|
||||
{
|
||||
"dep": "ROOT",
|
||||
"start": 96,
|
||||
"head": -1,
|
||||
"tag": "VBZ",
|
||||
"orth": "is"
|
||||
},
|
||||
{
|
||||
"dep": "PRD",
|
||||
"start": 99,
|
||||
"head": 96,
|
||||
"tag": "NN",
|
||||
"orth": "chairman"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 108,
|
||||
"head": 99,
|
||||
"tag": "IN",
|
||||
"orth": "of"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 111,
|
||||
"head": 120,
|
||||
"tag": "NNP",
|
||||
"orth": "Elsevier"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 120,
|
||||
"head": 147,
|
||||
"tag": "NNP",
|
||||
"orth": "N.V."
|
||||
},
|
||||
{
|
||||
"dep": "P",
|
||||
"start": 124,
|
||||
"head": 147,
|
||||
"tag": ",",
|
||||
"orth": ","
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 126,
|
||||
"head": 147,
|
||||
"tag": "DT",
|
||||
"orth": "the"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 130,
|
||||
"head": 147,
|
||||
"tag": "NNP",
|
||||
"orth": "Dutch"
|
||||
},
|
||||
{
|
||||
"dep": "NMOD",
|
||||
"start": 136,
|
||||
"head": 147,
|
||||
"tag": "VBG",
|
||||
"orth": "publishing"
|
||||
},
|
||||
{
|
||||
"dep": "PMOD",
|
||||
"start": 147,
|
||||
"head": 108,
|
||||
"tag": "NN",
|
||||
"orth": "group"
|
||||
},
|
||||
{
|
||||
"dep": "P",
|
||||
"start": 152,
|
||||
"head": 96,
|
||||
"tag": ".",
|
||||
"orth": "."
|
||||
}
|
||||
],
|
||||
"brackets": [
|
||||
{
|
||||
"start": 0,
|
||||
"end": 7,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 15,
|
||||
"end": 18,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 15,
|
||||
"end": 24,
|
||||
"label": "ADJP"
|
||||
},
|
||||
{
|
||||
"start": 0,
|
||||
"end": 27,
|
||||
"label": "NP-SBJ"
|
||||
},
|
||||
{
|
||||
"start": 39,
|
||||
"end": 43,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 52,
|
||||
"end": 67,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 49,
|
||||
"end": 67,
|
||||
"label": "PP-CLR"
|
||||
},
|
||||
{
|
||||
"start": 76,
|
||||
"end": 81,
|
||||
"label": "NP-TMP"
|
||||
},
|
||||
{
|
||||
"start": 34,
|
||||
"end": 81,
|
||||
"label": "VP"
|
||||
},
|
||||
{
|
||||
"start": 29,
|
||||
"end": 81,
|
||||
"label": "VP"
|
||||
},
|
||||
{
|
||||
"start": 0,
|
||||
"end": 83,
|
||||
"label": "S"
|
||||
},
|
||||
{
|
||||
"start": 85,
|
||||
"end": 89,
|
||||
"label": "NP-SBJ"
|
||||
},
|
||||
{
|
||||
"start": 99,
|
||||
"end": 99,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 111,
|
||||
"end": 120,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 126,
|
||||
"end": 147,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 111,
|
||||
"end": 147,
|
||||
"label": "NP"
|
||||
},
|
||||
{
|
||||
"start": 108,
|
||||
"end": 147,
|
||||
"label": "PP"
|
||||
},
|
||||
{
|
||||
"start": 99,
|
||||
"end": 147,
|
||||
"label": "NP-PRD"
|
||||
},
|
||||
{
|
||||
"start": 96,
|
||||
"end": 147,
|
||||
"label": "VP"
|
||||
},
|
||||
{
|
||||
"start": 85,
|
||||
"end": 152,
|
||||
"label": "S"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
|
@ -1,262 +0,0 @@
|
|||
How spaCy Works
|
||||
===============
|
||||
|
||||
The following are some hasty preliminary notes on how spaCy works. The short
|
||||
story is, there are no new killer algorithms. The way that the tokenizer works
|
||||
is novel and a bit neat, and the parser has a new feature set, but otherwise
|
||||
the key algorithms are well known in the recent literature.
|
||||
|
||||
Some might also wonder how I get Python code to run so fast. I don't --- spaCy
|
||||
is written in `Cython`_, an optionally statically-typed language that compiles
|
||||
to C or C++, which is then loaded as a C extension module.
|
||||
This makes it `easy to achieve the performance of native C code`_, but allows the
|
||||
use of Python language features, via the Python C API. The Python unicode
|
||||
library was particularly useful to me. I think it would have been much more
|
||||
difficult to write spaCy in another language.
|
||||
|
||||
.. _Cython: http://cython.org/
|
||||
|
||||
.. _easy to achieve the performance of native C code: https://honnibal.wordpress.com/2014/10/21/writing-c-in-cython/
|
||||
|
||||
Tokenizer and Lexicon
|
||||
---------------------
|
||||
|
||||
Tokenization is the task of splitting a string into meaningful pieces, called
|
||||
tokens, which you can then compute with. In practice, the task is usually to
|
||||
match the tokenization performed in some treebank, or other corpus. If we want
|
||||
to apply a tagger, entity recogniser, parser etc, then we want our run-time
|
||||
text to match the training conventions. If we want to use a model that's been
|
||||
trained to expect "isn't" to be split into two tokens, ["is", "n't"], then that's
|
||||
how we need to prepare our data.
|
||||
|
||||
In order to train spaCy's models with the best data available, I therefore
|
||||
tokenize English according to the Penn Treebank scheme. It's not perfect, but
|
||||
it's what everybody is using, and it's good enough.
|
||||
|
||||
What we don't do
|
||||
################
|
||||
|
||||
The Penn Treebank was distributed with a script called tokenizer.sed, which
|
||||
tokenizes ASCII newswire text roughly according to the Penn Treebank standard.
|
||||
Almost all tokenizers are based on these regular expressions, with various
|
||||
updates to account for unicode characters, and the fact that it's no longer
|
||||
1986 --- today's text has URLs, emails, emoji, etc.
|
||||
|
||||
Usually, the resulting regular expressions are applied in multiple passes, which
|
||||
is quite inefficient. Often no care is taken to preserve indices into the original
|
||||
string. If you lose these indices, it'll be difficult to calculate mark-up based
|
||||
on your annotations.
|
||||
|
||||
Tokenizer Algorithm
|
||||
###################
|
||||
|
||||
spaCy's tokenizer assumes that no tokens will cross whitespace --- there will
|
||||
be no multi-word tokens. If we want these, we can post-process the
|
||||
token-stream later, merging as necessary. This assumption allows us to deal
|
||||
only with small chunks of text. We can cache the processing of these, and
|
||||
simplify our expressions somewhat.
|
||||
|
||||
Here is what the outer-loop would look like in Python. (You can see the
|
||||
production implementation, in Cython, here.)
|
||||
|
||||
.. code:: python
|
||||
|
||||
cache = {}
|
||||
def tokenize(text):
|
||||
tokens = []
|
||||
for substring in text.split(' '):
|
||||
if substring in cache:
|
||||
tokens.extend(cache[substring])
|
||||
else:
|
||||
subtokens = _tokenize_substring(substring)
|
||||
tokens.extend(subtokens)
|
||||
cache[substring] = subtokens
|
||||
return tokens
|
||||
|
||||
The actual work is performed in _tokenize_substring. For this, I divide the
|
||||
tokenization rules into three pieces:
|
||||
|
||||
1. A prefixes expression, which matches from the start of the string;
|
||||
2. A suffixes expression, which matches from the end of the string;
|
||||
3. A special-cases table, which matches the whole string.
|
||||
|
||||
The algorithm then proceeds roughly like this (consider this like pseudo-code;
|
||||
this was written quickly and has not been executed):
|
||||
|
||||
.. code:: python
|
||||
|
||||
# Tokens which can be attached at the beginning or end of another
|
||||
prefix_re = _make_re([",", '"', '(', ...])
|
||||
suffix_re = _make_re(s[",", "'", ":", "'s", ...])
|
||||
|
||||
# Contractions etc are simply enumerated, since they're a finite set. We
|
||||
# can also specify anything we like here, which is nice --- different data
|
||||
# has different quirks, so we want to be able to add ad hoc exceptions.
|
||||
special_cases = {
|
||||
"can't": ("ca", "n't"),
|
||||
"won't": ("wo", "n't"),
|
||||
"he'd've": ("he", "'d", "'ve"),
|
||||
...
|
||||
":)": (":)",) # We can add any arbitrary thing to this list.
|
||||
}
|
||||
|
||||
def _tokenize_substring(substring):
|
||||
prefixes = []
|
||||
suffixes = []
|
||||
while substring not in special_cases:
|
||||
prefix, substring = _apply_re(substring, prefix_re)
|
||||
if prefix:
|
||||
prefixes.append(prefix)
|
||||
else:
|
||||
suffix, substring = _apply_re(substring, suffix_re)
|
||||
if suffix:
|
||||
suffixes.append(suffix)
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
This procedure splits off tokens from the start and end of the string, at each
|
||||
point checking whether the remaining string is in our special-cases table. If
|
||||
it is, we stop splitting, and return the tokenization at that point.
|
||||
|
||||
The advantage of this design is that the prefixes, suffixes and special-cases
|
||||
can be declared separately, in easy-to-understand files. If a new entry is
|
||||
added to the special-cases, you can be sure that it won't have some unforeseen
|
||||
consequence to a complicated regular-expression grammar.
|
||||
|
||||
Coupling the Tokenizer and Lexicon
|
||||
##################################
|
||||
|
||||
As mentioned above, the tokenizer is designed to support easy caching. If all
|
||||
we were caching were the matched substrings, this would not be so advantageous.
|
||||
Instead, what we do is create a struct which houses all of our lexical
|
||||
features, and cache *that*. The tokens are then simply pointers to these rich
|
||||
lexical types.
|
||||
|
||||
In a sample of text, vocabulary size grows exponentially slower than word
|
||||
count. So any computations we can perform over the vocabulary and apply to the
|
||||
word count are efficient.
|
||||
|
||||
|
||||
Part-of-speech Tagger
|
||||
---------------------
|
||||
|
||||
.. _how to write a good part of speech tagger: https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ .
|
||||
|
||||
In 2013, I wrote a blog post describing `how to write a good part of speech
|
||||
tagger`_.
|
||||
My recommendation then was to use greedy decoding with the averaged perceptron.
|
||||
I think this is still the best approach, so it's what I implemented in spaCy.
|
||||
|
||||
The tutorial also recommends the use of Brown cluster features, and case
|
||||
normalization features, as these make the model more robust and domain
|
||||
independent. spaCy's tagger makes heavy use of these features.
|
||||
|
||||
Dependency Parser
|
||||
-----------------
|
||||
|
||||
.. _2014 blog post: https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/
|
||||
|
||||
The parser uses the algorithm described in my `2014 blog post`_.
|
||||
This algorithm, shift-reduce dependency parsing, is becoming widely adopted due
|
||||
to its compelling speed/accuracy trade-off.
|
||||
|
||||
Some quick details about spaCy's take on this, for those who happen to know
|
||||
these models well. I'll write up a better description shortly.
|
||||
|
||||
1. I use greedy decoding, not beam search;
|
||||
2. I use the arc-eager transition system;
|
||||
3. I use the Goldberg and Nivre (2012) dynamic oracle.
|
||||
4. I use the non-monotonic update from my CoNLL 2013 paper (Honnibal, Goldberg
|
||||
and Johnson 2013).
|
||||
|
||||
So far, this is exactly the configuration from the CoNLL 2013 paper, which
|
||||
scored 91.0. So how have I gotten it to 92.4? The following tweaks:
|
||||
|
||||
1. I use Brown cluster features --- these help a lot;
|
||||
2. I redesigned the feature set. I've long known that the Zhang and Nivre
|
||||
(2011) feature set was suboptimal, but a few features don't make a very
|
||||
compelling publication. Still, they're important.
|
||||
3. When I do the dynamic oracle training, I also make
|
||||
the upate cost-sensitive: if the oracle determines that the move the parser
|
||||
took has a cost of N, then the weights for the gold class are incremented by
|
||||
+N, and the weights for the predicted class are incremented by -N. This
|
||||
only made a small (0.1-0.2%) difference.
|
||||
|
||||
Implementation
|
||||
##############
|
||||
|
||||
I don't do anything algorithmically novel to improve the efficiency of the
|
||||
parser. However, I was very careful in the implementation.
|
||||
|
||||
A greedy shift-reduce parser with a linear model boils down to the following
|
||||
loop:
|
||||
|
||||
.. code:: python
|
||||
|
||||
def parse(words, model, feature_funcs, n_classes):
|
||||
state = init_state(words)
|
||||
for _ in range(len(words) * 2):
|
||||
features = [templ(state) for templ in feature_funcs]
|
||||
scores = [0 for _ in range(n_classes)]
|
||||
for feat in features:
|
||||
weights = model[feat]
|
||||
for i, weight in enumerate(weights):
|
||||
scores[i] += weight
|
||||
class_, score = max(enumerate(scores), key=lambda item: item[1])
|
||||
transition(state, class_)
|
||||
|
||||
The parser makes 2N transitions for a sentence of length N. In order to select
|
||||
the transition, it extracts a vector of K features from the state. Each feature
|
||||
is used as a key into a hash table managed by the model. The features map to
|
||||
a vector of weights, of length C. We then dot product the feature weights to the
|
||||
scores vector we are building for that instance.
|
||||
|
||||
The inner-most loop here is not so bad: we only have a few dozen classes, so
|
||||
it's just a short dot product. Both of the vectors are in the cache, so this
|
||||
is a snack to a modern CPU.
|
||||
|
||||
The bottle-neck in this algorithm is the 2NK look-ups into the hash-table that
|
||||
we must make, as these almost always have to hit main memory. The feature-set
|
||||
is enormously large, because all of our features are one-hot boolean
|
||||
indicators. Some of the features will be common, so they'll lurk around in the
|
||||
CPU's cache hierarchy. But a lot of them won't be, and accessing main memory
|
||||
takes a lot of cycles.
|
||||
|
||||
.. _Jeff Preshing's excellent post: http://preshing.com/20130107/this-hash-table-is-faster-than-a-judy-array/ .
|
||||
|
||||
I used to use the Google dense_hash_map implementation. This seemed a solid
|
||||
choice: it came from a big brand, it was in C++, and it seemed very
|
||||
complicated. Later, I read `Jeff Preshing's excellent post`_ on open-addressing
|
||||
with linear probing.
|
||||
This really spoke to me. I had assumed that a fast hash table implementation
|
||||
would necessarily be very complicated, but no --- this is another situation
|
||||
where the simple strategy wins.
|
||||
|
||||
I've packaged my Cython implementation separately from spaCy, in the package
|
||||
`preshed`_ --- for "pre-hashed", but also as a nod to Preshing. I've also taken
|
||||
great care over the feature extraction and perceptron code, which I'm distributing
|
||||
in a package named `thinc`_ (since it's for learning very sparse models with
|
||||
Cython).
|
||||
|
||||
.. _preshed: https://github.com/syllog1sm/preshed
|
||||
|
||||
.. _thinc: https://github.com/honnibal/thinc
|
||||
|
||||
By the way: from comparing notes with a few people, it seems common to
|
||||
implement linear models in a way that's suboptimal for multi-class
|
||||
classification. The mistake is to store in the hash-table one weight per
|
||||
(feature, class) pair, rather than mapping the feature to a vector of weights,
|
||||
for all of the classes. This is bad because it means you need to hit the table
|
||||
C times, one per class, as you always need to evaluate a feature against all of
|
||||
the classes. In the case of the parser, this means the hash table is accessed
|
||||
2NKC times, instead of the 2NK times if you have a weights vector. You should
|
||||
also be careful to store the weights contiguously in memory --- you don't want
|
||||
a linked list here. I use a block-sparse format, because my problems tend to
|
||||
have a few dozen classes.
|
||||
|
||||
I guess if I had to summarize my experience, I'd say that the efficiency of
|
||||
these models is really all about the data structures. We want to stay small,
|
||||
and stay contiguous. Minimize redundancy and minimize pointer chasing.
|
||||
That's why Cython is so well suited to this: we get to lay out our data
|
||||
structures, and manage the memory ourselves, with full C-level control.
|
|
@ -1,339 +0,0 @@
|
|||
.. spaCy documentation master file, created by
|
||||
sphinx-quickstart on Tue Aug 19 16:27:38 2014.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
==============================
|
||||
spaCy: Industrial-strength NLP
|
||||
==============================
|
||||
|
||||
|
||||
.. _Issue Tracker: https://github.com/honnibal/spaCy/issues
|
||||
|
||||
**2015-07-08**: `Version 0.88 released`_
|
||||
|
||||
.. _Version 0.87 released: updates.html
|
||||
|
||||
`spaCy`_ is a new library for text processing in Python and Cython.
|
||||
I wrote it because I think small companies are terrible at
|
||||
natural language processing (NLP). Or rather:
|
||||
small companies are using terrible NLP technology.
|
||||
|
||||
.. _spaCy: https://github.com/honnibal/spaCy/
|
||||
|
||||
To do great NLP, you have to know a little about linguistics, a lot
|
||||
about machine learning, and almost everything about the latest research.
|
||||
The people who fit this description seldom join small companies.
|
||||
Most are broke --- they've just finished grad school.
|
||||
If they don't want to stay in academia, they join Google, IBM, etc.
|
||||
|
||||
The net result is that outside of the tech giants, commercial NLP has changed
|
||||
little in the last ten years. In academia, it's changed entirely. Amazing
|
||||
improvements in quality. Orders of magnitude faster. But the
|
||||
academic code is always GPL, undocumented, unuseable, or all three. You could
|
||||
implement the ideas yourself, but the papers are hard to read, and training
|
||||
data is exorbitantly expensive. So what are you left with? A common answer is
|
||||
NLTK, which was written primarily as an educational resource. Nothing past the
|
||||
tokenizer is suitable for production use.
|
||||
|
||||
I used to think that the NLP community just needed to do more to communicate
|
||||
its findings to software engineers. So I wrote two blog posts, explaining
|
||||
`how to write a part-of-speech tagger`_ and `parser`_. Both were well received,
|
||||
and there's been a bit of interest in `my research software`_ --- even though
|
||||
it's entirely undocumented, and mostly unuseable to anyone but me.
|
||||
|
||||
.. _`my research software`: https://github.com/syllog1sm/redshift/tree/develop
|
||||
|
||||
.. _`how to write a part-of-speech tagger`: https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
|
||||
|
||||
.. _`parser`: https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/
|
||||
|
||||
So six months ago I quit my post-doc, and I've been working day and night on
|
||||
spaCy since. I'm now pleased to announce an alpha release.
|
||||
|
||||
If you're a small company doing NLP, I think spaCy will seem like a minor miracle.
|
||||
It's by far the fastest NLP software ever released.
|
||||
The full processing pipeline completes in 20ms per document, including accurate
|
||||
tagging and parsing. All strings are mapped to integer IDs, tokens are linked
|
||||
to embedded word representations, and a range of useful features are pre-calculated
|
||||
and cached.
|
||||
|
||||
If none of that made any sense to you, here's the gist of it. Computers don't
|
||||
understand text. This is unfortunate, because that's what the web almost entirely
|
||||
consists of. We want to recommend people text based on other text they liked.
|
||||
We want to shorten text to display it on a mobile screen. We want to aggregate
|
||||
it, link it, filter it, categorise it, generate it and correct it.
|
||||
|
||||
spaCy provides a library of utility functions that help programmers build such
|
||||
products. It's commercial open source software: you can either use it under
|
||||
the AGPL, or you can `buy a commercial license`_ for a one-time fee.
|
||||
|
||||
.. _buy a commercial license: license.html
|
||||
|
||||
Example functionality
|
||||
---------------------
|
||||
|
||||
Let's say you're developing a proofreading tool, or possibly an IDE for
|
||||
writers. You're convinced by Stephen King's advice that `adverbs are not your
|
||||
friend <http://www.brainpickings.org/2013/03/13/stephen-king-on-adverbs/>`_, so
|
||||
you want to **highlight all adverbs**. We'll use one of the examples he finds
|
||||
particularly egregious:
|
||||
|
||||
>>> import spacy.en
|
||||
>>> from spacy.parts_of_speech import ADV
|
||||
>>> # Load the pipeline, and call it with some text.
|
||||
>>> nlp = spacy.en.English()
|
||||
>>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False)
|
||||
>>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
|
||||
u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
|
||||
Easy enough --- but the problem is that we've also highlighted "back".
|
||||
While "back" is undoubtedly an adverb, we probably don't want to highlight it.
|
||||
If what we're trying to do is flag dubious stylistic choices, we'll need to
|
||||
refine our logic. It turns out only a certain type of adverb is of interest to
|
||||
us.
|
||||
|
||||
There are lots of ways we might do this, depending on just what words
|
||||
we want to flag. The simplest way to exclude adverbs like "back" and "not"
|
||||
is by word frequency: these words are much more common than the prototypical
|
||||
manner adverbs that the style guides are worried about.
|
||||
|
||||
The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a
|
||||
log probability estimate of the word:
|
||||
|
||||
>>> nlp.vocab[u'back'].prob
|
||||
-7.403977394104004
|
||||
>>> nlp.vocab[u'not'].prob
|
||||
-5.407193660736084
|
||||
>>> nlp.vocab[u'quietly'].prob
|
||||
-11.07155704498291
|
||||
|
||||
(The probability estimate is based on counts from a 3 billion word corpus,
|
||||
smoothed using the `Simple Good-Turing`_ method.)
|
||||
|
||||
.. _`Simple Good-Turing`: http://www.d.umn.edu/~tpederse/Courses/CS8761-FALL02/Code/sgt-gale.pdf
|
||||
|
||||
So we can easily exclude the N most frequent words in English from our adverb
|
||||
marker. Let's try N=1000 for now:
|
||||
|
||||
>>> import spacy.en
|
||||
>>> from spacy.parts_of_speech import ADV
|
||||
>>> nlp = spacy.en.English()
|
||||
>>> # Find log probability of Nth most frequent word
|
||||
>>> probs = [lex.prob for lex in nlp.vocab]
|
||||
>>> probs.sort()
|
||||
>>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
|
||||
>>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
|
||||
>>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
|
||||
‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
There are lots of other ways we could refine the logic, depending on just what
|
||||
words we want to flag. Let's say we wanted to only flag adverbs that modified words
|
||||
similar to "pleaded". This is easy to do, as spaCy loads a vector-space
|
||||
representation for every word (by default, the vectors produced by
|
||||
`Levy and Goldberg (2014)`_). Naturally, the vector is provided as a numpy
|
||||
array:
|
||||
|
||||
>>> pleaded = tokens[7]
|
||||
>>> pleaded.repvec.shape
|
||||
(300,)
|
||||
>>> pleaded.repvec[:5]
|
||||
array([ 0.04229792, 0.07459262, 0.00820188, -0.02181299, 0.07519238], dtype=float32)
|
||||
|
||||
.. _Levy and Goldberg (2014): https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/
|
||||
|
||||
We want to sort the words in our vocabulary by their similarity to "pleaded".
|
||||
There are lots of ways to measure the similarity of two vectors. We'll use the
|
||||
cosine metric:
|
||||
|
||||
>>> from numpy import dot
|
||||
>>> from numpy.linalg import norm
|
||||
|
||||
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
>>> words = [w for w in nlp.vocab if w.has_repvec]
|
||||
>>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
|
||||
>>> words.reverse()
|
||||
>>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading
|
||||
>>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses
|
||||
>>> print('100-110', ', '.join(w.orth_ for w in words[100:110]))
|
||||
100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes
|
||||
>>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged
|
||||
>>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010]))
|
||||
50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists
|
||||
|
||||
As you can see, the similarity model that these vectors give us is excellent
|
||||
--- we're still getting meaningful results at 1000 words, off a single
|
||||
prototype! The only problem is that the list really contains two clusters of
|
||||
words: one associated with the legal meaning of "pleaded", and one for the more
|
||||
general sense. Sorting out these clusters is an area of active research.
|
||||
|
||||
|
||||
A simple work-around is to average the vectors of several words, and use that
|
||||
as our target:
|
||||
|
||||
>>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested']
|
||||
>>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs)
|
||||
>>> words.sort(key=lambda w: cosine(w.repvec * say_vector))
|
||||
>>> words.reverse()
|
||||
>>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
|
||||
1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired
|
||||
>>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
|
||||
50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed
|
||||
>>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
|
||||
1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate
|
||||
|
||||
These definitely look like words that King might scold a writer for attaching
|
||||
adverbs to. Recall that our original adverb highlighting function looked like
|
||||
this:
|
||||
|
||||
>>> import spacy.en
|
||||
>>> from spacy.parts_of_speech import ADV
|
||||
>>> # Load the pipeline, and call it with some text.
|
||||
>>> nlp = spacy.en.English()
|
||||
>>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
|
||||
tag=True, parse=False)
|
||||
>>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens))
|
||||
‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
|
||||
|
||||
|
||||
|
||||
We wanted to refine the logic so that only adverbs modifying evocative verbs
|
||||
of communication, like "pleaded", were highlighted. We've now built a vector that
|
||||
represents that type of word, so now we can highlight adverbs based on
|
||||
subtle logic, honing in on adverbs that seem the most stylistically
|
||||
problematic, given our starting assumptions:
|
||||
|
||||
>>> import numpy
|
||||
>>> from numpy import dot
|
||||
>>> from numpy.linalg import norm
|
||||
>>> import spacy.en
|
||||
>>> from spacy.parts_of_speech import ADV, VERB
|
||||
>>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
|
||||
>>> def is_bad_adverb(token, target_verb, tol):
|
||||
... if token.pos != ADV
|
||||
... return False
|
||||
... elif token.head.pos != VERB:
|
||||
... return False
|
||||
... elif cosine(token.head.repvec, target_verb) < tol:
|
||||
... return False
|
||||
... else:
|
||||
... return True
|
||||
|
||||
|
||||
This example was somewhat contrived --- and, truth be told, I've never really
|
||||
bought the idea that adverbs were a grave stylistic sin. But hopefully it got
|
||||
the message across: the state-of-the-art NLP technologies are very powerful.
|
||||
spaCy gives you easy and efficient access to them, which lets you build all
|
||||
sorts of use products and features that were previously impossible.
|
||||
|
||||
|
||||
Independent Evaluation
|
||||
----------------------
|
||||
|
||||
.. table:: Independent evaluation by Yahoo! Labs and Emory
|
||||
University, to appear at ACL 2015. Higher is better.
|
||||
|
||||
+----------------+------------+------------+------------+
|
||||
| System | Language | Accuracy | Speed |
|
||||
+----------------+------------+------------+------------+
|
||||
| spaCy v0.86 | Cython | 91.9 | **13,963** |
|
||||
+----------------+------------+------------+------------+
|
||||
| ClearNLP | Java | 91.7 | 10,271 |
|
||||
+----------------+------------+------------+------------+
|
||||
| spaCy v0.84 | Cython | 90.9 | 13,963 |
|
||||
+----------------+------------+------------+------------+
|
||||
| CoreNLP | Java | 89.6 | 8,602 |
|
||||
+----------------+------------+------------+------------+
|
||||
| MATE | Java | **92.5** | 550 |
|
||||
+----------------+------------+------------+------------+
|
||||
| Turbo | C++ | 92.4 | 349 |
|
||||
+----------------+------------+------------+------------+
|
||||
| Yara | Java | 92.3 | 340 |
|
||||
+----------------+------------+------------+------------+
|
||||
|
||||
|
||||
Accuracy is % unlabelled arcs correct, speed is tokens per second.
|
||||
|
||||
Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory) performed
|
||||
a detailed comparison of the best parsers available. All numbers above
|
||||
are taken from the pre-print they kindly made available to me,
|
||||
except for spaCy v0.86.
|
||||
|
||||
I'm particularly grateful to the authors for discussion of their results, which
|
||||
led to the improvement in accuracy between v0.84 and v0.86. A tip from Jin-ho
|
||||
(developer of ClearNLP) was particularly useful.
|
||||
|
||||
|
||||
Detailed Speed Comparison
|
||||
-------------------------
|
||||
|
||||
**Set up**: 100,000 plain-text documents were streamed from an SQLite3
|
||||
database, and processed with an NLP library, to one of three levels of detail
|
||||
--- tokenization, tagging, or parsing. The tasks are additive: to parse the
|
||||
text you have to tokenize and tag it. The pre-processing was not subtracted
|
||||
from the times --- I report the time required for the pipeline to complete.
|
||||
I report mean times per document, in milliseconds.
|
||||
|
||||
**Hardware**: Intel i7-3770 (2012)
|
||||
|
||||
.. table:: Per-document processing times. Lower is better.
|
||||
|
||||
+--------------+---------------------------+--------------------------------+
|
||||
| | Absolute (ms per doc) | Relative (to spaCy) |
|
||||
+--------------+----------+--------+-------+----------+---------+-----------+
|
||||
| System | Tokenize | Tag | Parse | Tokenize | Tag | Parse |
|
||||
+--------------+----------+--------+-------+----------+---------+-----------+
|
||||
| spaCy | 0.2ms | 1ms | 19ms | 1x | 1x | 1x |
|
||||
+--------------+----------+--------+-------+----------+---------+-----------+
|
||||
| CoreNLP | 2ms | 10ms | 49ms | 10x | 10x | 2.6x |
|
||||
+--------------+----------+--------+-------+----------+---------+-----------+
|
||||
| ZPar | 1ms | 8ms | 850ms | 5x | 8x | 44.7x |
|
||||
+--------------+----------+--------+-------+----------+---------+-----------+
|
||||
| NLTK | 4ms | 443ms | n/a | 20x | 443x | n/a |
|
||||
+--------------+----------+--------+-------+----------+---------+-----------+
|
||||
|
||||
|
||||
Efficiency is a major concern for NLP applications. It is very common to hear
|
||||
people say that they cannot afford more detailed processing, because their
|
||||
datasets are too large. This is a bad position to be in. If you can't apply
|
||||
detailed processing, you generally have to cobble together various heuristics.
|
||||
This normally takes a few iterations, and what you come up with will usually be
|
||||
brittle and difficult to reason about.
|
||||
|
||||
spaCy's parser is faster than most taggers, and its tokenizer is fast enough
|
||||
for any workload. And the tokenizer doesn't just give you a list
|
||||
of strings. A spaCy token is a pointer to a Lexeme struct, from which you can
|
||||
access a wide range of pre-computed features, including embedded word
|
||||
representations.
|
||||
|
||||
.. I wrote spaCy because I think existing commercial NLP engines are crap.
|
||||
Alchemy API are a typical example. Check out this part of their terms of
|
||||
service:
|
||||
publish or perform any benchmark or performance tests or analysis relating to
|
||||
the Service or the use thereof without express authorization from AlchemyAPI;
|
||||
|
||||
.. Did you get that? You're not allowed to evaluate how well their system works,
|
||||
unless you're granted a special exception. Their system must be pretty
|
||||
terrible to motivate such an embarrassing restriction.
|
||||
They must know this makes them look bad, but they apparently believe allowing
|
||||
you to evaluate their product would make them look even worse!
|
||||
|
||||
.. spaCy is based on science, not alchemy. It's open source, and I am happy to
|
||||
clarify any detail of the algorithms I've implemented.
|
||||
It's evaluated against the current best published systems, following the standard
|
||||
methodologies. These evaluations show that it performs extremely well.
|
||||
.. See `Benchmarks`_ for details.
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
:hidden:
|
||||
|
||||
quickstart.rst
|
||||
reference/index.rst
|
||||
license.rst
|
||||
updates.rst
|
|
@ -1,126 +0,0 @@
|
|||
=======
|
||||
License
|
||||
=======
|
||||
|
||||
* Download the `license agreement`_
|
||||
* Get in touch: matt@spacy.io
|
||||
|
||||
.. _license agreement: spacy_trial_free.docx
|
||||
|
||||
|
||||
+------------+-----------+----------+-------------------------------------+
|
||||
| License | Price | Term | Suitable for |
|
||||
+============+===========+==========+=====================================+
|
||||
| Commercial | $5,000 | Life | Production use |
|
||||
+------------+-----------+----------+-------------------------------------+
|
||||
| Trial | $0 | 90 days | Evaluation, seed startup |
|
||||
+------------+-----------+----------+-------------------------------------+
|
||||
| AGPLv3 | Free | Life | Research, teaching, hobbyists, FOSS |
|
||||
+------------+-----------+----------+-------------------------------------+
|
||||
|
||||
|
||||
To make spaCy as valuable as possible, licenses to it are for life. You get
|
||||
complete transparency, certainty and control.
|
||||
If you need to use spaCy as an API, it's trivial to host it yourself --- and
|
||||
you don't need to worry about the service changing or disappearing.
|
||||
And if you're ever in acquisition or IPO talks, the story is simple.
|
||||
|
||||
spaCy can also be used as free open-source software, under the Aferro GPL
|
||||
license. If you use it this way, you must comply with the AGPL license terms.
|
||||
When you distribute your project, or offer it as a network service, you must
|
||||
distribute the source-code and grant users an AGPL license to it.
|
||||
|
||||
|
||||
.. I left academia in June 2014, just when I should have been submitting my first
|
||||
grant proposal. Grant writing seemed a bad business model. I wasn't sure
|
||||
exactly what I would do instead, but I knew that the work I could do was
|
||||
valuable, and that it would make sense for people to pay me to do it, and that
|
||||
it's often easy to convince smart people of things that are true.
|
||||
|
||||
.. I left because I don't like the grant system. It's not the
|
||||
best way to create value, and it's not the best way to get paid.
|
||||
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
In order to clarify how spaCy's license structure might apply to you, I've
|
||||
written a few examples, in the form of user-stories.
|
||||
|
||||
Ashley and Casey: Seed stage start-up
|
||||
#####################################
|
||||
|
||||
Ashley and Casey have an idea for a start-up. To explore their idea, they want
|
||||
to build a minimum viable product they can put in front of potential users and
|
||||
investors.
|
||||
|
||||
They have two options.
|
||||
|
||||
1. **Trial commercial license.** With a simple form, they can use spaCy for 90
|
||||
days, for a nominal fee of $1. They are free to modify spaCy, and they
|
||||
will own the copyright to their modifications for the duration of the license.
|
||||
After the trial period elapses, they can either pay the license fee, stop
|
||||
using spaCy, release their project under the AGPL.
|
||||
|
||||
2. **AGPL.** Casey and Pat can instead use spaCy under the AGPL license.
|
||||
However, they must then release any code that statically or dynamically
|
||||
links to spaCy under the AGPL as well (e.g. if they import the module, or
|
||||
import a module that imports it, etc). They also cannot use spaCy as
|
||||
a network resource, by running it as a service --- this is the
|
||||
loophole that the "A" part of the AGPL is designed to close.
|
||||
|
||||
Ashley and Casey find the AGPL license unattractive for commercial use.
|
||||
They decide to take up the trial commercial license.
|
||||
However, over the next 90 days, Ashley has to move house twice, and Casey gets
|
||||
sick. By the time the trial expires, they still don't have a demo they can show
|
||||
investors. They send an email explaining the situation, and a 90 day extension
|
||||
to their trial license is granted.
|
||||
|
||||
By the time the extension period has elapsed, spaCy has helped them secure
|
||||
funding, and they even have a little revenue. They are glad to pay the $5,000
|
||||
commercial license fee.
|
||||
|
||||
spaCy is now permanently licensed for the product Ashley and Casey are
|
||||
developing. They own the copyright to any modifications they make to spaCy,
|
||||
but not to the original spaCy code.
|
||||
|
||||
No additional fees will be due when they hire new developers, run spaCy on
|
||||
additional internal servers, etc. If their company is acquired, the license will
|
||||
be transferred to the company acquiring them. However, to use spaCy in another
|
||||
product, they will have to buy a second license.
|
||||
|
||||
|
||||
Alex and Sasha: University Academics
|
||||
####################################
|
||||
|
||||
Alex and Sasha are post-doctoral researchers working for a university. Part of
|
||||
their funding comes from a grant from Google, but Google will not own any part
|
||||
of the work that they produce. Their mission is just to write papers.
|
||||
|
||||
Alex and Sasha find spaCy convenient, so they use it in their system under the
|
||||
AGPL. This means that their system must also be released under the AGPL, but they're
|
||||
cool with that --- they were going to release their code anyway, as it's the only
|
||||
way to ensure their experiments are properly repeatable.
|
||||
|
||||
Alex and Sasha find and fix a few bugs in spaCy. They must release these
|
||||
modifications, and they ask that they be accepted into the main spaCy repo.
|
||||
In order to do this, they must sign a contributor agreement, ceding their
|
||||
copyright. When commercial licenses to spaCy are sold, Alex and Sasha will
|
||||
not be able to claim any royalties from their contributions.
|
||||
|
||||
Later, Alex and Sasha implement new features into spaCy, for another paper. The
|
||||
code was quite rushed, and they don't want to take the time to put together a
|
||||
proper pull request. They must release their modifications under the AGPL, but
|
||||
they are not obliged to contribute it to the spaCy repository, or concede their
|
||||
copyright.
|
||||
|
||||
|
||||
Phuong and Jessie: Open Source developers
|
||||
#########################################
|
||||
|
||||
Phuong and Jessie use the open-source software Calibre to manage their e-book
|
||||
libraries. They have an idea for a search feature, and they want to use spaCy
|
||||
to implement it. Calibre is released under the GPLv3. The AGPL has additional
|
||||
restrictions for projects used as a network resource, but they don't apply to
|
||||
this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll
|
||||
have to release their code, but that was always their intention anyway.
|
|
@ -1,236 +0,0 @@
|
|||
Quick Start
|
||||
===========
|
||||
|
||||
|
||||
Install
|
||||
-------
|
||||
|
||||
.. py:currentmodule:: spacy
|
||||
|
||||
|
||||
With Python 2.7 or Python 3, using Linux or OSX, run:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pip install spacy
|
||||
$ python -m spacy.en.download
|
||||
|
||||
.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz
|
||||
|
||||
|
||||
The download command fetches and installs about 300mb of data, for the
|
||||
parser model and word vectors, which it installs within the spacy.en package directory.
|
||||
|
||||
If you're stuck using a server with an old version of Python, and you don't
|
||||
have root access, I've prepared a bootstrap script to help you compile a local
|
||||
Python install. Run:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
|
||||
|
||||
The other way to install the package is to clone the github repository, and
|
||||
build it from source. This installs an additional dependency, Cython.
|
||||
If you're using Python 2, I also recommend installing fabric and fabtools ---
|
||||
this is how I build the project.
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ git clone https://github.com/honnibal/spaCy.git
|
||||
$ cd spaCy
|
||||
$ virtualenv .env && source .env/bin/activate
|
||||
$ export PYTHONPATH=`pwd`
|
||||
$ pip install -r requirements.txt
|
||||
$ python setup.py build_ext --inplace
|
||||
$ python -m spacy.en.download
|
||||
$ pip install pytest
|
||||
$ py.test tests/
|
||||
|
||||
Python packaging is awkward at the best of times, and it's particularly tricky
|
||||
with C extensions, built via Cython, requiring large data files. So, please
|
||||
report issues as you encounter them, and bear with me :)
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
The main entry-point is :meth:`en.English.__call__`, which accepts a unicode string
|
||||
as an argument, and returns a :py:class:`tokens.Doc` object. You can
|
||||
iterate over it to get :py:class:`tokens.Token` objects, which provide
|
||||
a convenient API:
|
||||
|
||||
>>> from __future__ import unicode_literals # If Python 2
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp(u'I ate the pizza with anchovies.')
|
||||
>>> pizza = tokens[3]
|
||||
>>> (pizza.orth, pizza.orth_, pizza.head.lemma, pizza.head.lemma_)
|
||||
... (14702, u'pizza', 14702, u'eat')
|
||||
|
||||
spaCy maps all strings to sequential integer IDs --- a common trick in NLP.
|
||||
If an attribute `Token.foo` is an integer ID, then `Token.foo_` is the string,
|
||||
e.g. `pizza.orth` and `pizza.orth_` provide the integer ID and the string of
|
||||
the original orthographic form of the word.
|
||||
|
||||
.. note:: en.English.__call__ is stateful --- it has an important **side-effect**.
|
||||
|
||||
When it processes a previously unseen word, it increments the ID counter,
|
||||
assigns the ID to the string, and writes the mapping in
|
||||
:py:data:`English.vocab.strings` (instance of
|
||||
:py:class:`strings.StringStore`).
|
||||
Future releases will feature a way to reconcile mappings, but for now, you
|
||||
should only work with one instance of the pipeline at a time.
|
||||
|
||||
|
||||
(Most of the) API at a glance
|
||||
-----------------------------
|
||||
|
||||
**Process the string:**
|
||||
|
||||
.. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data'))
|
||||
|
||||
.. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Doc
|
||||
|
||||
+-----------------+--------------+--------------+
|
||||
| Attribute | Type | Its API |
|
||||
+=================+==============+==============+
|
||||
| vocab | Vocab | __getitem__ |
|
||||
+-----------------+--------------+--------------+
|
||||
| vocab.strings | StingStore | __getitem__ |
|
||||
+-----------------+--------------+--------------+
|
||||
| tokenizer | Tokenizer | __call__ |
|
||||
+-----------------+--------------+--------------+
|
||||
| tagger | EnPosTagger | __call__ |
|
||||
+-----------------+--------------+--------------+
|
||||
| parser | GreedyParser | __call__ |
|
||||
+-----------------+--------------+--------------+
|
||||
| entity | GreedyParser | __call__ |
|
||||
+-----------------+--------------+--------------+
|
||||
|
||||
**Get dict or numpy array:**
|
||||
|
||||
.. py:method:: tokens.Doc.to_array(self, attr_ids: List[int]) --> ndarray[ndim=2, dtype=long]
|
||||
|
||||
.. py:method:: tokens.Doc.count_by(self, attr_id: int) --> Dict[int, int]
|
||||
|
||||
**Get Token objects**
|
||||
|
||||
.. py:method:: tokens.Doc.__getitem__(self, i) --> Token
|
||||
|
||||
.. py:method:: tokens.Doc.__iter__(self) --> Iterator[Token]
|
||||
|
||||
**Get sentence or named entity spans**
|
||||
|
||||
.. py:attribute:: tokens.Doc.sents --> Iterator[Span]
|
||||
|
||||
.. py:attribute:: tokens.Doc.ents --> Iterator[Span]
|
||||
|
||||
You can iterate over a Span to access individual Doc, or access its
|
||||
start, end or label.
|
||||
|
||||
|
||||
**Embedded word representenations**
|
||||
|
||||
.. py:attribute:: tokens.Token.repvec
|
||||
|
||||
.. py:attribute:: lexeme.Lexeme.repvec
|
||||
|
||||
|
||||
**Navigate to tree- or string-neighbor tokens**
|
||||
|
||||
.. py:method:: nbor(self, i=1) --> Token
|
||||
|
||||
.. py:method:: child(self, i=1) --> Token
|
||||
|
||||
.. py:method:: sibling(self, i=1) --> Token
|
||||
|
||||
.. py:attribute:: head: Token
|
||||
|
||||
.. py:attribute:: dep: int
|
||||
|
||||
**Align to original string**
|
||||
|
||||
.. py:attribute:: string: unicode
|
||||
|
||||
Padded with original whitespace.
|
||||
|
||||
.. py:attribute:: length: int
|
||||
|
||||
Length, in unicode code-points. Equal to len(self.orth_).
|
||||
|
||||
.. py:attribute:: idx: int
|
||||
|
||||
Starting offset of word in the original string.
|
||||
|
||||
|
||||
Features
|
||||
--------
|
||||
|
||||
|
||||
**Boolean features**
|
||||
|
||||
>>> lexeme = nlp.vocab[u'Apple']
|
||||
>>> lexeme.is_alpha, is_upper
|
||||
True, False
|
||||
>>> tokens = nlp('Apple computers')
|
||||
>>> tokens[0].is_alpha, tokens[0].is_upper
|
||||
>>> True, False
|
||||
>>> from spacy.en.attrs import IS_ALPHA, IS_UPPER
|
||||
>>> tokens.to_array((IS_ALPHA, IS_UPPER))[0]
|
||||
array([1, 0])
|
||||
|
||||
+----------+---------------------------------------------------------------+
|
||||
| is_alpha | :py:meth:`str.isalpha` |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| is_digit | :py:meth:`str.isdigit` |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| is_lower | :py:meth:`str.islower` |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| is_title | :py:meth:`str.istitle` |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| is_upper | :py:meth:`str.isupper` |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| is_ascii | all(ord(c) < 128 for c in string) |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| is_punct | all(unicodedata.category(c).startswith('P') for c in string) |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| like_url | Using various heuristics, does the string resemble a URL? |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| like_num | "Two", "10", "1,000", "10.54", "1/2" etc all match |
|
||||
+----------+---------------------------------------------------------------+
|
||||
|
||||
**String-transform Features**
|
||||
|
||||
|
||||
+----------+---------------------------------------------------------------+
|
||||
| orth | The original string, unmodified. |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| lower | The original string, forced to lower-case |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| norm | The string after additional normalization |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| shape | Word shape, e.g. 10 --> dd, Garden --> Xxxx, Hi!5 --> Xx!d |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| prefix | A short slice from the start of the string. |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| suffix | A short slice from the end of the string. |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| lemma | The word's lemma, i.e. morphological suffixes removed |
|
||||
+----------+---------------------------------------------------------------+
|
||||
|
||||
**Syntactic labels**
|
||||
|
||||
+----------+---------------------------------------------------------------+
|
||||
| pos | The word's part-of-speech, from the Google Universal Tag Set |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| tag | A fine-grained morphosyntactic tag, e.g. VBZ, NNS, etc |
|
||||
+----------+---------------------------------------------------------------+
|
||||
| dep | Dependency type label between word and its head, e.g. subj |
|
||||
+----------+---------------------------------------------------------------+
|
||||
|
||||
**Distributional**
|
||||
|
||||
+---------+-----------------------------------------------------------+
|
||||
| cluster | Brown cluster ID of the word |
|
||||
+---------+-----------------------------------------------------------+
|
||||
| prob | Log probability of word, smoothed with Simple Good-Turing |
|
||||
+---------+-----------------------------------------------------------+
|
|
@ -1,116 +0,0 @@
|
|||
====================
|
||||
Annotation Standards
|
||||
====================
|
||||
|
||||
This document describes the target annotations spaCy is trained to predict.
|
||||
|
||||
This is currently a work in progress. Please ask questions on the issue tracker,
|
||||
so that the answers can be integrated here to improve the documentation.
|
||||
|
||||
https://github.com/honnibal/spaCy/issues
|
||||
|
||||
English
|
||||
=======
|
||||
|
||||
Tokenization
|
||||
------------
|
||||
|
||||
Tokenization standards are based on the OntoNotes 5 corpus.
|
||||
|
||||
The tokenizer differs from most by including tokens for significant whitespace.
|
||||
Any sequence of whitespace characters beyond a single space (' ') is included
|
||||
as a token. For instance:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English(parse=False)
|
||||
>>> tokens = nlp(u'Some\nspaces and\ttab characters')
|
||||
>>> print [t.orth_ for t in tokens]
|
||||
[u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters']
|
||||
|
||||
The whitespace tokens are useful for much the same reason punctuation is --- it's
|
||||
often an important delimiter in the text. By preserving it in the token output,
|
||||
we are able to maintain a simple alignment between the tokens and the original
|
||||
string, and we ensure that the token stream does not lose information.
|
||||
|
||||
Sentence boundary detection
|
||||
---------------------------
|
||||
|
||||
Sentence boundaries are calculated from the syntactic parse tree, so features
|
||||
such as punctuation and capitalisation play an important but non-decisive role
|
||||
in determining the sentence boundaries. Usually this means that the sentence
|
||||
boundaries will at least coincide with clause boundaries, even given poorly
|
||||
punctuated text.
|
||||
|
||||
Part-of-speech Tagging
|
||||
----------------------
|
||||
|
||||
The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
|
||||
tag set. We also map the tags to the simpler Google Universal POS Tag set.
|
||||
|
||||
Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
|
||||
|
||||
Lemmatization
|
||||
-------------
|
||||
|
||||
A "lemma" is the uninflected form of a word. In English, this means:
|
||||
|
||||
* Adjectives: The form like "happy", not "happier" or "happiest"
|
||||
* Adverbs: The form like "badly", not "worse" or "worst"
|
||||
* Nouns: The form like "dog", not "dogs"; like "child", not "children"
|
||||
* Verbs: The form like "write", not "writes", "writing", "wrote" or "written"
|
||||
|
||||
The lemmatization data is taken from WordNet. However, we also add a special
|
||||
case for pronouns: all pronouns are lemmatized to the special token -PRON-.
|
||||
|
||||
Syntactic Dependency Parsing
|
||||
----------------------------
|
||||
|
||||
The parser is trained on data produced by the ClearNLP converter. Details of
|
||||
the annotation scheme can be found here:
|
||||
|
||||
http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
|
||||
|
||||
Named Entity Recognition
|
||||
------------------------
|
||||
|
||||
+--------------+-----------------------------------------------------+
|
||||
| PERSON | People, including fictional |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| NORP | Nationalities or religious or political groups |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| FACILITY | Buildings, airports, highways, bridges, etc. |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| ORGANIZATION | Companies, agencies, institutions, etc. |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| GPE | Countries, cities, states |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| LOCATION | Non-GPE locations, mountain ranges, bodies of water |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| PRODUCT | Vehicles, weapons, foods, etc. (Not services) |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| EVENT | Named hurricanes, battles, wars, sports events, etc.|
|
||||
+--------------+-----------------------------------------------------+
|
||||
| WORK OF ART | Titles of books, songs, etc. |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| LAW | Named documents made into laws |
|
||||
+--------------+-----------------------------------------------------+
|
||||
| LANGUAGE | Any named language |
|
||||
+--------------+-----------------------------------------------------+
|
||||
|
||||
The following values are also annotated in a style similar to names:
|
||||
|
||||
+--------------+---------------------------------------------+
|
||||
| DATE | Absolute or relative dates or periods |
|
||||
+--------------+---------------------------------------------+
|
||||
| TIME | Times smaller than a day |
|
||||
+--------------+---------------------------------------------+
|
||||
| PERCENT | Percentage (including “%”) |
|
||||
+--------------+---------------------------------------------+
|
||||
| MONEY | Monetary values, including unit |
|
||||
+--------------+---------------------------------------------+
|
||||
| QUANTITY | Measurements, as of weight or distance |
|
||||
+--------------+---------------------------------------------+
|
||||
| ORDINAL | "first", "second" |
|
||||
+--------------+---------------------------------------------+
|
||||
| CARDINAL | Numerals that do not fall under another type|
|
||||
+--------------+---------------------------------------------+
|
|
@ -1,112 +0,0 @@
|
|||
=============
|
||||
Documentation
|
||||
=============
|
||||
|
||||
The table below shows every class in spaCy: a link to its documentation, implementation,
|
||||
and a small usage snippet.
|
||||
|
||||
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| Class name | Usage | Implemention |
|
||||
+================+==========================+================================+
|
||||
| `English`_ | doc = English() | `spacy/en/__init__.py`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| Data objects |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Doc`_ | doc = nlp(text) | `spacy/doc.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Token`_ | token = doc[10] | `spacy/token.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Span`_ | sent = doc.sents.next() | `spacy/span.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Lexeme`_ | lex = nlp.vocab[u'word'] | `spacy/lexeme.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| Lookup tables |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Vocab`_ | nlp.vocab | `spacy/vocab.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `StringStore`_ | nlp.vocab.strings | `spacy/strings.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| Processing modules |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Tokenizer`_ | nlp.tokenizer | `spacy/tokenizer.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `EnPosTagger`_ | nlp.tagger | `spacy/en/pos.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Parser`_ | nlp.parser | `spacy/syntax/parser.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| Parser internals |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| ArcEager | | spacy/syntax/arc_eager.pyx |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| BiluoPushDown | | spacy/syntax/ner.pyx |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| StateClass | | spacy/syntax/stateclass.pyx |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| Research Utilities |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `GoldParse`_ | | `spacy/gold.pyx`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
| `Scorer`_ | | `spacy/scorer.py`_ |
|
||||
+----------------+--------------------------+--------------------------------+
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 4
|
||||
|
||||
processing.rst
|
||||
using/document.rst
|
||||
using/span.rst
|
||||
using/token.rst
|
||||
using/lexeme.rst
|
||||
lookup.rst
|
||||
|
||||
|
||||
.. _English: processing.html
|
||||
|
||||
.. _Doc: using/doc.html
|
||||
|
||||
.. _Token: using/token.html
|
||||
|
||||
.. _Span: using/span.html
|
||||
|
||||
.. _Vocab: lookup.html
|
||||
|
||||
.. _StringStore: lookup.html
|
||||
|
||||
.. _Tokenizer: processing.html
|
||||
|
||||
.. _EnPosTagger: processing.html
|
||||
|
||||
.. _Parser: processing.html
|
||||
|
||||
.. _Lexeme: lookup.html
|
||||
|
||||
.. _Scorer: misc.html
|
||||
|
||||
.. _GoldParse: misc.html
|
||||
|
||||
|
||||
.. _spacy/en/__init__.py: https://github.com/honnibal/spaCy/tree/master/spacy/en/__init__.py
|
||||
|
||||
.. _spacy/doc.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx
|
||||
|
||||
.. _spacy/token.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx
|
||||
|
||||
.. _spacy/span.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/spans.pyx
|
||||
|
||||
.. _spacy/vocab.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/vocab.pyx
|
||||
|
||||
.. _spacy/strings.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/strings.pyx
|
||||
|
||||
.. _spacy/tokenizer.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokenizer.pyx
|
||||
|
||||
.. _spacy/en/pos.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/en/pos.pyx
|
||||
|
||||
.. _spacy/syntax/parser.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/syntax/parser.pyx
|
||||
|
||||
.. _spacy/lexeme.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/lexeme.pyx
|
||||
|
||||
.. _spacy/gold.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/gold.pyx
|
||||
|
||||
.. _spacy/scorer.py: https://github.com/honnibal/spaCy/tree/master/spacy/scorer.py
|
|
@ -1,41 +0,0 @@
|
|||
=================
|
||||
Loading Resources
|
||||
=================
|
||||
In more detail:
|
||||
|
||||
.. code::
|
||||
|
||||
class English(object):
|
||||
def __init__(self,
|
||||
data_dir=path.join(path.dirname(__file__), 'data'),
|
||||
Tokenizer=Tokenizer.from_dir,
|
||||
Tagger=EnPosTagger,
|
||||
Parser=Createarser(ArcEager),
|
||||
Entity=CreateParser(BiluoNER),
|
||||
load_vectors=True
|
||||
):
|
||||
|
||||
:code:`data_dir`
|
||||
:code:`unicode path`
|
||||
|
||||
The data directory. May be None, to disable any data loading (including
|
||||
the vocabulary).
|
||||
|
||||
:code:`Tokenizer`
|
||||
:code:`(Vocab vocab, unicode data_dir)(unicode) --> Doc`
|
||||
|
||||
A class/function that creates the tokenizer.
|
||||
|
||||
:code:`Tagger` / :code:`Parser` / :code:`Entity`
|
||||
:code:`(Vocab vocab, unicode data_dir)(Doc) --> None`
|
||||
|
||||
A class/function that creates the part-of-speech tagger /
|
||||
syntactic dependency parser / named entity recogniser.
|
||||
May be None or False, to disable tagging.
|
||||
|
||||
:code:`load_vectors`
|
||||
:code:`bool`
|
||||
A boolean value to control whether the word vectors are loaded.
|
||||
|
||||
|
||||
|
|
@ -1,111 +0,0 @@
|
|||
Lexical Lookup
|
||||
--------------
|
||||
|
||||
Where possible, spaCy computes information over lexical *types*, rather than
|
||||
*tokens*. If you process a large batch of text, the number of unique types
|
||||
you will see will grow exponentially slower than the number of tokens --- so
|
||||
it's much more efficient to compute over types. And, in small samples, we generally
|
||||
want to know about the distribution of a word in the language at large ---
|
||||
which again, is type-based information.
|
||||
|
||||
You can access the lexical features via the Token object, but you can also look them
|
||||
up in the vocabulary directly:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> lexeme = nlp.vocab[u'Amazon']
|
||||
|
||||
.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None)
|
||||
|
||||
.. py:method:: __len__(self)
|
||||
|
||||
:returns: number of words in the vocabulary
|
||||
:rtype: int
|
||||
|
||||
.. py:method:: __getitem__(self, key_int)
|
||||
|
||||
:param int key:
|
||||
Integer ID
|
||||
|
||||
:returns: A Lexeme object
|
||||
|
||||
.. py:method:: __getitem__(self, key_str)
|
||||
|
||||
:param unicode key_str:
|
||||
A string in the vocabulary
|
||||
|
||||
:rtype: Lexeme
|
||||
|
||||
|
||||
.. py:method:: __setitem__(self, orth_str, props)
|
||||
|
||||
:param unicode orth_str:
|
||||
The orth key
|
||||
|
||||
:param dict props:
|
||||
A props dictionary
|
||||
|
||||
:returns: None
|
||||
|
||||
.. py:method:: dump(self, loc)
|
||||
|
||||
:param unicode loc:
|
||||
Path where the vocabulary should be saved
|
||||
|
||||
.. py:method:: load_lexemes(self, loc)
|
||||
|
||||
:param unicode loc:
|
||||
Path to load the lexemes.bin file from
|
||||
|
||||
.. py:method:: load_vectors(self, loc)
|
||||
|
||||
:param unicode loc:
|
||||
Path to load the vectors.bin from
|
||||
|
||||
|
||||
.. py:class:: strings.StringStore(self)
|
||||
|
||||
.. py:method:: __len__(self)
|
||||
|
||||
:returns:
|
||||
Number of strings in the string-store
|
||||
|
||||
.. py:method:: __getitem__(self, key_int)
|
||||
|
||||
:param int key_int: An integer key
|
||||
|
||||
:returns:
|
||||
The string that the integer key maps to
|
||||
|
||||
:rtype: unicode
|
||||
|
||||
.. py:method:: __getitem__(self, key_unicode)
|
||||
|
||||
:param int key_unicode:
|
||||
A key, as a unicode string
|
||||
|
||||
:returns:
|
||||
The integer ID of the string.
|
||||
|
||||
:rtype: int
|
||||
|
||||
.. py:method:: __getitem__(self, key_utf8_bytes)
|
||||
|
||||
:param int key_utf8_bytes:
|
||||
A key, as a UTF-8 encoded byte-string
|
||||
|
||||
:returns:
|
||||
The integer ID of the string.
|
||||
|
||||
:rtype:
|
||||
int
|
||||
|
||||
.. py:method:: dump(self, loc)
|
||||
|
||||
:param loc:
|
||||
File path to save the strings.txt to.
|
||||
|
||||
.. py:method:: load(self, loc)
|
||||
|
||||
:param loc:
|
||||
File path to load the strings.txt from.
|
|
@ -1,89 +0,0 @@
|
|||
================
|
||||
spacy.en.English
|
||||
================
|
||||
|
||||
|
||||
99\% of the time, you will load spaCy's resources using a language pipeline class,
|
||||
e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
|
||||
specified directory. By default, spaCy installs data into each language's
|
||||
package directory, and loads it from there.
|
||||
|
||||
Usually, this is all you will need:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
|
||||
If you need to replace some of the components, you may want to just make your
|
||||
own pipeline class --- the English class itself does almost no work; it just
|
||||
applies the modules in order. You can also provide a function or class that
|
||||
produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`,
|
||||
to customize the pipeline:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> from my_module import MyTagger
|
||||
>>> nlp = English(Tagger=MyTagger)
|
||||
|
||||
The text processing API is very small and simple. Everything is a callable object,
|
||||
and you will almost always apply the pipeline all at once.
|
||||
|
||||
|
||||
.. py:class:: spacy.en.English
|
||||
|
||||
.. py:method:: __init__(self, data_dir=..., Tokenizer=..., Tagger=..., Parser=..., Entity=..., Matcher=..., Packer=None, load_vectors=True)
|
||||
|
||||
:param unicode data_dir:
|
||||
The data directory. May be None, to disable any data loading (including
|
||||
the vocabulary).
|
||||
|
||||
:param Tokenizer:
|
||||
A class/function that creates the tokenizer.
|
||||
|
||||
:param Tagger:
|
||||
A class/function that creates the part-of-speech tagger.
|
||||
|
||||
:param Parser:
|
||||
A class/function that creates the dependency parser.
|
||||
|
||||
:param Entity:
|
||||
A class/function that creates the named entity recogniser.
|
||||
|
||||
:param bool load_vectors:
|
||||
A boolean value to control whether the word vectors are loaded.
|
||||
|
||||
.. py:method:: __call__(text, tag=True, parse=True, entity=True) --> Doc
|
||||
|
||||
:param unicode text:
|
||||
The text to be processed. No pre-processing needs to be applied, and any
|
||||
length of text can be submitted. Usually you will submit a whole document.
|
||||
Text may be zero-length. An exception is raised if byte strings are supplied.
|
||||
|
||||
:param bool tag:
|
||||
Whether to apply the part-of-speech tagger. Required for parsing and entity
|
||||
recognition.
|
||||
|
||||
:param bool parse:
|
||||
Whether to apply the syntactic dependency parser.
|
||||
|
||||
:param bool entity:
|
||||
Whether to apply the named entity recognizer.
|
||||
|
||||
:return: A document
|
||||
:rtype: :py:class:`spacy.tokens.Doc`
|
||||
|
||||
:Example:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> doc = nlp(u'Some text.) # Applies tagger, parser, entity
|
||||
>>> doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
|
||||
>>> doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
|
||||
>>> doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
|
||||
>>> doc = nlp(u'') # Zero-length tokens, not an error
|
||||
>>> doc = nlp(b'Some text') # Error: need unicode
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
File "spacy/en/__init__.py", line 128, in __call__
|
||||
tokens = self.tokenizer(text)
|
||||
TypeError: Argument 'string' has incorrect type (expected unicode, got str)
|
||||
>>> doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
|
||||
>>>
|
|
@ -1,94 +0,0 @@
|
|||
==============
|
||||
The Doc Object
|
||||
==============
|
||||
|
||||
|
||||
.. py:class:: spacy.tokens.doc.Doc
|
||||
|
||||
.. py:method:: __init__(self, Vocab vocab, orths_and_spaces=None)
|
||||
|
||||
:param Vocab vocab: A vocabulary object.
|
||||
|
||||
:param list orths_and_spaces=None: Defaults to None.
|
||||
|
||||
.. py:method:: __getitem__(self, int i)
|
||||
|
||||
:returns: Token
|
||||
|
||||
.. py:method:: __getitem__(self, slice start_colon_end)
|
||||
|
||||
:returns: Span
|
||||
|
||||
.. py:method:: __iter__(self)
|
||||
|
||||
Iterate over tokens
|
||||
|
||||
.. code::
|
||||
|
||||
>>> tokens = nlp(u'Zero one two three four five six')
|
||||
>>> tokens[0].orth_
|
||||
u'Zero'
|
||||
>>> tokens[-1].orth_
|
||||
u'six'
|
||||
|
||||
.. py:method:: __len__(self)
|
||||
|
||||
Number of tokens
|
||||
|
||||
.. py:attribute:: sents
|
||||
|
||||
Iterate over sentences in the document.
|
||||
|
||||
:returns generator: Sentences
|
||||
|
||||
.. py:attribute:: ents
|
||||
|
||||
Iterate over named entities in the document.
|
||||
|
||||
:returns tuple: Named Entities
|
||||
|
||||
.. py:attribute:: noun_chunks
|
||||
|
||||
:returns generator:
|
||||
|
||||
.. py:method:: to_array(self, list attr_ids)
|
||||
|
||||
Given a list of M attribute IDs, export the tokens to a numpy ndarray
|
||||
of shape N*M, where N is the length of the sentence.
|
||||
|
||||
:param list[int] attr_ids: A list of attribute ID ints.
|
||||
|
||||
:returns feat_array:
|
||||
A feature matrix, with one row per word, and one column per attribute
|
||||
indicated in the input attr_ids.
|
||||
|
||||
.. py:method:: count_by(self, attr_id)
|
||||
|
||||
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
by the values of the given attribute ID.
|
||||
|
||||
.. code::
|
||||
|
||||
>>> from spacy.en import English, attrs
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp(u'apple apple orange banana')
|
||||
>>> tokens.count_by(attrs.ORTH)
|
||||
{12800L: 1, 11880L: 2, 7561L: 1}
|
||||
>>> tokens.to_array([attrs.ORTH])
|
||||
array([[11880],
|
||||
[11880],
|
||||
[ 7561],
|
||||
[12800]])
|
||||
|
||||
.. py:method:: from_array(self, attrs, array)
|
||||
|
||||
.. py:method:: to_bytes(self)
|
||||
|
||||
.. py:method:: from_bytes(self)
|
||||
|
||||
.. py:method:: read_bytes(self)
|
||||
|
||||
.. py:method:: merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type)
|
||||
|
||||
Merge a multi-word expression into a single token. Currently
|
||||
experimental; API is likely to change.
|
|
@ -1,11 +0,0 @@
|
|||
==================
|
||||
Annotation Objects
|
||||
==================
|
||||
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
|
||||
document.rst
|
||||
token.rst
|
||||
span.rst
|
|
@ -1,58 +0,0 @@
|
|||
===============
|
||||
The Span Object
|
||||
===============
|
||||
|
||||
.. autoclass:: spacy.spans.Span
|
||||
|
||||
.. py:class:: Span
|
||||
|
||||
|
||||
.. py:method:: __getitem__
|
||||
|
||||
.. py:method:: __iter__
|
||||
|
||||
.. py:method:: __len__
|
||||
|
||||
.. py:attribute:: root
|
||||
|
||||
Syntactic head
|
||||
|
||||
.. py:attribute:: lefts
|
||||
|
||||
Tokens that are:
|
||||
|
||||
1. To the left of the span;
|
||||
2. Syntactic children of words within the span
|
||||
|
||||
i.e.
|
||||
|
||||
.. code::
|
||||
|
||||
lefts = [span.doc[i] for i in range(0, span.start) if span.doc[i].head in span]
|
||||
|
||||
.. py:attribute:: rights
|
||||
|
||||
Tokens that are:
|
||||
|
||||
1. To the right of the span;
|
||||
2. Syntactic children of words within the span
|
||||
|
||||
i.e.
|
||||
|
||||
.. code::
|
||||
|
||||
rights = [span.doc[i] for i in range(span.end, len(span.doc)) if span.doc[i].head in span]
|
||||
|
||||
Tokens that are:
|
||||
|
||||
1. To the right of the span;
|
||||
2. Syntactic children of words within the span
|
||||
|
||||
|
||||
.. py:attribute:: string
|
||||
|
||||
.. py:attribute:: lemma / lemma\_
|
||||
|
||||
.. py:attribute:: label / label\_
|
||||
|
||||
.. py:attribute:: subtree
|
|
@ -1,195 +0,0 @@
|
|||
================
|
||||
The Token Object
|
||||
================
|
||||
|
||||
A Token represents a single word, punctuation or significant whitespace symbol.
|
||||
|
||||
Integer IDs are provided for all string features. The (unicode) string is
|
||||
provided by an attribute of the same name followed by an underscore, e.g.
|
||||
token.orth is an integer ID, token.orth\_ is the unicode value.
|
||||
|
||||
The only exception is the Token.string attribute, which is (unicode)
|
||||
string-typed.
|
||||
|
||||
|
||||
.. py:class:: Token
|
||||
|
||||
.. py:method:: __init__(self, Vocab vocab, Doc doc, int offset)
|
||||
|
||||
**String Views**
|
||||
|
||||
.. py:attribute:: orth / orth\_
|
||||
|
||||
The form of the word with no string normalization or processing, as it
|
||||
appears in the string, without trailing whitespace.
|
||||
|
||||
.. py:attribute:: lemma / lemma\_
|
||||
|
||||
The "base" of the word, with no inflectional suffixes, e.g. the lemma of
|
||||
"developing" is "develop", the lemma of "geese" is "goose", etc. Note that
|
||||
*derivational* suffixes are not stripped, e.g. the lemma of "instutitions"
|
||||
is "institution", not "institute". Lemmatization is performed using the
|
||||
WordNet data, but extended to also cover closed-class words such as
|
||||
pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his".
|
||||
We assign pronouns the lemma -PRON-.
|
||||
|
||||
.. py:attribute:: lower / lower\_
|
||||
|
||||
The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower()
|
||||
|
||||
.. py:attribute:: norm / norm\_
|
||||
|
||||
The form of the word, after language-specific normalizations have been
|
||||
applied.
|
||||
|
||||
.. py:attribute:: shape / shape\_
|
||||
|
||||
A transform of the word's string, to show orthographic features. The
|
||||
characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d.
|
||||
After these mappings, sequences of 4 or more of the same character are
|
||||
truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
|
||||
:) --> :)
|
||||
|
||||
.. py:attribute:: prefix / prefix\_
|
||||
|
||||
A length-N substring from the start of the word. Length may vary by
|
||||
language; currently for English n=1, i.e. prefix = word.orth\_[:1]
|
||||
|
||||
.. py:attribute:: suffix / suffix\_
|
||||
|
||||
A length-N substring from the end of the word. Length may vary by
|
||||
language; currently for English n=3, i.e. suffix = word.orth\_[-3:]
|
||||
|
||||
.. py:attribute:: lex_id
|
||||
|
||||
**Alignment and Output**
|
||||
|
||||
.. py:attribute:: idx
|
||||
|
||||
.. py:method:: __len__(self)
|
||||
|
||||
.. py:method:: __unicode__(self)
|
||||
|
||||
.. py:method:: __str__(self)
|
||||
|
||||
.. py:attribute:: string
|
||||
|
||||
The form of the word as it appears in the string, **including trailing
|
||||
whitespace**. This is useful when you need to use linguistic features to
|
||||
add inline mark-up to the string.
|
||||
|
||||
.. py:method:: nbor(self, int i=1)
|
||||
|
||||
**Distributional Features**
|
||||
|
||||
.. py:attribute:: repvec
|
||||
|
||||
A "word embedding" representation: a dense real-valued vector that supports
|
||||
similarity queries between words. By default, spaCy currently loads
|
||||
vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
|
||||
model.
|
||||
|
||||
.. py:attribute:: cluster
|
||||
|
||||
The Brown cluster ID of the word. These are often useful features for
|
||||
linear models. If you're using a non-linear model, particularly
|
||||
a neural net or random forest, consider using the real-valued word
|
||||
representation vector, in Token.repvec, instead.
|
||||
|
||||
.. py:attribute:: prob
|
||||
|
||||
The unigram log-probability of the word, estimated from counts from a
|
||||
large corpus, smoothed using Simple Good Turing estimation.
|
||||
|
||||
**Navigating the Dependency Tree**
|
||||
|
||||
.. py:attribute:: pos / pos\_
|
||||
|
||||
A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB,
|
||||
ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech.
|
||||
|
||||
.. py:attribute:: tag / tag\_
|
||||
|
||||
A morphosyntactic tag, e.g. NN, VBZ, DT, etc. These tags are
|
||||
language/corpus specific, and typically describe part-of-speech and some
|
||||
amount of morphological information. For instance, in the Penn Treebank
|
||||
tag set, VBZ is assigned to a present-tense singular verb.
|
||||
|
||||
.. py:attribute:: dep / dep\_
|
||||
|
||||
The type of syntactic dependency relation between the word and its
|
||||
syntactic head.
|
||||
|
||||
.. py:attribute:: head
|
||||
|
||||
The Token that is the immediate syntactic head of the word. If the word is
|
||||
the root of the dependency tree, the same word is returned.
|
||||
|
||||
.. py:attribute:: lefts
|
||||
|
||||
An iterator for the immediate leftward syntactic children of the word.
|
||||
|
||||
.. py:attribute:: rights
|
||||
|
||||
An iterator for the immediate rightward syntactic children of the word.
|
||||
|
||||
.. py:attribute:: n_lefts
|
||||
|
||||
The number of immediate syntactic children preceding the word in the
|
||||
string.
|
||||
|
||||
.. py:attribute:: n_rights
|
||||
|
||||
The number of immediate syntactic children following the word in the
|
||||
string.
|
||||
|
||||
.. py:attribute:: children
|
||||
|
||||
An iterator that yields from lefts, and then yields from rights.
|
||||
|
||||
.. py:attribute:: subtree
|
||||
|
||||
An iterator for the part of the sentence syntactically governed by the
|
||||
word, including the word itself.
|
||||
|
||||
.. py:attribute:: left_edge
|
||||
|
||||
.. py:attribute:: right_edge
|
||||
|
||||
.. py:attribute:: conjuncts
|
||||
|
||||
**Named Entities**
|
||||
|
||||
.. py:attribute:: ent_type
|
||||
|
||||
If the token is part of an entity, its entity type
|
||||
|
||||
.. py:attribute:: ent_iob
|
||||
|
||||
The IOB (inside, outside, begin) entity recognition tag for the token
|
||||
|
||||
**Lexeme Flags**
|
||||
|
||||
.. py:method:: check_flag(self, attr_id_t flag_id)
|
||||
|
||||
.. py:attribute:: is_oov
|
||||
|
||||
.. py:attribute:: is_alpha
|
||||
|
||||
.. py:attribute:: is_ascii
|
||||
|
||||
.. py:attribute:: is_digit
|
||||
|
||||
.. py:attribute:: is_lower
|
||||
|
||||
.. py:attribute:: is_title
|
||||
|
||||
.. py:attribute:: is_punct
|
||||
|
||||
.. py:attribute:: is_space
|
||||
|
||||
.. py:attribute:: like_url
|
||||
|
||||
.. py:attribute:: like_num
|
||||
|
||||
.. py:attribute:: like_email
|
|
@ -1,280 +0,0 @@
|
|||
===================================
|
||||
Tutorial: Extractive Summarization
|
||||
===================================
|
||||
|
||||
This tutorial will go through the implementation of several extractive
|
||||
summarization models with spaCy.
|
||||
|
||||
An *extractive* summarization system is a filter over the original document/s:
|
||||
most of the text is removed, and the remaining text is formatted as a summary.
|
||||
In contrast, an *abstractive* summarization system generates new text.
|
||||
|
||||
Application Context
|
||||
-------------------
|
||||
|
||||
Extractive summarization systems need an application context. We can't ask how
|
||||
to design the system without some concept of what sort of summary will be
|
||||
useful for a given application. (Contrast with speech recognition, where
|
||||
a notion of "correct" is much less application-sensitive.)
|
||||
|
||||
For this, I've adopted the application context that `Flipboard`_ discuss in a
|
||||
recent blog post: they want to display lead-text to readers on mobile devices,
|
||||
so that readers can easily choose interesting links.
|
||||
|
||||
I've chosen this application context for two reasons. First, `Flipboard`_ say
|
||||
they're putting something like this into production. Second, there's a ready
|
||||
source of evaluation data. We can look at the lead-text that human editors
|
||||
have chosen, and evaluate whether our automatic system chooses similar text.
|
||||
|
||||
Experimental Setup
|
||||
------------------
|
||||
|
||||
Instead of scraping data, I'm using articles from the New York Times Annotated
|
||||
Corpus, which is a handy dump of XML-annotated articles distributed by the LDC.
|
||||
The annotations come with a field named "online lead paragraph". Our
|
||||
summarization systems will be evaluated on their Rouge-1 overlap with this
|
||||
field.
|
||||
|
||||
Further details of the experimental setup can be found in the appendices.
|
||||
|
||||
.. _newyorktimes.com: http://newyorktimes.com
|
||||
|
||||
.. _Flipboard: http://engineering.flipboard.com/2014/10/summarization/
|
||||
|
||||
.. _vector-space model: https://en.wikipedia.org/wiki/Vector_space_model
|
||||
|
||||
.. _LexRank algorithm: https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
|
||||
|
||||
.. _PageRank: https://en.wikipedia.org/wiki/PageRank
|
||||
|
||||
Summarizer API
|
||||
--------------
|
||||
|
||||
Each summarization model will have the following API:
|
||||
|
||||
.. py:func:`summarize(nlp: spacy.en.English, headline: unicode, paragraphs: List[unicode],
|
||||
target_length: int) --> summary: unicode
|
||||
|
||||
We receive the headline and a list of paragraphs, and a target length. We have
|
||||
to produce a block of text where len(text) < target_length. We want summaries
|
||||
that users will click-on, and not bounce back out of. Long-term, we want
|
||||
summaries that would keep people using the app.
|
||||
|
||||
Baselines: Truncate
|
||||
-------------------
|
||||
|
||||
.. code:: python
|
||||
|
||||
def truncate_chars(nlp, headline, paragraphs, target_length):
|
||||
text = ' '.join(paragraphs)
|
||||
return text[:target_length - 3] + '...'
|
||||
|
||||
def truncate_words(nlp, headline, paragraphs, target_length):
|
||||
text = ' '.join(paragraphs)
|
||||
tokens = text.split()
|
||||
summary = []
|
||||
n_words = 0
|
||||
n_chars = 0
|
||||
while n_chars < target_length - 3:
|
||||
n_chars += len(tokens[n_words])
|
||||
n_chars += 1 # Space
|
||||
n_words += 1
|
||||
return ' '.join(tokens[:n_words]) + '...'
|
||||
|
||||
def truncate_sentences(nlp, headline, paragraphs, target_length):
|
||||
sentences = []
|
||||
summary = ''
|
||||
for para in paragraphs:
|
||||
tokens = nlp(para)
|
||||
for sentence in tokens.sentences():
|
||||
if len(summary) + len(sentence) >= target_length:
|
||||
return summary
|
||||
summary += str(sentence)
|
||||
return summary
|
||||
|
||||
I'd be surprised if Flipboard never had something like this in production. Details
|
||||
like lead-text take a while to float up the priority list. This strategy also has
|
||||
the advantage of transparency: it's obvious to users how the decision is being
|
||||
made, so nobody is likely to complain about the feature if it works this way.
|
||||
|
||||
Instead of cutting off the text mid-word, we can tokenize the text, and
|
||||
|
||||
+----------------+-----------+
|
||||
| System | Rouge-1 R |
|
||||
+----------------+-----------+
|
||||
| Truncate chars | 69.3 |
|
||||
+----------------+-----------+
|
||||
| Truncate words | 69.8 |
|
||||
+----------------+-----------+
|
||||
| Truncate sents | 48.5 |
|
||||
+----------------+-----------+
|
||||
|
||||
Sentence Vectors
|
||||
----------------
|
||||
|
||||
A simple bag-of-words model can be created using the `count_by` method, which
|
||||
produces a dictionary of frequencies, keyed by string IDs:
|
||||
|
||||
.. code:: python
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> from spacy.en.attrs import SIC
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp(u'a a a. b b b b.')
|
||||
>>> tokens.count_by(SIC)
|
||||
{41L: 4, 11L: 3, 5L: 2}
|
||||
>>> [s.count_by(SIC) for s in tokens.sentences()]
|
||||
[{11L: 3, 5L: 1}, {41L: 4, 5L: 1}]
|
||||
|
||||
|
||||
Similar functionality is provided by `scikit-learn`_, but with a different
|
||||
style of API design. With spaCy, functions generally have more limited
|
||||
responsibility. The advantage of this is that spaCy's APIs are much simpler,
|
||||
and it's often easier to compose functions in a more flexible way.
|
||||
|
||||
One particularly powerful feature of spaCy is its support for
|
||||
`word embeddings`_ --- the dense vectors introduced by deep learning models, and
|
||||
now commonly produced by `word2vec`_ and related systems.
|
||||
|
||||
Once a set of word embeddings has been installed, the vectors are available
|
||||
from any token:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> from spacy.en.attrs import SIC
|
||||
>>> from scipy.spatial.distance import cosine
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp(u'Apple banana Batman hero')
|
||||
>>> cosine(tokens[0].vec, tokens[1].vec)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
|
||||
|
||||
.. _word2vec: https://code.google.com/p/word2vec/
|
||||
|
||||
.. code:: python
|
||||
|
||||
def main(db_loc, output_dir, feat_type="tfidf"):
|
||||
nlp = spacy.en.English()
|
||||
|
||||
# Read stop list and make TF-IDF weights --- data needed for the
|
||||
# feature extraction.
|
||||
with open(stops_loc) as file_:
|
||||
stop_words = set(nlp.vocab.strings[word.strip()] for word in file_)
|
||||
idf_weights = get_idf_weights(nlp, iter_docs(db_loc))
|
||||
if feat_type == 'tfidf':
|
||||
feature_extractor = tfidf_extractor(stop_words, idf_weights)
|
||||
elif feat_type == 'vec':
|
||||
feature_extractor = vec_extractor(stop_words, idf_weights)
|
||||
|
||||
for i, text in enumerate(iter_docs(db_loc)):
|
||||
tokens = nlp(body)
|
||||
sentences = tokens.sentences()
|
||||
summary = summarize(sentences, feature_extractor)
|
||||
write_output(summary, output_dir, i)
|
||||
|
||||
|
||||
|
||||
|
||||
.. _scikit-learn: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
The LexRank Algorithm
|
||||
----------------------
|
||||
|
||||
LexRank is described as a graph-based algorithm, derived from `Google's PageRank`_.
|
||||
The nodes are sentences, and the edges are the similarities between one
|
||||
sentence and another. The "graph" is fully-connected, and its edges are
|
||||
undirected --- so, it's natural to represent this as a matrix:
|
||||
|
||||
.. code:: python
|
||||
|
||||
from scipy.spatial.distance import cosine
|
||||
import numpy
|
||||
|
||||
|
||||
def lexrank(sent_vectors):
|
||||
n = len(sent_vectors)
|
||||
# Build the cosine similarity matrix
|
||||
matrix = numpy.ndarray(shape=(n, n))
|
||||
for i in range(n):
|
||||
for j in range(n):
|
||||
matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j])
|
||||
# Normalize
|
||||
for i in range(n):
|
||||
matrix[i] /= sum(matrix[i])
|
||||
return _pagerank(matrix)
|
||||
|
||||
The rows are normalized (i.e. rows sum to 1), allowing the PageRank algorithm
|
||||
to be applied. Unfortunately the PageRank implementation is rather opaque ---
|
||||
it's easier to just read the Wikipedia page:
|
||||
|
||||
.. code:: python
|
||||
|
||||
def _pagerank(matrix, d=0.85):
|
||||
# This is admittedly opaque --- just read the Wikipedia page.
|
||||
n = len(matrix)
|
||||
rank = numpy.ones(shape=(n,)) / n
|
||||
new_rank = numpy.zeros(shape=(n,))
|
||||
while not _has_converged(rank, new_rank):
|
||||
rank, new_rank = new_rank, rank
|
||||
for i in range(n):
|
||||
new_rank[i] = ((1.0 - d) / n) + (d * sum(rank * matrix[i]))
|
||||
return rank
|
||||
|
||||
def _has_converged(x, y, epsilon=0.0001):
|
||||
return all(abs(x[i] - y[i]) < epsilon for i in range(n))
|
||||
|
||||
|
||||
Initial Processing
|
||||
------------------
|
||||
|
||||
|
||||
|
||||
|
||||
Feature Extraction
|
||||
------------------
|
||||
|
||||
.. code:: python
|
||||
def sentence_vectors(sentence, idf_weights):
|
||||
tf_idf = {}
|
||||
for term, freq in sent.count_by(LEMMA).items():
|
||||
tf_idf[term] = freq * idf_weights[term]
|
||||
vectors.append(tf_idf)
|
||||
return vectors
|
||||
|
||||
The LexRank paper models each sentence as a bag-of-words
|
||||
|
||||
This is simple and fairly standard, but often gives
|
||||
underwhelming results. My idea is to instead calculate vectors from
|
||||
`word-embeddings`_, which have been one of the exciting outcomes of the recent
|
||||
work on deep-learning. I had a quick look at the literature, and found
|
||||
a `recent workshop paper`_ that suggested the idea was plausible.
|
||||
|
||||
|
||||
|
||||
|
||||
Taking the feature representation and similarity function as parameters, the
|
||||
LexRank function looks like this:
|
||||
|
||||
|
||||
Given a list of N sentences, a function that maps a sentence to a feature
|
||||
vector, and a function that computes a similarity measure of two feature
|
||||
vectors, this produces a vector of N floats, which indicate how well each
|
||||
sentence represents the document as a whole.
|
||||
|
||||
.. _Rouge: https://en.wikipedia.org/wiki/ROUGE_%28metric%29
|
||||
|
||||
|
||||
.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/
|
||||
|
||||
.. _recent workshop paper: https://www.aclweb.org/anthology/W/W14/W14-1504.pdf
|
||||
|
||||
|
||||
Document Model
|
||||
--------------
|
|
@ -1,233 +0,0 @@
|
|||
Updates
|
||||
=======
|
||||
|
||||
To update your installation:
|
||||
|
||||
.. code:: bash
|
||||
|
||||
$ pip install --upgrade spacy
|
||||
$ python -m spacy.en.download all
|
||||
|
||||
Most updates ship a new model, so you will usually have to redownload the data.
|
||||
|
||||
v0.89
|
||||
-----
|
||||
|
||||
* Fix regression in parse times on very long texts. Recent versions were
|
||||
calculating parse features in a way that was polynomial in input length.
|
||||
* Add tag SP (coarse tag SPACE) for whitespace tokens. Ensure entity recogniser
|
||||
does not assign entities to whitespace.
|
||||
* Rename :code:`Span.head` to :code:`Span.root`, fix its documentation, and make
|
||||
it more efficient. I considered adding Span.head, Span.dep and Span.dep\_ as
|
||||
well, but for now I leave these as accessible via :code:`Span.root.head`,
|
||||
:code:`Span.head.dep`, and :code:`Span.head.dep\_`, to keep the API smaller.
|
||||
|
||||
|
||||
2015-07-08 v0.88
|
||||
----------------
|
||||
|
||||
Refactoring release.
|
||||
|
||||
If you have the data for v0.87, you don't need to redownload the data for this
|
||||
release.
|
||||
|
||||
* You can now set tag=False, parse=False or entity=False when creating the pipleine,
|
||||
to disable some of the models. See the documentation for details.
|
||||
* Models no longer lazy-loaded.
|
||||
* Warning emitted when parse=True or entity=True but model not loaded.
|
||||
* Rename the tokens.Tokens class to tokens.Doc. An alias has been made to assist
|
||||
backwards compatibility, but you should update your code to refer to the new
|
||||
class name.
|
||||
* Various bits of internal refactoring
|
||||
|
||||
|
||||
2015-07-01 v0.87
|
||||
----------------
|
||||
|
||||
* Changed weights data structure. Memory use should be reduced 30-40%.
|
||||
* Fixed speed regressions introduced in the last few versions.
|
||||
* Models should now be slightly more robust to noise in the input text, as I'm
|
||||
now training on data with a small amount of noise added, e.g. I randomly corrupt
|
||||
capitalization, swap spaces for newlines, etc. This is bringing a small
|
||||
benefit on out-of-domain data. I think this strategy could yield better
|
||||
results with a better noise-generation function. If you think you have a good
|
||||
way to make clean text resemble the kind of noisy input you're seeing in your
|
||||
domain, get in touch.
|
||||
|
||||
2015-06-24 v0.86
|
||||
----------------
|
||||
|
||||
* Parser now more accurate, using novel non-monotonic transition system that's
|
||||
currently under review.
|
||||
|
||||
|
||||
2015-05-12 v0.85
|
||||
----------------
|
||||
|
||||
* Parser produces richer dependency labels following the `ClearNLP scheme`_
|
||||
* Training data now includes text from a variety of genres.
|
||||
* Parser now uses more memory and the data is slightly larger, due to the additional
|
||||
labels. Impact on efficiency is minimal: entire process still takes
|
||||
<10ms per document.
|
||||
|
||||
Most users should see a substantial increase in accuracy from the new model.
|
||||
Long post on accuracy evaluation and model details coming soon.
|
||||
|
||||
.. _ClearNLP scheme: https://github.com/clir/clearnlp-guidelines/blob/master/md/dependency/dependency_guidelines.md
|
||||
|
||||
|
||||
2015-05-12 v0.84
|
||||
----------------
|
||||
|
||||
* Bug fixes for parsing
|
||||
* Bug fixes for named entity recognition
|
||||
|
||||
2015-04-13 v0.80
|
||||
----------------
|
||||
|
||||
* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements.
|
||||
|
||||
* Better sentence boundary detection, drawn from the syntactic structure.
|
||||
|
||||
* Lots of bug fixes.
|
||||
|
||||
2015-03-05 v0.70
|
||||
----------------
|
||||
|
||||
* Improved parse navigation API
|
||||
* Bug fixes to labelled parsing
|
||||
|
||||
|
||||
2015-01-30 spaCy v0.4: Still alpha, improving quickly
|
||||
-----------------------------------------------------
|
||||
|
||||
Five days ago I presented the alpha release of spaCy, a natural language
|
||||
processing library that brings state-of-the-art technology to small companies.
|
||||
|
||||
spaCy has been well received, and there are now a lot of eyes on the project.
|
||||
Naturally, lots of issues have surfaced. I'm grateful to those who've reported
|
||||
them. I've worked hard to address them as quickly as I could.
|
||||
|
||||
Bug Fixes
|
||||
----------
|
||||
|
||||
* Lexemes.bin data file had a platform-specific encoding.
|
||||
This was a silly error: instead of the string, or an index into the
|
||||
list of strings, I was storing the 64-bit hash of the string. On
|
||||
wide-unicode builds, a unicode string hashes differently. This meant that
|
||||
all look-ups into the vocabulary failed on wide unicode builds, which
|
||||
further meant that the part-of-speech tagger and parser features were not
|
||||
computed correctly.
|
||||
|
||||
The fix is simple: we already have to read in a list of all the strings, so
|
||||
just store an index into that list, instead of a hash.
|
||||
|
||||
* Parse tree navigation API was rough, and buggy.
|
||||
The parse-tree navigation API was the last thing I added before v0.3. I've
|
||||
now replaced it with something better. The previous API design was flawed,
|
||||
and the implementation was buggy --- Token.child() and Token.head were
|
||||
sometimes inconsistent.
|
||||
|
||||
I've addressed the most immediate problems, but this part of the design is
|
||||
still a work in progress. It's a difficult problem. The parse is a tree,
|
||||
and we want to freely navigate up and down it without creating reference
|
||||
cycles that inhibit garbage collection, and without doing a lot of copying,
|
||||
creating and deleting.
|
||||
|
||||
I think I've got a promising solution to this, but I suspect there's
|
||||
currently a memory leak. Please get in touch no the tracker if you want to
|
||||
know more, especially if you think you can help.
|
||||
|
||||
Known Issues
|
||||
------------
|
||||
|
||||
Some systems are still experiencing memory errors, which I'm having trouble
|
||||
pinning down or reproducing. Please send details of your system to the
|
||||
`Issue Tracker`_ if this is happening to you.
|
||||
|
||||
.. _Issue Tracker: https://github.com/honnibal/spaCy/issues
|
||||
|
||||
Enhancements: Train and evaluate on whole paragraphs
|
||||
----------------------------------------------------
|
||||
|
||||
.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser.
|
||||
|
||||
|
||||
Most English parsing research is performed on text with perfect pre-processing:
|
||||
one newline between every sentence, one space between every token.
|
||||
It's always been done this way, and it's good. It's a useful idealisation,
|
||||
because the pre-processing has few algorithmic implications.
|
||||
|
||||
But, for practical performance, this stuff can matter a lot.
|
||||
Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few
|
||||
parsers on raw text. Even on the standard Wall Street Journal corpus,
|
||||
where pre-processing tools are quite good, the quality of pre-processing
|
||||
made a big difference:
|
||||
|
||||
+-------------+-------+----------+
|
||||
| Preprocess | BLLIP | Berkeley |
|
||||
+-------------+-------+----------+
|
||||
| Gold | 90.9 | 89.8 |
|
||||
+-------------+-------+----------+
|
||||
| Default | 86.4 | 88.4 |
|
||||
+-------------+-------+----------+
|
||||
| Corrected | 89.9 | 88.8 |
|
||||
+-------------+-------+----------+
|
||||
|
||||
.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable.
|
||||
|
||||
|
||||
|
||||
In the standard experimental condition --- gold pre-processing --- the
|
||||
BLLIP parser is better. But, it turns out it ships with lousy pre-processing
|
||||
tools: when you evaluate the parsers on raw text, the BLLIP parser falls way
|
||||
behind. To verify that this was due to the quality of the pre-processing
|
||||
tools, and not some particular algorithmic sensitivity, Dridan and Oepen ran
|
||||
both parsers with their high-quality tokenizer and sentence segmenter. This
|
||||
confirmed that with equal pre-processing, the BLLIP parser is better.
|
||||
|
||||
The Dridan and Oepen paper really convinced me to take pre-processing seriously
|
||||
in spaCy. In fact, spaCy started life as just a tokenizer --- hence the name.
|
||||
|
||||
The spaCy parser has a special trick up its sleeve. Because both the tagger
|
||||
and parser run in linear time, it doesn't require that the input be divided
|
||||
into sentences. This is nice because it avoids error-cascades: if you segment
|
||||
first, then the parser just has to live with whatever decision the segmenter
|
||||
made.
|
||||
|
||||
But, even though I designed the system with this consideration in mind,
|
||||
I decided to present the initial results using the standard methodology, using
|
||||
gold-standard inputs. But...then I made a mistake.
|
||||
|
||||
Unfortunately, with all the other things I was doing before launch, I forgot
|
||||
all about this problem. spaCy launched with a parsing model that expected the
|
||||
input to be segmented into sentences, but with no sentence segmenter. This
|
||||
caused a drop in parse accuracy of 4%!
|
||||
|
||||
Over the last five days, I've worked hard to correct this. I implemented the
|
||||
modifications to the parsing algorithm I had planned, from Dongdong Zhang et al.
|
||||
(2013), and trained and evaluated the parser on raw text, using the version of
|
||||
the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's
|
||||
experiments.
|
||||
|
||||
I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly
|
||||
as well on raw text as text with gold-standard tokenization and sentence
|
||||
boundary detection.
|
||||
|
||||
I still need to evaluate this on web text, and I need to compare against the
|
||||
Stanford CoreNLP and other parsers. I suspect that most other parsers will
|
||||
decline in accuracy by 1% --- we'll see.
|
||||
|
||||
|
||||
+-------------+---------+
|
||||
| Preprocess | spaCy |
|
||||
+-------------+---------+
|
||||
| Gold | 92.4% |
|
||||
+-------------+---------+
|
||||
| Default | 92.2% |
|
||||
+-------------+---------+
|
||||
|
||||
2015-01-25
|
||||
----------
|
||||
|
||||
spaCy v0.33 launched --- first alpha build.
|
Loading…
Reference in New Issue