diff --git a/README.rst b/README.rst
index bac46dec4..671801061 100644
--- a/README.rst
+++ b/README.rst
@@ -6,7 +6,7 @@ Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day 1 to be used in real products. It's commercial
open-source software, released under the MIT license.
-π« **Version 1.2 out now!** `Read the release notes here. `_
+π« **Version 1.3 out now!** `Read the release notes here. `_
.. image:: http://i.imgur.com/wFvLZyJ.png
:target: https://travis-ci.org/explosion/spaCy
@@ -241,8 +241,38 @@ calling ``spacy.load()``, or by passing a ``path`` argument to the ``spacy.en.En
Changelog
=========
-2016-11-04 `v1.2.0 `_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
--------------------------------------------------------------------------------------------------------------------------------------------
+2016-12-03 `v1.3.0 `_: *Improve API consistency*
+---------------------------------------------------------------------------------------------
+
+**β¨ API improvements**
+
+* Add ``Span.sentiment`` attribute.
+* `#658 `_: Add ``Span.noun_chunks`` iterator (thanks `@pokey `_).
+* `#642 `_: Let ``--data-path`` be specified when running download.py scripts (thanks `@ExplodingCabbage `_).
+* `#638 `_: Add German stopwords (thanks `@souravsingh `_).
+* `#614 `_: Fix ``PhraseMatcher`` to work with new ``Matcher`` (thanks `@sadovnychyi `_).
+
+**π΄ Bug fixes**
+
+* Fix issue `#605 `_: ``accept`` argument to ``Matcher`` now rejects matches as expected.
+* Fix issue `#617 `_: ``Vocab.load()`` now works with string paths, as well as ``Path`` objects.
+* Fix issue `#639 `_: Stop words in ``Language`` class now used as expected.
+* Fix issues `#656 `_, `#624 `_: ``Tokenizer`` special-case rules now support arbitrary token attributes.
+
+
+**π Documentation and examples**
+
+* Add `"Customizing the tokenizer" `_ workflow.
+* Add `"Training the tagger, parser and entity recognizer" `_ workflow.
+* Add `"Entity recognition" `_ workflow.
+* Fix various typos and inconsistencies.
+
+**π₯ Contributors**
+
+Thanks to `@pokey `_, `@ExplodingCabbage `_, `@souravsingh `_, `@sadovnychyi `_, `@manojsakhwar `_, `@TiagoMRodrigues `_, `@savkov `_, `@pspiegelhalter `_, `@chenb67 `_, `@kylepjohnson `_, `@YanhaoYang `_, `@tjrileywisc `_, `@dechov `_, `@wjt `_, `@jsmootiv `_ and `@blarghmatey `_ for the pull requests!
+
+2016-11-04 `v1.2.0 `_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
+------------------------------------------------------------------------------------------------------------------------------------------------------
**β¨ Major features and improvements**
diff --git a/examples/training/load_ner.py b/examples/training/load_ner.py
new file mode 100644
index 000000000..bf81cee50
--- /dev/null
+++ b/examples/training/load_ner.py
@@ -0,0 +1,22 @@
+# Load NER
+from __future__ import unicode_literals
+import spacy
+import pathlib
+from spacy.pipeline import EntityRecognizer
+from spacy.vocab import Vocab
+
+def load_model(model_dir):
+ model_dir = pathlib.Path(model_dir)
+ nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
+ with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
+ nlp.vocab.strings.load(file_)
+ nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
+ ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
+ return (nlp, ner)
+
+(nlp, ner) = load_model('ner')
+doc = nlp.make_doc('Who is Shaka Khan?')
+nlp.tagger(doc)
+ner(doc)
+for word in doc:
+ print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
diff --git a/examples/training/train_ner.py b/examples/training/train_ner.py
index 8c96dc0a4..220244b93 100644
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@@ -10,6 +10,13 @@ from spacy.tagger import Tagger
def train_ner(nlp, train_data, entity_types):
+ # Add new words to vocab.
+ for raw_text, _ in train_data:
+ doc = nlp.make_doc(raw_text)
+ for word in doc:
+ _ = nlp.vocab[word.orth]
+
+ # Train NER.
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
for itn in range(5):
random.shuffle(train_data)
@@ -20,21 +27,30 @@ def train_ner(nlp, train_data, entity_types):
ner.model.end_training()
return ner
+def save_model(ner, model_dir):
+ model_dir = pathlib.Path(model_dir)
+ if not model_dir.exists():
+ model_dir.mkdir()
+ assert model_dir.is_dir()
+
+ with (model_dir / 'config.json').open('w') as file_:
+ json.dump(ner.cfg, file_)
+ ner.model.dump(str(model_dir / 'model'))
+ if not (model_dir / 'vocab').exists():
+ (model_dir / 'vocab').mkdir()
+ ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
+ with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
+ ner.vocab.strings.dump(file_)
+
def main(model_dir=None):
- if model_dir is not None:
- model_dir = pathlib.Path(model_dir)
- if not model_dir.exists():
- model_dir.mkdir()
- assert model_dir.is_dir()
-
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
# v1.1.2 onwards
if nlp.tagger is None:
print('---- WARNING ----')
print('Data directory not found')
- print('please run: `python -m spacy.en.download βforce all` for better performance')
+ print('please run: `python -m spacy.en.download --force all` for better performance')
print('Using feature templates for tagging')
print('-----------------')
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
@@ -56,16 +72,17 @@ def main(model_dir=None):
nlp.tagger(doc)
ner(doc)
for word in doc:
- print(word.text, word.tag_, word.ent_type_, word.ent_iob)
+ print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
if model_dir is not None:
- with (model_dir / 'config.json').open('w') as file_:
- json.dump(ner.cfg, file_)
- ner.model.dump(str(model_dir / 'model'))
+ save_model(ner, model_dir)
+
+
+
if __name__ == '__main__':
- main()
+ main('ner')
# Who "" 2
# is "" 2
# Shaka "" PERSON 3
diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index 6d8f66630..d5a519942 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -69,7 +69,7 @@ def main(output_dir=None):
print(word.text, word.tag_, word.pos_)
if output_dir is not None:
tagger.model.dump(str(output_dir / 'pos' / 'model'))
- with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
+ with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
tagger.vocab.strings.dump(file_)
diff --git a/spacy/about.py b/spacy/about.py
index 3d5909d2c..1336000f4 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -4,7 +4,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy'
-__version__ = '1.2.0'
+__version__ = '1.3.0'
__summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal'
diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd
index 942d8aa9c..ca1d1ed79 100644
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@@ -426,3 +426,9 @@ cpdef enum symbol_t:
#IS_QUOTE
#IS_LEFT_PUNCT
#IS_RIGHT_PUNCT
+
+# These symbols are currently missing. However, if we add them currently,
+# we'll throw off the integer index and the model will have to be retrained.
+# We therefore wait until the next data version to add them.
+# acl
+
diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py
index 27a88a61b..aee869c5b 100644
--- a/spacy/tests/spans/test_span.py
+++ b/spacy/tests/spans/test_span.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
from spacy.attrs import HEAD
from spacy.en import English
+from spacy.tokens.doc import Doc
import numpy as np
import pytest
@@ -49,3 +50,44 @@ def test_sent(doc):
assert span.sent.text == 'This is a sentence.'
span = doc[6:7]
assert span.sent.root.left_edge.text == 'This'
+
+
+def test_default_sentiment(EN):
+ '''Test new span.sentiment property's default averaging behaviour'''
+ good = EN.vocab[u'good']
+ good.sentiment = 3.0
+ bad = EN.vocab[u'bad']
+ bad.sentiment = -2.0
+
+ doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
+
+ good_stuff = doc[:2]
+ assert good_stuff.sentiment == 3.0 / 2
+
+ bad_stuff = doc[-2:]
+ assert bad_stuff.sentiment == -2. / 2
+
+ good_stuff_bad = doc[:-1]
+ assert good_stuff_bad.sentiment == (3.+-2) / 3.
+
+
+
+def test_override_sentiment(EN):
+ '''Test new span.sentiment property's default averaging behaviour'''
+ good = EN.vocab[u'good']
+ good.sentiment = 3.0
+ bad = EN.vocab[u'bad']
+ bad.sentiment = -2.0
+
+ doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
+
+ doc.user_span_hooks['sentiment'] = lambda span: 10.0
+
+ good_stuff = doc[:2]
+ assert good_stuff.sentiment == 10.0
+
+ bad_stuff = doc[-2:]
+ assert bad_stuff.sentiment == 10.0
+
+ good_stuff_bad = doc[:-1]
+ assert good_stuff_bad.sentiment == 10.0
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index a4f49555a..903ef26d1 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -179,6 +179,13 @@ cdef class Span:
self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm
+ property sentiment:
+ def __get__(self):
+ if 'sentiment' in self.doc.user_span_hooks:
+ return self.doc.user_span_hooks['sentiment'](self)
+ else:
+ return sum([token.sentiment for token in self]) / len(self)
+
property text:
def __get__(self):
text = self.text_with_ws
diff --git a/website/docs/usage/data-model.jade b/website/docs/usage/data-model.jade
index 3275d878f..beac4b60b 100644
--- a/website/docs/usage/data-model.jade
+++ b/website/docs/usage/data-model.jade
@@ -14,7 +14,7 @@ p After reading this page, you should be able to:
+h(3, "no-job-too-big") No job too big
p
- | When writing spaCy, one of my motos was #[em no job too big]. I wanted
+ | When writing spaCy, one of my mottos was #[em no job too big]. I wanted
| to make sure that if Google or Facebook were founded tomorrow, spaCy
| would be the obvious choice for them. I wanted spaCy to be the obvious
| choice for web-scale NLP. This meant sweating about performance, because
diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade
index 1b65f3331..ed29142f4 100644
--- a/website/docs/usage/entity-recognition.jade
+++ b/website/docs/usage/entity-recognition.jade
@@ -217,7 +217,7 @@ p
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
]
- nlp = spacy.load(entity=False, parser=False)
+ nlp = spacy.load('en', entity=False, parser=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
for itn in range(5):