Merge github.com:explosion/spaCy into dutch

This commit is contained in:
Janneke van der Zwaan 2016-12-13 09:25:23 +01:00
commit 4a3fdcce8a
10 changed files with 143 additions and 19 deletions

View File

@ -6,7 +6,7 @@ Cython. spaCy is built on the very latest research, but it isn't researchware.
It was designed from day 1 to be used in real products. It's commercial It was designed from day 1 to be used in real products. It's commercial
open-source software, released under the MIT license. open-source software, released under the MIT license.
💫 **Version 1.2 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_ 💫 **Version 1.3 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
.. image:: http://i.imgur.com/wFvLZyJ.png .. image:: http://i.imgur.com/wFvLZyJ.png
:target: https://travis-ci.org/explosion/spaCy :target: https://travis-ci.org/explosion/spaCy
@ -241,8 +241,38 @@ calling ``spacy.load()``, or by passing a ``path`` argument to the ``spacy.en.En
Changelog Changelog
========= =========
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese* 2016-12-03 `v1.3.0 <https://github.com/explosion/spaCy/releases>`_: *Improve API consistency*
------------------------------------------------------------------------------------------------------------------------------------------- ---------------------------------------------------------------------------------------------
**✨ API improvements**
* Add ``Span.sentiment`` attribute.
* `#658 <https://github.com/explosion/spaCy/pull/658>`_: Add ``Span.noun_chunks`` iterator (thanks `@pokey <https://github.com/pokey>`_).
* `#642 <https://github.com/explosion/spaCy/pull/642>`_: Let ``--data-path`` be specified when running download.py scripts (thanks `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_).
* `#638 <https://github.com/explosion/spaCy/pull/638>`_: Add German stopwords (thanks `@souravsingh <https://github.com/souravsingh>`_).
* `#614 <https://github.com/explosion/spaCy/pull/614>`_: Fix ``PhraseMatcher`` to work with new ``Matcher`` (thanks `@sadovnychyi <https://github.com/sadovnychyi>`_).
**🔴 Bug fixes**
* Fix issue `#605 <https://github.com/explosion/spaCy/issues/605>`_: ``accept`` argument to ``Matcher`` now rejects matches as expected.
* Fix issue `#617 <https://github.com/explosion/spaCy/issues/617>`_: ``Vocab.load()`` now works with string paths, as well as ``Path`` objects.
* Fix issue `#639 <https://github.com/explosion/spaCy/issues/639>`_: Stop words in ``Language`` class now used as expected.
* Fix issues `#656 <https://github.com/explosion/spaCy/issues/656>`_, `#624 <https://github.com/explosion/spaCy/issues/624>`_: ``Tokenizer`` special-case rules now support arbitrary token attributes.
**📖 Documentation and examples**
* Add `"Customizing the tokenizer" <https://spacy.io/docs/usage/customizing-tokenizer>`_ workflow.
* Add `"Training the tagger, parser and entity recognizer" <https://spacy.io/docs/usage/training>`_ workflow.
* Add `"Entity recognition" <https://spacy.io/docs/usage/entity-recognition>`_ workflow.
* Fix various typos and inconsistencies.
**👥 Contributors**
Thanks to `@pokey <https://github.com/pokey>`_, `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_, `@souravsingh <https://github.com/souravsingh>`_, `@sadovnychyi <https://github.com/sadovnychyi>`_, `@manojsakhwar <https://github.com/manojsakhwar>`_, `@TiagoMRodrigues <https://github.com/TiagoMRodrigues>`_, `@savkov <https://github.com/savkov>`_, `@pspiegelhalter <https://github.com/pspiegelhalter>`_, `@chenb67 <https://github.com/chenb67>`_, `@kylepjohnson <https://github.com/kylepjohnson>`_, `@YanhaoYang <https://github.com/YanhaoYang>`_, `@tjrileywisc <https://github.com/tjrileywisc>`_, `@dechov <https://github.com/dechov>`_, `@wjt <https://github.com/wjt>`_, `@jsmootiv <https://github.com/jsmootiv>`_ and `@blarghmatey <https://github.com/blarghmatey>`_ for the pull requests!
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases/tag/v1.2.0>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
------------------------------------------------------------------------------------------------------------------------------------------------------
**✨ Major features and improvements** **✨ Major features and improvements**

View File

@ -0,0 +1,22 @@
# Load NER
from __future__ import unicode_literals
import spacy
import pathlib
from spacy.pipeline import EntityRecognizer
from spacy.vocab import Vocab
def load_model(model_dir):
model_dir = pathlib.Path(model_dir)
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
nlp.vocab.strings.load(file_)
nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
return (nlp, ner)
(nlp, ner) = load_model('ner')
doc = nlp.make_doc('Who is Shaka Khan?')
nlp.tagger(doc)
ner(doc)
for word in doc:
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)

View File

@ -10,6 +10,13 @@ from spacy.tagger import Tagger
def train_ner(nlp, train_data, entity_types): def train_ner(nlp, train_data, entity_types):
# Add new words to vocab.
for raw_text, _ in train_data:
doc = nlp.make_doc(raw_text)
for word in doc:
_ = nlp.vocab[word.orth]
# Train NER.
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types) ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
for itn in range(5): for itn in range(5):
random.shuffle(train_data) random.shuffle(train_data)
@ -20,21 +27,30 @@ def train_ner(nlp, train_data, entity_types):
ner.model.end_training() ner.model.end_training()
return ner return ner
def save_model(ner, model_dir):
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
with (model_dir / 'config.json').open('w') as file_:
json.dump(ner.cfg, file_)
ner.model.dump(str(model_dir / 'model'))
if not (model_dir / 'vocab').exists():
(model_dir / 'vocab').mkdir()
ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
ner.vocab.strings.dump(file_)
def main(model_dir=None): def main(model_dir=None):
if model_dir is not None:
model_dir = pathlib.Path(model_dir)
if not model_dir.exists():
model_dir.mkdir()
assert model_dir.is_dir()
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
# v1.1.2 onwards # v1.1.2 onwards
if nlp.tagger is None: if nlp.tagger is None:
print('---- WARNING ----') print('---- WARNING ----')
print('Data directory not found') print('Data directory not found')
print('please run: `python -m spacy.en.download force all` for better performance') print('please run: `python -m spacy.en.download --force all` for better performance')
print('Using feature templates for tagging') print('Using feature templates for tagging')
print('-----------------') print('-----------------')
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
@ -56,16 +72,17 @@ def main(model_dir=None):
nlp.tagger(doc) nlp.tagger(doc)
ner(doc) ner(doc)
for word in doc: for word in doc:
print(word.text, word.tag_, word.ent_type_, word.ent_iob) print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
if model_dir is not None: if model_dir is not None:
with (model_dir / 'config.json').open('w') as file_: save_model(ner, model_dir)
json.dump(ner.cfg, file_)
ner.model.dump(str(model_dir / 'model'))
if __name__ == '__main__': if __name__ == '__main__':
main() main('ner')
# Who "" 2 # Who "" 2
# is "" 2 # is "" 2
# Shaka "" PERSON 3 # Shaka "" PERSON 3

View File

@ -69,7 +69,7 @@ def main(output_dir=None):
print(word.text, word.tag_, word.pos_) print(word.text, word.tag_, word.pos_)
if output_dir is not None: if output_dir is not None:
tagger.model.dump(str(output_dir / 'pos' / 'model')) tagger.model.dump(str(output_dir / 'pos' / 'model'))
with (output_dir / 'vocab' / 'strings.json').open('wb') as file_: with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
tagger.vocab.strings.dump(file_) tagger.vocab.strings.dump(file_)

View File

@ -4,7 +4,7 @@
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
__title__ = 'spacy' __title__ = 'spacy'
__version__ = '1.2.0' __version__ = '1.3.0'
__summary__ = 'Industrial-strength NLP' __summary__ = 'Industrial-strength NLP'
__uri__ = 'https://spacy.io' __uri__ = 'https://spacy.io'
__author__ = 'Matthew Honnibal' __author__ = 'Matthew Honnibal'

View File

@ -426,3 +426,9 @@ cpdef enum symbol_t:
#IS_QUOTE #IS_QUOTE
#IS_LEFT_PUNCT #IS_LEFT_PUNCT
#IS_RIGHT_PUNCT #IS_RIGHT_PUNCT
# These symbols are currently missing. However, if we add them currently,
# we'll throw off the integer index and the model will have to be retrained.
# We therefore wait until the next data version to add them.
# acl

View File

@ -1,6 +1,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.attrs import HEAD from spacy.attrs import HEAD
from spacy.en import English from spacy.en import English
from spacy.tokens.doc import Doc
import numpy as np import numpy as np
import pytest import pytest
@ -49,3 +50,44 @@ def test_sent(doc):
assert span.sent.text == 'This is a sentence.' assert span.sent.text == 'This is a sentence.'
span = doc[6:7] span = doc[6:7]
assert span.sent.root.left_edge.text == 'This' assert span.sent.root.left_edge.text == 'This'
def test_default_sentiment(EN):
'''Test new span.sentiment property's default averaging behaviour'''
good = EN.vocab[u'good']
good.sentiment = 3.0
bad = EN.vocab[u'bad']
bad.sentiment = -2.0
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
good_stuff = doc[:2]
assert good_stuff.sentiment == 3.0 / 2
bad_stuff = doc[-2:]
assert bad_stuff.sentiment == -2. / 2
good_stuff_bad = doc[:-1]
assert good_stuff_bad.sentiment == (3.+-2) / 3.
def test_override_sentiment(EN):
'''Test new span.sentiment property's default averaging behaviour'''
good = EN.vocab[u'good']
good.sentiment = 3.0
bad = EN.vocab[u'bad']
bad.sentiment = -2.0
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
doc.user_span_hooks['sentiment'] = lambda span: 10.0
good_stuff = doc[:2]
assert good_stuff.sentiment == 10.0
bad_stuff = doc[-2:]
assert bad_stuff.sentiment == 10.0
good_stuff_bad = doc[:-1]
assert good_stuff_bad.sentiment == 10.0

View File

@ -179,6 +179,13 @@ cdef class Span:
self._vector_norm = sqrt(norm) if norm != 0 else 0 self._vector_norm = sqrt(norm) if norm != 0 else 0
return self._vector_norm return self._vector_norm
property sentiment:
def __get__(self):
if 'sentiment' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sentiment'](self)
else:
return sum([token.sentiment for token in self]) / len(self)
property text: property text:
def __get__(self): def __get__(self):
text = self.text_with_ws text = self.text_with_ws

View File

@ -14,7 +14,7 @@ p After reading this page, you should be able to:
+h(3, "no-job-too-big") No job too big +h(3, "no-job-too-big") No job too big
p p
| When writing spaCy, one of my motos was #[em no job too big]. I wanted | When writing spaCy, one of my mottos was #[em no job too big]. I wanted
| to make sure that if Google or Facebook were founded tomorrow, spaCy | to make sure that if Google or Facebook were founded tomorrow, spaCy
| would be the obvious choice for them. I wanted spaCy to be the obvious | would be the obvious choice for them. I wanted spaCy to be the obvious
| choice for web-scale NLP. This meant sweating about performance, because | choice for web-scale NLP. This meant sweating about performance, because

View File

@ -217,7 +217,7 @@ p
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')]) ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
] ]
nlp = spacy.load(entity=False, parser=False) nlp = spacy.load('en', entity=False, parser=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC']) ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
for itn in range(5): for itn in range(5):