mirror of https://github.com/explosion/spaCy.git
Merge github.com:explosion/spaCy into dutch
This commit is contained in:
commit
4a3fdcce8a
36
README.rst
36
README.rst
|
@ -6,7 +6,7 @@ Cython. spaCy is built on the very latest research, but it isn't researchware.
|
||||||
It was designed from day 1 to be used in real products. It's commercial
|
It was designed from day 1 to be used in real products. It's commercial
|
||||||
open-source software, released under the MIT license.
|
open-source software, released under the MIT license.
|
||||||
|
|
||||||
💫 **Version 1.2 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
💫 **Version 1.3 out now!** `Read the release notes here. <https://github.com/explosion/spaCy/releases/>`_
|
||||||
|
|
||||||
.. image:: http://i.imgur.com/wFvLZyJ.png
|
.. image:: http://i.imgur.com/wFvLZyJ.png
|
||||||
:target: https://travis-ci.org/explosion/spaCy
|
:target: https://travis-ci.org/explosion/spaCy
|
||||||
|
@ -241,8 +241,38 @@ calling ``spacy.load()``, or by passing a ``path`` argument to the ``spacy.en.En
|
||||||
Changelog
|
Changelog
|
||||||
=========
|
=========
|
||||||
|
|
||||||
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
|
2016-12-03 `v1.3.0 <https://github.com/explosion/spaCy/releases>`_: *Improve API consistency*
|
||||||
-------------------------------------------------------------------------------------------------------------------------------------------
|
---------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
**✨ API improvements**
|
||||||
|
|
||||||
|
* Add ``Span.sentiment`` attribute.
|
||||||
|
* `#658 <https://github.com/explosion/spaCy/pull/658>`_: Add ``Span.noun_chunks`` iterator (thanks `@pokey <https://github.com/pokey>`_).
|
||||||
|
* `#642 <https://github.com/explosion/spaCy/pull/642>`_: Let ``--data-path`` be specified when running download.py scripts (thanks `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_).
|
||||||
|
* `#638 <https://github.com/explosion/spaCy/pull/638>`_: Add German stopwords (thanks `@souravsingh <https://github.com/souravsingh>`_).
|
||||||
|
* `#614 <https://github.com/explosion/spaCy/pull/614>`_: Fix ``PhraseMatcher`` to work with new ``Matcher`` (thanks `@sadovnychyi <https://github.com/sadovnychyi>`_).
|
||||||
|
|
||||||
|
**🔴 Bug fixes**
|
||||||
|
|
||||||
|
* Fix issue `#605 <https://github.com/explosion/spaCy/issues/605>`_: ``accept`` argument to ``Matcher`` now rejects matches as expected.
|
||||||
|
* Fix issue `#617 <https://github.com/explosion/spaCy/issues/617>`_: ``Vocab.load()`` now works with string paths, as well as ``Path`` objects.
|
||||||
|
* Fix issue `#639 <https://github.com/explosion/spaCy/issues/639>`_: Stop words in ``Language`` class now used as expected.
|
||||||
|
* Fix issues `#656 <https://github.com/explosion/spaCy/issues/656>`_, `#624 <https://github.com/explosion/spaCy/issues/624>`_: ``Tokenizer`` special-case rules now support arbitrary token attributes.
|
||||||
|
|
||||||
|
|
||||||
|
**📖 Documentation and examples**
|
||||||
|
|
||||||
|
* Add `"Customizing the tokenizer" <https://spacy.io/docs/usage/customizing-tokenizer>`_ workflow.
|
||||||
|
* Add `"Training the tagger, parser and entity recognizer" <https://spacy.io/docs/usage/training>`_ workflow.
|
||||||
|
* Add `"Entity recognition" <https://spacy.io/docs/usage/entity-recognition>`_ workflow.
|
||||||
|
* Fix various typos and inconsistencies.
|
||||||
|
|
||||||
|
**👥 Contributors**
|
||||||
|
|
||||||
|
Thanks to `@pokey <https://github.com/pokey>`_, `@ExplodingCabbage <https://github.com/ExplodingCabbage>`_, `@souravsingh <https://github.com/souravsingh>`_, `@sadovnychyi <https://github.com/sadovnychyi>`_, `@manojsakhwar <https://github.com/manojsakhwar>`_, `@TiagoMRodrigues <https://github.com/TiagoMRodrigues>`_, `@savkov <https://github.com/savkov>`_, `@pspiegelhalter <https://github.com/pspiegelhalter>`_, `@chenb67 <https://github.com/chenb67>`_, `@kylepjohnson <https://github.com/kylepjohnson>`_, `@YanhaoYang <https://github.com/YanhaoYang>`_, `@tjrileywisc <https://github.com/tjrileywisc>`_, `@dechov <https://github.com/dechov>`_, `@wjt <https://github.com/wjt>`_, `@jsmootiv <https://github.com/jsmootiv>`_ and `@blarghmatey <https://github.com/blarghmatey>`_ for the pull requests!
|
||||||
|
|
||||||
|
2016-11-04 `v1.2.0 <https://github.com/explosion/spaCy/releases/tag/v1.2.0>`_: *Alpha tokenizers for Chinese, French, Spanish, Italian and Portuguese*
|
||||||
|
------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
**✨ Major features and improvements**
|
**✨ Major features and improvements**
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
# Load NER
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import spacy
|
||||||
|
import pathlib
|
||||||
|
from spacy.pipeline import EntityRecognizer
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
def load_model(model_dir):
|
||||||
|
model_dir = pathlib.Path(model_dir)
|
||||||
|
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
|
||||||
|
with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||||
|
nlp.vocab.strings.load(file_)
|
||||||
|
nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
|
||||||
|
ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
|
||||||
|
return (nlp, ner)
|
||||||
|
|
||||||
|
(nlp, ner) = load_model('ner')
|
||||||
|
doc = nlp.make_doc('Who is Shaka Khan?')
|
||||||
|
nlp.tagger(doc)
|
||||||
|
ner(doc)
|
||||||
|
for word in doc:
|
||||||
|
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
|
|
@ -10,6 +10,13 @@ from spacy.tagger import Tagger
|
||||||
|
|
||||||
|
|
||||||
def train_ner(nlp, train_data, entity_types):
|
def train_ner(nlp, train_data, entity_types):
|
||||||
|
# Add new words to vocab.
|
||||||
|
for raw_text, _ in train_data:
|
||||||
|
doc = nlp.make_doc(raw_text)
|
||||||
|
for word in doc:
|
||||||
|
_ = nlp.vocab[word.orth]
|
||||||
|
|
||||||
|
# Train NER.
|
||||||
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
|
ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
|
||||||
for itn in range(5):
|
for itn in range(5):
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
|
@ -20,21 +27,30 @@ def train_ner(nlp, train_data, entity_types):
|
||||||
ner.model.end_training()
|
ner.model.end_training()
|
||||||
return ner
|
return ner
|
||||||
|
|
||||||
|
def save_model(ner, model_dir):
|
||||||
|
model_dir = pathlib.Path(model_dir)
|
||||||
|
if not model_dir.exists():
|
||||||
|
model_dir.mkdir()
|
||||||
|
assert model_dir.is_dir()
|
||||||
|
|
||||||
|
with (model_dir / 'config.json').open('w') as file_:
|
||||||
|
json.dump(ner.cfg, file_)
|
||||||
|
ner.model.dump(str(model_dir / 'model'))
|
||||||
|
if not (model_dir / 'vocab').exists():
|
||||||
|
(model_dir / 'vocab').mkdir()
|
||||||
|
ner.vocab.dump(str(model_dir / 'vocab' / 'lexemes.bin'))
|
||||||
|
with (model_dir / 'vocab' / 'strings.json').open('w', encoding='utf8') as file_:
|
||||||
|
ner.vocab.strings.dump(file_)
|
||||||
|
|
||||||
|
|
||||||
def main(model_dir=None):
|
def main(model_dir=None):
|
||||||
if model_dir is not None:
|
|
||||||
model_dir = pathlib.Path(model_dir)
|
|
||||||
if not model_dir.exists():
|
|
||||||
model_dir.mkdir()
|
|
||||||
assert model_dir.is_dir()
|
|
||||||
|
|
||||||
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
|
nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
|
||||||
|
|
||||||
# v1.1.2 onwards
|
# v1.1.2 onwards
|
||||||
if nlp.tagger is None:
|
if nlp.tagger is None:
|
||||||
print('---- WARNING ----')
|
print('---- WARNING ----')
|
||||||
print('Data directory not found')
|
print('Data directory not found')
|
||||||
print('please run: `python -m spacy.en.download –force all` for better performance')
|
print('please run: `python -m spacy.en.download --force all` for better performance')
|
||||||
print('Using feature templates for tagging')
|
print('Using feature templates for tagging')
|
||||||
print('-----------------')
|
print('-----------------')
|
||||||
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
|
nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)
|
||||||
|
@ -56,16 +72,17 @@ def main(model_dir=None):
|
||||||
nlp.tagger(doc)
|
nlp.tagger(doc)
|
||||||
ner(doc)
|
ner(doc)
|
||||||
for word in doc:
|
for word in doc:
|
||||||
print(word.text, word.tag_, word.ent_type_, word.ent_iob)
|
print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
|
||||||
|
|
||||||
if model_dir is not None:
|
if model_dir is not None:
|
||||||
with (model_dir / 'config.json').open('w') as file_:
|
save_model(ner, model_dir)
|
||||||
json.dump(ner.cfg, file_)
|
|
||||||
ner.model.dump(str(model_dir / 'model'))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main('ner')
|
||||||
# Who "" 2
|
# Who "" 2
|
||||||
# is "" 2
|
# is "" 2
|
||||||
# Shaka "" PERSON 3
|
# Shaka "" PERSON 3
|
||||||
|
|
|
@ -69,7 +69,7 @@ def main(output_dir=None):
|
||||||
print(word.text, word.tag_, word.pos_)
|
print(word.text, word.tag_, word.pos_)
|
||||||
if output_dir is not None:
|
if output_dir is not None:
|
||||||
tagger.model.dump(str(output_dir / 'pos' / 'model'))
|
tagger.model.dump(str(output_dir / 'pos' / 'model'))
|
||||||
with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
|
with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
|
||||||
tagger.vocab.strings.dump(file_)
|
tagger.vocab.strings.dump(file_)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
|
||||||
|
|
||||||
__title__ = 'spacy'
|
__title__ = 'spacy'
|
||||||
__version__ = '1.2.0'
|
__version__ = '1.3.0'
|
||||||
__summary__ = 'Industrial-strength NLP'
|
__summary__ = 'Industrial-strength NLP'
|
||||||
__uri__ = 'https://spacy.io'
|
__uri__ = 'https://spacy.io'
|
||||||
__author__ = 'Matthew Honnibal'
|
__author__ = 'Matthew Honnibal'
|
||||||
|
|
|
@ -426,3 +426,9 @@ cpdef enum symbol_t:
|
||||||
#IS_QUOTE
|
#IS_QUOTE
|
||||||
#IS_LEFT_PUNCT
|
#IS_LEFT_PUNCT
|
||||||
#IS_RIGHT_PUNCT
|
#IS_RIGHT_PUNCT
|
||||||
|
|
||||||
|
# These symbols are currently missing. However, if we add them currently,
|
||||||
|
# we'll throw off the integer index and the model will have to be retrained.
|
||||||
|
# We therefore wait until the next data version to add them.
|
||||||
|
# acl
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from spacy.attrs import HEAD
|
from spacy.attrs import HEAD
|
||||||
from spacy.en import English
|
from spacy.en import English
|
||||||
|
from spacy.tokens.doc import Doc
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -49,3 +50,44 @@ def test_sent(doc):
|
||||||
assert span.sent.text == 'This is a sentence.'
|
assert span.sent.text == 'This is a sentence.'
|
||||||
span = doc[6:7]
|
span = doc[6:7]
|
||||||
assert span.sent.root.left_edge.text == 'This'
|
assert span.sent.root.left_edge.text == 'This'
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_sentiment(EN):
|
||||||
|
'''Test new span.sentiment property's default averaging behaviour'''
|
||||||
|
good = EN.vocab[u'good']
|
||||||
|
good.sentiment = 3.0
|
||||||
|
bad = EN.vocab[u'bad']
|
||||||
|
bad.sentiment = -2.0
|
||||||
|
|
||||||
|
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
|
||||||
|
|
||||||
|
good_stuff = doc[:2]
|
||||||
|
assert good_stuff.sentiment == 3.0 / 2
|
||||||
|
|
||||||
|
bad_stuff = doc[-2:]
|
||||||
|
assert bad_stuff.sentiment == -2. / 2
|
||||||
|
|
||||||
|
good_stuff_bad = doc[:-1]
|
||||||
|
assert good_stuff_bad.sentiment == (3.+-2) / 3.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_override_sentiment(EN):
|
||||||
|
'''Test new span.sentiment property's default averaging behaviour'''
|
||||||
|
good = EN.vocab[u'good']
|
||||||
|
good.sentiment = 3.0
|
||||||
|
bad = EN.vocab[u'bad']
|
||||||
|
bad.sentiment = -2.0
|
||||||
|
|
||||||
|
doc = Doc(EN.vocab, [u'good', 'stuff', u'bad', u'stuff'])
|
||||||
|
|
||||||
|
doc.user_span_hooks['sentiment'] = lambda span: 10.0
|
||||||
|
|
||||||
|
good_stuff = doc[:2]
|
||||||
|
assert good_stuff.sentiment == 10.0
|
||||||
|
|
||||||
|
bad_stuff = doc[-2:]
|
||||||
|
assert bad_stuff.sentiment == 10.0
|
||||||
|
|
||||||
|
good_stuff_bad = doc[:-1]
|
||||||
|
assert good_stuff_bad.sentiment == 10.0
|
||||||
|
|
|
@ -179,6 +179,13 @@ cdef class Span:
|
||||||
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
self._vector_norm = sqrt(norm) if norm != 0 else 0
|
||||||
return self._vector_norm
|
return self._vector_norm
|
||||||
|
|
||||||
|
property sentiment:
|
||||||
|
def __get__(self):
|
||||||
|
if 'sentiment' in self.doc.user_span_hooks:
|
||||||
|
return self.doc.user_span_hooks['sentiment'](self)
|
||||||
|
else:
|
||||||
|
return sum([token.sentiment for token in self]) / len(self)
|
||||||
|
|
||||||
property text:
|
property text:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
text = self.text_with_ws
|
text = self.text_with_ws
|
||||||
|
|
|
@ -14,7 +14,7 @@ p After reading this page, you should be able to:
|
||||||
+h(3, "no-job-too-big") No job too big
|
+h(3, "no-job-too-big") No job too big
|
||||||
|
|
||||||
p
|
p
|
||||||
| When writing spaCy, one of my motos was #[em no job too big]. I wanted
|
| When writing spaCy, one of my mottos was #[em no job too big]. I wanted
|
||||||
| to make sure that if Google or Facebook were founded tomorrow, spaCy
|
| to make sure that if Google or Facebook were founded tomorrow, spaCy
|
||||||
| would be the obvious choice for them. I wanted spaCy to be the obvious
|
| would be the obvious choice for them. I wanted spaCy to be the obvious
|
||||||
| choice for web-scale NLP. This meant sweating about performance, because
|
| choice for web-scale NLP. This meant sweating about performance, because
|
||||||
|
|
|
@ -217,7 +217,7 @@ p
|
||||||
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
|
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
|
||||||
]
|
]
|
||||||
|
|
||||||
nlp = spacy.load(entity=False, parser=False)
|
nlp = spacy.load('en', entity=False, parser=False)
|
||||||
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
|
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
|
||||||
|
|
||||||
for itn in range(5):
|
for itn in range(5):
|
||||||
|
|
Loading…
Reference in New Issue