mirror of https://github.com/explosion/spaCy.git
💫 Add better and serializable sentencizer (#3471)
* Add better serializable sentencizer component * Replace default factory * Add tests * Tidy up * Pass test * Update docs
This commit is contained in:
parent
d9a07a7f6e
commit
06bf130890
|
@ -43,8 +43,9 @@ redirects = [
|
|||
{from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
|
||||
{from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"},
|
||||
{from = "/models/comparison", to = "/models"},
|
||||
{from = "/api/#section-cython", to = "/api/cython"},
|
||||
{from = "/api/#cython", to = "/api/cython"},
|
||||
{from = "/api/#section-cython", to = "/api/cython", force = true},
|
||||
{from = "/api/#cython", to = "/api/cython", force = true},
|
||||
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
|
||||
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
|
||||
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
|
||||
]
|
||||
|
|
|
@ -15,7 +15,7 @@ from .tokenizer import Tokenizer
|
|||
from .vocab import Vocab
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
||||
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
|
||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||
from .pipeline import EntityRuler
|
||||
from .compat import izip, basestring_
|
||||
|
@ -119,7 +119,7 @@ class Language(object):
|
|||
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
||||
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
||||
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
||||
"sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||
"sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
|
||||
"merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
|
||||
"merge_entities": lambda nlp, **cfg: merge_entities,
|
||||
"merge_subtokens": lambda nlp, **cfg: merge_subtokens,
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from .pipes import Tagger, DependencyParser, EntityRecognizer
|
||||
from .pipes import TextCategorizer, Tensorizer, Pipe
|
||||
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
|
||||
from .entityruler import EntityRuler
|
||||
from .hooks import SentenceSegmenter, SimilarityHook
|
||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens
|
||||
|
@ -15,6 +15,7 @@ __all__ = [
|
|||
"Tensorizer",
|
||||
"Pipe",
|
||||
"EntityRuler",
|
||||
"Sentencizer",
|
||||
"SentenceSegmenter",
|
||||
"SimilarityHook",
|
||||
"merge_entities",
|
||||
|
|
|
@ -191,7 +191,7 @@ class EntityRuler(object):
|
|||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler
|
||||
DOCS: https://spacy.io/api/entityruler#to_disk
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
path = path.with_suffix(".jsonl")
|
||||
|
|
|
@ -15,8 +15,6 @@ class SentenceSegmenter(object):
|
|||
initialization, or assign a new strategy to the .strategy attribute.
|
||||
Sentence detection strategies should be generators that take `Doc` objects
|
||||
and yield `Span` objects for each sentence.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencesegmenter
|
||||
"""
|
||||
|
||||
name = "sentencizer"
|
||||
|
@ -35,12 +33,12 @@ class SentenceSegmenter(object):
|
|||
def split_on_punct(doc):
|
||||
start = 0
|
||||
seen_period = False
|
||||
for i, word in enumerate(doc):
|
||||
if seen_period and not word.is_punct:
|
||||
yield doc[start : word.i]
|
||||
start = word.i
|
||||
for i, token in enumerate(doc):
|
||||
if seen_period and not token.is_punct:
|
||||
yield doc[start : token.i]
|
||||
start = token.i
|
||||
seen_period = False
|
||||
elif word.text in [".", "!", "?"]:
|
||||
elif token.text in [".", "!", "?"]:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
yield doc[start : len(doc)]
|
||||
|
|
|
@ -1058,4 +1058,90 @@ cdef class EntityRecognizer(Parser):
|
|||
if move[0] in ("B", "I", "L", "U")))
|
||||
|
||||
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"]
|
||||
class Sentencizer(object):
|
||||
"""Segment the Doc into sentences using a rule-based strategy.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer
|
||||
"""
|
||||
|
||||
name = "sentencizer"
|
||||
default_punct_chars = [".", "!", "?"]
|
||||
|
||||
def __init__(self, punct_chars=None, **kwargs):
|
||||
"""Initialize the sentencizer.
|
||||
|
||||
punct_chars (list): Punctuation characters to split on. Will be
|
||||
serialized with the nlp object.
|
||||
RETURNS (Sentencizer): The sentencizer component.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#init
|
||||
"""
|
||||
self.punct_chars = punct_chars or self.default_punct_chars
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||
|
||||
doc (Doc): The document to process.
|
||||
RETURNS (Doc): The processed Doc.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#call
|
||||
"""
|
||||
start = 0
|
||||
seen_period = False
|
||||
for i, token in enumerate(doc):
|
||||
is_in_punct_chars = token.text in self.punct_chars
|
||||
token.is_sent_start = i == 0
|
||||
if seen_period and not token.is_punct and not is_in_punct_chars:
|
||||
doc[start].is_sent_start = True
|
||||
start = token.i
|
||||
seen_period = False
|
||||
elif is_in_punct_chars:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
doc[start].is_sent_start = True
|
||||
return doc
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the sentencizer to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized object.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#to_bytes
|
||||
"""
|
||||
return srsly.msgpack_dumps({"punct_chars": self.punct_chars})
|
||||
|
||||
def from_bytes(self, bytes_data, **kwargs):
|
||||
"""Load the sentencizer from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
returns (Sentencizer): The loaded object.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#from_bytes
|
||||
"""
|
||||
cfg = srsly.msgpack_loads(bytes_data)
|
||||
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Serialize the sentencizer to disk.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#to_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
path = path.with_suffix(".json")
|
||||
srsly.write_json(path, {"punct_chars": self.punct_chars})
|
||||
|
||||
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Load the sentencizer from disk.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer#from_disk
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
path = path.with_suffix(".json")
|
||||
cfg = srsly.read_json(path)
|
||||
self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
|
||||
return self
|
||||
|
||||
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "Sentencizer"]
|
||||
|
|
|
@ -0,0 +1,87 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
from spacy.pipeline import Sentencizer
|
||||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
def test_sentencizer(en_vocab):
|
||||
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
|
||||
sentencizer = Sentencizer()
|
||||
doc = sentencizer(doc)
|
||||
assert doc.is_sentenced
|
||||
sent_starts = [t.is_sent_start for t in doc]
|
||||
assert sent_starts == [True, False, True, False, False, False, False]
|
||||
assert len(list(doc.sents)) == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"words,sent_starts,n_sents",
|
||||
[
|
||||
# The expected result here is that the duplicate punctuation gets merged
|
||||
# onto the same sentence and no one-token sentence is created for them.
|
||||
(
|
||||
["Hello", "!", ".", "Test", ".", ".", "ok"],
|
||||
[True, False, False, True, False, False, True],
|
||||
3,
|
||||
),
|
||||
# We also want to make sure ¡ and ¿ aren't treated as sentence end
|
||||
# markers, even though they're punctuation
|
||||
(
|
||||
["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"],
|
||||
[True, False, False, False, True, False, False, False, False, False],
|
||||
2,
|
||||
),
|
||||
# The Token.is_punct check ensures that quotes are handled as well
|
||||
(
|
||||
['"', "Nice", "!", '"', "I", "am", "happy", "."],
|
||||
[True, False, False, False, True, False, False, False],
|
||||
2,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
|
||||
doc = Doc(en_vocab, words=words)
|
||||
sentencizer = Sentencizer()
|
||||
doc = sentencizer(doc)
|
||||
assert doc.is_sentenced
|
||||
assert [t.is_sent_start for t in doc] == sent_starts
|
||||
assert len(list(doc.sents)) == n_sents
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"punct_chars,words,sent_starts,n_sents",
|
||||
[
|
||||
(
|
||||
["~", "?"],
|
||||
["Hello", "world", "~", "A", ".", "B", "."],
|
||||
[True, False, False, True, False, False, False],
|
||||
2,
|
||||
),
|
||||
# Even thought it's not common, the punct_chars should be able to
|
||||
# handle any tokens
|
||||
(
|
||||
[".", "ö"],
|
||||
["Hello", ".", "Test", "ö", "Ok", "."],
|
||||
[True, False, True, False, True, False],
|
||||
3,
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
|
||||
doc = Doc(en_vocab, words=words)
|
||||
sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||
doc = sentencizer(doc)
|
||||
assert doc.is_sentenced
|
||||
assert [t.is_sent_start for t in doc] == sent_starts
|
||||
assert len(list(doc.sents)) == n_sents
|
||||
|
||||
|
||||
def test_sentencizer_serialize_bytes(en_vocab):
|
||||
punct_chars = [".", "~", "+"]
|
||||
sentencizer = Sentencizer(punct_chars=punct_chars)
|
||||
assert sentencizer.punct_chars == punct_chars
|
||||
bytes_data = sentencizer.to_bytes()
|
||||
new_sentencizer = Sentencizer().from_bytes(bytes_data)
|
||||
assert new_sentencizer.punct_chars == punct_chars
|
|
@ -6,10 +6,9 @@ from spacy.lang.en import English
|
|||
from spacy.tokens import Doc
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue3468():
|
||||
"""Test that sentence boundaries are serialized if they're not set by the
|
||||
dependency parser."""
|
||||
"""Test that sentence boundaries are set correctly so Doc.is_sentenced can
|
||||
be restored after serialization."""
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("sentencizer"))
|
||||
doc = nlp("Hello world")
|
||||
|
|
|
@ -230,7 +230,7 @@ cdef class Doc:
|
|||
defined as having at least one of the following:
|
||||
|
||||
a) An entry "sents" in doc.user_hooks";
|
||||
b) sent.is_parsed is set to True;
|
||||
b) Doc.is_parsed is set to True;
|
||||
c) At least one token other than the first where sent_start is not None.
|
||||
"""
|
||||
if "sents" in self.user_hooks:
|
||||
|
|
|
@ -441,6 +441,7 @@ cdef class Token:
|
|||
|
||||
property sent_start:
|
||||
def __get__(self):
|
||||
"""Deprecated: use Token.is_sent_start instead."""
|
||||
# Raising a deprecation warning here causes errors for autocomplete
|
||||
# Handle broken backwards compatibility case: doc[0].sent_start
|
||||
# was False.
|
||||
|
|
|
@ -1,78 +0,0 @@
|
|||
---
|
||||
title: SentenceSegmenter
|
||||
tag: class
|
||||
source: spacy/pipeline/hooks.py
|
||||
---
|
||||
|
||||
A simple spaCy hook, to allow custom sentence boundary detection logic that
|
||||
doesn't require the dependency parse. By default, sentence segmentation is
|
||||
performed by the [`DependencyParser`](/api/dependencyparser), so the
|
||||
`SentenceSegmenter` lets you implement a simpler, rule-based strategy that
|
||||
doesn't require a statistical model to be loaded. The component is also
|
||||
available via the string name `"sentencizer"`. After initialization, it is
|
||||
typically added to the processing pipeline using
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
## SentenceSegmenter.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize the sentence segmenter. To change the sentence boundary detection
|
||||
strategy, pass a generator function `strategy` on initialization, or assign a
|
||||
new strategy to the `.strategy` attribute. Sentence detection strategies should
|
||||
be generators that take `Doc` objects and yield `Span` objects for each
|
||||
sentence.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via create_pipe
|
||||
> sentencizer = nlp.create_pipe("sentencizer")
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import SentenceSegmenter
|
||||
> sentencizer = SentenceSegmenter(nlp.vocab)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | ----------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `strategy` | unicode / callable | The segmentation strategy to use. Defaults to `"on_punct"`. |
|
||||
| **RETURNS** | `SentenceSegmenter` | The newly constructed object. |
|
||||
|
||||
## SentenceSegmenter.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the sentence segmenter on a `Doc`. Typically, this happens automatically
|
||||
after the component has been added to the pipeline using
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.lang.en import English
|
||||
>
|
||||
> nlp = English()
|
||||
> sentencizer = nlp.create_pipe("sentencizer")
|
||||
> nlp.add_pipe(sentencizer)
|
||||
> doc = nlp(u"This is a sentence. This is another sentence.")
|
||||
> assert list(doc.sents) == 2
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. |
|
||||
|
||||
## SentenceSegmenter.split_on_punct {#split_on_punct tag="staticmethod"}
|
||||
|
||||
Split the `Doc` on punctuation characters `.`, `!` and `?`. This is the default
|
||||
strategy used by the `SentenceSegmenter.`
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------ | ------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process. |
|
||||
| **YIELDS** | `Span` | The sentences in the document. |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------- | ------------------------------------------------------------------- |
|
||||
| `strategy` | callable | The segmentation strategy. Can be overwritten after initialization. |
|
|
@ -0,0 +1,136 @@
|
|||
---
|
||||
title: Sentencizer
|
||||
tag: class
|
||||
source: spacy/pipeline/pipes.pyx
|
||||
---
|
||||
|
||||
A simple pipeline component, to allow custom sentence boundary detection logic
|
||||
that doesn't require the dependency parse. By default, sentence segmentation is
|
||||
performed by the [`DependencyParser`](/api/dependencyparser), so the
|
||||
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
|
||||
require a statistical model to be loaded. The component is also available via
|
||||
the string name `"sentencizer"`. After initialization, it is typically added to
|
||||
the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
Compared to the previous `SentenceSegmenter` class, the `Sentencizer` component
|
||||
doesn't add a hook to `doc.user_hooks["sents"]`. Instead, it iterates over the
|
||||
tokens in the `Doc` and sets the `Token.is_sent_start` property. The
|
||||
`SentenceSegmenter` is still available if you import it directly:
|
||||
|
||||
```python
|
||||
from spacy.pipeline import SentenceSegmenter
|
||||
```
|
||||
|
||||
</Infobox>
|
||||
|
||||
## Sentencizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize the sentencizer.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> # Construction via create_pipe
|
||||
> sentencizer = nlp.create_pipe("sentencizer")
|
||||
>
|
||||
> # Construction from class
|
||||
> from spacy.pipeline import Sentencizer
|
||||
> sentencizer = Sentencizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` |
|
||||
| **RETURNS** | `Sentencizer` | The newly constructed object. |
|
||||
|
||||
## Sentencizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the sentencizer on a `Doc`. Typically, this happens automatically after
|
||||
the component has been added to the pipeline using
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.lang.en import English
|
||||
>
|
||||
> nlp = English()
|
||||
> sentencizer = nlp.create_pipe("sentencizer")
|
||||
> nlp.add_pipe(sentencizer)
|
||||
> doc = nlp(u"This is a sentence. This is another sentence.")
|
||||
> assert list(doc.sents) == 2
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. |
|
||||
|
||||
## Sentencizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
Save the sentencizer settings (punctuation characters) a directory. Will create
|
||||
a file `sentencizer.json`. This also happens automatically when you save an
|
||||
`nlp` object with a sentencizer added to its pipeline.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"])
|
||||
> sentencizer.to_disk("/path/to/sentencizer.jsonl")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## Sentencizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
Load the sentencizer settings from a file. Expects a JSON file. This also
|
||||
happens automatically when you load an `nlp` object or model with a sentencizer
|
||||
added to its pipeline.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> sentencizer = Sentencizer()
|
||||
> sentencizer.from_disk("/path/to/sentencizer.json")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
|
||||
|
||||
## Sentencizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
Serialize the sentencizer settings to a bytestring.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"])
|
||||
> sentencizer_bytes = sentencizer.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------- |
|
||||
| **RETURNS** | bytes | The serialized data. |
|
||||
|
||||
## Sentencizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
Load the pipe from a bytestring. Modifies the object in place and returns it.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> sentencizer_bytes = sentencizer.to_bytes()
|
||||
> sentencizer = Sentencizer()
|
||||
> sentencizer.from_bytes(sentencizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | ---------------------------------- |
|
||||
| `bytes_data` | bytes | The bytestring to load. |
|
||||
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
|
|
@ -26,7 +26,7 @@ an **annotated document**. It also orchestrates training and serialization.
|
|||
### Processing pipeline {#architecture-pipeline}
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. |
|
||||
| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. |
|
||||
| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. |
|
||||
|
@ -38,7 +38,7 @@ an **annotated document**. It also orchestrates training and serialization.
|
|||
| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. |
|
||||
| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. |
|
||||
| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. |
|
||||
| [`SentenceSegmenter`](/api/sentencesegmenter) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. |
|
||||
| [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. |
|
||||
| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. |
|
||||
|
||||
### Other classes {#architecture-other}
|
||||
|
|
|
@ -1149,9 +1149,14 @@ but it also means you'll need a **statistical model** and accurate predictions.
|
|||
If your texts are closer to general-purpose news or web text, this should work
|
||||
well out-of-the-box. For social media or conversational text that doesn't follow
|
||||
the same rules, your application may benefit from a custom rule-based
|
||||
implementation. You can either plug a rule-based component into your
|
||||
[processing pipeline](/usage/processing-pipelines) or use the
|
||||
`SentenceSegmenter` component with a custom strategy.
|
||||
implementation. You can either use the built-in
|
||||
[`Sentencizer`](/api/sentencizer) or plug an entirely custom rule-based function
|
||||
into your [processing pipeline](/usage/processing-pipelines).
|
||||
|
||||
spaCy's dependency parser respects already set boundaries, so you can preprocess
|
||||
your `Doc` using custom rules _before_ it's parsed. Depending on your text, this
|
||||
may also improve accuracy, since the parser is constrained to predict parses
|
||||
consistent with the sentence boundaries.
|
||||
|
||||
### Default: Using the dependency parse {#sbd-parser model="parser"}
|
||||
|
||||
|
@ -1168,13 +1173,35 @@ for sent in doc.sents:
|
|||
print(sent.text)
|
||||
```
|
||||
|
||||
### Setting boundaries manually {#sbd-manual}
|
||||
### Rule-based pipeline component {#sbd-component}
|
||||
|
||||
spaCy's dependency parser respects already set boundaries, so you can preprocess
|
||||
your `Doc` using custom rules _before_ it's parsed. This can be done by adding a
|
||||
[custom pipeline component](/usage/processing-pipelines). Depending on your
|
||||
text, this may also improve accuracy, since the parser is constrained to predict
|
||||
parses consistent with the sentence boundaries.
|
||||
The [`Sentencizer`](/api/sentencizer) component is a
|
||||
[pipeline component](/usage/processing-pipelines) that splits sentences on
|
||||
punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only
|
||||
need sentence boundaries without the dependency parse.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
|
||||
nlp = English() # just the language with no model
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
nlp.add_pipe(sentencizer)
|
||||
doc = nlp(u"This is a sentence. This is another sentence.")
|
||||
for sent in doc.sents:
|
||||
print(sent.text)
|
||||
```
|
||||
|
||||
### Custom rule-based strategy {id="sbd-custom"}
|
||||
|
||||
If you want to implement your own strategy that differs from the default
|
||||
rule-based approach of splitting on sentences, you can also create a
|
||||
[custom pipeline component](/usage/processing-pipelines#custom-components) that
|
||||
takes a `Doc` object and sets the `Token.is_sent_start` attribute on each
|
||||
individual token. If set to `False`, the token is explicitly marked as _not_ the
|
||||
start of a sentence. If set to `None` (default), it's treated as a missing value
|
||||
and can still be overwritten by the parser.
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
|
@ -1187,9 +1214,11 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe).
|
|||
|
||||
Here's an example of a component that implements a pre-processing rule for
|
||||
splitting on `'...'` tokens. The component is added before the parser, which is
|
||||
then used to further segment the text. This approach can be useful if you want
|
||||
to implement **additional** rules specific to your data, while still being able
|
||||
to take advantage of dependency-based sentence segmentation.
|
||||
then used to further segment the text. That's possible, because `is_sent_start`
|
||||
is only set to `True` for some of the tokens – all others still specify `None`
|
||||
for unset sentence boundaries. This approach can be useful if you want to
|
||||
implement **additional** rules specific to your data, while still being able to
|
||||
take advantage of dependency-based sentence segmentation.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
|
@ -1212,62 +1241,6 @@ doc = nlp(text)
|
|||
print("After:", [sent.text for sent in doc.sents])
|
||||
```
|
||||
|
||||
### Rule-based pipeline component {#sbd-component}
|
||||
|
||||
The `sentencizer` component is a
|
||||
[pipeline component](/usage/processing-pipelines) that splits sentences on
|
||||
punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only
|
||||
need sentence boundaries without the dependency parse. Note that `Doc.sents`
|
||||
will **raise an error** if no sentence boundaries are set.
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
import spacy
|
||||
from spacy.lang.en import English
|
||||
|
||||
nlp = English() # just the language with no model
|
||||
sentencizer = nlp.create_pipe("sentencizer")
|
||||
nlp.add_pipe(sentencizer)
|
||||
doc = nlp(u"This is a sentence. This is another sentence.")
|
||||
for sent in doc.sents:
|
||||
print(sent.text)
|
||||
```
|
||||
|
||||
### Custom rule-based strategy {#sbd-custom}
|
||||
|
||||
If you want to implement your own strategy that differs from the default
|
||||
rule-based approach of splitting on sentences, you can also instantiate the
|
||||
`SentenceSegmenter` directly and pass in your own strategy. The strategy should
|
||||
be a function that takes a `Doc` object and yields a `Span` for each sentence.
|
||||
Here's an example of a custom segmentation strategy for splitting on newlines
|
||||
only:
|
||||
|
||||
```python
|
||||
### {executable="true"}
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import SentenceSegmenter
|
||||
|
||||
def split_on_newlines(doc):
|
||||
start = 0
|
||||
seen_newline = False
|
||||
for word in doc:
|
||||
if seen_newline and not word.is_space:
|
||||
yield doc[start:word.i]
|
||||
start = word.i
|
||||
seen_newline = False
|
||||
elif word.text == '\\n':
|
||||
seen_newline = True
|
||||
if start < len(doc):
|
||||
yield doc[start:len(doc)]
|
||||
|
||||
nlp = English() # Just the language with no model
|
||||
sentencizer = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)
|
||||
nlp.add_pipe(sentencizer)
|
||||
doc = nlp(u"This is a sentence\\n\\nThis is another sentence\\nAnd more")
|
||||
for sent in doc.sents:
|
||||
print([token.text for token in sent])
|
||||
```
|
||||
|
||||
## Rule-based matching {#rule-based-matching hidden="true"}
|
||||
|
||||
<div id="rule-based-matching">
|
||||
|
|
|
@ -138,7 +138,7 @@ require them in the pipeline settings in your model's `meta.json`.
|
|||
| `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. |
|
||||
| `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. |
|
||||
| `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. |
|
||||
| `sentencizer` | [`SentenceSegmenter`](/api/sentencesegmenter) | Add rule-based sentence segmentation without the dependency parse. |
|
||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||
| `merge_noun_chunks` | [`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) | Merge all noun chunks into a single token. Should be added after the tagger and parser. |
|
||||
| `merge_entities` | [`merge_entities`](/api/pipeline-functions#merge_entities) | Merge all entities into a single token. Should be added after the entity recognizer. |
|
||||
| `merge_subtokens` | [`merge_subtokens`](/api/pipeline-functions#merge_subtokens) | Merge subtokens predicted by the parser into single tokens. Should be added after the parser. |
|
||||
|
|
|
@ -195,7 +195,7 @@ the existing pages and added some new content:
|
|||
- **Universe:** [Videos](/universe/category/videos) and
|
||||
[Podcasts](/universe/category/podcasts)
|
||||
- **API:** [`EntityRuler`](/api/entityruler)
|
||||
- **API:** [`SentenceSegmenter`](/api/sentencesegmenter)
|
||||
- **API:** [`Sentencizer`](/api/sentencizer)
|
||||
- **API:** [Pipeline functions](/api/pipeline-functions)
|
||||
|
||||
## Backwards incompatibilities {#incompat}
|
||||
|
|
|
@ -79,7 +79,7 @@
|
|||
{ "text": "Matcher", "url": "/api/matcher" },
|
||||
{ "text": "PhraseMatcher", "url": "/api/phrasematcher" },
|
||||
{ "text": "EntityRuler", "url": "/api/entityruler" },
|
||||
{ "text": "SentenceSegmenter", "url": "/api/sentencesegmenter" },
|
||||
{ "text": "Sentencizer", "url": "/api/sentencizer" },
|
||||
{ "text": "Other Functions", "url": "/api/pipeline-functions" }
|
||||
]
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue