From 06bf1308902e46dee34751cecd6e0b1f6840ceca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 23 Mar 2019 15:45:02 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Add=20better=20and=20serializabl?= =?UTF-8?q?e=20sentencizer=20(#3471)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add better serializable sentencizer component * Replace default factory * Add tests * Tidy up * Pass test * Update docs --- netlify.toml | 5 +- spacy/language.py | 4 +- spacy/pipeline/__init__.py | 3 +- spacy/pipeline/entityruler.py | 2 +- spacy/pipeline/hooks.py | 12 +- spacy/pipeline/pipes.pyx | 88 ++++++++++++- spacy/tests/pipeline/test_sentencizer.py | 87 +++++++++++++ spacy/tests/regression/test_issue3468.py | 5 +- spacy/tokens/doc.pyx | 2 +- spacy/tokens/token.pyx | 1 + website/docs/api/sentencesegmenter.md | 78 ------------ website/docs/api/sentencizer.md | 136 +++++++++++++++++++++ website/docs/usage/101/_architecture.md | 30 ++--- website/docs/usage/linguistic-features.md | 109 +++++++---------- website/docs/usage/processing-pipelines.md | 2 +- website/docs/usage/v2-1.md | 2 +- website/meta/sidebars.json | 2 +- 17 files changed, 386 insertions(+), 182 deletions(-) create mode 100644 spacy/tests/pipeline/test_sentencizer.py delete mode 100644 website/docs/api/sentencesegmenter.md create mode 100644 website/docs/api/sentencizer.md diff --git a/netlify.toml b/netlify.toml index 9941cf285..c116eb49b 100644 --- a/netlify.toml +++ b/netlify.toml @@ -43,8 +43,9 @@ redirects = [ {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"}, {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"}, {from = "/models/comparison", to = "/models"}, - {from = "/api/#section-cython", to = "/api/cython"}, - {from = "/api/#cython", to = "/api/cython"}, + {from = "/api/#section-cython", to = "/api/cython", force = true}, + {from = "/api/#cython", to = "/api/cython", force = true}, + {from = "/api/sentencesegmenter", to="/api/sentencizer"}, {from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true}, {from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true}, ] diff --git a/spacy/language.py b/spacy/language.py index d47ec3f83..c1642f631 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -15,7 +15,7 @@ from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer -from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter +from .pipeline import SimilarityHook, TextCategorizer, Sentencizer from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import EntityRuler from .compat import izip, basestring_ @@ -119,7 +119,7 @@ class Language(object): "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), - "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), + "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg), "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks, "merge_entities": lambda nlp, **cfg: merge_entities, "merge_subtokens": lambda nlp, **cfg: merge_subtokens, diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 64286832f..eaadd977d 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .pipes import Tagger, DependencyParser, EntityRecognizer -from .pipes import TextCategorizer, Tensorizer, Pipe +from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer from .entityruler import EntityRuler from .hooks import SentenceSegmenter, SimilarityHook from .functions import merge_entities, merge_noun_chunks, merge_subtokens @@ -15,6 +15,7 @@ __all__ = [ "Tensorizer", "Pipe", "EntityRuler", + "Sentencizer", "SentenceSegmenter", "SimilarityHook", "merge_entities", diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 09a0c0491..cd399a4fe 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -191,7 +191,7 @@ class EntityRuler(object): **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. - DOCS: https://spacy.io/api/entityruler + DOCS: https://spacy.io/api/entityruler#to_disk """ path = ensure_path(path) path = path.with_suffix(".jsonl") diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index e998ee0cb..38672cde0 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -15,8 +15,6 @@ class SentenceSegmenter(object): initialization, or assign a new strategy to the .strategy attribute. Sentence detection strategies should be generators that take `Doc` objects and yield `Span` objects for each sentence. - - DOCS: https://spacy.io/api/sentencesegmenter """ name = "sentencizer" @@ -35,12 +33,12 @@ class SentenceSegmenter(object): def split_on_punct(doc): start = 0 seen_period = False - for i, word in enumerate(doc): - if seen_period and not word.is_punct: - yield doc[start : word.i] - start = word.i + for i, token in enumerate(doc): + if seen_period and not token.is_punct: + yield doc[start : token.i] + start = token.i seen_period = False - elif word.text in [".", "!", "?"]: + elif token.text in [".", "!", "?"]: seen_period = True if start < len(doc): yield doc[start : len(doc)] diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 2544570ad..7ad67cb5a 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1058,4 +1058,90 @@ cdef class EntityRecognizer(Parser): if move[0] in ("B", "I", "L", "U"))) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"] +class Sentencizer(object): + """Segment the Doc into sentences using a rule-based strategy. + + DOCS: https://spacy.io/api/sentencizer + """ + + name = "sentencizer" + default_punct_chars = [".", "!", "?"] + + def __init__(self, punct_chars=None, **kwargs): + """Initialize the sentencizer. + + punct_chars (list): Punctuation characters to split on. Will be + serialized with the nlp object. + RETURNS (Sentencizer): The sentencizer component. + + DOCS: https://spacy.io/api/sentencizer#init + """ + self.punct_chars = punct_chars or self.default_punct_chars + + def __call__(self, doc): + """Apply the sentencizer to a Doc and set Token.is_sent_start. + + doc (Doc): The document to process. + RETURNS (Doc): The processed Doc. + + DOCS: https://spacy.io/api/sentencizer#call + """ + start = 0 + seen_period = False + for i, token in enumerate(doc): + is_in_punct_chars = token.text in self.punct_chars + token.is_sent_start = i == 0 + if seen_period and not token.is_punct and not is_in_punct_chars: + doc[start].is_sent_start = True + start = token.i + seen_period = False + elif is_in_punct_chars: + seen_period = True + if start < len(doc): + doc[start].is_sent_start = True + return doc + + def to_bytes(self, **kwargs): + """Serialize the sentencizer to a bytestring. + + RETURNS (bytes): The serialized object. + + DOCS: https://spacy.io/api/sentencizer#to_bytes + """ + return srsly.msgpack_dumps({"punct_chars": self.punct_chars}) + + def from_bytes(self, bytes_data, **kwargs): + """Load the sentencizer from a bytestring. + + bytes_data (bytes): The data to load. + returns (Sentencizer): The loaded object. + + DOCS: https://spacy.io/api/sentencizer#from_bytes + """ + cfg = srsly.msgpack_loads(bytes_data) + self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + """Serialize the sentencizer to disk. + + DOCS: https://spacy.io/api/sentencizer#to_disk + """ + path = util.ensure_path(path) + path = path.with_suffix(".json") + srsly.write_json(path, {"punct_chars": self.punct_chars}) + + + def from_disk(self, path, exclude=tuple(), **kwargs): + """Load the sentencizer from disk. + + DOCS: https://spacy.io/api/sentencizer#from_disk + """ + path = util.ensure_path(path) + path = path.with_suffix(".json") + cfg = srsly.read_json(path) + self.punct_chars = cfg.get("punct_chars", self.default_punct_chars) + return self + + +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "Sentencizer"] diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py new file mode 100644 index 000000000..c1b3eba45 --- /dev/null +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -0,0 +1,87 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.pipeline import Sentencizer +from spacy.tokens import Doc + + +def test_sentencizer(en_vocab): + doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."]) + sentencizer = Sentencizer() + doc = sentencizer(doc) + assert doc.is_sentenced + sent_starts = [t.is_sent_start for t in doc] + assert sent_starts == [True, False, True, False, False, False, False] + assert len(list(doc.sents)) == 2 + + +@pytest.mark.parametrize( + "words,sent_starts,n_sents", + [ + # The expected result here is that the duplicate punctuation gets merged + # onto the same sentence and no one-token sentence is created for them. + ( + ["Hello", "!", ".", "Test", ".", ".", "ok"], + [True, False, False, True, False, False, True], + 3, + ), + # We also want to make sure ¡ and ¿ aren't treated as sentence end + # markers, even though they're punctuation + ( + ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"], + [True, False, False, False, True, False, False, False, False, False], + 2, + ), + # The Token.is_punct check ensures that quotes are handled as well + ( + ['"', "Nice", "!", '"', "I", "am", "happy", "."], + [True, False, False, False, True, False, False, False], + 2, + ), + ], +) +def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents): + doc = Doc(en_vocab, words=words) + sentencizer = Sentencizer() + doc = sentencizer(doc) + assert doc.is_sentenced + assert [t.is_sent_start for t in doc] == sent_starts + assert len(list(doc.sents)) == n_sents + + +@pytest.mark.parametrize( + "punct_chars,words,sent_starts,n_sents", + [ + ( + ["~", "?"], + ["Hello", "world", "~", "A", ".", "B", "."], + [True, False, False, True, False, False, False], + 2, + ), + # Even thought it's not common, the punct_chars should be able to + # handle any tokens + ( + [".", "ö"], + ["Hello", ".", "Test", "ö", "Ok", "."], + [True, False, True, False, True, False], + 3, + ), + ], +) +def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents): + doc = Doc(en_vocab, words=words) + sentencizer = Sentencizer(punct_chars=punct_chars) + doc = sentencizer(doc) + assert doc.is_sentenced + assert [t.is_sent_start for t in doc] == sent_starts + assert len(list(doc.sents)) == n_sents + + +def test_sentencizer_serialize_bytes(en_vocab): + punct_chars = [".", "~", "+"] + sentencizer = Sentencizer(punct_chars=punct_chars) + assert sentencizer.punct_chars == punct_chars + bytes_data = sentencizer.to_bytes() + new_sentencizer = Sentencizer().from_bytes(bytes_data) + assert new_sentencizer.punct_chars == punct_chars diff --git a/spacy/tests/regression/test_issue3468.py b/spacy/tests/regression/test_issue3468.py index fc9ab0783..02cd01e17 100644 --- a/spacy/tests/regression/test_issue3468.py +++ b/spacy/tests/regression/test_issue3468.py @@ -6,10 +6,9 @@ from spacy.lang.en import English from spacy.tokens import Doc -@pytest.mark.xfail def test_issue3468(): - """Test that sentence boundaries are serialized if they're not set by the - dependency parser.""" + """Test that sentence boundaries are set correctly so Doc.is_sentenced can + be restored after serialization.""" nlp = English() nlp.add_pipe(nlp.create_pipe("sentencizer")) doc = nlp("Hello world") diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d4d7e5fa4..e433002f2 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -230,7 +230,7 @@ cdef class Doc: defined as having at least one of the following: a) An entry "sents" in doc.user_hooks"; - b) sent.is_parsed is set to True; + b) Doc.is_parsed is set to True; c) At least one token other than the first where sent_start is not None. """ if "sents" in self.user_hooks: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 66728d35c..bdf6a8dd5 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -441,6 +441,7 @@ cdef class Token: property sent_start: def __get__(self): + """Deprecated: use Token.is_sent_start instead.""" # Raising a deprecation warning here causes errors for autocomplete # Handle broken backwards compatibility case: doc[0].sent_start # was False. diff --git a/website/docs/api/sentencesegmenter.md b/website/docs/api/sentencesegmenter.md deleted file mode 100644 index d4055536d..000000000 --- a/website/docs/api/sentencesegmenter.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: SentenceSegmenter -tag: class -source: spacy/pipeline/hooks.py ---- - -A simple spaCy hook, to allow custom sentence boundary detection logic that -doesn't require the dependency parse. By default, sentence segmentation is -performed by the [`DependencyParser`](/api/dependencyparser), so the -`SentenceSegmenter` lets you implement a simpler, rule-based strategy that -doesn't require a statistical model to be loaded. The component is also -available via the string name `"sentencizer"`. After initialization, it is -typically added to the processing pipeline using -[`nlp.add_pipe`](/api/language#add_pipe). - -## SentenceSegmenter.\_\_init\_\_ {#init tag="method"} - -Initialize the sentence segmenter. To change the sentence boundary detection -strategy, pass a generator function `strategy` on initialization, or assign a -new strategy to the `.strategy` attribute. Sentence detection strategies should -be generators that take `Doc` objects and yield `Span` objects for each -sentence. - -> #### Example -> -> ```python -> # Construction via create_pipe -> sentencizer = nlp.create_pipe("sentencizer") -> -> # Construction from class -> from spacy.pipeline import SentenceSegmenter -> sentencizer = SentenceSegmenter(nlp.vocab) -> ``` - -| Name | Type | Description | -| ----------- | ------------------- | ----------------------------------------------------------- | -| `vocab` | `Vocab` | The shared vocabulary. | -| `strategy` | unicode / callable | The segmentation strategy to use. Defaults to `"on_punct"`. | -| **RETURNS** | `SentenceSegmenter` | The newly constructed object. | - -## SentenceSegmenter.\_\_call\_\_ {#call tag="method"} - -Apply the sentence segmenter on a `Doc`. Typically, this happens automatically -after the component has been added to the pipeline using -[`nlp.add_pipe`](/api/language#add_pipe). - -> #### Example -> -> ```python -> from spacy.lang.en import English -> -> nlp = English() -> sentencizer = nlp.create_pipe("sentencizer") -> nlp.add_pipe(sentencizer) -> doc = nlp(u"This is a sentence. This is another sentence.") -> assert list(doc.sents) == 2 -> ``` - -| Name | Type | Description | -| ----------- | ----- | ------------------------------------------------------------ | -| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | -| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. | - -## SentenceSegmenter.split_on_punct {#split_on_punct tag="staticmethod"} - -Split the `Doc` on punctuation characters `.`, `!` and `?`. This is the default -strategy used by the `SentenceSegmenter.` - -| Name | Type | Description | -| ---------- | ------ | ------------------------------ | -| `doc` | `Doc` | The `Doc` object to process. | -| **YIELDS** | `Span` | The sentences in the document. | - -## Attributes {#attributes} - -| Name | Type | Description | -| ---------- | -------- | ------------------------------------------------------------------- | -| `strategy` | callable | The segmentation strategy. Can be overwritten after initialization. | diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md new file mode 100644 index 000000000..26d205c24 --- /dev/null +++ b/website/docs/api/sentencizer.md @@ -0,0 +1,136 @@ +--- +title: Sentencizer +tag: class +source: spacy/pipeline/pipes.pyx +--- + +A simple pipeline component, to allow custom sentence boundary detection logic +that doesn't require the dependency parse. By default, sentence segmentation is +performed by the [`DependencyParser`](/api/dependencyparser), so the +`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't +require a statistical model to be loaded. The component is also available via +the string name `"sentencizer"`. After initialization, it is typically added to +the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). + + + +Compared to the previous `SentenceSegmenter` class, the `Sentencizer` component +doesn't add a hook to `doc.user_hooks["sents"]`. Instead, it iterates over the +tokens in the `Doc` and sets the `Token.is_sent_start` property. The +`SentenceSegmenter` is still available if you import it directly: + +```python +from spacy.pipeline import SentenceSegmenter +``` + + + +## Sentencizer.\_\_init\_\_ {#init tag="method"} + +Initialize the sentencizer. + +> #### Example +> +> ```python +> # Construction via create_pipe +> sentencizer = nlp.create_pipe("sentencizer") +> +> # Construction from class +> from spacy.pipeline import Sentencizer +> sentencizer = Sentencizer() +> ``` + +| Name | Type | Description | +| ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | +| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `[".", "!", "?"].` | +| **RETURNS** | `Sentencizer` | The newly constructed object. | + +## Sentencizer.\_\_call\_\_ {#call tag="method"} + +Apply the sentencizer on a `Doc`. Typically, this happens automatically after +the component has been added to the pipeline using +[`nlp.add_pipe`](/api/language#add_pipe). + +> #### Example +> +> ```python +> from spacy.lang.en import English +> +> nlp = English() +> sentencizer = nlp.create_pipe("sentencizer") +> nlp.add_pipe(sentencizer) +> doc = nlp(u"This is a sentence. This is another sentence.") +> assert list(doc.sents) == 2 +> ``` + +| Name | Type | Description | +| ----------- | ----- | ------------------------------------------------------------ | +| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | +| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. | + +## Sentencizer.to_disk {#to_disk tag="method"} + +Save the sentencizer settings (punctuation characters) a directory. Will create +a file `sentencizer.json`. This also happens automatically when you save an +`nlp` object with a sentencizer added to its pipeline. + +> #### Example +> +> ```python +> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"]) +> sentencizer.to_disk("/path/to/sentencizer.jsonl") +> ``` + +| Name | Type | Description | +| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | + +## Sentencizer.from_disk {#from_disk tag="method"} + +Load the sentencizer settings from a file. Expects a JSON file. This also +happens automatically when you load an `nlp` object or model with a sentencizer +added to its pipeline. + +> #### Example +> +> ```python +> sentencizer = Sentencizer() +> sentencizer.from_disk("/path/to/sentencizer.json") +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | -------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | + +## Sentencizer.to_bytes {#to_bytes tag="method"} + +Serialize the sentencizer settings to a bytestring. + +> #### Example +> +> ```python +> sentencizer = Sentencizer(punct_chars=[".", "?", "!", "。"]) +> sentencizer_bytes = sentencizer.to_bytes() +> ``` + +| Name | Type | Description | +| ----------- | ----- | -------------------- | +| **RETURNS** | bytes | The serialized data. | + +## Sentencizer.from_bytes {#from_bytes tag="method"} + +Load the pipe from a bytestring. Modifies the object in place and returns it. + +> #### Example +> +> ```python +> sentencizer_bytes = sentencizer.to_bytes() +> sentencizer = Sentencizer() +> sentencizer.from_bytes(sentencizer_bytes) +> ``` + +| Name | Type | Description | +| ------------ | ------------- | ---------------------------------- | +| `bytes_data` | bytes | The bytestring to load. | +| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index d797250f9..7cd749521 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -25,21 +25,21 @@ an **annotated document**. It also orchestrates training and serialization. ### Processing pipeline {#architecture-pipeline} -| Name | Description | -| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | -| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. | -| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. | -| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. | -| `Morphology` | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | -| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. | -| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. | -| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. | -| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. | -| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. | -| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. | -| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | -| [`SentenceSegmenter`](/api/sentencesegmenter) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. | -| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | +| Name | Description | +| ------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- | +| [`Language`](/api/language) | A text-processing pipeline. Usually you'll load this once per process as `nlp` and pass the instance around your application. | +| [`Tokenizer`](/api/tokenizer) | Segment text, and create `Doc` objects with the discovered segment boundaries. | +| [`Lemmatizer`](/api/lemmatizer) | Determine the base forms of words. | +| `Morphology` | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | +| [`Tagger`](/api/tagger) | Annotate part-of-speech tags on `Doc` objects. | +| [`DependencyParser`](/api/dependencyparser) | Annotate syntactic dependencies on `Doc` objects. | +| [`EntityRecognizer`](/api/entityrecognizer) | Annotate named entities, e.g. persons or products, on `Doc` objects. | +| [`TextCategorizer`](/api/textcategorizer) | Assign categories or labels to `Doc` objects. | +| [`Matcher`](/api/matcher) | Match sequences of tokens, based on pattern rules, similar to regular expressions. | +| [`PhraseMatcher`](/api/phrasematcher) | Match sequences of tokens based on phrases. | +| [`EntityRuler`](/api/entityruler) | Add entity spans to the `Doc` using token-based rules or exact phrase matches. | +| [`Sentencizer`](/api/sentencizer) | Implement custom sentence boundary detection logic that doesn't require the dependency parse. | +| [Other functions](/api/pipeline-functions) | Automatically apply something to the `Doc`, e.g. to merge spans of tokens. | ### Other classes {#architecture-other} diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index db2d06e0f..538a9f205 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1149,9 +1149,14 @@ but it also means you'll need a **statistical model** and accurate predictions. If your texts are closer to general-purpose news or web text, this should work well out-of-the-box. For social media or conversational text that doesn't follow the same rules, your application may benefit from a custom rule-based -implementation. You can either plug a rule-based component into your -[processing pipeline](/usage/processing-pipelines) or use the -`SentenceSegmenter` component with a custom strategy. +implementation. You can either use the built-in +[`Sentencizer`](/api/sentencizer) or plug an entirely custom rule-based function +into your [processing pipeline](/usage/processing-pipelines). + +spaCy's dependency parser respects already set boundaries, so you can preprocess +your `Doc` using custom rules _before_ it's parsed. Depending on your text, this +may also improve accuracy, since the parser is constrained to predict parses +consistent with the sentence boundaries. ### Default: Using the dependency parse {#sbd-parser model="parser"} @@ -1168,13 +1173,35 @@ for sent in doc.sents: print(sent.text) ``` -### Setting boundaries manually {#sbd-manual} +### Rule-based pipeline component {#sbd-component} -spaCy's dependency parser respects already set boundaries, so you can preprocess -your `Doc` using custom rules _before_ it's parsed. This can be done by adding a -[custom pipeline component](/usage/processing-pipelines). Depending on your -text, this may also improve accuracy, since the parser is constrained to predict -parses consistent with the sentence boundaries. +The [`Sentencizer`](/api/sentencizer) component is a +[pipeline component](/usage/processing-pipelines) that splits sentences on +punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only +need sentence boundaries without the dependency parse. + +```python +### {executable="true"} +import spacy +from spacy.lang.en import English + +nlp = English() # just the language with no model +sentencizer = nlp.create_pipe("sentencizer") +nlp.add_pipe(sentencizer) +doc = nlp(u"This is a sentence. This is another sentence.") +for sent in doc.sents: + print(sent.text) +``` + +### Custom rule-based strategy {id="sbd-custom"} + +If you want to implement your own strategy that differs from the default +rule-based approach of splitting on sentences, you can also create a +[custom pipeline component](/usage/processing-pipelines#custom-components) that +takes a `Doc` object and sets the `Token.is_sent_start` attribute on each +individual token. If set to `False`, the token is explicitly marked as _not_ the +start of a sentence. If set to `None` (default), it's treated as a missing value +and can still be overwritten by the parser. @@ -1187,9 +1214,11 @@ adding it to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). Here's an example of a component that implements a pre-processing rule for splitting on `'...'` tokens. The component is added before the parser, which is -then used to further segment the text. This approach can be useful if you want -to implement **additional** rules specific to your data, while still being able -to take advantage of dependency-based sentence segmentation. +then used to further segment the text. That's possible, because `is_sent_start` +is only set to `True` for some of the tokens – all others still specify `None` +for unset sentence boundaries. This approach can be useful if you want to +implement **additional** rules specific to your data, while still being able to +take advantage of dependency-based sentence segmentation. ```python ### {executable="true"} @@ -1212,62 +1241,6 @@ doc = nlp(text) print("After:", [sent.text for sent in doc.sents]) ``` -### Rule-based pipeline component {#sbd-component} - -The `sentencizer` component is a -[pipeline component](/usage/processing-pipelines) that splits sentences on -punctuation like `.`, `!` or `?`. You can plug it into your pipeline if you only -need sentence boundaries without the dependency parse. Note that `Doc.sents` -will **raise an error** if no sentence boundaries are set. - -```python -### {executable="true"} -import spacy -from spacy.lang.en import English - -nlp = English() # just the language with no model -sentencizer = nlp.create_pipe("sentencizer") -nlp.add_pipe(sentencizer) -doc = nlp(u"This is a sentence. This is another sentence.") -for sent in doc.sents: - print(sent.text) -``` - -### Custom rule-based strategy {#sbd-custom} - -If you want to implement your own strategy that differs from the default -rule-based approach of splitting on sentences, you can also instantiate the -`SentenceSegmenter` directly and pass in your own strategy. The strategy should -be a function that takes a `Doc` object and yields a `Span` for each sentence. -Here's an example of a custom segmentation strategy for splitting on newlines -only: - -```python -### {executable="true"} -from spacy.lang.en import English -from spacy.pipeline import SentenceSegmenter - -def split_on_newlines(doc): - start = 0 - seen_newline = False - for word in doc: - if seen_newline and not word.is_space: - yield doc[start:word.i] - start = word.i - seen_newline = False - elif word.text == '\\n': - seen_newline = True - if start < len(doc): - yield doc[start:len(doc)] - -nlp = English() # Just the language with no model -sentencizer = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines) -nlp.add_pipe(sentencizer) -doc = nlp(u"This is a sentence\\n\\nThis is another sentence\\nAnd more") -for sent in doc.sents: - print([token.text for token in sent]) -``` - ## Rule-based matching {#rule-based-matching hidden="true"}
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 16bedce50..8eaf81652 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -138,7 +138,7 @@ require them in the pipeline settings in your model's `meta.json`. | `ner` | [`EntityRecognizer`](/api/entityrecognizer) | Assign named entities. | | `textcat` | [`TextCategorizer`](/api/textcategorizer) | Assign text categories. | | `entity_ruler` | [`EntityRuler`](/api/entityruler) | Assign named entities based on pattern rules. | -| `sentencizer` | [`SentenceSegmenter`](/api/sentencesegmenter) | Add rule-based sentence segmentation without the dependency parse. | +| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | | `merge_noun_chunks` | [`merge_noun_chunks`](/api/pipeline-functions#merge_noun_chunks) | Merge all noun chunks into a single token. Should be added after the tagger and parser. | | `merge_entities` | [`merge_entities`](/api/pipeline-functions#merge_entities) | Merge all entities into a single token. Should be added after the entity recognizer. | | `merge_subtokens` | [`merge_subtokens`](/api/pipeline-functions#merge_subtokens) | Merge subtokens predicted by the parser into single tokens. Should be added after the parser. | diff --git a/website/docs/usage/v2-1.md b/website/docs/usage/v2-1.md index 271440dba..0ba6fa407 100644 --- a/website/docs/usage/v2-1.md +++ b/website/docs/usage/v2-1.md @@ -195,7 +195,7 @@ the existing pages and added some new content: - **Universe:** [Videos](/universe/category/videos) and [Podcasts](/universe/category/podcasts) - **API:** [`EntityRuler`](/api/entityruler) -- **API:** [`SentenceSegmenter`](/api/sentencesegmenter) +- **API:** [`Sentencizer`](/api/sentencizer) - **API:** [Pipeline functions](/api/pipeline-functions) ## Backwards incompatibilities {#incompat} diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index fb4075ee5..bc8a70ea0 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -79,7 +79,7 @@ { "text": "Matcher", "url": "/api/matcher" }, { "text": "PhraseMatcher", "url": "/api/phrasematcher" }, { "text": "EntityRuler", "url": "/api/entityruler" }, - { "text": "SentenceSegmenter", "url": "/api/sentencesegmenter" }, + { "text": "Sentencizer", "url": "/api/sentencizer" }, { "text": "Other Functions", "url": "/api/pipeline-functions" } ] },