From d122bbc9084adcb9aa0e6af57f5df828d0753ffb Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 25 May 2017 00:30:21 +0200 Subject: [PATCH] Rewrite custom tokenizer docs --- website/docs/usage/customizing-tokenizer.jade | 101 +++++++++++------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade index 5871e1655..86040a4eb 100644 --- a/website/docs/usage/customizing-tokenizer.jade +++ b/website/docs/usage/customizing-tokenizer.jade @@ -11,16 +11,10 @@ p | #[code spaces] booleans, which allow you to maintain alignment of the | tokens into the original string. -+aside("spaCy's data model") - | The main point to keep in mind is that spaCy's #[code Doc] doesn't - | copy or refer to the original string. The string is reconstructed from - | the tokens when required. - +h(2, "101") Tokenizer 101 include _spacy-101/_tokenization - +h(3, "101-data") Tokenizer data p @@ -221,27 +215,68 @@ p +h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline p - | You can pass a custom tokenizer using the #[code make_doc] keyword, when - | you're creating the pipeline: + | The tokenizer is the first component of the processing pipeline and the + | only one that can't be replaced by writing to #[code nlp.pipeline]. This + | is because it has a different signature from all the other components: + | it takes a text and returns a #[code Doc], whereas all other components + | expect to already receive a tokenized #[code Doc]. + ++image + include ../../assets/img/docs/pipeline.svg + .u-text-right + +button("/assets/img/docs/pipeline.svg", false, "secondary").u-text-tag View large graphic -+code. - nlp = spacy.load('en', make_doc=my_tokenizer) p - | However, this approach often leaves us with a chicken-and-egg problem. - | To construct the tokenizer, we usually want attributes of the #[code nlp] - | pipeline. Specifically, we want the tokenizer to hold a reference to the - | pipeline's vocabulary object. Let's say we have the following class as - | our tokenizer: - + | To overwrite the existing tokenizer, you need to replace + | #[code nlp.tokenizer] with a custom function that takes a text, and + | returns a #[code Doc]. + ++code. + nlp = spacy.load('en') + nlp.tokenizer = my_tokenizer + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code text] + +cell unicode + +cell The raw text to tokenize. + + +footrow + +cell returns + +cell #[code Doc] + +cell The tokenized document. + ++infobox("Important note: using a custom tokenizer") + .o-block + | In spaCy v1.x, you had to add a custom tokenizer by passing it to the + | #[code make_doc] keyword argument, or by passing a tokenizer "factory" + | to #[code create_make_doc]. This was unnecessarily complicated. Since + | spaCy v2.0, you can simply write to #[code nlp.tokenizer]. If your + | tokenizer needs the vocab, you can write a function and use + | #[code nlp.vocab]. + + +code-new. + nlp.tokenizer = my_tokenizer + nlp.tokenizer = my_tokenizer_factory(nlp.vocab) + +code-old. + nlp = spacy.load('en', make_doc=my_tokenizer) + nlp = spacy.load('en', create_make_doc=my_tokenizer_factory) + ++h(3, "custom-tokenizer-example") Example: A custom whitespace tokenizer + +p + | To construct the tokenizer, we usually want attributes of the #[code nlp] + | pipeline. Specifically, we want the tokenizer to hold a reference to the + | vocabulary object. Let's say we have the following class as + | our tokenizer: +code. - import spacy from spacy.tokens import Doc class WhitespaceTokenizer(object): - def __init__(self, nlp): - self.vocab = nlp.vocab + def __init__(self, vocab): + self.vocab = vocab def __call__(self, text): words = text.split(' ') @@ -250,28 +285,12 @@ p return Doc(self.vocab, words=words, spaces=spaces) p - | As you can see, we need a #[code vocab] instance to construct this — but - | we won't get the #[code vocab] instance until we get back the #[code nlp] - | object from #[code spacy.load()]. The simplest solution is to build the - | object in two steps: + | As you can see, we need a #[code Vocab] instance to construct this — but + | we won't have it until we get back the loaded #[code nlp] object. The + | simplest solution is to build the tokenizer in two steps. This also means + | that you can reuse the "tokenizer factory" and initialise it with + | different instances of #[code Vocab]. +code. nlp = spacy.load('en') - nlp.make_doc = WhitespaceTokenizer(nlp) - -p - | You can instead pass the class to the #[code create_make_doc] keyword, - | which is invoked as callback once the #[code nlp] object is ready: - -+code. - nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer) - -p - | Finally, you can of course create your own subclasses, and create a bound - | #[code make_doc] method. The disadvantage of this approach is that spaCy - | uses inheritance to give each language-specific pipeline its own class. - | If you're working with multiple languages, a naive solution will - | therefore require one custom class per language you're working with. - | This might be at least annoying. You may be able to do something more - | generic by doing some clever magic with metaclasses or mixins, if that's - | the sort of thing you're into. + nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)