From c20abc8a6d11aa9d7cdee2b60d34c7bd4ef8e66f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 5 Nov 2016 20:40:11 +0100 Subject: [PATCH] Add customizing tokenizer and training workflow --- website/docs/usage/_data.json | 16 +- website/docs/usage/customizing-tokenizer.jade | 242 ++++++++++++++++++ website/docs/usage/training.jade | 118 +++++++++ 3 files changed, 374 insertions(+), 2 deletions(-) create mode 100644 website/docs/usage/customizing-tokenizer.jade create mode 100644 website/docs/usage/training.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index d73703a17..c67119796 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -12,7 +12,9 @@ "Custom pipelines": "customizing-pipeline", "Rule-based matching": "rule-based-matching", "Word vectors": "word-vectors-similarities", - "Deep learning": "deep-learning" + "Deep learning": "deep-learning", + "Custom tokenization": "customizing-tokenizer", + "Training": "training" }, "Examples": { "Tutorials": "tutorials", @@ -35,7 +37,8 @@ }, "customizing-pipeline": { - "title": "Customizing the pipeline" + "title": "Customizing the pipeline", + "next": "customizing-tokenizer" }, "processing-text": { @@ -63,6 +66,15 @@ "title": "Hooking a deep learning model into spaCy" }, + "customizing-tokenizer": { + "title": "Customizing the tokenizer", + "next": "training" + }, + + "training": { + "title": "Training the tagger, parser and entity recognizer" + }, + "showcase": { "title": "Showcase", diff --git a/website/docs/usage/customizing-tokenizer.jade b/website/docs/usage/customizing-tokenizer.jade new file mode 100644 index 000000000..306f76106 --- /dev/null +++ b/website/docs/usage/customizing-tokenizer.jade @@ -0,0 +1,242 @@ +//- DOCS > USAGE > TOKENIZER + +include ../../_includes/_mixins + +p + | Tokenization is the task of splitting a text into meaningful segments, + | called #[em tokens]. The input to the tokenizer is a unicode text, and + | the output is a #[+api("doc") #[code Doc]] object. To construct a + | #[code Doc] object, you need a #[+api("vocab") #[code Vocab]] instance, + | a sequence of #[code word] strings, and optionally a sequence of + | #[code spaces] booleans, which allow you to maintain alignment of the + | tokens into the original string. + ++aside("See Also") + | If you haven't read up on spaCy's #[+a("data-model") data model] yet, + | you should probably have a look. The main point to keep in mind is that + | spaCy's #[code Doc] doesn't copy or refer to the original string. The + | string is reconstructed from the tokens when required. + + ++h(2, "special-cases") Adding special case tokenization rules + +p + | Most domains have at least some idiosyncracies that require custom + | tokenization rules. Here's how to add a special case rule to an existing + | #[+api("tokenizer") #[code Tokenizer]] instance: + ++code. + nlp = spacy.load('en') + assert [w.text for w in nlp(u'gimme that')] == [u'gimme', u'that'] + nlp.tokenizer.add_special_case(u'gimme', + [ + { + ORTH: u'gim', + LEMMA: u'give', + POS: u'VERB'}, + { + ORTH: u'me'}]) + assert [w.text for w in nlp(u'gimme that')] == [u'gim', u'me', u'that'] + assert [w.lemma_ for w in nlp(u'gimme that')] == [u'give', u'-PRON-', u'that'] + +p + | The special case doesn't have to match an entire whitespace-delimited + | substring. The tokenizer will incrementally split off punctuation, and + | keep looking up the remaining substring: + ++code. + assert 'gimme' not in [w.text for w in nlp(u'gimme!')] + assert 'gimme' not in [w.text for w in nlp(u'("...gimme...?")')] + +p + | The special case rules have precedence over the punctuation splitting: + ++code. + nlp.tokenizer.add_special_case(u"...gimme...?", + [{ + ORTH: u'...gimme...?", LEMMA: "give", TAG: "VB"}]) + assert len(nlp(u'...gimme...?')) == 1 + +p + | Because the special-case rules allow you to set arbitrary token + | attributes, such as the part-of-speech, lemma, etc, they make a good + | mechanism for arbitrary fix-up rules. Having this logic live in the + | tokenizer isn't very satisfying from a design perspective, however, so + | the API may eventually be exposed on the + | #[+api("language") #[code Language]] class itself. + + ++h(2, "how-tokenizer-works") How spaCy's tokenizer works + +p + | spaCy introduces a novel tokenization algorithm, that gives a better + | balance between performance, ease of definition, and ease of alignment + | into the original string. + +p + | After consuming a prefix or infix, we consult the special cases again. + | We want the special cases to handle things like "don't" in English, and + | we want the same rule to work for "(don't)!". We do this by splitting + | off the open bracket, then the exclamation, then the close bracket, and + | finally matching the special-case. Here's an implementation of the + | algorithm in Python, optimized for readability rather than performance: + ++code. + def tokenizer_pseudo_code(text, find_prefix, find_suffix, + find_infixes, special_cases): + tokens = [] + for substring in text.split(' '): + suffixes = [] + while substring: + if substring in special_cases: + tokens.extend(special_cases[substring]) + substring = '' + elif find_prefix(substring) is not None: + split = find_prefix(substring) + tokens.append(substring[:split]) + substring = substring[split:] + elif find_suffix(substring) is not None: + split = find_suffix(substring) + suffixes.append(substring[split:]) + substring = substring[:split] + elif find_infixes(substring): + infixes = find_infixes(substring) + offset = 0 + for match in infixes: + tokens.append(substring[i : match.start()]) + tokens.append(substring[match.start() : match.end()]) + offset = match.end() + substring = substring[offset:] + else: + tokens.append(substring) + substring = '' + tokens.extend(suffixes) + return tokens + +p + | The algorithm can be summarized as follows: + ++list("numbers") + +item Iterate over space-separated substrings + +item + | Check whether we have an explicitly defined rule for this substring. + | If we do, use it. + +item Otherwise, try to consume a prefix. + +item + | If we consumed a prefix, go back to the beginning of the loop, so + | that special-cases always get priority. + +item If we didn't consume a prefix, try to consume a suffix. + +item + | If we can't consume a prefix or suffix, look for "infixes" — stuff + | like hyphens etc. + +item Once we can't consume any more of the string, handle it as a single token. + ++h(2, "native-tokenizers") Customizing spaCy's Tokenizer class + +p + | Let's imagine you wanted to create a tokenizer for a new language. There + | are four things you would need to define: + ++list("numbers") + +item + | A dictionary of #[strong special cases]. This handles things like + | contractions, units of measurement, emoticons, certain + | abbreviations, etc. + + +item + | A function #[code prefix_search], to handle + | #[strong preceding punctuation], such as open quotes, open brackets, + | etc + + +item + | A function #[code suffix_search], to handle + | #[strong succeeding punctuation], such as commas, periods, close + | quotes, etc. + + +item + | A function #[code infixes_finditer], to handle non-whitespace + | separators, such as hyphens etc. + +p + | You shouldn't usually need to create a #[code Tokenizer] subclass. + | Standard usage is to use #[code re.compile()] to build a regular + | expression object, and pass its #[code .search()] and + | #[code .finditer()] methods: + ++code. + import re + from spacy.tokenizer import Tokenizer + + prefix_re = re.compile(r'''[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']''') + def create_tokenizer(nlp): + return Tokenizer(nlp.vocab, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search) + + nlp = spacy.load('en', tokenizer=create_make_doc) + +p + | If you need to subclass the tokenizer instead, the relevant methods to + | specialize are #[code find_prefix], #[code find_suffix] and + | #[code find_infix]. + ++h(2, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline + +p + | You can pass a custom tokenizer using the #[code make_doc] keyword, when + | you're creating the pipeline: + ++code. + import spacy + + nlp = spacy.load('en', make_doc=my_tokenizer) + +p + | However, this approach often leaves us with a chicken-and-egg problem. + | To construct the tokenizer, we usually want attributes of the #[code nlp] + | pipeline. Specifically, we want the tokenizer to hold a reference to the + | pipeline's vocabulary object. Let's say we have the following class as + | our tokenizer: + + ++code. + import spacy + from spacy.tokens import Doc + + class WhitespaceTokenizer(object): + def __init__(self, nlp): + self.vocab = nlp.vocab + + def __call__(self, text): + words = text.split(' ') + # All tokens 'own' a subsequent space character in this tokenizer + spaces = [True] * len(word) + return Doc(self.vocab, words=words, spaces=spaces) + +p + | As you can see, we need a #[code vocab] instance to construct this — but + | we won't get the #[code vocab] instance until we get back the #[code nlp] + | object from #[code spacy.load()]. The simplest solution is to build the + | object in two steps: + ++code. + nlp = spacy.load('en') + nlp.make_doc = WhitespaceTokenizer(nlp) + +p + | You can instead pass the class to the #[code create_make_doc] keyword, + | which is invoked as callback once the #[code nlp] object is ready: + ++code. + nlp = spacy.load('en', create_make_doc=WhitespaceTokenizer) + +p + | Finally, you can of course create your own subclasses, and create a bound + | #[code make_doc] method. The disadvantage of this approach is that spaCy + | uses inheritance to give each language-specific pipeline its own class. + | If you're working with multiple languages, a naive solution will + | therefore require one custom class per language you're working with. + | This might be at least annoying. You may be able to do something more + | generic by doing some clever magic with metaclasses or mixins, if that's + | the sort of thing you're into. diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade new file mode 100644 index 000000000..98afef36b --- /dev/null +++ b/website/docs/usage/training.jade @@ -0,0 +1,118 @@ +include ../../_includes/_mixins + +p + | This tutorial describes how to train new statistical models for spaCy's + | part-of-speech tagger, named entity recognizer and dependency parser. + +p + | I'll start with some quick code examples, that describe how to train + | each model. I'll then provide a bit of background about the algorithms, + | and explain how the data and feature templates work. + ++h(2, "train-pos-tagger") Training the part-of-speech tagger + ++code. + from spacy.vocab import Vocab + from spacy.pipeline import Tagger + from spacy.tokens import Doc + + vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}}) + tagger = Tagger(vocab) + + doc = Doc(vocab, words=['I', 'like', 'stuff']) + tagger.update(doc, ['N', 'V', 'N']) + + tagger.model.end_training() + +p + +button(gh("spaCy", "examples/training/train_tagger.py"), false, "secondary") Full example + ++h(2, "train-entity") Training the named entity recognizer + ++code. + from spacy.vocab import Vocab + from spacy.pipeline import EntityRecognizer + from spacy.tokens import Doc + + vocab = Vocab() + entity = EntityRecognizer(vocab, entity_types=['PERSON', 'LOC']) + + doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) + entity.update(doc, ['O', 'O', 'B-PERSON', 'L-PERSON', 'O']) + + entity.model.end_training() + +p + +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example + ++h(2, "train-entity") Training the dependency parser + ++code. + from spacy.vocab import Vocab + from spacy.pipeline import DependencyParser + from spacy.tokens import Doc + + vocab = Vocab() + parser = DependencyParser(vocab, labels=['nsubj', 'compound', 'dobj', 'punct']) + + doc = Doc(vocab, words=['Who', 'is', 'Shaka', 'Khan', '?']) + parser.update(doc, [(1, 'nsubj'), (1, 'ROOT'), (3, 'compound'), (1, 'dobj'), + (1, 'punct')]) + + parser.model.end_training() + +p + +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example + ++h(2, 'feature-templates') Customizing the feature extraction + +p + | spaCy currently uses linear models for the tagger, parser and entity + | recognizer, with weights learned using the + | #[+a("https://explosion.ai/blog/part-of-speech-pos-tagger-in-python") Averaged Perceptron algorithm]. + +p + | Because it's a linear model, it's important for accuracy to build + | conjunction features out of the atomic predictors. Let's say you have + | two atomic predictors asking, "What is the part-of-speech of the + | previous token?", and "What is the part-of-speech of the previous + | previous token?". These ppredictors will introduce a number of features, + | e.g. #[code Prev-pos=NN], #[code Prev-pos=VBZ], etc. A conjunction + | template introduces features such as #[code Prev-pos=NN&Prev-pos=VBZ]. + +p + | The feature extraction proceeds in two passes. In the first pass, we + | fill an array with the values of all of the atomic predictors. In the + | second pass, we iterate over the feature templates, and fill a small + | temporary array with the predictors that will be combined into a + | conjunction feature. Finally, we hash this array into a 64-bit integer, + | using the MurmurHash algorithm. You can see this at work in the + | #[+a(gh("thinc", "thinc/linear/features.pyx", "94dbe06fd3c8f24d86ab0f5c7984e52dbfcdc6cb")) #[code thinc.linear.features]] module. + +p + | It's very easy to change the feature templates, to create novel + | combinations of the existing atomic predictors. There's currently no API + | available to add new atomic predictors, though. You'll have to create a + | subclass of the model, and write your own #[code set_featuresC] method. + +p + | The feature templates are passed in using the #[code features] keyword + | argument to the constructors of the #[+api("tagger") #[code Tagger]], + | #[+api("dependencyparser") #[code DependencyParser]] and + | #[+api("entityrecognizer") #[code EntityRecognizer]]: + ++code. + from spacy.vocab import Vocab + from spacy.pipeline import Tagger + from spacy.tagger import P2_orth, P1_orth + from spacy.tagger import P2_cluster, P1_cluster, W_orth, N1_orth, N2_orth + + vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}}) + tagger = Tagger(vocab, features=[(P2_orth, P2_cluster), (P1_orth, P1_cluster), + (P2_orth,), (P1_orth,), (W_orth,), + (N1_orth,), (N2_orth,)]) + +p + | Custom feature templates can be passed to the #[code DependencyParser] + | and #[code EntityRecognizer] as well, also using the #[code features] + | keyword argument of the constructor.