spaCy/website/api/lemmatizer.jade

//- 💫 DOCS > API > LEMMATIZER

include ../_includes/_mixins

p
    |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
    |  rules and lookup tables.

+h(2, "init") Lemmatizer.__init__
    +tag method

p Create a #[code Lemmatizer].

+aside-code("Example").
    from spacy.lemmatizer import Lemmatizer
    lemmatizer = Lemmatizer()

+table(["Name", "Type", "Description"])
    +row
        +cell #[code index]
        +cell dict / #[code None]
        +cell Inventory of lemmas in the language.

    +row
        +cell #[code exceptions]
        +cell dict / #[code None]
        +cell Mapping of string forms to lemmas that bypass the #[code rules].

    +row
        +cell #[code rules]
        +cell dict / #[code None]
        +cell List of suffix rewrite rules.

    +row
        +cell #[code lookup]
        +cell dict / #[code None]
        +cell Lookup table mapping string to their lemmas.

    +row("foot")
        +cell returns
        +cell #[code Lemmatizer]
        +cell The newly created object.

+h(2, "call") Lemmatizer.__call__
    +tag method

p Lemmatize a string.

+aside-code("Example").
    from spacy.lemmatizer import Lemmatizer
    from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
    lemmas = lemmatizer(u'ducks', u'NOUN')
    assert lemmas == [u'duck']

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to lemmatize, e.g. the token text.

    +row
        +cell #[code univ_pos]
        +cell unicode / int
        +cell The token's universal part-of-speech tag.

    +row
        +cell #[code morphology]
        +cell dict / #[code None]
        +cell
            |  Morphological features following the
            |  #[+a("http://universaldependencies.org/") Universal Dependencies]
            |  scheme.

    +row("foot")
        +cell returns
        +cell list
        +cell The available lemmas for the string.

+h(2, "lookup") Lemmatizer.lookup
    +tag method
    +tag-new(2)

p
    |  Look up a lemma in the lookup table, if available. If no lemma is found,
    |  the original string is returned. Languages can provide a
    |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
    |  #[code lemma_lookup] variable, set on the individual #[code Language]
    |  class.

+aside-code("Example").
    lookup = {u'going': u'go'}
    lemmatizer = Lemmatizer(lookup=lookup)
    assert lemmatizer.lookup(u'going') == u'go'

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to look up.

    +row("foot")
        +cell returns
        +cell unicode
        +cell The lemma if the string was found, otherwise the original string.

+h(2, "is_base_form") Lemmatizer.is_base_form
    +tag method

p
    |  Check whether we're dealing with an uninflected paradigm, so we can
    |  avoid lemmatization entirely.

+aside-code("Example").
    pos = 'verb'
    morph = {'VerbForm': 'inf'}
    is_base_form = lemmatizer.is_base_form(pos, morph)
    assert is_base_form == True

+table(["Name", "Type", "Description"])
    +row
        +cell #[code univ_pos]
        +cell unicode / int
        +cell The token's universal part-of-speech tag.

    +row
        +cell #[code morphology]
        +cell dict
        +cell The token's morphological features.

    +row("foot")
        +cell returns
        +cell bool
        +cell
            |  Whether the token's part-of-speech tag and morphological features
            |  describe a base form.

+h(2, "attributes") Attributes

+table(["Name", "Type", "Description"])
    +row
        +cell #[code index]
        +cell dict / #[code None]
        +cell Inventory of lemmas in the language.

    +row
        +cell #[code exc]
        +cell dict / #[code None]
        +cell Mapping of string forms to lemmas that bypass the #[code rules].

    +row
        +cell #[code rules]
        +cell dict / #[code None]
        +cell List of suffix rewrite rules.

    +row
        +cell #[code lookup_table]
            +tag-new(2)
        +cell dict / #[code None]
        +cell The lemma lookup table, if available.