2019-04-03 12:13:26 +00:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2019-09-30 22:01:27 +00:00
|
|
|
from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV
|
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file
Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.
This focuses on Turkish specifically because it has the most language
data in core.
* Transition all lemmatizer.py files to json
This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.
None of the affected files have logic, so this was pretty
straightforward.
One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.
* Move large lang data to json for fr/nb/nl/sv
These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.
For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.
* Fix id lemmas.json
The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.
* Add .json.gz to gitignore
This covers the json.gz files built as part of distribution.
* Add language data gzip to build process
Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.
* Remove Danish lemmatizer.py
Missed this when I added the json.
* Update to match latest explosion/srsly#9
The way gzipped json is loaded/saved in srsly changed a bit.
* Only compress language data if necessary
If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.
* Move en/el language data to json
This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.
* Remove empty files in Norwegian tokenizer
It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.
This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.
* Remove dubious entries in English lookup.json
" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.
* Fix small issues with en/fr lemmatizers
The en tokenizer was including the removed _nouns.py file, so that's
removed.
The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.
* Auto-format
* Auto-format
* Update srsly pin
* Consistently use pathlib paths
2019-08-20 12:54:11 +00:00
|
|
|
|
2019-04-03 12:13:26 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
class DutchLemmatizer(object):
|
|
|
|
# Note: CGN does not distinguish AUX verbs, so we treat AUX as VERB.
|
|
|
|
univ_pos_name_variants = {
|
|
|
|
NOUN: "noun",
|
|
|
|
"NOUN": "noun",
|
|
|
|
"noun": "noun",
|
|
|
|
VERB: "verb",
|
|
|
|
"VERB": "verb",
|
|
|
|
"verb": "verb",
|
|
|
|
AUX: "verb",
|
|
|
|
"AUX": "verb",
|
|
|
|
"aux": "verb",
|
|
|
|
ADJ: "adj",
|
|
|
|
"ADJ": "adj",
|
|
|
|
"adj": "adj",
|
|
|
|
ADV: "adv",
|
|
|
|
"ADV": "adv",
|
|
|
|
"adv": "adv",
|
|
|
|
PRON: "pron",
|
|
|
|
"PRON": "pron",
|
|
|
|
"pron": "pron",
|
|
|
|
DET: "det",
|
|
|
|
"DET": "det",
|
|
|
|
"det": "det",
|
|
|
|
ADP: "adp",
|
|
|
|
"ADP": "adp",
|
|
|
|
"adp": "adp",
|
|
|
|
NUM: "num",
|
|
|
|
"NUM": "num",
|
|
|
|
"num": "num",
|
|
|
|
}
|
2019-04-03 12:13:26 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
@classmethod
|
|
|
|
def load(cls, path, index=None, exc=None, rules=None, lookup=None):
|
|
|
|
return cls(index, exc, rules, lookup)
|
2019-04-03 12:13:26 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
|
|
|
|
self.index = index
|
|
|
|
self.exc = exceptions
|
|
|
|
self.rules = rules or {}
|
|
|
|
self.lookup_table = lookup if lookup is not None else {}
|
2019-04-03 12:13:26 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
def __call__(self, string, univ_pos, morphology=None):
|
|
|
|
# Difference 1: self.rules is assumed to be non-None, so no
|
|
|
|
# 'is None' check required.
|
|
|
|
# String lowercased from the get-go. All lemmatization results in
|
|
|
|
# lowercased strings. For most applications, this shouldn't pose
|
|
|
|
# any problems, and it keeps the exceptions indexes small. If this
|
|
|
|
# creates problems for proper nouns, we can introduce a check for
|
|
|
|
# univ_pos == "PROPN".
|
|
|
|
string = string.lower()
|
|
|
|
try:
|
|
|
|
univ_pos = self.univ_pos_name_variants[univ_pos]
|
|
|
|
except KeyError:
|
|
|
|
# Because PROPN not in self.univ_pos_name_variants, proper names
|
|
|
|
# are not lemmatized. They are lowercased, however.
|
|
|
|
return [string]
|
|
|
|
# if string in self.lemma_index.get(univ_pos)
|
2019-09-15 20:08:13 +00:00
|
|
|
lemma_index = self.index.get(univ_pos, {})
|
2019-08-22 12:21:32 +00:00
|
|
|
# string is already lemma
|
|
|
|
if string in lemma_index:
|
|
|
|
return [string]
|
2019-09-15 20:08:13 +00:00
|
|
|
exceptions = self.exc.get(univ_pos, {})
|
2019-08-22 12:21:32 +00:00
|
|
|
# string is irregular token contained in exceptions index.
|
|
|
|
try:
|
|
|
|
lemma = exceptions[string]
|
|
|
|
return [lemma[0]]
|
|
|
|
except KeyError:
|
|
|
|
pass
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
# string corresponds to key in lookup table
|
2019-08-22 12:21:32 +00:00
|
|
|
lookup_table = self.lookup_table
|
2019-09-15 20:08:13 +00:00
|
|
|
looked_up_lemma = lookup_table.get(string)
|
2019-08-22 12:21:32 +00:00
|
|
|
if looked_up_lemma and looked_up_lemma in lemma_index:
|
|
|
|
return [looked_up_lemma]
|
2019-04-03 12:13:26 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
forms, is_known = lemmatize(
|
2019-09-15 20:08:13 +00:00
|
|
|
string, lemma_index, exceptions, self.rules.get(univ_pos, [])
|
2019-08-22 12:21:32 +00:00
|
|
|
)
|
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file
Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.
This focuses on Turkish specifically because it has the most language
data in core.
* Transition all lemmatizer.py files to json
This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.
None of the affected files have logic, so this was pretty
straightforward.
One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.
* Move large lang data to json for fr/nb/nl/sv
These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.
For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.
* Fix id lemmas.json
The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.
* Add .json.gz to gitignore
This covers the json.gz files built as part of distribution.
* Add language data gzip to build process
Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.
* Remove Danish lemmatizer.py
Missed this when I added the json.
* Update to match latest explosion/srsly#9
The way gzipped json is loaded/saved in srsly changed a bit.
* Only compress language data if necessary
If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.
* Move en/el language data to json
This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.
* Remove empty files in Norwegian tokenizer
It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.
This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.
* Remove dubious entries in English lookup.json
" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.
* Fix small issues with en/fr lemmatizers
The en tokenizer was including the removed _nouns.py file, so that's
removed.
The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.
* Auto-format
* Auto-format
* Update srsly pin
* Consistently use pathlib paths
2019-08-20 12:54:11 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
# Back-off through remaining return value candidates.
|
|
|
|
if forms:
|
|
|
|
if is_known:
|
|
|
|
return forms
|
|
|
|
else:
|
|
|
|
for form in forms:
|
|
|
|
if form in exceptions:
|
|
|
|
return [form]
|
|
|
|
if looked_up_lemma:
|
|
|
|
return [looked_up_lemma]
|
|
|
|
else:
|
|
|
|
return forms
|
|
|
|
elif looked_up_lemma:
|
|
|
|
return [looked_up_lemma]
|
|
|
|
else:
|
|
|
|
return [string]
|
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file
Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.
This focuses on Turkish specifically because it has the most language
data in core.
* Transition all lemmatizer.py files to json
This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.
None of the affected files have logic, so this was pretty
straightforward.
One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.
* Move large lang data to json for fr/nb/nl/sv
These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.
For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.
* Fix id lemmas.json
The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.
* Add .json.gz to gitignore
This covers the json.gz files built as part of distribution.
* Add language data gzip to build process
Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.
* Remove Danish lemmatizer.py
Missed this when I added the json.
* Update to match latest explosion/srsly#9
The way gzipped json is loaded/saved in srsly changed a bit.
* Only compress language data if necessary
If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.
* Move en/el language data to json
This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.
* Remove empty files in Norwegian tokenizer
It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.
This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.
* Remove dubious entries in English lookup.json
" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.
* Fix small issues with en/fr lemmatizers
The en tokenizer was including the removed _nouns.py file, so that's
removed.
The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.
* Auto-format
* Auto-format
* Update srsly pin
* Consistently use pathlib paths
2019-08-20 12:54:11 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
# Overrides parent method so that a lowercased version of the string is
|
|
|
|
# used to search the lookup table. This is necessary because our lookup
|
|
|
|
# table consists entirely of lowercase keys.
|
2019-09-15 20:08:13 +00:00
|
|
|
def lookup(self, string, orth=None):
|
2019-08-22 12:21:32 +00:00
|
|
|
string = string.lower()
|
Bloom-filter backed Lookup Tables (#4268)
* Improve load_language_data helper
* WIP: Add Lookups implementation
* Start moving lemma data over to JSON
* WIP: move data over for more languages
* Convert more languages
* Fix lemmatizer fixtures in tests
* Finish conversion
* Auto-format JSON files
* Fix test for now
* Make sure tables are stored on instance
* Update docstrings
* Update docstrings and errors
* Update test
* Add Lookups.__len__
* Add serialization methods
* Add Lookups.remove_table
* Use msgpack for serialization to disk
* Fix file exists check
* Try using OrderedDict for everything
* Update .flake8 [ci skip]
* Try fixing serialization
* Update test_lookups.py
* Update test_serialize_vocab_strings.py
* Lookups / Tables now work
This implements the stubs in the Lookups/Table classes. Currently this
is in Cython but with no type declarations, so that could be improved.
* Add lookups to setup.py
* Actually add lookups pyx
The previous commit added the old py file...
* Lookups work-in-progress
* Move from pyx back to py
* Add string based lookups, fix serialization
* Update tests, language/lemmatizer to work with string lookups
There are some outstanding issues here:
- a pickling-related test fails due to the bloom filter
- some custom lemmatizers (fr/nl at least) have issues
More generally, there's a question of how to deal with the case where
you have a string but want to use the lookup table. Currently the table
allows access by string or id, but that's getting pretty awkward.
* Change lemmatizer lookup method to pass (orth, string)
* Fix token lookup
* Fix French lookup
* Fix lt lemmatizer test
* Fix Dutch lemmatizer
* Fix lemmatizer lookup test
This was using a normal dict instead of a Table, so checks for the
string instead of an integer key failed.
* Make uk/nl/ru lemmatizer lookup methods consistent
The mentioned tokenizers all have their own implementation of the
`lookup` method, which accesses a `Lookups` table. The way that was
called in `token.pyx` was changed so this should be updated to have the
same arguments as `lookup` in `lemmatizer.py` (specificially (orth/id,
string)).
Prior to this change tests weren't failing, but there would probably be
issues with normal use of a model. More tests should proably be added.
Additionally, the language-specific `lookup` implementations seem like
they might not be needed, since they handle things like lower-casing
that aren't actually language specific.
* Make recently added Greek method compatible
* Remove redundant class/method
Leftovers from a merge not cleaned up adequately.
2019-09-12 15:26:11 +00:00
|
|
|
if orth is not None:
|
|
|
|
return self.lookup_table.get(orth, string)
|
|
|
|
else:
|
2019-09-15 20:08:13 +00:00
|
|
|
return self.lookup_table.get(string, string)
|
2019-04-03 12:13:26 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
def noun(self, string, morphology=None):
|
|
|
|
return self(string, "noun", morphology)
|
2019-04-09 09:40:19 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
def verb(self, string, morphology=None):
|
|
|
|
return self(string, "verb", morphology)
|
2019-04-09 09:40:19 +00:00
|
|
|
|
2019-08-22 12:21:32 +00:00
|
|
|
def adj(self, string, morphology=None):
|
|
|
|
return self(string, "adj", morphology)
|
|
|
|
|
|
|
|
def det(self, string, morphology=None):
|
|
|
|
return self(string, "det", morphology)
|
|
|
|
|
|
|
|
def pron(self, string, morphology=None):
|
|
|
|
return self(string, "pron", morphology)
|
|
|
|
|
|
|
|
def adp(self, string, morphology=None):
|
|
|
|
return self(string, "adp", morphology)
|
|
|
|
|
|
|
|
def punct(self, string, morphology=None):
|
|
|
|
return self(string, "punct", morphology)
|
|
|
|
|
|
|
|
|
|
|
|
# Reimplemented to focus more on application of suffix rules and to return
|
|
|
|
# as early as possible.
|
|
|
|
def lemmatize(string, index, exceptions, rules):
|
|
|
|
# returns (forms, is_known: bool)
|
|
|
|
oov_forms = []
|
|
|
|
for old, new in rules:
|
|
|
|
if string.endswith(old):
|
|
|
|
form = string[: len(string) - len(old)] + new
|
|
|
|
if not form:
|
|
|
|
pass
|
|
|
|
elif form in index:
|
|
|
|
return [form], True # True = Is known (is lemma)
|
|
|
|
else:
|
|
|
|
oov_forms.append(form)
|
|
|
|
return list(set(oov_forms)), False
|