spaCy/spacy/lang/nl/lex_attrs.py

# coding: utf8
from __future__ import unicode_literals

from ...attrs import LIKE_NUM


_num_words = set(
    """
nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien
veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd
duizend miljoen miljard biljoen biljard triljoen triljard
""".split()
)

_ordinal_words = set(
    """
eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde
twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste
zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste
miljardste biljoenste biljardste triljoenste triljardste
""".split()
)


def like_num(text):
    # This only does the most basic check for whether a token is a digit
    # or matches one of the number words. In order to handle numbers like
    # "drieëntwintig", more work is required.
    # See this discussion: https://github.com/explosion/spaCy/pull/1177
    text = text.replace(",", "").replace(".", "")
    if text.isdigit():
        return True
    if text.count("/") == 1:
        num, denom = text.split("/")
        if num.isdigit() and denom.isdigit():
            return True
    if text.lower() in _num_words:
        return True
    if text.lower() in _ordinal_words:
        return True
    return False


LEX_ATTRS = {LIKE_NUM: like_num}
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00			`# coding: utf8`
			`from __future__ import unicode_literals`

			`from ...attrs import LIKE_NUM`


Tidy up and auto-format 2019-08-20 15:36:34 +00:00			`_num_words = set(`
			`"""`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00			`nul een één twee drie vier vijf zes zeven acht negen tien elf twaalf dertien`
			`veertien twintig dertig veertig vijftig zestig zeventig tachtig negentig honderd`
			`duizend miljoen miljard biljoen biljard triljoen triljard`
Tidy up and auto-format 2019-08-20 15:36:34 +00:00			`""".split()`
			`)`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00
Tidy up and auto-format 2019-08-20 15:36:34 +00:00			`_ordinal_words = set(`
			`"""`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00			`eerste tweede derde vierde vijfde zesde zevende achtste negende tiende elfde`
			`twaalfde dertiende veertiende twintigste dertigste veertigste vijftigste`
			`zestigste zeventigste tachtigste negentigste honderdste duizendste miljoenste`
			`miljardste biljoenste biljardste triljoenste triljardste`
Tidy up and auto-format 2019-08-20 15:36:34 +00:00			`""".split()`
			`)`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00

			`def like_num(text):`
Add comment to like_num re: future work 2017-09-26 14:43:28 +00:00			`# This only does the most basic check for whether a token is a digit`
			`# or matches one of the number words. In order to handle numbers like`
			`# "drieëntwintig", more work is required.`
			`# See this discussion: https://github.com/explosion/spaCy/pull/1177`
Tidy up and auto-format 2019-08-20 15:36:34 +00:00			`text = text.replace(",", "").replace(".", "")`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00			`if text.isdigit():`
			`return True`
Tidy up and auto-format 2019-08-20 15:36:34 +00:00			`if text.count("/") == 1:`
			`num, denom = text.split("/")`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00			`if num.isdigit() and denom.isdigit():`
			`return True`
Find lowercased forms of numeric words 2018-01-08 02:25:08 +00:00			`if text.lower() in _num_words:`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00			`return True`
Find lowercased forms of ordinal words, where possible 2018-01-08 02:28:50 +00:00			`if text.lower() in _ordinal_words:`
			`return True`
Implement like_num getter for Dutch (via #1177) 2017-09-26 14:39:15 +00:00			`return False`


Tidy up and auto-format 2019-08-20 15:36:34 +00:00			`LEX_ATTRS = {LIKE_NUM: like_num}`