spaCy/spacy/lang/zh/lex_attrs.py

99 lines
1.6 KiB
Python
Raw Normal View History

# coding: utf8
from __future__ import unicode_literals
import re
from ...attrs import LIKE_NUM
_single_num_words = [
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"十一",
"十二",
"十三",
"十四",
"十五",
"十六",
"十七",
"十八",
"十九",
"廿",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"拾壹",
"拾贰",
"拾叁",
"拾肆",
"拾伍",
"拾陆",
"拾柒",
"拾捌",
2019-08-20 15:36:34 +00:00
"拾玖",
]
_count_num_words = [
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
2019-08-20 15:36:34 +00:00
"",
]
2019-08-20 15:36:34 +00:00
_base_num_words = ["", "", "", "", "亿", "", "", "", ""]
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
2019-08-20 15:36:34 +00:00
text = text.replace(",", "").replace(".", "").replace("", "").replace("", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _single_num_words:
return True
2019-08-20 15:36:34 +00:00
# fmt: off
if re.match('^((' + '|'.join(_count_num_words) + '){1}'
+ '(' + '|'.join(_base_num_words) + '){1})+'
+ '(' + '|'.join(_count_num_words) + ')?$', text):
return True
2019-08-20 15:36:34 +00:00
# fmt: on
return False
LEX_ATTRS = {LIKE_NUM: like_num}