diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 4a5ee6d67..cb2e817d5 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -215,7 +215,7 @@ _punct = ( r"… …… , : ; \! \? ¿ ؟ ¡ \( \) \[ \] \{ \} < > _ # \* & 。 ? ! , 、 ; : ~ · । ، ؛ ٪" ) _quotes = r'\' " ” “ ` ‘ ´ ’ ‚ , „ » « 「 」 『 』 ( ) 〔 〕 【 】 《 》 〈 〉' -_hyphens = r"\- – — \-\- \-\-\- —— ~" +_hyphens = "- – — -- --- —— ~" # Various symbols like dingbats, but also emoji # Details: https://www.compart.com/en/unicode/category/So diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index eda4c1593..1422b4194 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -7,7 +7,7 @@ from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "") -HYPHENS = r"\- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") +HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "").replace("\n", "") _suffixes = (