From 07407e07ab8de5864bb61c5fa6c857b3373922b6 Mon Sep 17 00:00:00 2001 From: Arman Mohammadi <45389988+arplusman@users.noreply.github.com> Date: Wed, 2 Aug 2023 18:22:26 +0330 Subject: [PATCH] fix the regular expression matching on the full text (#12883) There was a mistake in the regex pattern which caused not matching all the desired tokens. The problem was that when we use r string literal prefix to suppose a raw text, we should not use two backslashes to demonstrate a backslash. --- website/docs/usage/rule-based-matching.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 39be5f47b..4f54415cb 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -311,7 +311,7 @@ import re nlp = spacy.load("en_core_web_sm") doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.") -expression = r"[Uu](nited|\\.?) ?[Ss](tates|\\.?)" +expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)" for match in re.finditer(expression, doc.text): start, end = match.span() span = doc.char_span(start, end)