mirror of https://github.com/explosion/spaCy.git
fix the regular expression matching on the full text (#12883)
There was a mistake in the regex pattern which caused not matching all the desired tokens. The problem was that when we use r string literal prefix to suppose a raw text, we should not use two backslashes to demonstrate a backslash.
This commit is contained in:
parent
222bd3c5b1
commit
07407e07ab
|
@ -311,7 +311,7 @@ import re
|
||||||
nlp = spacy.load("en_core_web_sm")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")
|
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")
|
||||||
|
|
||||||
expression = r"[Uu](nited|\\.?) ?[Ss](tates|\\.?)"
|
expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
|
||||||
for match in re.finditer(expression, doc.text):
|
for match in re.finditer(expression, doc.text):
|
||||||
start, end = match.span()
|
start, end = match.span()
|
||||||
span = doc.char_span(start, end)
|
span = doc.char_span(start, end)
|
||||||
|
|
Loading…
Reference in New Issue