mirror of https://github.com/explosion/spaCy.git
bugfix in span similarity (#5155)
* bugfix in span similarity * also rewrite doc.pyx for clarity * formatting
This commit is contained in:
parent
1f9852abc3
commit
d6d95674c1
|
@ -0,0 +1,18 @@
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue5152():
|
||||||
|
# Test that the comparison between a Span and a Token, goes well
|
||||||
|
# There was a bug when the number of tokens in the span equaled the number of characters in the token (!)
|
||||||
|
nlp = English()
|
||||||
|
text = nlp("Talk about being boring!")
|
||||||
|
text_var = nlp("Talk of being boring!")
|
||||||
|
y = nlp("Let")
|
||||||
|
|
||||||
|
span = text[0:3] # Talk about being
|
||||||
|
span_2 = text[0:3] # Talk about being
|
||||||
|
span_3 = text_var[0:3] # Talk of being
|
||||||
|
token = y[0] # Let
|
||||||
|
assert span.similarity(token) == 0.0
|
||||||
|
assert span.similarity(span_2) == 1.0
|
||||||
|
assert span_2.similarity(span_3) < 1.0
|
|
@ -380,13 +380,14 @@ cdef class Doc:
|
||||||
if isinstance(other, (Lexeme, Token)) and self.length == 1:
|
if isinstance(other, (Lexeme, Token)) and self.length == 1:
|
||||||
if self.c[0].lex.orth == other.orth:
|
if self.c[0].lex.orth == other.orth:
|
||||||
return 1.0
|
return 1.0
|
||||||
elif isinstance(other, (Span, Doc)):
|
elif isinstance(other, (Span, Doc)) and len(self) == len(other):
|
||||||
if len(self) == len(other):
|
similar = True
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
if self[i].orth != other[i].orth:
|
if self[i].orth != other[i].orth:
|
||||||
break
|
similar = False
|
||||||
else:
|
break
|
||||||
return 1.0
|
if similar:
|
||||||
|
return 1.0
|
||||||
if self.vocab.vectors.n_keys == 0:
|
if self.vocab.vectors.n_keys == 0:
|
||||||
warnings.warn(Warnings.W007.format(obj="Doc"))
|
warnings.warn(Warnings.W007.format(obj="Doc"))
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
|
|
@ -320,11 +320,13 @@ cdef class Span:
|
||||||
if len(self) == 1 and hasattr(other, "orth"):
|
if len(self) == 1 and hasattr(other, "orth"):
|
||||||
if self[0].orth == other.orth:
|
if self[0].orth == other.orth:
|
||||||
return 1.0
|
return 1.0
|
||||||
elif hasattr(other, "__len__") and len(self) == len(other):
|
elif isinstance(other, (Doc, Span)) and len(self) == len(other):
|
||||||
|
similar = True
|
||||||
for i in range(len(self)):
|
for i in range(len(self)):
|
||||||
if self[i].orth != getattr(other[i], "orth", None):
|
if self[i].orth != getattr(other[i], "orth", None):
|
||||||
|
similar = False
|
||||||
break
|
break
|
||||||
else:
|
if similar:
|
||||||
return 1.0
|
return 1.0
|
||||||
if self.vocab.vectors.n_keys == 0:
|
if self.vocab.vectors.n_keys == 0:
|
||||||
warnings.warn(Warnings.W007.format(obj="Span"))
|
warnings.warn(Warnings.W007.format(obj="Span"))
|
||||||
|
|
Loading…
Reference in New Issue