From 73566899bf3bde655a9437af601fe5744f700a66 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 00:51:25 -0700 Subject: [PATCH 1/7] Add Doc slicing tests --- tests/tokens/test_tokens_api.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index e1238373f..a7311932f 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -12,6 +12,15 @@ def test_getitem(EN): with pytest.raises(IndexError): tokens[len(tokens)] + span = tokens[1:1] + assert not '/'.join(token.orth_ for token in span) + span = tokens[1:4] + assert '/'.join(token.orth_ for token in span) == 'it/back/!' + with pytest.raises(ValueError): + tokens[1:4:2] + with pytest.raises(ValueError): + tokens[1:4:-1] + @pytest.mark.models def test_serialize(EN): From 2fc33e8024487974c6fbc6941026b75f8e89a07b Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 00:56:33 -0700 Subject: [PATCH 2/7] Allow step=1 when slicing a Doc --- spacy/tokens/doc.pyx | 2 +- tests/tokens/test_tokens_api.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 8a7d12555..ce278d868 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -87,7 +87,7 @@ cdef class Doc: token (Token): """ if isinstance(i, slice): - if i.step is not None: + if not (i.step is None or i.step == 1): raise ValueError("Stepped slices not supported in Span objects." "Try: list(doc)[start:stop:step] instead.") if i.start is None: diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index a7311932f..fc1b52143 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -16,6 +16,8 @@ def test_getitem(EN): assert not '/'.join(token.orth_ for token in span) span = tokens[1:4] assert '/'.join(token.orth_ for token in span) == 'it/back/!' + span = tokens[1:4:1] + assert '/'.join(token.orth_ for token in span) == 'it/back/!' with pytest.raises(ValueError): tokens[1:4:2] with pytest.raises(ValueError): From ef2af20cd373583b6d4ee6cc06ce8ca8406fba8c Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 01:59:11 -0700 Subject: [PATCH 3/7] Make Doc's slicing behavior conform to Python conventions --- spacy/tokens/spans.pyx | 8 +++++-- tests/tokens/test_tokens_api.py | 40 ++++++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index c39f8976c..99efad4b9 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -16,9 +16,13 @@ cdef class Span: def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): if start < 0: - start = tokens.length - start + start = tokens.length + start + start = min(tokens.length, max(0, start)) + if end < 0: - end = tokens.length - end + end = tokens.length + end + end = min(tokens.length, max(start, end)) + self.doc = tokens self.start = start self.end = end diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index fc1b52143..a272a8e3b 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -12,17 +12,51 @@ def test_getitem(EN): with pytest.raises(IndexError): tokens[len(tokens)] + def to_str(span): + return '/'.join(token.orth_ for token in span) + span = tokens[1:1] - assert not '/'.join(token.orth_ for token in span) + assert not to_str(span) span = tokens[1:4] - assert '/'.join(token.orth_ for token in span) == 'it/back/!' + assert to_str(span) == 'it/back/!' span = tokens[1:4:1] - assert '/'.join(token.orth_ for token in span) == 'it/back/!' + assert to_str(span) == 'it/back/!' with pytest.raises(ValueError): tokens[1:4:2] with pytest.raises(ValueError): tokens[1:4:-1] + span = tokens[-3:6] + assert to_str(span) == 'He/pleaded' + span = tokens[4:-1] + assert to_str(span) == 'He/pleaded' + span = tokens[-5:-3] + assert to_str(span) == 'back/!' + span = tokens[5:4] + assert span.start == span.end == 5 and not to_str(span) + span = tokens[4:-3] + assert span.start == span.end == 4 and not to_str(span) + + span = tokens[:] + assert to_str(span) == 'Give/it/back/!/He/pleaded/.' + span = tokens[4:] + assert to_str(span) == 'He/pleaded/.' + span = tokens[:4] + assert to_str(span) == 'Give/it/back/!' + span = tokens[:-3] + assert to_str(span) == 'Give/it/back/!' + span = tokens[-3:] + assert to_str(span) == 'He/pleaded/.' + + span = tokens[4:50] + assert to_str(span) == 'He/pleaded/.' + span = tokens[-50:4] + assert to_str(span) == 'Give/it/back/!' + span = tokens[-50:-40] + assert span.start == span.end == 0 and not to_str(span) + span = tokens[40:50] + assert span.start == span.end == 7 and not to_str(span) + @pytest.mark.models def test_serialize(EN): From 5cc2f2b01ab26e313a7035f998fc1b4373cb6cc5 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 02:08:39 -0700 Subject: [PATCH 4/7] Test simple indexing for Span --- tests/tokens/test_tokens_api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index a272a8e3b..34e54a2af 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -57,6 +57,9 @@ def test_getitem(EN): span = tokens[40:50] assert span.start == span.end == 7 and not to_str(span) + span = tokens[1:4] + assert span[0].orth_ == 'it' + @pytest.mark.models def test_serialize(EN): From 97685aecb735289de32c992e3659e503412aeeb5 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Tue, 6 Oct 2015 02:45:49 -0700 Subject: [PATCH 5/7] Add slicing support to Span --- spacy/tokens/spans.pyx | 21 ++++++++++++++++++++- tests/tokens/test_tokens_api.py | 18 ++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 99efad4b9..955d24ad4 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -50,7 +50,26 @@ cdef class Span: return 0 return self.end - self.start - def __getitem__(self, int i): + def __getitem__(self, object i): + if isinstance(i, slice): + start, end, step = i.start, i.stop, i.step + if start is None: + start = 0 + elif start < 0: + start += len(self) + start = min(len(self), max(0, start)) + + if end is None: + end = len(self) + elif end < 0: + end += len(self) + end = min(len(self), max(start, end)) + + start += self.start + end += self.start + + return self.doc[start:end:i.step] + if i < 0: return self.doc[self.end + i] else: diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index 34e54a2af..675f00235 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -59,6 +59,24 @@ def test_getitem(EN): span = tokens[1:4] assert span[0].orth_ == 'it' + subspan = span[:] + assert to_str(subspan) == 'it/back/!' + subspan = span[:2] + assert to_str(subspan) == 'it/back' + subspan = span[1:] + assert to_str(subspan) == 'back/!' + subspan = span[:-1] + assert to_str(subspan) == 'it/back' + subspan = span[-2:] + assert to_str(subspan) == 'back/!' + subspan = span[1:2] + assert to_str(subspan) == 'back' + subspan = span[-2:-1] + assert to_str(subspan) == 'back' + subspan = span[-50:50] + assert to_str(subspan) == 'it/back/!' + subspan = span[50:-50] + assert subspan.start == subspan.end == 4 and not to_str(subspan) @pytest.mark.models From 3fd3bc79aa7fa5c1c1ae360b49b3d2a1da6b0f36 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Wed, 7 Oct 2015 01:25:35 -0700 Subject: [PATCH 6/7] Refactor to remove duplicate slicing logic --- spacy/tokens/doc.pyx | 11 +++-------- spacy/tokens/spans.pyx | 27 +++++---------------------- spacy/util.py | 20 ++++++++++++++++++++ 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ce278d868..b78214ba9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -21,6 +21,7 @@ from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray +from ..util import normalize_slice DEF PADDING = 5 @@ -87,14 +88,8 @@ cdef class Doc: token (Token): """ if isinstance(i, slice): - if not (i.step is None or i.step == 1): - raise ValueError("Stepped slices not supported in Span objects." - "Try: list(doc)[start:stop:step] instead.") - if i.start is None: - i = slice(0, i.stop) - if i.stop is None: - i = slice(i.start, len(self)) - return Span(self, i.start, i.stop, label=0) + start, stop = normalize_slice(len(self), i.start, i.stop, i.step) + return Span(self, start, stop, label=0) if i < 0: i = self.length + i diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 955d24ad4..e8d2f2e59 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -9,19 +9,15 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t +from ..util import normalize_slice cdef class Span: """A slice from a Doc object.""" def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): - if start < 0: - start = tokens.length + start - start = min(tokens.length, max(0, start)) - - if end < 0: - end = tokens.length + end - end = min(tokens.length, max(start, end)) + if not (0 <= start <= end <= len(tokens)): + raise IndexError self.doc = tokens self.start = start @@ -52,23 +48,10 @@ cdef class Span: def __getitem__(self, object i): if isinstance(i, slice): - start, end, step = i.start, i.stop, i.step - if start is None: - start = 0 - elif start < 0: - start += len(self) - start = min(len(self), max(0, start)) - - if end is None: - end = len(self) - elif end < 0: - end += len(self) - end = min(len(self), max(start, end)) - + start, end = normalize_slice(len(self), i.start, i.stop, i.step) start += self.start end += self.start - - return self.doc[start:end:i.step] + return Span(self.doc, start, end) if i < 0: return self.doc[self.end + i] diff --git a/spacy/util.py b/spacy/util.py index 9f5b4fe04..449b06399 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,6 +7,26 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE DATA_DIR = path.join(path.dirname(__file__), '..', 'data') +def normalize_slice(length, start, stop, step=None): + if not (step is None or step == 1): + raise ValueError("Stepped slices not supported in Span objects." + "Try: list(tokens)[start:stop:step] instead.") + if start is None: + start = 0 + elif start < 0: + start += length + start = min(length, max(0, start)) + + if stop is None: + stop = length + elif stop < 0: + stop += length + stop = min(length, max(start, stop)) + + assert 0 <= start <= stop <= length + return start, stop + + def utf8open(loc, mode='r'): return codecs.open(loc, mode, 'utf8') From 0f601b8b750a8991d333a7a95f97b74b80b46846 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Wed, 7 Oct 2015 01:27:28 -0700 Subject: [PATCH 7/7] Update docstring of Doc.__getitem__ --- spacy/tokens/doc.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index b78214ba9..eab6c044e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -82,10 +82,10 @@ cdef class Doc: self._vector = None def __getitem__(self, object i): - """Get a token. + """Get a Token or a Span from the Doc. Returns: - token (Token): + token (Token) or span (Span): """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step)