From 3fd3bc79aa7fa5c1c1ae360b49b3d2a1da6b0f36 Mon Sep 17 00:00:00 2001 From: "Yubing (Tom) Dong" Date: Wed, 7 Oct 2015 01:25:35 -0700 Subject: [PATCH] Refactor to remove duplicate slicing logic --- spacy/tokens/doc.pyx | 11 +++-------- spacy/tokens/spans.pyx | 27 +++++---------------------- spacy/util.py | 20 ++++++++++++++++++++ 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ce278d868..b78214ba9 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -21,6 +21,7 @@ from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray +from ..util import normalize_slice DEF PADDING = 5 @@ -87,14 +88,8 @@ cdef class Doc: token (Token): """ if isinstance(i, slice): - if not (i.step is None or i.step == 1): - raise ValueError("Stepped slices not supported in Span objects." - "Try: list(doc)[start:stop:step] instead.") - if i.start is None: - i = slice(0, i.stop) - if i.stop is None: - i = slice(i.start, len(self)) - return Span(self, i.start, i.stop, label=0) + start, stop = normalize_slice(len(self), i.start, i.stop, i.step) + return Span(self, start, stop, label=0) if i < 0: i = self.length + i diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 955d24ad4..e8d2f2e59 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -9,19 +9,15 @@ from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t +from ..util import normalize_slice cdef class Span: """A slice from a Doc object.""" def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, vector_norm=None): - if start < 0: - start = tokens.length + start - start = min(tokens.length, max(0, start)) - - if end < 0: - end = tokens.length + end - end = min(tokens.length, max(start, end)) + if not (0 <= start <= end <= len(tokens)): + raise IndexError self.doc = tokens self.start = start @@ -52,23 +48,10 @@ cdef class Span: def __getitem__(self, object i): if isinstance(i, slice): - start, end, step = i.start, i.stop, i.step - if start is None: - start = 0 - elif start < 0: - start += len(self) - start = min(len(self), max(0, start)) - - if end is None: - end = len(self) - elif end < 0: - end += len(self) - end = min(len(self), max(start, end)) - + start, end = normalize_slice(len(self), i.start, i.stop, i.step) start += self.start end += self.start - - return self.doc[start:end:i.step] + return Span(self.doc, start, end) if i < 0: return self.doc[self.end + i] diff --git a/spacy/util.py b/spacy/util.py index 9f5b4fe04..449b06399 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,6 +7,26 @@ from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE DATA_DIR = path.join(path.dirname(__file__), '..', 'data') +def normalize_slice(length, start, stop, step=None): + if not (step is None or step == 1): + raise ValueError("Stepped slices not supported in Span objects." + "Try: list(tokens)[start:stop:step] instead.") + if start is None: + start = 0 + elif start < 0: + start += length + start = min(length, max(0, start)) + + if stop is None: + stop = length + elif stop < 0: + stop += length + stop = min(length, max(start, stop)) + + assert 0 <= start <= stop <= length + return start, stop + + def utf8open(loc, mode='r'): return codecs.open(loc, mode, 'utf8')