From 426d17167f7373812f5a27d14343b256e5498dfa Mon Sep 17 00:00:00 2001 From: Em Date: Fri, 10 Mar 2017 16:50:02 -0800 Subject: [PATCH] Added string manipulation for spans --- .gitignore | 5 +++++ spacy/tests/spans/test_span.py | 7 +++++++ spacy/tokens/span.pyx | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/.gitignore b/.gitignore index 64f24a487..3ad6bfd96 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,8 @@ website/package.json website/announcement.jade website/www/ website/.gitignore + +# Personal (Eric) +venv +venv/* +.gitignore diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 79505f1cb..6e8114ad4 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -31,6 +31,13 @@ def test_spans_root(doc): assert span.root.text == 'sentence' assert span.root.head.text == 'is' +def test_spans_string_fn(doc): + span = doc[0:4] + assert len(span) == 4 + assert span.text == 'This is a sentence' + assert span.mapStr((lambda x, i, arg="_": x + i + arg), "y", "z") == 'This yzis yza yzsentence yz' + assert span.upper_ == 'THIS IS A SENTENCE' + assert span.lower_ == 'this is a sentence' def test_spans_root2(en_tokenizer): text = "through North and South Carolina" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 903ef26d1..08f8aea62 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -118,6 +118,17 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def mapStr(self, fn, *argv, **kargs): + '''Perform a function on the string representation of each token in this span. + + Arguments: + fn (function): First argument will always be string of a token. Additional arguments + will be defined according to *argv and **kargs passed to this mapStr() method. + *argv (unpacked tuple): Arguments to be passed to fn + **kargs (unpacked dict): Arguments to be passed to fn + ''' + return ''.join([fn(t.string, *argv, **kargs) for t in self]).strip() + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \ @@ -365,6 +376,14 @@ cdef class Span: def __get__(self): return ' '.join([t.lemma_ for t in self]).strip() + property upper_: + def __get__(self): + return ''.join([t.string.upper() for t in self]).strip() + + property lower_: + def __get__(self): + return ''.join([t.string.lower() for t in self]).strip() + property string: def __get__(self): return ''.join([t.string for t in self])