From 426d17167f7373812f5a27d14343b256e5498dfa Mon Sep 17 00:00:00 2001 From: Em Date: Fri, 10 Mar 2017 16:50:02 -0800 Subject: [PATCH 1/3] Added string manipulation for spans --- .gitignore | 5 +++++ spacy/tests/spans/test_span.py | 7 +++++++ spacy/tokens/span.pyx | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/.gitignore b/.gitignore index 64f24a487..3ad6bfd96 100644 --- a/.gitignore +++ b/.gitignore @@ -105,3 +105,8 @@ website/package.json website/announcement.jade website/www/ website/.gitignore + +# Personal (Eric) +venv +venv/* +.gitignore diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 79505f1cb..6e8114ad4 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -31,6 +31,13 @@ def test_spans_root(doc): assert span.root.text == 'sentence' assert span.root.head.text == 'is' +def test_spans_string_fn(doc): + span = doc[0:4] + assert len(span) == 4 + assert span.text == 'This is a sentence' + assert span.mapStr((lambda x, i, arg="_": x + i + arg), "y", "z") == 'This yzis yza yzsentence yz' + assert span.upper_ == 'THIS IS A SENTENCE' + assert span.lower_ == 'this is a sentence' def test_spans_root2(en_tokenizer): text = "through North and South Carolina" diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 903ef26d1..08f8aea62 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -118,6 +118,17 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def mapStr(self, fn, *argv, **kargs): + '''Perform a function on the string representation of each token in this span. + + Arguments: + fn (function): First argument will always be string of a token. Additional arguments + will be defined according to *argv and **kargs passed to this mapStr() method. + *argv (unpacked tuple): Arguments to be passed to fn + **kargs (unpacked dict): Arguments to be passed to fn + ''' + return ''.join([fn(t.string, *argv, **kargs) for t in self]).strip() + cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \ @@ -365,6 +376,14 @@ cdef class Span: def __get__(self): return ' '.join([t.lemma_ for t in self]).strip() + property upper_: + def __get__(self): + return ''.join([t.string.upper() for t in self]).strip() + + property lower_: + def __get__(self): + return ''.join([t.string.lower() for t in self]).strip() + property string: def __get__(self): return ''.join([t.string for t in self]) From 1bb364a3b563d776337bda9d82d593a1ad9637a4 Mon Sep 17 00:00:00 2001 From: Em Date: Fri, 10 Mar 2017 16:52:04 -0800 Subject: [PATCH 2/3] Adding venv to .gitignore --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 3ad6bfd96..8716a8ef0 100644 --- a/.gitignore +++ b/.gitignore @@ -106,7 +106,6 @@ website/announcement.jade website/www/ website/.gitignore -# Personal (Eric) +# Python virtualenv venv venv/* -.gitignore From 9c809efc25808a82bedadd48739f1d0fcd567d7a Mon Sep 17 00:00:00 2001 From: Em Date: Sat, 11 Mar 2017 16:23:26 -0800 Subject: [PATCH 3/3] Removed mapStr --- spacy/tests/spans/test_span.py | 1 - spacy/tokens/span.pyx | 11 ----------- 2 files changed, 12 deletions(-) diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 6e8114ad4..14c176edc 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -35,7 +35,6 @@ def test_spans_string_fn(doc): span = doc[0:4] assert len(span) == 4 assert span.text == 'This is a sentence' - assert span.mapStr((lambda x, i, arg="_": x + i + arg), "y", "z") == 'This yzis yza yzsentence yz' assert span.upper_ == 'THIS IS A SENTENCE' assert span.lower_ == 'this is a sentence' diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 08f8aea62..fc5d26174 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -118,17 +118,6 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) - def mapStr(self, fn, *argv, **kargs): - '''Perform a function on the string representation of each token in this span. - - Arguments: - fn (function): First argument will always be string of a token. Additional arguments - will be defined according to *argv and **kargs passed to this mapStr() method. - *argv (unpacked tuple): Arguments to be passed to fn - **kargs (unpacked dict): Arguments to be passed to fn - ''' - return ''.join([fn(t.string, *argv, **kargs) for t in self]).strip() - cpdef int _recalculate_indices(self) except -1: if self.end > self.doc.length \ or self.doc.c[self.start].idx != self.start_char \