Added string manipulation for spans

This commit is contained in:
Em 2017-03-10 16:50:02 -08:00
parent a16aff17aa
commit 426d17167f
3 changed files with 31 additions and 0 deletions

5
.gitignore vendored
View File

@ -105,3 +105,8 @@ website/package.json
website/announcement.jade
website/www/
website/.gitignore
# Personal (Eric)
venv
venv/*
.gitignore

View File

@ -31,6 +31,13 @@ def test_spans_root(doc):
assert span.root.text == 'sentence'
assert span.root.head.text == 'is'
def test_spans_string_fn(doc):
span = doc[0:4]
assert len(span) == 4
assert span.text == 'This is a sentence'
assert span.mapStr((lambda x, i, arg="_": x + i + arg), "y", "z") == 'This yzis yza yzsentence yz'
assert span.upper_ == 'THIS IS A SENTENCE'
assert span.lower_ == 'this is a sentence'
def test_spans_root2(en_tokenizer):
text = "through North and South Carolina"

View File

@ -118,6 +118,17 @@ cdef class Span:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
def mapStr(self, fn, *argv, **kargs):
'''Perform a function on the string representation of each token in this span.
Arguments:
fn (function): First argument will always be string of a token. Additional arguments
will be defined according to *argv and **kargs passed to this mapStr() method.
*argv (unpacked tuple): Arguments to be passed to fn
**kargs (unpacked dict): Arguments to be passed to fn
'''
return ''.join([fn(t.string, *argv, **kargs) for t in self]).strip()
cpdef int _recalculate_indices(self) except -1:
if self.end > self.doc.length \
or self.doc.c[self.start].idx != self.start_char \
@ -365,6 +376,14 @@ cdef class Span:
def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip()
property upper_:
def __get__(self):
return ''.join([t.string.upper() for t in self]).strip()
property lower_:
def __get__(self):
return ''.join([t.string.lower() for t in self]).strip()
property string:
def __get__(self):
return ''.join([t.string for t in self])