2014-07-07 05:36:43 +00:00
|
|
|
# cython: profile=True
|
2014-08-20 11:39:39 +00:00
|
|
|
# cython: embedsignature=True
|
|
|
|
'''Tokenize English text, allowing some differences from the Penn Treebank
|
|
|
|
tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB
|
|
|
|
compatibility is the priority.
|
2014-07-05 18:51:42 +00:00
|
|
|
'''
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
from libc.stdlib cimport malloc, calloc, free
|
|
|
|
from libc.stdint cimport uint64_t
|
2014-07-06 23:15:59 +00:00
|
|
|
from libcpp.vector cimport vector
|
2014-07-05 18:51:42 +00:00
|
|
|
|
2014-08-20 11:39:39 +00:00
|
|
|
cimport spacy
|
2014-07-07 10:47:21 +00:00
|
|
|
|
2014-07-05 18:51:42 +00:00
|
|
|
|
2014-08-20 11:39:39 +00:00
|
|
|
from spacy.orthography.latin cimport *
|
2014-08-21 01:29:15 +00:00
|
|
|
from spacy.lexeme cimport *
|
|
|
|
|
|
|
|
from .orthography.latin import *
|
|
|
|
from .lexeme import *
|
2014-08-20 11:39:39 +00:00
|
|
|
|
2014-07-05 18:51:42 +00:00
|
|
|
|
|
|
|
|
2014-07-07 10:47:21 +00:00
|
|
|
cdef class English(spacy.Language):
|
2014-08-20 11:39:39 +00:00
|
|
|
cdef int set_orth(self, unicode word, Lexeme* lex) except -1:
|
|
|
|
pass
|
|
|
|
|
2014-08-18 17:14:00 +00:00
|
|
|
cdef int find_split(self, unicode word):
|
|
|
|
cdef size_t length = len(word)
|
2014-07-07 10:47:21 +00:00
|
|
|
cdef int i = 0
|
2014-08-18 17:14:00 +00:00
|
|
|
if word.startswith("'s") or word.startswith("'S"):
|
|
|
|
return 2
|
2014-07-07 10:47:21 +00:00
|
|
|
# Contractions
|
2014-08-18 17:14:00 +00:00
|
|
|
if word.endswith("'s") and length >= 3:
|
2014-07-07 10:47:21 +00:00
|
|
|
return length - 2
|
|
|
|
# Leading punctuation
|
2014-08-20 11:39:39 +00:00
|
|
|
if check_punct(word, 0, length):
|
2014-07-07 10:47:21 +00:00
|
|
|
return 1
|
|
|
|
elif length >= 1:
|
|
|
|
# Split off all trailing punctuation characters
|
|
|
|
i = 0
|
2014-08-20 11:39:39 +00:00
|
|
|
while i < length and not check_punct(word, i, length):
|
2014-07-07 10:47:21 +00:00
|
|
|
i += 1
|
|
|
|
return i
|
2014-07-06 16:35:55 +00:00
|
|
|
|
2014-07-06 22:02:55 +00:00
|
|
|
|
2014-08-20 11:39:39 +00:00
|
|
|
cdef bint check_punct(unicode word, size_t i, size_t length):
|
2014-07-06 23:15:59 +00:00
|
|
|
# Don't count appostrophes as punct if the next char is a letter
|
|
|
|
if word[i] == "'" and i < (length - 1) and word[i+1].isalpha():
|
2014-07-07 21:26:01 +00:00
|
|
|
return i == 0
|
2014-07-23 16:35:18 +00:00
|
|
|
if word[i] == "-" and i < (length - 1) and word[i+1] == '-':
|
|
|
|
return False
|
2014-07-06 23:15:59 +00:00
|
|
|
# Don't count commas as punct if the next char is a number
|
|
|
|
if word[i] == "," and i < (length - 1) and word[i+1].isdigit():
|
|
|
|
return False
|
2014-07-23 16:35:18 +00:00
|
|
|
# Don't count periods as punct if the next char is not whitespace
|
|
|
|
if word[i] == "." and i < (length - 1) and not word[i+1].isspace():
|
2014-07-06 23:15:59 +00:00
|
|
|
return False
|
2014-07-06 17:28:42 +00:00
|
|
|
return not word[i].isalnum()
|
2014-07-07 05:36:43 +00:00
|
|
|
|
|
|
|
|
2014-07-07 10:47:21 +00:00
|
|
|
EN = English('en')
|
|
|
|
|
|
|
|
|
|
|
|
cpdef Tokens tokenize(unicode string):
|
2014-08-20 11:39:39 +00:00
|
|
|
"""Tokenize a string.
|
|
|
|
|
2014-08-21 01:29:15 +00:00
|
|
|
The tokenization rules are defined in two places:
|
|
|
|
|
|
|
|
* The data/en/tokenization table, which handles special cases like contractions;
|
|
|
|
* The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
|
2014-08-20 11:39:39 +00:00
|
|
|
|
|
|
|
Args:
|
2014-08-21 01:29:15 +00:00
|
|
|
string (unicode): The string to be tokenized.
|
2014-08-20 11:39:39 +00:00
|
|
|
|
|
|
|
Returns:
|
2014-08-21 01:29:15 +00:00
|
|
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
2014-08-20 11:39:39 +00:00
|
|
|
"""
|
2014-07-07 10:47:21 +00:00
|
|
|
return EN.tokenize(string)
|
|
|
|
|
2014-08-20 11:39:39 +00:00
|
|
|
|
2014-08-21 01:29:15 +00:00
|
|
|
# +49 151 4336 2587
|
|
|
|
|
|
|
|
|
|
|
|
cpdef LexID lookup(unicode string) except 0:
|
|
|
|
"""Retrieve (or create, if not found) a Lexeme ID for a string.
|
2014-08-20 11:39:39 +00:00
|
|
|
|
2014-08-21 01:29:15 +00:00
|
|
|
The LexID is really a memory address, making dereferencing it essentially free.
|
|
|
|
|
2014-08-20 11:39:39 +00:00
|
|
|
Args:
|
|
|
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
|
|
|
|
|
|
|
Returns:
|
2014-08-21 01:29:15 +00:00
|
|
|
lexeme (LexID): A reference to a lexical type.
|
2014-08-20 11:39:39 +00:00
|
|
|
"""
|
2014-08-18 17:14:00 +00:00
|
|
|
return <Lexeme_addr>EN.lookup(string)
|
2014-07-07 10:47:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
cpdef unicode unhash(StringHash hash_value):
|
2014-08-20 11:39:39 +00:00
|
|
|
"""Retrieve a string from a hash value. Mostly used for testing.
|
|
|
|
|
|
|
|
In general you should avoid computing with strings, as they are slower than
|
|
|
|
the intended ID-based usage. However, strings can be recovered if necessary,
|
|
|
|
although no control is taken for hash collisions.
|
2014-08-19 00:40:37 +00:00
|
|
|
|
2014-08-20 11:39:39 +00:00
|
|
|
Args:
|
2014-08-21 01:29:15 +00:00
|
|
|
hash_value (StringHash): The hash of a string, returned by Python's hash()
|
2014-08-20 11:39:39 +00:00
|
|
|
function.
|
2014-08-19 00:40:37 +00:00
|
|
|
|
2014-08-20 11:39:39 +00:00
|
|
|
Returns:
|
|
|
|
string (unicode): A unicode string that hashes to the hash_value.
|
|
|
|
"""
|
|
|
|
return EN.unhash(hash_value)
|