From 4e79446dc298633379fc77cd27a07d3d9b67e2e0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 7 Jul 2014 00:02:55 +0200 Subject: [PATCH] * Reading in tokenization rules correctly. Passing tests. --- spacy/en.cpp | 1395 +++++++++++++++++++++++++++++++++---------- spacy/en.pyx | 27 +- spacy/lexeme.cpp | 2 +- spacy/spacy.cpp | 2 +- spacy/util.py | 27 +- tests/test_vocab.py | 7 + 6 files changed, 1120 insertions(+), 340 deletions(-) diff --git a/spacy/en.cpp b/spacy/en.cpp index c5d249e9d..3eadb456e 100644 --- a/spacy/en.cpp +++ b/spacy/en.cpp @@ -1,4 +1,4 @@ -/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */ +/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */ #define PY_SSIZE_T_CLEAN #ifndef CYTHON_USE_PYLONG_INTERNALS @@ -701,6 +701,14 @@ static CYTHON_INLINE PyObject* __Pyx_PyObject_GetAttrStr(PyObject* obj, PyObject static PyObject *__Pyx_GetBuiltinName(PyObject *name); /*proto*/ +static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected); + +static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index); + +static CYTHON_INLINE int __Pyx_IterFinish(void); /*proto*/ + +static int __Pyx_IternextUnpackEndCheck(PyObject *retval, Py_ssize_t expected); /*proto*/ + #include static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals); /*proto*/ @@ -772,10 +780,38 @@ static CYTHON_INLINE int __Pyx_SetItemInt_Generic(PyObject *o, PyObject *j, PyOb static CYTHON_INLINE int __Pyx_SetItemInt_Fast(PyObject *o, Py_ssize_t i, PyObject *v, int is_list, int wraparound, int boundscheck); +static int __Pyx_PyUnicode_Tailmatch(PyObject* s, PyObject* substr, + Py_ssize_t start, Py_ssize_t end, int direction) { + if (unlikely(PyTuple_Check(substr))) { + Py_ssize_t i, count = PyTuple_GET_SIZE(substr); + for (i = 0; i < count; i++) { + int result; +#if CYTHON_COMPILING_IN_CPYTHON + result = PyUnicode_Tailmatch(s, PyTuple_GET_ITEM(substr, i), + start, end, direction); +#else + PyObject* sub = PySequence_GetItem(substr, i); + if (unlikely(!sub)) return -1; + result = PyUnicode_Tailmatch(s, sub, start, end, direction); + Py_DECREF(sub); +#endif + if (result) { + return result; + } + } + return 0; + } + return PyUnicode_Tailmatch(s, substr, start, end, direction); +} + static void __Pyx_WriteUnraisable(const char *name, int clineno, int lineno, const char *filename, int full_traceback); /*proto*/ +static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name); /*proto*/ + +static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level); /*proto*/ + static CYTHON_INLINE int __Pyx_PyInt_As_int(PyObject *); static CYTHON_INLINE uint64_t __Pyx_PyInt_As_uint64_t(PyObject *); @@ -859,16 +895,22 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *, int, int, size_t, int __ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5spacy_6lexeme_StringHash, PyObject *, int, size_t); /*proto*/ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyObject *, __pyx_t_5spacy_6lexeme_StringHash, int, size_t); /*proto*/ static size_t __pyx_f_5spacy_2en__find_split(PyObject *, size_t); /*proto*/ +static int __pyx_f_5spacy_2en_is_punct(PyObject *, size_t, size_t); /*proto*/ #define __Pyx_MODULE_NAME "spacy.en" int __pyx_module_is_main_spacy__en = 0; /* Implementation of 'spacy.en' */ +static PyObject *__pyx_builtin_enumerate; static PyObject *__pyx_builtin_ValueError; -static PyObject *__pyx_pf_5spacy_2en_lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string); /* proto */ -static PyObject *__pyx_pf_5spacy_2en_2lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end); /* proto */ -static PyObject *__pyx_pf_5spacy_2en_4unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value); /* proto */ -static PyObject *__pyx_pf_5spacy_2en_6_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_token_rules); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_2lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_4lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_6unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value); /* proto */ +static PyObject *__pyx_pf_5spacy_2en_8_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length); /* proto */ static char __pyx_k_[] = ""; +static char __pyx_k_i[] = "i"; +static char __pyx_k_s[] = "'s"; +static char __pyx_k_en[] = "en"; static char __pyx_k_end[] = "end"; static char __pyx_k_lex[] = "lex"; static char __pyx_k_sic[] = "sic"; @@ -877,36 +919,59 @@ static char __pyx_k_main[] = "__main__"; static char __pyx_k_prob[] = "prob"; static char __pyx_k_tail[] = "tail"; static char __pyx_k_test[] = "__test__"; +static char __pyx_k_util[] = "util"; +static char __pyx_k_word[] = "word"; +static char __pyx_k_chunk[] = "chunk"; static char __pyx_k_first[] = "first"; static char __pyx_k_last3[] = "last3"; static char __pyx_k_lower[] = "lower"; +static char __pyx_k_s_d_s[] = "%s:@:%d:@:%s"; static char __pyx_k_start[] = "start"; static char __pyx_k_DIGITS[] = "!DIGITS"; +static char __pyx_k_hashed[] = "hashed"; +static char __pyx_k_import[] = "__import__"; static char __pyx_k_length[] = "length"; static char __pyx_k_normed[] = "normed"; static char __pyx_k_string[] = "string"; +static char __pyx_k_tokens[] = "tokens"; static char __pyx_k_LEXEMES[] = "LEXEMES"; static char __pyx_k_STRINGS[] = "STRINGS"; static char __pyx_k_cluster[] = "cluster"; static char __pyx_k_isdigit[] = "isdigit"; static char __pyx_k_pyx_capi[] = "__pyx_capi__"; +static char __pyx_k_spacy_en[] = "spacy.en"; +static char __pyx_k_enumerate[] = "enumerate"; static char __pyx_k_oft_title[] = "oft_title"; static char __pyx_k_oft_upper[] = "oft_upper"; static char __pyx_k_ValueError[] = "ValueError"; +static char __pyx_k_token_rules[] = "token_rules"; +static char __pyx_k_token_string[] = "token_string"; +static char __pyx_k_load_tokenization[] = "load_tokenization"; +static char __pyx_k_read_tokenization[] = "read_tokenization"; +static char __pyx_k_Users_matt_repos_spaCy_spacy_en[] = "/Users/matt/repos/spaCy/spacy/en.pyx"; static char __pyx_k_Serve_pointers_to_Lexeme_structs[] = "Serve pointers to Lexeme structs, given strings. Maintain a reverse index,\nso that strings can be retrieved from hashes. Use 64-bit hash values and\nboldly assume no collisions.\n"; +static PyObject *__pyx_n_s_; static PyObject *__pyx_kp_u_; static PyObject *__pyx_kp_u_DIGITS; static PyObject *__pyx_n_s_LEXEMES; static PyObject *__pyx_n_s_STRINGS; +static PyObject *__pyx_kp_s_Users_matt_repos_spaCy_spacy_en; static PyObject *__pyx_n_s_ValueError; static PyObject *__pyx_kp_u_YEAR; +static PyObject *__pyx_n_s_chunk; static PyObject *__pyx_n_s_cluster; +static PyObject *__pyx_n_u_en; static PyObject *__pyx_n_s_end; +static PyObject *__pyx_n_s_enumerate; static PyObject *__pyx_n_s_first; +static PyObject *__pyx_n_s_hashed; +static PyObject *__pyx_n_s_i; +static PyObject *__pyx_n_s_import; static PyObject *__pyx_n_s_isdigit; static PyObject *__pyx_n_s_last3; static PyObject *__pyx_n_s_length; static PyObject *__pyx_n_s_lex; +static PyObject *__pyx_n_s_load_tokenization; static PyObject *__pyx_n_s_lower; static PyObject *__pyx_n_s_main; static PyObject *__pyx_n_s_normed; @@ -914,21 +979,390 @@ static PyObject *__pyx_n_s_oft_title; static PyObject *__pyx_n_s_oft_upper; static PyObject *__pyx_n_s_prob; static PyObject *__pyx_n_s_pyx_capi; +static PyObject *__pyx_n_s_read_tokenization; +static PyObject *__pyx_kp_u_s; +static PyObject *__pyx_kp_u_s_d_s; static PyObject *__pyx_n_s_sic; +static PyObject *__pyx_n_s_spacy_en; static PyObject *__pyx_n_s_start; static PyObject *__pyx_n_s_string; static PyObject *__pyx_n_s_tail; static PyObject *__pyx_n_s_test; +static PyObject *__pyx_n_s_token_rules; +static PyObject *__pyx_n_s_token_string; +static PyObject *__pyx_n_s_tokens; +static PyObject *__pyx_n_s_util; +static PyObject *__pyx_n_s_word; +static PyObject *__pyx_int_0; +static PyObject *__pyx_int_1; +static PyObject *__pyx_tuple__2; +static PyObject *__pyx_tuple__4; +static PyObject *__pyx_codeobj__3; -/* "spacy/en.pyx":23 +/* "spacy/en.pyx":24 * * + * def load_tokenization(token_rules): # <<<<<<<<<<<<<< + * cdef Lexeme* word + * cdef StringHash hashed + */ + +/* Python wrapper */ +static PyObject *__pyx_pw_5spacy_2en_1load_tokenization(PyObject *__pyx_self, PyObject *__pyx_v_token_rules); /*proto*/ +static PyMethodDef __pyx_mdef_5spacy_2en_1load_tokenization = {__Pyx_NAMESTR("load_tokenization"), (PyCFunction)__pyx_pw_5spacy_2en_1load_tokenization, METH_O, __Pyx_DOCSTR(0)}; +static PyObject *__pyx_pw_5spacy_2en_1load_tokenization(PyObject *__pyx_self, PyObject *__pyx_v_token_rules) { + PyObject *__pyx_r = 0; + __Pyx_RefNannyDeclarations + __Pyx_RefNannySetupContext("load_tokenization (wrapper)", 0); + __pyx_r = __pyx_pf_5spacy_2en_load_tokenization(__pyx_self, ((PyObject *)__pyx_v_token_rules)); + + /* function exit code */ + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +static PyObject *__pyx_pf_5spacy_2en_load_tokenization(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_token_rules) { + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_v_word; + __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed; + PyObject *__pyx_v_chunk = NULL; + PyObject *__pyx_v_lex = NULL; + PyObject *__pyx_v_tokens = NULL; + PyObject *__pyx_v_i = NULL; + PyObject *__pyx_v_token_string = NULL; + Py_ssize_t __pyx_v_length; + PyObject *__pyx_r = NULL; + __Pyx_RefNannyDeclarations + PyObject *__pyx_t_1 = NULL; + Py_ssize_t __pyx_t_2; + PyObject *(*__pyx_t_3)(PyObject *); + PyObject *__pyx_t_4 = NULL; + PyObject *__pyx_t_5 = NULL; + PyObject *__pyx_t_6 = NULL; + PyObject *__pyx_t_7 = NULL; + PyObject *__pyx_t_8 = NULL; + PyObject *(*__pyx_t_9)(PyObject *); + Py_ssize_t __pyx_t_10; + __pyx_t_5spacy_6lexeme_StringHash __pyx_t_11; + Py_ssize_t __pyx_t_12; + struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_t_13; + PyObject *(*__pyx_t_14)(PyObject *); + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("load_tokenization", 0); + + /* "spacy/en.pyx":27 + * cdef Lexeme* word + * cdef StringHash hashed + * for chunk, lex, tokens in token_rules: # <<<<<<<<<<<<<< + * hashed = hash_string(chunk, len(chunk)) + * assert LEXEMES[hashed] == NULL + */ + if (PyList_CheckExact(__pyx_v_token_rules) || PyTuple_CheckExact(__pyx_v_token_rules)) { + __pyx_t_1 = __pyx_v_token_rules; __Pyx_INCREF(__pyx_t_1); __pyx_t_2 = 0; + __pyx_t_3 = NULL; + } else { + __pyx_t_2 = -1; __pyx_t_1 = PyObject_GetIter(__pyx_v_token_rules); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_3 = Py_TYPE(__pyx_t_1)->tp_iternext; + } + for (;;) { + if (!__pyx_t_3 && PyList_CheckExact(__pyx_t_1)) { + if (__pyx_t_2 >= PyList_GET_SIZE(__pyx_t_1)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_4 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #else + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + } else if (!__pyx_t_3 && PyTuple_CheckExact(__pyx_t_1)) { + if (__pyx_t_2 >= PyTuple_GET_SIZE(__pyx_t_1)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_4 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_2); __Pyx_INCREF(__pyx_t_4); __pyx_t_2++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #else + __pyx_t_4 = PySequence_ITEM(__pyx_t_1, __pyx_t_2); __pyx_t_2++; if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + } else { + __pyx_t_4 = __pyx_t_3(__pyx_t_1); + if (unlikely(!__pyx_t_4)) { + PyObject* exc_type = PyErr_Occurred(); + if (exc_type) { + if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); + else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + break; + } + __Pyx_GOTREF(__pyx_t_4); + } + if ((likely(PyTuple_CheckExact(__pyx_t_4))) || (PyList_CheckExact(__pyx_t_4))) { + PyObject* sequence = __pyx_t_4; + #if CYTHON_COMPILING_IN_CPYTHON + Py_ssize_t size = Py_SIZE(sequence); + #else + Py_ssize_t size = PySequence_Size(sequence); + #endif + if (unlikely(size != 3)) { + if (size > 3) __Pyx_RaiseTooManyValuesError(3); + else if (size >= 0) __Pyx_RaiseNeedMoreValuesError(size); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + #if CYTHON_COMPILING_IN_CPYTHON + if (likely(PyTuple_CheckExact(sequence))) { + __pyx_t_5 = PyTuple_GET_ITEM(sequence, 0); + __pyx_t_6 = PyTuple_GET_ITEM(sequence, 1); + __pyx_t_7 = PyTuple_GET_ITEM(sequence, 2); + } else { + __pyx_t_5 = PyList_GET_ITEM(sequence, 0); + __pyx_t_6 = PyList_GET_ITEM(sequence, 1); + __pyx_t_7 = PyList_GET_ITEM(sequence, 2); + } + __Pyx_INCREF(__pyx_t_5); + __Pyx_INCREF(__pyx_t_6); + __Pyx_INCREF(__pyx_t_7); + #else + __pyx_t_5 = PySequence_ITEM(sequence, 0); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_5); + __pyx_t_6 = PySequence_ITEM(sequence, 1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + __pyx_t_7 = PySequence_ITEM(sequence, 2); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_7); + #endif + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + } else { + Py_ssize_t index = -1; + __pyx_t_8 = PyObject_GetIter(__pyx_t_4); if (unlikely(!__pyx_t_8)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_8); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_9 = Py_TYPE(__pyx_t_8)->tp_iternext; + index = 0; __pyx_t_5 = __pyx_t_9(__pyx_t_8); if (unlikely(!__pyx_t_5)) goto __pyx_L5_unpacking_failed; + __Pyx_GOTREF(__pyx_t_5); + index = 1; __pyx_t_6 = __pyx_t_9(__pyx_t_8); if (unlikely(!__pyx_t_6)) goto __pyx_L5_unpacking_failed; + __Pyx_GOTREF(__pyx_t_6); + index = 2; __pyx_t_7 = __pyx_t_9(__pyx_t_8); if (unlikely(!__pyx_t_7)) goto __pyx_L5_unpacking_failed; + __Pyx_GOTREF(__pyx_t_7); + if (__Pyx_IternextUnpackEndCheck(__pyx_t_9(__pyx_t_8), 3) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_9 = NULL; + __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; + goto __pyx_L6_unpacking_done; + __pyx_L5_unpacking_failed:; + __Pyx_DECREF(__pyx_t_8); __pyx_t_8 = 0; + __pyx_t_9 = NULL; + if (__Pyx_IterFinish() == 0) __Pyx_RaiseNeedMoreValuesError(index); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 27; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_L6_unpacking_done:; + } + __Pyx_XDECREF_SET(__pyx_v_chunk, __pyx_t_5); + __pyx_t_5 = 0; + __Pyx_XDECREF_SET(__pyx_v_lex, __pyx_t_6); + __pyx_t_6 = 0; + __Pyx_XDECREF_SET(__pyx_v_tokens, __pyx_t_7); + __pyx_t_7 = 0; + + /* "spacy/en.pyx":28 + * cdef StringHash hashed + * for chunk, lex, tokens in token_rules: + * hashed = hash_string(chunk, len(chunk)) # <<<<<<<<<<<<<< + * assert LEXEMES[hashed] == NULL + * word = _add(hashed, lex, len(lex), len(lex)) + */ + if (!(likely(PyUnicode_CheckExact(__pyx_v_chunk))||((__pyx_v_chunk) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_chunk)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_10 = PyObject_Length(__pyx_v_chunk); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_11 = __pyx_f_5spacy_2en_hash_string(((PyObject*)__pyx_v_chunk), __pyx_t_10); if (unlikely(__pyx_t_11 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 28; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_hashed = __pyx_t_11; + + /* "spacy/en.pyx":29 + * for chunk, lex, tokens in token_rules: + * hashed = hash_string(chunk, len(chunk)) + * assert LEXEMES[hashed] == NULL # <<<<<<<<<<<<<< + * word = _add(hashed, lex, len(lex), len(lex)) + * for i, lex in enumerate(tokens): + */ + #ifndef CYTHON_WITHOUT_ASSERTIONS + if (unlikely(!Py_OptimizeFlag)) { + if (unlikely(!(((__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]) == NULL) != 0))) { + PyErr_SetNone(PyExc_AssertionError); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 29; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + } + #endif + + /* "spacy/en.pyx":30 + * hashed = hash_string(chunk, len(chunk)) + * assert LEXEMES[hashed] == NULL + * word = _add(hashed, lex, len(lex), len(lex)) # <<<<<<<<<<<<<< + * for i, lex in enumerate(tokens): + * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) + */ + if (!(likely(PyUnicode_CheckExact(__pyx_v_lex))||((__pyx_v_lex) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_lex)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_10 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_12 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_12 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_13 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, ((PyObject*)__pyx_v_lex), __pyx_t_10, __pyx_t_12); if (unlikely(__pyx_t_13 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 30; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word = __pyx_t_13; + + /* "spacy/en.pyx":31 + * assert LEXEMES[hashed] == NULL + * word = _add(hashed, lex, len(lex), len(lex)) + * for i, lex in enumerate(tokens): # <<<<<<<<<<<<<< + * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) + * length = len(token_string) + */ + __Pyx_INCREF(__pyx_int_0); + __pyx_t_4 = __pyx_int_0; + if (PyList_CheckExact(__pyx_v_tokens) || PyTuple_CheckExact(__pyx_v_tokens)) { + __pyx_t_7 = __pyx_v_tokens; __Pyx_INCREF(__pyx_t_7); __pyx_t_12 = 0; + __pyx_t_14 = NULL; + } else { + __pyx_t_12 = -1; __pyx_t_7 = PyObject_GetIter(__pyx_v_tokens); if (unlikely(!__pyx_t_7)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_7); + __pyx_t_14 = Py_TYPE(__pyx_t_7)->tp_iternext; + } + for (;;) { + if (!__pyx_t_14 && PyList_CheckExact(__pyx_t_7)) { + if (__pyx_t_12 >= PyList_GET_SIZE(__pyx_t_7)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_6 = PyList_GET_ITEM(__pyx_t_7, __pyx_t_12); __Pyx_INCREF(__pyx_t_6); __pyx_t_12++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #else + __pyx_t_6 = PySequence_ITEM(__pyx_t_7, __pyx_t_12); __pyx_t_12++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + } else if (!__pyx_t_14 && PyTuple_CheckExact(__pyx_t_7)) { + if (__pyx_t_12 >= PyTuple_GET_SIZE(__pyx_t_7)) break; + #if CYTHON_COMPILING_IN_CPYTHON + __pyx_t_6 = PyTuple_GET_ITEM(__pyx_t_7, __pyx_t_12); __Pyx_INCREF(__pyx_t_6); __pyx_t_12++; if (unlikely(0 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #else + __pyx_t_6 = PySequence_ITEM(__pyx_t_7, __pyx_t_12); __pyx_t_12++; if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + #endif + } else { + __pyx_t_6 = __pyx_t_14(__pyx_t_7); + if (unlikely(!__pyx_t_6)) { + PyObject* exc_type = PyErr_Occurred(); + if (exc_type) { + if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear(); + else {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + } + break; + } + __Pyx_GOTREF(__pyx_t_6); + } + __Pyx_DECREF_SET(__pyx_v_lex, __pyx_t_6); + __pyx_t_6 = 0; + __Pyx_INCREF(__pyx_t_4); + __Pyx_XDECREF_SET(__pyx_v_i, __pyx_t_4); + __pyx_t_6 = PyNumber_Add(__pyx_t_4, __pyx_int_1); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + __Pyx_DECREF(__pyx_t_4); + __pyx_t_4 = __pyx_t_6; + __pyx_t_6 = 0; + + /* "spacy/en.pyx":32 + * word = _add(hashed, lex, len(lex), len(lex)) + * for i, lex in enumerate(tokens): + * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) # <<<<<<<<<<<<<< + * length = len(token_string) + * hashed = hash_string(token_string, length) + */ + __pyx_t_6 = PyTuple_New(3); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_6); + __Pyx_INCREF(__pyx_v_chunk); + PyTuple_SET_ITEM(__pyx_t_6, 0, __pyx_v_chunk); + __Pyx_GIVEREF(__pyx_v_chunk); + __Pyx_INCREF(__pyx_v_i); + PyTuple_SET_ITEM(__pyx_t_6, 1, __pyx_v_i); + __Pyx_GIVEREF(__pyx_v_i); + __Pyx_INCREF(__pyx_v_lex); + PyTuple_SET_ITEM(__pyx_t_6, 2, __pyx_v_lex); + __Pyx_GIVEREF(__pyx_v_lex); + __pyx_t_5 = PyUnicode_Format(__pyx_kp_u_s_d_s, __pyx_t_6); if (unlikely(!__pyx_t_5)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 32; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_5); + __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; + __Pyx_XDECREF_SET(__pyx_v_token_string, ((PyObject*)__pyx_t_5)); + __pyx_t_5 = 0; + + /* "spacy/en.pyx":33 + * for i, lex in enumerate(tokens): + * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) + * length = len(token_string) # <<<<<<<<<<<<<< + * hashed = hash_string(token_string, length) + * word.tail = _add(hashed, lex, 0, len(lex)) + */ + __pyx_t_10 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_token_string); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_length = __pyx_t_10; + + /* "spacy/en.pyx":34 + * token_string = '%s:@:%d:@:%s' % (chunk, i, lex) + * length = len(token_string) + * hashed = hash_string(token_string, length) # <<<<<<<<<<<<<< + * word.tail = _add(hashed, lex, 0, len(lex)) + * word = word.tail + */ + __pyx_t_11 = __pyx_f_5spacy_2en_hash_string(__pyx_v_token_string, __pyx_v_length); if (unlikely(__pyx_t_11 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_hashed = __pyx_t_11; + + /* "spacy/en.pyx":35 + * length = len(token_string) + * hashed = hash_string(token_string, length) + * word.tail = _add(hashed, lex, 0, len(lex)) # <<<<<<<<<<<<<< + * word = word.tail + * + */ + if (!(likely(PyUnicode_CheckExact(__pyx_v_lex))||((__pyx_v_lex) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_v_lex)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_10 = PyObject_Length(__pyx_v_lex); if (unlikely(__pyx_t_10 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_13 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, ((PyObject*)__pyx_v_lex), 0, __pyx_t_10); if (unlikely(__pyx_t_13 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 35; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_v_word->tail = __pyx_t_13; + + /* "spacy/en.pyx":36 + * hashed = hash_string(token_string, length) + * word.tail = _add(hashed, lex, 0, len(lex)) + * word = word.tail # <<<<<<<<<<<<<< + * + * + */ + __pyx_t_13 = __pyx_v_word->tail; + __pyx_v_word = __pyx_t_13; + } + __Pyx_DECREF(__pyx_t_7); __pyx_t_7 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + } + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + + /* "spacy/en.pyx":24 + * + * + * def load_tokenization(token_rules): # <<<<<<<<<<<<<< + * cdef Lexeme* word + * cdef StringHash hashed + */ + + /* function exit code */ + __pyx_r = Py_None; __Pyx_INCREF(Py_None); + goto __pyx_L0; + __pyx_L1_error:; + __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_4); + __Pyx_XDECREF(__pyx_t_5); + __Pyx_XDECREF(__pyx_t_6); + __Pyx_XDECREF(__pyx_t_7); + __Pyx_XDECREF(__pyx_t_8); + __Pyx_AddTraceback("spacy.en.load_tokenization", __pyx_clineno, __pyx_lineno, __pyx_filename); + __pyx_r = NULL; + __pyx_L0:; + __Pyx_XDECREF(__pyx_v_chunk); + __Pyx_XDECREF(__pyx_v_lex); + __Pyx_XDECREF(__pyx_v_tokens); + __Pyx_XDECREF(__pyx_v_i); + __Pyx_XDECREF(__pyx_v_token_string); + __Pyx_XGIVEREF(__pyx_r); + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + +/* "spacy/en.pyx":41 + * load_tokenization(util.read_tokenization('en')) + * * cpdef Lexeme_addr lookup(unicode string) except 0: # <<<<<<<<<<<<<< * '''.. function:: enumerate(sequence[, start=0]) * Fetch a Lexeme representing a word string. If the word has not been seen, */ -static PyObject *__pyx_pw_5spacy_2en_1lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_3lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_v_string, CYTHON_UNUSED int __pyx_skip_dispatch) { size_t __pyx_v_length; __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed; @@ -945,18 +1379,18 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup", 0); - /* "spacy/en.pyx":31 + /* "spacy/en.pyx":49 * To specify the boundaries of the word if it has not been seen, use lookup_chunk. * ''' * if string == '': # <<<<<<<<<<<<<< * return &BLANK_WORD * cdef size_t length = len(string) */ - __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 49; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_t_2 = (__pyx_t_1 != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":32 + /* "spacy/en.pyx":50 * ''' * if string == '': * return &BLANK_WORD # <<<<<<<<<<<<<< @@ -967,7 +1401,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ goto __pyx_L0; } - /* "spacy/en.pyx":33 + /* "spacy/en.pyx":51 * if string == '': * return &BLANK_WORD * cdef size_t length = len(string) # <<<<<<<<<<<<<< @@ -976,22 +1410,22 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ */ if (unlikely(__pyx_v_string == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 33; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 51; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_length = __pyx_t_3; - /* "spacy/en.pyx":34 + /* "spacy/en.pyx":52 * return &BLANK_WORD * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) # <<<<<<<<<<<<<< * cdef Lexeme* word_ptr = LEXEMES[hashed] * cdef size_t n */ - __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 34; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_hashed = __pyx_t_4; - /* "spacy/en.pyx":35 + /* "spacy/en.pyx":53 * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) * cdef Lexeme* word_ptr = LEXEMES[hashed] # <<<<<<<<<<<<<< @@ -1000,7 +1434,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ */ __pyx_v_word_ptr = (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]); - /* "spacy/en.pyx":37 + /* "spacy/en.pyx":55 * cdef Lexeme* word_ptr = LEXEMES[hashed] * cdef size_t n * if word_ptr == NULL: # <<<<<<<<<<<<<< @@ -1010,20 +1444,20 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ __pyx_t_2 = ((__pyx_v_word_ptr == NULL) != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":38 + /* "spacy/en.pyx":56 * cdef size_t n * if word_ptr == NULL: * word_ptr = _add(hashed, string, _find_split(string, length), length) # <<<<<<<<<<<<<< * return word_ptr * */ - __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_f_5spacy_2en__find_split(__pyx_v_string, __pyx_v_length), __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 38; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_f_5spacy_2en__find_split(__pyx_v_string, __pyx_v_length), __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word_ptr = __pyx_t_5; goto __pyx_L4; } __pyx_L4:; - /* "spacy/en.pyx":39 + /* "spacy/en.pyx":57 * if word_ptr == NULL: * word_ptr = _add(hashed, string, _find_split(string, length), length) * return word_ptr # <<<<<<<<<<<<<< @@ -1033,8 +1467,8 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_word_ptr); goto __pyx_L0; - /* "spacy/en.pyx":23 - * + /* "spacy/en.pyx":41 + * load_tokenization(util.read_tokenization('en')) * * cpdef Lexeme_addr lookup(unicode string) except 0: # <<<<<<<<<<<<<< * '''.. function:: enumerate(sequence[, start=0]) @@ -1051,17 +1485,17 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup(PyObject *__pyx_ } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_1lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ -static char __pyx_doc_5spacy_2en_lookup[] = ".. function:: enumerate(sequence[, start=0])\n Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, splitting off any attached punctuation or clitics. A\n reference to BLANK_WORD is returned for the empty string.\n \n To specify the boundaries of the word if it has not been seen, use lookup_chunk.\n "; -static PyObject *__pyx_pw_5spacy_2en_1lookup(PyObject *__pyx_self, PyObject *__pyx_v_string) { +static PyObject *__pyx_pw_5spacy_2en_3lookup(PyObject *__pyx_self, PyObject *__pyx_v_string); /*proto*/ +static char __pyx_doc_5spacy_2en_2lookup[] = ".. function:: enumerate(sequence[, start=0])\n Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, splitting off any attached punctuation or clitics. A\n reference to BLANK_WORD is returned for the empty string.\n \n To specify the boundaries of the word if it has not been seen, use lookup_chunk.\n "; +static PyObject *__pyx_pw_5spacy_2en_3lookup(PyObject *__pyx_self, PyObject *__pyx_v_string) { CYTHON_UNUSED int __pyx_lineno = 0; CYTHON_UNUSED const char *__pyx_filename = NULL; CYTHON_UNUSED int __pyx_clineno = 0; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("lookup (wrapper)", 0); - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_r = __pyx_pf_5spacy_2en_lookup(__pyx_self, ((PyObject*)__pyx_v_string)); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_2lookup(__pyx_self, ((PyObject*)__pyx_v_string)); /* function exit code */ goto __pyx_L0; @@ -1072,7 +1506,7 @@ static PyObject *__pyx_pw_5spacy_2en_1lookup(PyObject *__pyx_self, PyObject *__p return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string) { +static PyObject *__pyx_pf_5spacy_2en_2lookup(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_1; @@ -1082,8 +1516,8 @@ static PyObject *__pyx_pf_5spacy_2en_lookup(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en_lookup(__pyx_v_string, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 23; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en_lookup(__pyx_v_string, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 41; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __pyx_r = __pyx_t_2; __pyx_t_2 = 0; @@ -1100,7 +1534,7 @@ static PyObject *__pyx_pf_5spacy_2en_lookup(CYTHON_UNUSED PyObject *__pyx_self, return __pyx_r; } -/* "spacy/en.pyx":42 +/* "spacy/en.pyx":60 * * * cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: # <<<<<<<<<<<<<< @@ -1108,7 +1542,7 @@ static PyObject *__pyx_pf_5spacy_2en_lookup(CYTHON_UNUSED PyObject *__pyx_self, * construct one, given the specified start and end indices. A negative index */ -static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject *__pyx_v_string, int __pyx_v_start, CYTHON_UNUSED int __pyx_v_end, CYTHON_UNUSED int __pyx_skip_dispatch) { size_t __pyx_v_length; __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hashed; @@ -1125,18 +1559,18 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup_chunk", 0); - /* "spacy/en.pyx":50 + /* "spacy/en.pyx":68 * A reference to BLANK_WORD is returned for the empty string. * ''' * if string == '': # <<<<<<<<<<<<<< * return &BLANK_WORD * cdef size_t length = len(string) */ - __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 50; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = (__Pyx_PyUnicode_Equals(__pyx_v_string, __pyx_kp_u_, Py_EQ)); if (unlikely(__pyx_t_1 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_t_2 = (__pyx_t_1 != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":51 + /* "spacy/en.pyx":69 * ''' * if string == '': * return &BLANK_WORD # <<<<<<<<<<<<<< @@ -1147,7 +1581,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * goto __pyx_L0; } - /* "spacy/en.pyx":52 + /* "spacy/en.pyx":70 * if string == '': * return &BLANK_WORD * cdef size_t length = len(string) # <<<<<<<<<<<<<< @@ -1156,22 +1590,22 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * */ if (unlikely(__pyx_v_string == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_string); if (unlikely(__pyx_t_3 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_length = __pyx_t_3; - /* "spacy/en.pyx":53 + /* "spacy/en.pyx":71 * return &BLANK_WORD * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) # <<<<<<<<<<<<<< * cdef Lexeme* chunk_ptr = LEXEMES[hashed] * if chunk_ptr == NULL: */ - __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 53; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __pyx_f_5spacy_2en_hash_string(__pyx_v_string, __pyx_v_length); if (unlikely(__pyx_t_4 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 71; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_hashed = __pyx_t_4; - /* "spacy/en.pyx":54 + /* "spacy/en.pyx":72 * cdef size_t length = len(string) * cdef StringHash hashed = hash_string(string, length) * cdef Lexeme* chunk_ptr = LEXEMES[hashed] # <<<<<<<<<<<<<< @@ -1180,7 +1614,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * */ __pyx_v_chunk_ptr = (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]); - /* "spacy/en.pyx":55 + /* "spacy/en.pyx":73 * cdef StringHash hashed = hash_string(string, length) * cdef Lexeme* chunk_ptr = LEXEMES[hashed] * if chunk_ptr == NULL: # <<<<<<<<<<<<<< @@ -1190,20 +1624,20 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * __pyx_t_2 = ((__pyx_v_chunk_ptr == NULL) != 0); if (__pyx_t_2) { - /* "spacy/en.pyx":56 + /* "spacy/en.pyx":74 * cdef Lexeme* chunk_ptr = LEXEMES[hashed] * if chunk_ptr == NULL: * chunk_ptr = _add(hashed, string, start, length) # <<<<<<<<<<<<<< * return chunk_ptr * */ - __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_v_start, __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_5 = __pyx_f_5spacy_2en__add(__pyx_v_hashed, __pyx_v_string, __pyx_v_start, __pyx_v_length); if (unlikely(__pyx_t_5 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 74; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_chunk_ptr = __pyx_t_5; goto __pyx_L4; } __pyx_L4:; - /* "spacy/en.pyx":57 + /* "spacy/en.pyx":75 * if chunk_ptr == NULL: * chunk_ptr = _add(hashed, string, start, length) * return chunk_ptr # <<<<<<<<<<<<<< @@ -1213,7 +1647,7 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * __pyx_r = ((__pyx_t_5spacy_2en_Lexeme_addr)__pyx_v_chunk_ptr); goto __pyx_L0; - /* "spacy/en.pyx":42 + /* "spacy/en.pyx":60 * * * cpdef Lexeme_addr lookup_chunk(unicode string, int start, int end) except 0: # <<<<<<<<<<<<<< @@ -1231,9 +1665,9 @@ static __pyx_t_5spacy_2en_Lexeme_addr __pyx_f_5spacy_2en_lookup_chunk(PyObject * } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static char __pyx_doc_5spacy_2en_2lookup_chunk[] = "Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, given the specified start and end indices. A negative index\n significes 0 for start, and the string length for end --- i.e. the string\n will not be sliced if start == -1 and end == -1.\n \n A reference to BLANK_WORD is returned for the empty string.\n "; -static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { +static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static char __pyx_doc_5spacy_2en_4lookup_chunk[] = "Fetch a Lexeme representing a word string. If the word has not been seen,\n construct one, given the specified start and end indices. A negative index\n significes 0 for start, and the string length for end --- i.e. the string\n will not be sliced if start == -1 and end == -1.\n \n A reference to BLANK_WORD is returned for the empty string.\n "; +static PyObject *__pyx_pw_5spacy_2en_5lookup_chunk(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_string = 0; int __pyx_v_start; int __pyx_v_end; @@ -1264,16 +1698,16 @@ static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObjec case 1: if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } case 2: if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_end)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "lookup_chunk") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "lookup_chunk") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } else if (PyTuple_GET_SIZE(__pyx_args) != 3) { goto __pyx_L5_argtuple_error; @@ -1283,19 +1717,19 @@ static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObjec values[2] = PyTuple_GET_ITEM(__pyx_args, 2); } __pyx_v_string = ((PyObject*)values[0]); - __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} - __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("lookup_chunk", 1, 3, 3, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L3_error;} __pyx_L3_error:; __Pyx_AddTraceback("spacy.en.lookup_chunk", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_r = __pyx_pf_5spacy_2en_2lookup_chunk(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_4lookup_chunk(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end); /* function exit code */ goto __pyx_L0; @@ -1306,7 +1740,7 @@ static PyObject *__pyx_pw_5spacy_2en_3lookup_chunk(PyObject *__pyx_self, PyObjec return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_2lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end) { +static PyObject *__pyx_pf_5spacy_2en_4lookup_chunk(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations __pyx_t_5spacy_2en_Lexeme_addr __pyx_t_1; @@ -1316,8 +1750,8 @@ static PyObject *__pyx_pf_5spacy_2en_2lookup_chunk(CYTHON_UNUSED PyObject *__pyx int __pyx_clineno = 0; __Pyx_RefNannySetupContext("lookup_chunk", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en_lookup_chunk(__pyx_v_string, __pyx_v_start, __pyx_v_end, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 42; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en_lookup_chunk(__pyx_v_string, __pyx_v_start, __pyx_v_end, 0); if (unlikely(__pyx_t_1 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_FromSize_t(__pyx_t_1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 60; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __pyx_r = __pyx_t_2; __pyx_t_2 = 0; @@ -1334,7 +1768,7 @@ static PyObject *__pyx_pf_5spacy_2en_2lookup_chunk(CYTHON_UNUSED PyObject *__pyx return __pyx_r; } -/* "spacy/en.pyx":60 +/* "spacy/en.pyx":78 * * * cdef StringHash hash_string(unicode s, size_t length) except 0: # <<<<<<<<<<<<<< @@ -1351,7 +1785,7 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject int __pyx_clineno = 0; __Pyx_RefNannySetupContext("hash_string", 0); - /* "spacy/en.pyx":62 + /* "spacy/en.pyx":80 * cdef StringHash hash_string(unicode s, size_t length) except 0: * '''Hash unicode with MurmurHash64A''' * assert length # <<<<<<<<<<<<<< @@ -1362,23 +1796,23 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!(__pyx_v_length != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 62; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 80; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":63 + /* "spacy/en.pyx":81 * '''Hash unicode with MurmurHash64A''' * assert length * return MurmurHash64A(s, length * sizeof(Py_UNICODE), 0) # <<<<<<<<<<<<<< * * */ - __pyx_t_1 = __Pyx_PyUnicode_AsUnicode(__pyx_v_s); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 63; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyUnicode_AsUnicode(__pyx_v_s); if (unlikely((!__pyx_t_1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 81; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_r = MurmurHash64A(((__pyx_t_5spacy_2en_string_ptr)__pyx_t_1), (__pyx_v_length * (sizeof(Py_UNICODE))), 0); goto __pyx_L0; - /* "spacy/en.pyx":60 + /* "spacy/en.pyx":78 * * * cdef StringHash hash_string(unicode s, size_t length) except 0: # <<<<<<<<<<<<<< @@ -1395,7 +1829,7 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject return __pyx_r; } -/* "spacy/en.pyx":66 +/* "spacy/en.pyx":84 * * * cpdef unicode unhash(StringHash hash_value): # <<<<<<<<<<<<<< @@ -1403,7 +1837,7 @@ static __pyx_t_5spacy_6lexeme_StringHash __pyx_f_5spacy_2en_hash_string(PyObject * cdef string_ptr string = STRINGS[hash_value] */ -static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value, CYTHON_UNUSED int __pyx_skip_dispatch) { __pyx_t_5spacy_2en_string_ptr __pyx_v_string; PyObject *__pyx_r = NULL; @@ -1417,23 +1851,23 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p int __pyx_clineno = 0; __Pyx_RefNannySetupContext("unhash", 0); - /* "spacy/en.pyx":68 + /* "spacy/en.pyx":86 * cpdef unicode unhash(StringHash hash_value): * '''Fetch a string from the reverse index, given its hash value.''' * cdef string_ptr string = STRINGS[hash_value] # <<<<<<<<<<<<<< * if string == NULL: * raise ValueError(hash_value) */ - __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 86; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_hash_value, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_2 = __Pyx_GetItemInt(__pyx_t_1, __pyx_v_hash_value, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 86; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_3 = __Pyx_PyUnicode_AsUnicode(__pyx_t_2); if (unlikely((!__pyx_t_3) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 68; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyUnicode_AsUnicode(__pyx_t_2); if (unlikely((!__pyx_t_3) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 86; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; __pyx_v_string = __pyx_t_3; - /* "spacy/en.pyx":69 + /* "spacy/en.pyx":87 * '''Fetch a string from the reverse index, given its hash value.''' * cdef string_ptr string = STRINGS[hash_value] * if string == NULL: # <<<<<<<<<<<<<< @@ -1443,29 +1877,29 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p __pyx_t_4 = ((__pyx_v_string == NULL) != 0); if (__pyx_t_4) { - /* "spacy/en.pyx":70 + /* "spacy/en.pyx":88 * cdef string_ptr string = STRINGS[hash_value] * if string == NULL: * raise ValueError(hash_value) # <<<<<<<<<<<<<< * * return string */ - __pyx_t_2 = __Pyx_PyInt_From_uint64_t(__pyx_v_hash_value); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyInt_From_uint64_t(__pyx_v_hash_value); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_t_2); __Pyx_GIVEREF(__pyx_t_2); __pyx_t_2 = 0; - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_builtin_ValueError, __pyx_t_1, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; __Pyx_Raise(__pyx_t_2, 0, 0, 0); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - /* "spacy/en.pyx":72 + /* "spacy/en.pyx":90 * raise ValueError(hash_value) * * return string # <<<<<<<<<<<<<< @@ -1473,14 +1907,14 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p * */ __Pyx_XDECREF(__pyx_r); - __pyx_t_2 = __Pyx_PyUnicode_FromUnicode(__pyx_v_string); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 72; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyUnicode_FromUnicode(__pyx_v_string); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - if (!(likely(PyUnicode_CheckExact(__pyx_t_2))||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_2)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 72; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!(likely(PyUnicode_CheckExact(__pyx_t_2))||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_2)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_r = ((PyObject*)__pyx_t_2); __pyx_t_2 = 0; goto __pyx_L0; - /* "spacy/en.pyx":66 + /* "spacy/en.pyx":84 * * * cpdef unicode unhash(StringHash hash_value): # <<<<<<<<<<<<<< @@ -1501,9 +1935,9 @@ static PyObject *__pyx_f_5spacy_2en_unhash(__pyx_t_5spacy_6lexeme_StringHash __p } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ -static char __pyx_doc_5spacy_2en_4unhash[] = "Fetch a string from the reverse index, given its hash value."; -static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value) { +static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value); /*proto*/ +static char __pyx_doc_5spacy_2en_6unhash[] = "Fetch a string from the reverse index, given its hash value."; +static PyObject *__pyx_pw_5spacy_2en_7unhash(PyObject *__pyx_self, PyObject *__pyx_arg_hash_value) { __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value; int __pyx_lineno = 0; const char *__pyx_filename = NULL; @@ -1512,7 +1946,7 @@ static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__p __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("unhash (wrapper)", 0); assert(__pyx_arg_hash_value); { - __pyx_v_hash_value = __Pyx_PyInt_As_uint64_t(__pyx_arg_hash_value); if (unlikely((__pyx_v_hash_value == (uint64_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 66; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_hash_value = __Pyx_PyInt_As_uint64_t(__pyx_arg_hash_value); if (unlikely((__pyx_v_hash_value == (uint64_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } goto __pyx_L4_argument_unpacking_done; __pyx_L3_error:; @@ -1520,14 +1954,14 @@ static PyObject *__pyx_pw_5spacy_2en_5unhash(PyObject *__pyx_self, PyObject *__p __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - __pyx_r = __pyx_pf_5spacy_2en_4unhash(__pyx_self, ((__pyx_t_5spacy_6lexeme_StringHash)__pyx_v_hash_value)); + __pyx_r = __pyx_pf_5spacy_2en_6unhash(__pyx_self, ((__pyx_t_5spacy_6lexeme_StringHash)__pyx_v_hash_value)); /* function exit code */ __Pyx_RefNannyFinishContext(); return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_4unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value) { +static PyObject *__pyx_pf_5spacy_2en_6unhash(CYTHON_UNUSED PyObject *__pyx_self, __pyx_t_5spacy_6lexeme_StringHash __pyx_v_hash_value) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; @@ -1536,7 +1970,7 @@ static PyObject *__pyx_pf_5spacy_2en_4unhash(CYTHON_UNUSED PyObject *__pyx_self, int __pyx_clineno = 0; __Pyx_RefNannySetupContext("unhash", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en_unhash(__pyx_v_hash_value, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 66; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en_unhash(__pyx_v_hash_value, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -1553,7 +1987,7 @@ static PyObject *__pyx_pf_5spacy_2en_4unhash(CYTHON_UNUSED PyObject *__pyx_self, return __pyx_r; } -/* "spacy/en.pyx":75 +/* "spacy/en.pyx":93 * * * cdef unicode normalize_word_string(unicode word): # <<<<<<<<<<<<<< @@ -1576,26 +2010,26 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word int __pyx_clineno = 0; __Pyx_RefNannySetupContext("normalize_word_string", 0); - /* "spacy/en.pyx":82 + /* "spacy/en.pyx":100 * ''' * cdef unicode s * if word.isdigit() and len(word) == 4: # <<<<<<<<<<<<<< * return '!YEAR' * elif word[0].isdigit(): */ - __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_isdigit); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_isdigit); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); - __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyObject_Call(__pyx_t_1, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; - __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_PyObject_IsTrue(__pyx_t_2); if (unlikely(__pyx_t_3 < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; if (__pyx_t_3) { if (unlikely(__pyx_v_word == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_4 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_word); if (unlikely(__pyx_t_4 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 82; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_word); if (unlikely(__pyx_t_4 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 100; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_t_5 = (__pyx_t_4 == 4); __pyx_t_6 = __pyx_t_5; } else { @@ -1603,7 +2037,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word } if (__pyx_t_6) { - /* "spacy/en.pyx":83 + /* "spacy/en.pyx":101 * cdef unicode s * if word.isdigit() and len(word) == 4: * return '!YEAR' # <<<<<<<<<<<<<< @@ -1616,18 +2050,18 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word goto __pyx_L0; } - /* "spacy/en.pyx":84 + /* "spacy/en.pyx":102 * if word.isdigit() and len(word) == 4: * return '!YEAR' * elif word[0].isdigit(): # <<<<<<<<<<<<<< * return '!DIGITS' * else: */ - __pyx_t_7 = __Pyx_GetItemInt_Unicode(__pyx_v_word, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_7 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 84; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_7 = __Pyx_GetItemInt_Unicode(__pyx_v_word, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_7 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 102; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; __pyx_t_6 = Py_UNICODE_ISDIGIT(__pyx_t_7); if ((__pyx_t_6 != 0)) { - /* "spacy/en.pyx":85 + /* "spacy/en.pyx":103 * return '!YEAR' * elif word[0].isdigit(): * return '!DIGITS' # <<<<<<<<<<<<<< @@ -1641,7 +2075,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word } /*else*/ { - /* "spacy/en.pyx":87 + /* "spacy/en.pyx":105 * return '!DIGITS' * else: * return word.lower() # <<<<<<<<<<<<<< @@ -1649,18 +2083,18 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word * */ __Pyx_XDECREF(__pyx_r); - __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_lower); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __Pyx_PyObject_GetAttrStr(__pyx_v_word, __pyx_n_s_lower); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); - __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_empty_tuple, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - if (!(likely(PyUnicode_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 87; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (!(likely(PyUnicode_CheckExact(__pyx_t_1))||((__pyx_t_1) == Py_None)||(PyErr_Format(PyExc_TypeError, "Expected %.16s, got %.200s", "unicode", Py_TYPE(__pyx_t_1)->tp_name), 0))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_r = ((PyObject*)__pyx_t_1); __pyx_t_1 = 0; goto __pyx_L0; } - /* "spacy/en.pyx":75 + /* "spacy/en.pyx":93 * * * cdef unicode normalize_word_string(unicode word): # <<<<<<<<<<<<<< @@ -1680,7 +2114,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word return __pyx_r; } -/* "spacy/en.pyx":90 +/* "spacy/en.pyx":108 * * * cpdef unicode _substr(unicode string, int start, int end, size_t length): # <<<<<<<<<<<<<< @@ -1688,7 +2122,7 @@ static PyObject *__pyx_f_5spacy_2en_normalize_word_string(PyObject *__pyx_v_word * end = -1 */ -static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length, CYTHON_UNUSED int __pyx_skip_dispatch) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations @@ -1701,7 +2135,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_substr", 0); - /* "spacy/en.pyx":91 + /* "spacy/en.pyx":109 * * cpdef unicode _substr(unicode string, int start, int end, size_t length): * if end >= length: # <<<<<<<<<<<<<< @@ -1711,7 +2145,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_1 = ((__pyx_v_end >= __pyx_v_length) != 0); if (__pyx_t_1) { - /* "spacy/en.pyx":92 + /* "spacy/en.pyx":110 * cpdef unicode _substr(unicode string, int start, int end, size_t length): * if end >= length: * end = -1 # <<<<<<<<<<<<<< @@ -1723,7 +2157,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } __pyx_L3:; - /* "spacy/en.pyx":93 + /* "spacy/en.pyx":111 * if end >= length: * end = -1 * if start >= length: # <<<<<<<<<<<<<< @@ -1733,7 +2167,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_1 = ((__pyx_v_start >= __pyx_v_length) != 0); if (__pyx_t_1) { - /* "spacy/en.pyx":94 + /* "spacy/en.pyx":112 * end = -1 * if start >= length: * start = 0 # <<<<<<<<<<<<<< @@ -1745,7 +2179,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } __pyx_L4:; - /* "spacy/en.pyx":95 + /* "spacy/en.pyx":113 * if start >= length: * start = 0 * if start <= 0 and end < 0: # <<<<<<<<<<<<<< @@ -1761,7 +2195,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } if (__pyx_t_3) { - /* "spacy/en.pyx":96 + /* "spacy/en.pyx":114 * start = 0 * if start <= 0 and end < 0: * return string # <<<<<<<<<<<<<< @@ -1774,7 +2208,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ goto __pyx_L0; } - /* "spacy/en.pyx":97 + /* "spacy/en.pyx":115 * if start <= 0 and end < 0: * return string * elif start < 0: # <<<<<<<<<<<<<< @@ -1784,7 +2218,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_3 = ((__pyx_v_start < 0) != 0); if (__pyx_t_3) { - /* "spacy/en.pyx":98 + /* "spacy/en.pyx":116 * return string * elif start < 0: * start = 0 # <<<<<<<<<<<<<< @@ -1795,7 +2229,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ goto __pyx_L5; } - /* "spacy/en.pyx":99 + /* "spacy/en.pyx":117 * elif start < 0: * start = 0 * elif end < 0: # <<<<<<<<<<<<<< @@ -1805,7 +2239,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __pyx_t_3 = ((__pyx_v_end < 0) != 0); if (__pyx_t_3) { - /* "spacy/en.pyx":100 + /* "spacy/en.pyx":118 * start = 0 * elif end < 0: * end = length # <<<<<<<<<<<<<< @@ -1817,7 +2251,7 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } __pyx_L5:; - /* "spacy/en.pyx":101 + /* "spacy/en.pyx":119 * elif end < 0: * end = length * return string[start:end] # <<<<<<<<<<<<<< @@ -1827,15 +2261,15 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ __Pyx_XDECREF(__pyx_r); if (unlikely(__pyx_v_string == Py_None)) { PyErr_SetString(PyExc_TypeError, "'NoneType' object is not subscriptable"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 119; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_4 = __Pyx_PyUnicode_Substring(__pyx_v_string, __pyx_v_start, __pyx_v_end); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 101; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_4 = __Pyx_PyUnicode_Substring(__pyx_v_string, __pyx_v_start, __pyx_v_end); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 119; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_4); __pyx_r = ((PyObject*)__pyx_t_4); __pyx_t_4 = 0; goto __pyx_L0; - /* "spacy/en.pyx":90 + /* "spacy/en.pyx":108 * * * cpdef unicode _substr(unicode string, int start, int end, size_t length): # <<<<<<<<<<<<<< @@ -1855,8 +2289,8 @@ static PyObject *__pyx_f_5spacy_2en__substr(PyObject *__pyx_v_string, int __pyx_ } /* Python wrapper */ -static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { +static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ +static PyObject *__pyx_pw_5spacy_2en_9_substr(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_string = 0; int __pyx_v_start; int __pyx_v_end; @@ -1889,21 +2323,21 @@ static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__ case 1: if (likely((values[1] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_start)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 1); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } case 2: if (likely((values[2] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_end)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 2); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } case 3: if (likely((values[3] = PyDict_GetItem(__pyx_kwds, __pyx_n_s_length)) != 0)) kw_args--; else { - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, 3); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } if (unlikely(kw_args > 0)) { - if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_substr") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + if (unlikely(__Pyx_ParseOptionalKeywords(__pyx_kwds, __pyx_pyargnames, 0, values, pos_args, "_substr") < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } } else if (PyTuple_GET_SIZE(__pyx_args) != 4) { goto __pyx_L5_argtuple_error; @@ -1914,20 +2348,20 @@ static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__ values[3] = PyTuple_GET_ITEM(__pyx_args, 3); } __pyx_v_string = ((PyObject*)values[0]); - __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} - __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} - __pyx_v_length = __Pyx_PyInt_As_size_t(values[3]); if (unlikely((__pyx_v_length == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_start = __Pyx_PyInt_As_int(values[1]); if (unlikely((__pyx_v_start == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_end = __Pyx_PyInt_As_int(values[2]); if (unlikely((__pyx_v_end == (int)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __pyx_v_length = __Pyx_PyInt_As_size_t(values[3]); if (unlikely((__pyx_v_length == (size_t)-1) && PyErr_Occurred())) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; - __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L3_error;} + __Pyx_RaiseArgtupleInvalid("_substr", 1, 4, 4, PyTuple_GET_SIZE(__pyx_args)); {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L3_error;} __pyx_L3_error:; __Pyx_AddTraceback("spacy.en._substr", __pyx_clineno, __pyx_lineno, __pyx_filename); __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_r = __pyx_pf_5spacy_2en_6_substr(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length); + if (unlikely(!__Pyx_ArgTypeTest(((PyObject *)__pyx_v_string), (&PyUnicode_Type), 1, "string", 1))) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_r = __pyx_pf_5spacy_2en_8_substr(__pyx_self, __pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length); /* function exit code */ goto __pyx_L0; @@ -1938,7 +2372,7 @@ static PyObject *__pyx_pw_5spacy_2en_7_substr(PyObject *__pyx_self, PyObject *__ return __pyx_r; } -static PyObject *__pyx_pf_5spacy_2en_6_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length) { +static PyObject *__pyx_pf_5spacy_2en_8_substr(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_string, int __pyx_v_start, int __pyx_v_end, size_t __pyx_v_length) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; @@ -1947,7 +2381,7 @@ static PyObject *__pyx_pf_5spacy_2en_6_substr(CYTHON_UNUSED PyObject *__pyx_self int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_substr", 0); __Pyx_XDECREF(__pyx_r); - __pyx_t_1 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 90; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_1 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_start, __pyx_v_end, __pyx_v_length, 0); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 108; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); __pyx_r = __pyx_t_1; __pyx_t_1 = 0; @@ -1964,7 +2398,7 @@ static PyObject *__pyx_pf_5spacy_2en_6_substr(CYTHON_UNUSED PyObject *__pyx_self return __pyx_r; } -/* "spacy/en.pyx":104 +/* "spacy/en.pyx":122 * * * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: # <<<<<<<<<<<<<< @@ -1984,7 +2418,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_add", 0); - /* "spacy/en.pyx":105 + /* "spacy/en.pyx":123 * * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: * assert string # <<<<<<<<<<<<<< @@ -1996,12 +2430,12 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp __pyx_t_1 = (__pyx_v_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_string) != 0); if (unlikely(!__pyx_t_1)) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 105; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 123; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":106 + /* "spacy/en.pyx":124 * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: * assert string * assert split <= length # <<<<<<<<<<<<<< @@ -2012,22 +2446,22 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_v_split <= __pyx_v_length) != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 106; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 124; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":107 + /* "spacy/en.pyx":125 * assert string * assert split <= length * word = _init_lexeme(string, hashed, split, length) # <<<<<<<<<<<<<< * LEXEMES[hashed] = word * STRINGS[hashed] = string */ - __pyx_t_2 = __pyx_f_5spacy_2en__init_lexeme(__pyx_v_string, __pyx_v_hashed, __pyx_v_split, __pyx_v_length); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 107; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = __pyx_f_5spacy_2en__init_lexeme(__pyx_v_string, __pyx_v_hashed, __pyx_v_split, __pyx_v_length); if (unlikely(__pyx_t_2 == NULL)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 125; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word = __pyx_t_2; - /* "spacy/en.pyx":108 + /* "spacy/en.pyx":126 * assert split <= length * word = _init_lexeme(string, hashed, split, length) * LEXEMES[hashed] = word # <<<<<<<<<<<<<< @@ -2036,19 +2470,19 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp */ (__pyx_v_5spacy_2en_LEXEMES[__pyx_v_hashed]) = __pyx_v_word; - /* "spacy/en.pyx":109 + /* "spacy/en.pyx":127 * word = _init_lexeme(string, hashed, split, length) * LEXEMES[hashed] = word * STRINGS[hashed] = string # <<<<<<<<<<<<<< * return word * */ - __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 109; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_3); - if (unlikely(__Pyx_SetItemInt(__pyx_t_3, __pyx_v_hashed, __pyx_v_string, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 109; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_3, __pyx_v_hashed, __pyx_v_string, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 127; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0; - /* "spacy/en.pyx":110 + /* "spacy/en.pyx":128 * LEXEMES[hashed] = word * STRINGS[hashed] = string * return word # <<<<<<<<<<<<<< @@ -2058,7 +2492,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp __pyx_r = __pyx_v_word; goto __pyx_L0; - /* "spacy/en.pyx":104 + /* "spacy/en.pyx":122 * * * cdef Lexeme* _add(StringHash hashed, unicode string, int split, size_t length) except NULL: # <<<<<<<<<<<<<< @@ -2076,7 +2510,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__add(__pyx_t_5sp return __pyx_r; } -/* "spacy/en.pyx":113 +/* "spacy/en.pyx":131 * * * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, # <<<<<<<<<<<<<< @@ -2106,7 +2540,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_init_lexeme", 0); - /* "spacy/en.pyx":115 + /* "spacy/en.pyx":133 * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, * int split, size_t length) except NULL: * assert split <= length # <<<<<<<<<<<<<< @@ -2117,12 +2551,12 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO if (unlikely(!Py_OptimizeFlag)) { if (unlikely(!((__pyx_v_split <= __pyx_v_length) != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 115; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 133; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":116 + /* "spacy/en.pyx":134 * int split, size_t length) except NULL: * assert split <= length * cdef Lexeme* word = calloc(1, sizeof(Lexeme)) # <<<<<<<<<<<<<< @@ -2131,7 +2565,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)calloc(1, (sizeof(struct __pyx_t_5spacy_6lexeme_Lexeme)))); - /* "spacy/en.pyx":118 + /* "spacy/en.pyx":136 * cdef Lexeme* word = calloc(1, sizeof(Lexeme)) * * word.first = (string[0] if string else 0) # <<<<<<<<<<<<<< @@ -2140,14 +2574,14 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_t_2 = (__pyx_v_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_string) != 0); if (__pyx_t_2) { - __pyx_t_3 = __Pyx_GetItemInt_Unicode(__pyx_v_string, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_3 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 118; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_3 = __Pyx_GetItemInt_Unicode(__pyx_v_string, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_3 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; __pyx_t_1 = __pyx_t_3; } else { __pyx_t_1 = 0; } __pyx_v_word->first = ((Py_UNICODE)__pyx_t_1); - /* "spacy/en.pyx":119 + /* "spacy/en.pyx":137 * * word.first = (string[0] if string else 0) * word.sic = hashed # <<<<<<<<<<<<<< @@ -2156,7 +2590,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->sic = __pyx_v_hashed; - /* "spacy/en.pyx":123 + /* "spacy/en.pyx":141 * cdef unicode tail_string * cdef unicode lex * if split != 0 and split < length: # <<<<<<<<<<<<<< @@ -2172,26 +2606,26 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO } if (__pyx_t_5) { - /* "spacy/en.pyx":124 + /* "spacy/en.pyx":142 * cdef unicode lex * if split != 0 and split < length: * lex = _substr(string, 0, split, length) # <<<<<<<<<<<<<< * tail_string = _substr(string, split, length, length) * else: */ - __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, 0, __pyx_v_split, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 124; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, 0, __pyx_v_split, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_lex = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":125 + /* "spacy/en.pyx":143 * if split != 0 and split < length: * lex = _substr(string, 0, split, length) * tail_string = _substr(string, split, length, length) # <<<<<<<<<<<<<< * else: * lex = string */ - __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_split, __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 125; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, __pyx_v_split, __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 143; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_tail_string = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; @@ -2199,7 +2633,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO } /*else*/ { - /* "spacy/en.pyx":127 + /* "spacy/en.pyx":145 * tail_string = _substr(string, split, length, length) * else: * lex = string # <<<<<<<<<<<<<< @@ -2209,7 +2643,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __Pyx_INCREF(__pyx_v_string); __pyx_v_lex = __pyx_v_string; - /* "spacy/en.pyx":128 + /* "spacy/en.pyx":146 * else: * lex = string * tail_string = '' # <<<<<<<<<<<<<< @@ -2221,7 +2655,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO } __pyx_L3:; - /* "spacy/en.pyx":129 + /* "spacy/en.pyx":147 * lex = string * tail_string = '' * assert lex # <<<<<<<<<<<<<< @@ -2233,36 +2667,36 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_t_5 = (__pyx_v_lex != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_lex) != 0); if (unlikely(!__pyx_t_5)) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 129; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 147; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":130 + /* "spacy/en.pyx":148 * tail_string = '' * assert lex * cdef unicode normed = normalize_word_string(lex) # <<<<<<<<<<<<<< * cdef unicode last3 = _substr(string, length - 3, length, length) * */ - __pyx_t_6 = __pyx_f_5spacy_2en_normalize_word_string(__pyx_v_lex); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 130; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en_normalize_word_string(__pyx_v_lex); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 148; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_normed = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":131 + /* "spacy/en.pyx":149 * assert lex * cdef unicode normed = normalize_word_string(lex) * cdef unicode last3 = _substr(string, length - 3, length, length) # <<<<<<<<<<<<<< * * assert normed */ - __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, (__pyx_v_length - 3), __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 131; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __pyx_f_5spacy_2en__substr(__pyx_v_string, (__pyx_v_length - 3), __pyx_v_length, __pyx_v_length, 0); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 149; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); __pyx_v_last3 = ((PyObject*)__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":133 + /* "spacy/en.pyx":151 * cdef unicode last3 = _substr(string, length - 3, length, length) * * assert normed # <<<<<<<<<<<<<< @@ -2274,12 +2708,12 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_t_5 = (__pyx_v_normed != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_normed) != 0); if (unlikely(!__pyx_t_5)) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 133; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 151; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":134 + /* "spacy/en.pyx":152 * * assert normed * assert len(normed) # <<<<<<<<<<<<<< @@ -2290,17 +2724,17 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO if (unlikely(!Py_OptimizeFlag)) { if (unlikely(__pyx_v_normed == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 134; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 134; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} if (unlikely(!(__pyx_t_7 != 0))) { PyErr_SetNone(PyExc_AssertionError); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 134; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } } #endif - /* "spacy/en.pyx":136 + /* "spacy/en.pyx":154 * assert len(normed) * * word.lex = hash_string(lex, len(lex)) # <<<<<<<<<<<<<< @@ -2309,13 +2743,13 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ if (unlikely(__pyx_v_lex == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 154; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_lex); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_lex, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 136; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_lex); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 154; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_lex, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 154; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->lex = __pyx_t_8; - /* "spacy/en.pyx":137 + /* "spacy/en.pyx":155 * * word.lex = hash_string(lex, len(lex)) * word.normed = hash_string(normed, len(normed)) # <<<<<<<<<<<<<< @@ -2324,13 +2758,13 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ if (unlikely(__pyx_v_normed == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 137; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 137; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_normed, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 137; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_normed); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_normed, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 155; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->normed = __pyx_t_8; - /* "spacy/en.pyx":138 + /* "spacy/en.pyx":156 * word.lex = hash_string(lex, len(lex)) * word.normed = hash_string(normed, len(normed)) * word.last3 = hash_string(last3, len(last3)) # <<<<<<<<<<<<<< @@ -2339,49 +2773,49 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ if (unlikely(__pyx_v_last3 == Py_None)) { PyErr_SetString(PyExc_TypeError, "object of type 'NoneType' has no len()"); - {__pyx_filename = __pyx_f[0]; __pyx_lineno = 138; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 156; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_last3); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 138; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_last3, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 138; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_7 = __Pyx_PyUnicode_GET_LENGTH(__pyx_v_last3); if (unlikely(__pyx_t_7 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 156; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_8 = __pyx_f_5spacy_2en_hash_string(__pyx_v_last3, __pyx_t_7); if (unlikely(__pyx_t_8 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 156; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->last3 = __pyx_t_8; - /* "spacy/en.pyx":140 + /* "spacy/en.pyx":158 * word.last3 = hash_string(last3, len(last3)) * * STRINGS[word.lex] = lex # <<<<<<<<<<<<<< * STRINGS[word.normed] = normed * STRINGS[word.last3] = last3 */ - __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 140; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 158; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->lex, __pyx_v_lex, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 140; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->lex, __pyx_v_lex, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 158; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":141 + /* "spacy/en.pyx":159 * * STRINGS[word.lex] = lex * STRINGS[word.normed] = normed # <<<<<<<<<<<<<< * STRINGS[word.last3] = last3 * */ - __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 141; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 159; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->normed, __pyx_v_normed, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 141; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->normed, __pyx_v_normed, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 159; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":142 + /* "spacy/en.pyx":160 * STRINGS[word.lex] = lex * STRINGS[word.normed] = normed * STRINGS[word.last3] = last3 # <<<<<<<<<<<<<< * * # These are loaded later */ - __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_6 = __Pyx_GetModuleGlobalName(__pyx_n_s_STRINGS); if (unlikely(!__pyx_t_6)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 160; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_6); - if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->last3, __pyx_v_last3, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 142; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if (unlikely(__Pyx_SetItemInt(__pyx_t_6, __pyx_v_word->last3, __pyx_v_last3, __pyx_t_5spacy_6lexeme_StringHash, 0, __Pyx_PyInt_From_uint64_t, 0, 0, 1) < 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 160; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_DECREF(__pyx_t_6); __pyx_t_6 = 0; - /* "spacy/en.pyx":145 + /* "spacy/en.pyx":163 * * # These are loaded later * word.prob = 0 # <<<<<<<<<<<<<< @@ -2390,7 +2824,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->prob = 0.0; - /* "spacy/en.pyx":146 + /* "spacy/en.pyx":164 * # These are loaded later * word.prob = 0 * word.cluster = 0 # <<<<<<<<<<<<<< @@ -2399,7 +2833,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->cluster = 0; - /* "spacy/en.pyx":147 + /* "spacy/en.pyx":165 * word.prob = 0 * word.cluster = 0 * word.oft_upper = False # <<<<<<<<<<<<<< @@ -2408,7 +2842,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->oft_upper = 0; - /* "spacy/en.pyx":148 + /* "spacy/en.pyx":166 * word.cluster = 0 * word.oft_upper = False * word.oft_title = False # <<<<<<<<<<<<<< @@ -2417,7 +2851,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO */ __pyx_v_word->oft_title = 0; - /* "spacy/en.pyx":151 + /* "spacy/en.pyx":169 * * # Now recurse, and deal with the tail * if tail_string: # <<<<<<<<<<<<<< @@ -2427,20 +2861,20 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_t_5 = (__pyx_v_tail_string != Py_None) && (PyUnicode_GET_SIZE(__pyx_v_tail_string) != 0); if (__pyx_t_5) { - /* "spacy/en.pyx":152 + /* "spacy/en.pyx":170 * # Now recurse, and deal with the tail * if tail_string: * word.tail = lookup(tail_string) # <<<<<<<<<<<<<< * return word * */ - __pyx_t_9 = __pyx_f_5spacy_2en_lookup(__pyx_v_tail_string, 0); if (unlikely(__pyx_t_9 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 152; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_9 = __pyx_f_5spacy_2en_lookup(__pyx_v_tail_string, 0); if (unlikely(__pyx_t_9 == 0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 170; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __pyx_v_word->tail = ((struct __pyx_t_5spacy_6lexeme_Lexeme *)__pyx_t_9); goto __pyx_L4; } __pyx_L4:; - /* "spacy/en.pyx":153 + /* "spacy/en.pyx":171 * if tail_string: * word.tail = lookup(tail_string) * return word # <<<<<<<<<<<<<< @@ -2450,7 +2884,7 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO __pyx_r = __pyx_v_word; goto __pyx_L0; - /* "spacy/en.pyx":113 + /* "spacy/en.pyx":131 * * * cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, # <<<<<<<<<<<<<< @@ -2472,141 +2906,152 @@ static struct __pyx_t_5spacy_6lexeme_Lexeme *__pyx_f_5spacy_2en__init_lexeme(PyO return __pyx_r; } -/* "spacy/en.pyx":156 +/* "spacy/en.pyx":174 * * * cdef size_t _find_split(unicode word, size_t length): # <<<<<<<<<<<<<< - * cdef size_t i = 0 - * if word[0].isalnum(): + * cdef int i = 0 + * # Contractions */ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __pyx_v_length) { - size_t __pyx_v_i; + int __pyx_v_i; size_t __pyx_r; __Pyx_RefNannyDeclarations - Py_UCS4 __pyx_t_1; + int __pyx_t_1; int __pyx_t_2; int __pyx_t_3; - int __pyx_t_4; - size_t __pyx_t_5; - Py_UCS4 __pyx_t_6; - int __pyx_t_7; - int __pyx_t_8; int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; __Pyx_RefNannySetupContext("_find_split", 0); - /* "spacy/en.pyx":157 + /* "spacy/en.pyx":175 * * cdef size_t _find_split(unicode word, size_t length): - * cdef size_t i = 0 # <<<<<<<<<<<<<< - * if word[0].isalnum(): - * while i < length and word[i].isalnum(): + * cdef int i = 0 # <<<<<<<<<<<<<< + * # Contractions + * if word.endswith("'s"): */ __pyx_v_i = 0; - /* "spacy/en.pyx":158 - * cdef size_t _find_split(unicode word, size_t length): - * cdef size_t i = 0 - * if word[0].isalnum(): # <<<<<<<<<<<<<< - * while i < length and word[i].isalnum(): - * i += 1 + /* "spacy/en.pyx":177 + * cdef int i = 0 + * # Contractions + * if word.endswith("'s"): # <<<<<<<<<<<<<< + * return length - 2 + * # Leading punctuation */ - __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, 0, long, 1, __Pyx_PyInt_From_long, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 158; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; - __pyx_t_2 = Py_UNICODE_ISALNUM(__pyx_t_1); - if ((__pyx_t_2 != 0)) { - - /* "spacy/en.pyx":159 - * cdef size_t i = 0 - * if word[0].isalnum(): - * while i < length and word[i].isalnum(): # <<<<<<<<<<<<<< - * i += 1 - * else: - */ - while (1) { - __pyx_t_2 = (__pyx_v_i < __pyx_v_length); - if (__pyx_t_2) { - __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 159; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; - __pyx_t_3 = Py_UNICODE_ISALNUM(__pyx_t_1); - __pyx_t_4 = (__pyx_t_3 != 0); - } else { - __pyx_t_4 = __pyx_t_2; - } - if (!__pyx_t_4) break; - - /* "spacy/en.pyx":160 - * if word[0].isalnum(): - * while i < length and word[i].isalnum(): - * i += 1 # <<<<<<<<<<<<<< - * else: - * # Split off a punctuation character, or a sequence of the same punctuation character - */ - __pyx_v_i = (__pyx_v_i + 1); - } - goto __pyx_L3; + if (unlikely(__pyx_v_word == Py_None)) { + PyErr_Format(PyExc_AttributeError, "'NoneType' object has no attribute '%s'", "endswith"); + {__pyx_filename = __pyx_f[0]; __pyx_lineno = 177; __pyx_clineno = __LINE__; goto __pyx_L1_error;} } - /*else*/ { + __pyx_t_1 = __Pyx_PyUnicode_Tailmatch(__pyx_v_word, __pyx_kp_u_s, 0, PY_SSIZE_T_MAX, 1); if (unlikely(__pyx_t_1 == -1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 177; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + if ((__pyx_t_1 != 0)) { - /* "spacy/en.pyx":163 - * else: - * # Split off a punctuation character, or a sequence of the same punctuation character - * while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): # <<<<<<<<<<<<<< - * i += 1 + /* "spacy/en.pyx":178 + * # Contractions + * if word.endswith("'s"): + * return length - 2 # <<<<<<<<<<<<<< + * # Leading punctuation + * if is_punct(word, 0, length): + */ + __pyx_r = (__pyx_v_length - 2); + goto __pyx_L0; + } + + /* "spacy/en.pyx":180 + * return length - 2 + * # Leading punctuation + * if is_punct(word, 0, length): # <<<<<<<<<<<<<< + * return 1 + * elif length >= 1 and is_punct(word, length - 1, length): + */ + __pyx_t_1 = (__pyx_f_5spacy_2en_is_punct(__pyx_v_word, 0, __pyx_v_length) != 0); + if (__pyx_t_1) { + + /* "spacy/en.pyx":181 + * # Leading punctuation + * if is_punct(word, 0, length): + * return 1 # <<<<<<<<<<<<<< + * elif length >= 1 and is_punct(word, length - 1, length): + * # Split off all trailing punctuation characters + */ + __pyx_r = 1; + goto __pyx_L0; + } + + /* "spacy/en.pyx":182 + * if is_punct(word, 0, length): + * return 1 + * elif length >= 1 and is_punct(word, length - 1, length): # <<<<<<<<<<<<<< + * # Split off all trailing punctuation characters + * i = length - 1 + */ + __pyx_t_1 = ((__pyx_v_length >= 1) != 0); + if (__pyx_t_1) { + __pyx_t_2 = (__pyx_f_5spacy_2en_is_punct(__pyx_v_word, (__pyx_v_length - 1), __pyx_v_length) != 0); + __pyx_t_3 = __pyx_t_2; + } else { + __pyx_t_3 = __pyx_t_1; + } + if (__pyx_t_3) { + + /* "spacy/en.pyx":184 + * elif length >= 1 and is_punct(word, length - 1, length): + * # Split off all trailing punctuation characters + * i = length - 1 # <<<<<<<<<<<<<< + * while i >= 2 and is_punct(word, i-1, length): + * i -= 1 + */ + __pyx_v_i = (__pyx_v_length - 1); + + /* "spacy/en.pyx":185 + * # Split off all trailing punctuation characters + * i = length - 1 + * while i >= 2 and is_punct(word, i-1, length): # <<<<<<<<<<<<<< + * i -= 1 * return i */ while (1) { - __pyx_t_4 = ((__pyx_v_i < __pyx_v_length) != 0); - if (__pyx_t_4) { - __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 163; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; - __pyx_t_2 = Py_UNICODE_ISALNUM(__pyx_t_1); - __pyx_t_3 = ((!(__pyx_t_2 != 0)) != 0); - if (__pyx_t_3) { - __pyx_t_2 = ((__pyx_v_i == 0) != 0); - if (!__pyx_t_2) { - __pyx_t_5 = (__pyx_v_i - 1); - __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_t_5, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 163; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; - __pyx_t_6 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_6 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 163; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; - __pyx_t_7 = ((__pyx_t_1 == __pyx_t_6) != 0); - __pyx_t_8 = __pyx_t_7; - } else { - __pyx_t_8 = __pyx_t_2; - } - __pyx_t_2 = __pyx_t_8; - } else { - __pyx_t_2 = __pyx_t_3; - } - __pyx_t_3 = __pyx_t_2; + __pyx_t_3 = ((__pyx_v_i >= 2) != 0); + if (__pyx_t_3) { + __pyx_t_1 = (__pyx_f_5spacy_2en_is_punct(__pyx_v_word, (__pyx_v_i - 1), __pyx_v_length) != 0); + __pyx_t_2 = __pyx_t_1; } else { - __pyx_t_3 = __pyx_t_4; + __pyx_t_2 = __pyx_t_3; } - if (!__pyx_t_3) break; + if (!__pyx_t_2) break; - /* "spacy/en.pyx":164 - * # Split off a punctuation character, or a sequence of the same punctuation character - * while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): - * i += 1 # <<<<<<<<<<<<<< + /* "spacy/en.pyx":186 + * i = length - 1 + * while i >= 2 and is_punct(word, i-1, length): + * i -= 1 # <<<<<<<<<<<<<< * return i + * */ - __pyx_v_i = (__pyx_v_i + 1); + __pyx_v_i = (__pyx_v_i - 1); } + goto __pyx_L4; } - __pyx_L3:; + __pyx_L4:; - /* "spacy/en.pyx":165 - * while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): - * i += 1 + /* "spacy/en.pyx":187 + * while i >= 2 and is_punct(word, i-1, length): + * i -= 1 * return i # <<<<<<<<<<<<<< + * + * */ __pyx_r = __pyx_v_i; goto __pyx_L0; - /* "spacy/en.pyx":156 + /* "spacy/en.pyx":174 * * * cdef size_t _find_split(unicode word, size_t length): # <<<<<<<<<<<<<< - * cdef size_t i = 0 - * if word[0].isalnum(): + * cdef int i = 0 + * # Contractions */ /* function exit code */ @@ -2618,11 +3063,54 @@ static size_t __pyx_f_5spacy_2en__find_split(PyObject *__pyx_v_word, size_t __py return __pyx_r; } +/* "spacy/en.pyx":190 + * + * + * cdef bint is_punct(unicode word, size_t i, size_t length): # <<<<<<<<<<<<<< + * return not word[i].isalnum() + */ + +static int __pyx_f_5spacy_2en_is_punct(PyObject *__pyx_v_word, size_t __pyx_v_i, CYTHON_UNUSED size_t __pyx_v_length) { + int __pyx_r; + __Pyx_RefNannyDeclarations + Py_UCS4 __pyx_t_1; + int __pyx_t_2; + int __pyx_lineno = 0; + const char *__pyx_filename = NULL; + int __pyx_clineno = 0; + __Pyx_RefNannySetupContext("is_punct", 0); + + /* "spacy/en.pyx":191 + * + * cdef bint is_punct(unicode word, size_t i, size_t length): + * return not word[i].isalnum() # <<<<<<<<<<<<<< + */ + __pyx_t_1 = __Pyx_GetItemInt_Unicode(__pyx_v_word, __pyx_v_i, size_t, 0, __Pyx_PyInt_FromSize_t, 0, 0, 1); if (unlikely(__pyx_t_1 == (Py_UCS4)-1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 191; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_t_2 = Py_UNICODE_ISALNUM(__pyx_t_1); + __pyx_r = (!(__pyx_t_2 != 0)); + goto __pyx_L0; + + /* "spacy/en.pyx":190 + * + * + * cdef bint is_punct(unicode word, size_t i, size_t length): # <<<<<<<<<<<<<< + * return not word[i].isalnum() + */ + + /* function exit code */ + __pyx_L1_error:; + __Pyx_WriteUnraisable("spacy.en.is_punct", __pyx_clineno, __pyx_lineno, __pyx_filename, 0); + __pyx_r = 0; + __pyx_L0:; + __Pyx_RefNannyFinishContext(); + return __pyx_r; +} + static PyMethodDef __pyx_methods[] = { - {__Pyx_NAMESTR("lookup"), (PyCFunction)__pyx_pw_5spacy_2en_1lookup, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_lookup)}, - {__Pyx_NAMESTR("lookup_chunk"), (PyCFunction)__pyx_pw_5spacy_2en_3lookup_chunk, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_2lookup_chunk)}, - {__Pyx_NAMESTR("unhash"), (PyCFunction)__pyx_pw_5spacy_2en_5unhash, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_4unhash)}, - {__Pyx_NAMESTR("_substr"), (PyCFunction)__pyx_pw_5spacy_2en_7_substr, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)}, + {__Pyx_NAMESTR("lookup"), (PyCFunction)__pyx_pw_5spacy_2en_3lookup, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_2lookup)}, + {__Pyx_NAMESTR("lookup_chunk"), (PyCFunction)__pyx_pw_5spacy_2en_5lookup_chunk, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_4lookup_chunk)}, + {__Pyx_NAMESTR("unhash"), (PyCFunction)__pyx_pw_5spacy_2en_7unhash, METH_O, __Pyx_DOCSTR(__pyx_doc_5spacy_2en_6unhash)}, + {__Pyx_NAMESTR("_substr"), (PyCFunction)__pyx_pw_5spacy_2en_9_substr, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(0)}, {0, 0, 0, 0} }; @@ -2645,19 +3133,28 @@ static struct PyModuleDef __pyx_moduledef = { #endif static __Pyx_StringTabEntry __pyx_string_tab[] = { + {&__pyx_n_s_, __pyx_k_, sizeof(__pyx_k_), 0, 0, 1, 1}, {&__pyx_kp_u_, __pyx_k_, sizeof(__pyx_k_), 0, 1, 0, 0}, {&__pyx_kp_u_DIGITS, __pyx_k_DIGITS, sizeof(__pyx_k_DIGITS), 0, 1, 0, 0}, {&__pyx_n_s_LEXEMES, __pyx_k_LEXEMES, sizeof(__pyx_k_LEXEMES), 0, 0, 1, 1}, {&__pyx_n_s_STRINGS, __pyx_k_STRINGS, sizeof(__pyx_k_STRINGS), 0, 0, 1, 1}, + {&__pyx_kp_s_Users_matt_repos_spaCy_spacy_en, __pyx_k_Users_matt_repos_spaCy_spacy_en, sizeof(__pyx_k_Users_matt_repos_spaCy_spacy_en), 0, 0, 1, 0}, {&__pyx_n_s_ValueError, __pyx_k_ValueError, sizeof(__pyx_k_ValueError), 0, 0, 1, 1}, {&__pyx_kp_u_YEAR, __pyx_k_YEAR, sizeof(__pyx_k_YEAR), 0, 1, 0, 0}, + {&__pyx_n_s_chunk, __pyx_k_chunk, sizeof(__pyx_k_chunk), 0, 0, 1, 1}, {&__pyx_n_s_cluster, __pyx_k_cluster, sizeof(__pyx_k_cluster), 0, 0, 1, 1}, + {&__pyx_n_u_en, __pyx_k_en, sizeof(__pyx_k_en), 0, 1, 0, 1}, {&__pyx_n_s_end, __pyx_k_end, sizeof(__pyx_k_end), 0, 0, 1, 1}, + {&__pyx_n_s_enumerate, __pyx_k_enumerate, sizeof(__pyx_k_enumerate), 0, 0, 1, 1}, {&__pyx_n_s_first, __pyx_k_first, sizeof(__pyx_k_first), 0, 0, 1, 1}, + {&__pyx_n_s_hashed, __pyx_k_hashed, sizeof(__pyx_k_hashed), 0, 0, 1, 1}, + {&__pyx_n_s_i, __pyx_k_i, sizeof(__pyx_k_i), 0, 0, 1, 1}, + {&__pyx_n_s_import, __pyx_k_import, sizeof(__pyx_k_import), 0, 0, 1, 1}, {&__pyx_n_s_isdigit, __pyx_k_isdigit, sizeof(__pyx_k_isdigit), 0, 0, 1, 1}, {&__pyx_n_s_last3, __pyx_k_last3, sizeof(__pyx_k_last3), 0, 0, 1, 1}, {&__pyx_n_s_length, __pyx_k_length, sizeof(__pyx_k_length), 0, 0, 1, 1}, {&__pyx_n_s_lex, __pyx_k_lex, sizeof(__pyx_k_lex), 0, 0, 1, 1}, + {&__pyx_n_s_load_tokenization, __pyx_k_load_tokenization, sizeof(__pyx_k_load_tokenization), 0, 0, 1, 1}, {&__pyx_n_s_lower, __pyx_k_lower, sizeof(__pyx_k_lower), 0, 0, 1, 1}, {&__pyx_n_s_main, __pyx_k_main, sizeof(__pyx_k_main), 0, 0, 1, 1}, {&__pyx_n_s_normed, __pyx_k_normed, sizeof(__pyx_k_normed), 0, 0, 1, 1}, @@ -2665,15 +3162,25 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_n_s_oft_upper, __pyx_k_oft_upper, sizeof(__pyx_k_oft_upper), 0, 0, 1, 1}, {&__pyx_n_s_prob, __pyx_k_prob, sizeof(__pyx_k_prob), 0, 0, 1, 1}, {&__pyx_n_s_pyx_capi, __pyx_k_pyx_capi, sizeof(__pyx_k_pyx_capi), 0, 0, 1, 1}, + {&__pyx_n_s_read_tokenization, __pyx_k_read_tokenization, sizeof(__pyx_k_read_tokenization), 0, 0, 1, 1}, + {&__pyx_kp_u_s, __pyx_k_s, sizeof(__pyx_k_s), 0, 1, 0, 0}, + {&__pyx_kp_u_s_d_s, __pyx_k_s_d_s, sizeof(__pyx_k_s_d_s), 0, 1, 0, 0}, {&__pyx_n_s_sic, __pyx_k_sic, sizeof(__pyx_k_sic), 0, 0, 1, 1}, + {&__pyx_n_s_spacy_en, __pyx_k_spacy_en, sizeof(__pyx_k_spacy_en), 0, 0, 1, 1}, {&__pyx_n_s_start, __pyx_k_start, sizeof(__pyx_k_start), 0, 0, 1, 1}, {&__pyx_n_s_string, __pyx_k_string, sizeof(__pyx_k_string), 0, 0, 1, 1}, {&__pyx_n_s_tail, __pyx_k_tail, sizeof(__pyx_k_tail), 0, 0, 1, 1}, {&__pyx_n_s_test, __pyx_k_test, sizeof(__pyx_k_test), 0, 0, 1, 1}, + {&__pyx_n_s_token_rules, __pyx_k_token_rules, sizeof(__pyx_k_token_rules), 0, 0, 1, 1}, + {&__pyx_n_s_token_string, __pyx_k_token_string, sizeof(__pyx_k_token_string), 0, 0, 1, 1}, + {&__pyx_n_s_tokens, __pyx_k_tokens, sizeof(__pyx_k_tokens), 0, 0, 1, 1}, + {&__pyx_n_s_util, __pyx_k_util, sizeof(__pyx_k_util), 0, 0, 1, 1}, + {&__pyx_n_s_word, __pyx_k_word, sizeof(__pyx_k_word), 0, 0, 1, 1}, {0, 0, 0, 0, 0, 0, 0} }; static int __Pyx_InitCachedBuiltins(void) { - __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 70; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_builtin_enumerate = __Pyx_GetBuiltinName(__pyx_n_s_enumerate); if (!__pyx_builtin_enumerate) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 31; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_builtin_ValueError = __Pyx_GetBuiltinName(__pyx_n_s_ValueError); if (!__pyx_builtin_ValueError) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 88; __pyx_clineno = __LINE__; goto __pyx_L1_error;} return 0; __pyx_L1_error:; return -1; @@ -2682,12 +3189,40 @@ static int __Pyx_InitCachedBuiltins(void) { static int __Pyx_InitCachedConstants(void) { __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("__Pyx_InitCachedConstants", 0); + + /* "spacy/en.pyx":24 + * + * + * def load_tokenization(token_rules): # <<<<<<<<<<<<<< + * cdef Lexeme* word + * cdef StringHash hashed + */ + __pyx_tuple__2 = PyTuple_Pack(9, __pyx_n_s_token_rules, __pyx_n_s_word, __pyx_n_s_hashed, __pyx_n_s_chunk, __pyx_n_s_lex, __pyx_n_s_tokens, __pyx_n_s_i, __pyx_n_s_token_string, __pyx_n_s_length); if (unlikely(!__pyx_tuple__2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_tuple__2); + __Pyx_GIVEREF(__pyx_tuple__2); + __pyx_codeobj__3 = (PyObject*)__Pyx_PyCode_New(1, 0, 9, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_tuple__2, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_Users_matt_repos_spaCy_spacy_en, __pyx_n_s_load_tokenization, 24, __pyx_empty_bytes); if (unlikely(!__pyx_codeobj__3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + + /* "spacy/en.pyx":39 + * + * + * load_tokenization(util.read_tokenization('en')) # <<<<<<<<<<<<<< + * + * cpdef Lexeme_addr lookup(unicode string) except 0: + */ + __pyx_tuple__4 = PyTuple_Pack(1, __pyx_n_u_en); if (unlikely(!__pyx_tuple__4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_tuple__4); + __Pyx_GIVEREF(__pyx_tuple__4); __Pyx_RefNannyFinishContext(); return 0; + __pyx_L1_error:; + __Pyx_RefNannyFinishContext(); + return -1; } static int __Pyx_InitGlobals(void) { if (__Pyx_InitStrings(__pyx_string_tab) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;}; + __pyx_int_0 = PyInt_FromLong(0); if (unlikely(!__pyx_int_0)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_int_1 = PyInt_FromLong(1); if (unlikely(!__pyx_int_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 1; __pyx_clineno = __LINE__; goto __pyx_L1_error;} return 0; __pyx_L1_error:; return -1; @@ -2702,7 +3237,9 @@ PyMODINIT_FUNC PyInit_en(void) #endif { PyObject *__pyx_t_1 = NULL; - struct __pyx_t_5spacy_6lexeme_Lexeme __pyx_t_2; + PyObject *__pyx_t_2 = NULL; + struct __pyx_t_5spacy_6lexeme_Lexeme __pyx_t_3; + PyObject *__pyx_t_4 = NULL; int __pyx_lineno = 0; const char *__pyx_filename = NULL; int __pyx_clineno = 0; @@ -2784,19 +3321,40 @@ PyMODINIT_FUNC PyInit_en(void) /*--- Function import code ---*/ /*--- Execution code ---*/ - /* "spacy/en.pyx":15 + /* "spacy/en.pyx":13 + * from ext.murmurhash cimport MurmurHash64A + * from ext.murmurhash cimport MurmurHash64B + * from . import util # <<<<<<<<<<<<<< + * + * + */ + __pyx_t_1 = PyList_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __Pyx_INCREF(__pyx_n_s_util); + PyList_SET_ITEM(__pyx_t_1, 0, __pyx_n_s_util); + __Pyx_GIVEREF(__pyx_n_s_util); + __pyx_t_2 = __Pyx_Import(__pyx_n_s_, __pyx_t_1, 1); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_ImportFrom(__pyx_t_2, __pyx_n_s_util); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_util, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 13; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "spacy/en.pyx":16 * * * STRINGS = {} # <<<<<<<<<<<<<< * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() * LEXEMES.set_empty_key(0) */ - __pyx_t_1 = PyDict_New(); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_GOTREF(__pyx_t_1); - if (PyDict_SetItem(__pyx_d, __pyx_n_s_STRINGS, __pyx_t_1) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 15; __pyx_clineno = __LINE__; goto __pyx_L1_error;} - __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_2 = PyDict_New(); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_STRINGS, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 16; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; - /* "spacy/en.pyx":16 + /* "spacy/en.pyx":17 * * STRINGS = {} * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() # <<<<<<<<<<<<<< @@ -2805,7 +3363,7 @@ PyMODINIT_FUNC PyInit_en(void) */ __pyx_v_5spacy_2en_LEXEMES = google::dense_hash_map<__pyx_t_5spacy_6lexeme_StringHash,__pyx_t_5spacy_2en_Lexeme_ptr>(); - /* "spacy/en.pyx":17 + /* "spacy/en.pyx":18 * STRINGS = {} * LEXEMES = dense_hash_map[StringHash, Lexeme_ptr]() * LEXEMES.set_empty_key(0) # <<<<<<<<<<<<<< @@ -2814,24 +3372,64 @@ PyMODINIT_FUNC PyInit_en(void) */ __pyx_v_5spacy_2en_LEXEMES.set_empty_key(0); - /* "spacy/en.pyx":20 + /* "spacy/en.pyx":21 * * * cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) # <<<<<<<<<<<<<< * * */ - __pyx_t_2.sic = 0; - __pyx_t_2.lex = 0; - __pyx_t_2.normed = 0; - __pyx_t_2.last3 = 0; - __pyx_t_2.first = 0; - __pyx_t_2.prob = 0.0; - __pyx_t_2.cluster = 0; - __pyx_t_2.oft_upper = 0; - __pyx_t_2.oft_title = 0; - __pyx_t_2.tail = NULL; - __pyx_v_5spacy_2en_BLANK_WORD = __pyx_t_2; + __pyx_t_3.sic = 0; + __pyx_t_3.lex = 0; + __pyx_t_3.normed = 0; + __pyx_t_3.last3 = 0; + __pyx_t_3.first = 0; + __pyx_t_3.prob = 0.0; + __pyx_t_3.cluster = 0; + __pyx_t_3.oft_upper = 0; + __pyx_t_3.oft_title = 0; + __pyx_t_3.tail = NULL; + __pyx_v_5spacy_2en_BLANK_WORD = __pyx_t_3; + + /* "spacy/en.pyx":24 + * + * + * def load_tokenization(token_rules): # <<<<<<<<<<<<<< + * cdef Lexeme* word + * cdef StringHash hashed + */ + __pyx_t_2 = PyCFunction_NewEx(&__pyx_mdef_5spacy_2en_1load_tokenization, NULL, __pyx_n_s_spacy_en); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + if (PyDict_SetItem(__pyx_d, __pyx_n_s_load_tokenization, __pyx_t_2) < 0) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 24; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + + /* "spacy/en.pyx":39 + * + * + * load_tokenization(util.read_tokenization('en')) # <<<<<<<<<<<<<< + * + * cpdef Lexeme_addr lookup(unicode string) except 0: + */ + __pyx_t_2 = __Pyx_GetModuleGlobalName(__pyx_n_s_load_tokenization); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_2); + __pyx_t_1 = __Pyx_GetModuleGlobalName(__pyx_n_s_util); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __pyx_t_4 = __Pyx_PyObject_GetAttrStr(__pyx_t_1, __pyx_n_s_read_tokenization); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_4); + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_4, __pyx_tuple__4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __pyx_t_4 = PyTuple_New(1); if (unlikely(!__pyx_t_4)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_4); + PyTuple_SET_ITEM(__pyx_t_4, 0, __pyx_t_1); + __Pyx_GIVEREF(__pyx_t_1); + __pyx_t_1 = 0; + __pyx_t_1 = __Pyx_PyObject_Call(__pyx_t_2, __pyx_t_4, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 39; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __Pyx_GOTREF(__pyx_t_1); + __Pyx_DECREF(__pyx_t_2); __pyx_t_2 = 0; + __Pyx_DECREF(__pyx_t_4); __pyx_t_4 = 0; + __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0; /* "spacy/en.pyx":1 * '''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, # <<<<<<<<<<<<<< @@ -2845,6 +3443,8 @@ PyMODINIT_FUNC PyInit_en(void) goto __pyx_L0; __pyx_L1_error:; __Pyx_XDECREF(__pyx_t_1); + __Pyx_XDECREF(__pyx_t_2); + __Pyx_XDECREF(__pyx_t_4); if (__pyx_m) { __Pyx_AddTraceback("init spacy.en", __pyx_clineno, __pyx_lineno, __pyx_filename); Py_DECREF(__pyx_m); __pyx_m = 0; @@ -2890,6 +3490,62 @@ static PyObject *__Pyx_GetBuiltinName(PyObject *name) { return result; } +static CYTHON_INLINE void __Pyx_RaiseTooManyValuesError(Py_ssize_t expected) { + PyErr_Format(PyExc_ValueError, + "too many values to unpack (expected %" CYTHON_FORMAT_SSIZE_T "d)", expected); +} + +static CYTHON_INLINE void __Pyx_RaiseNeedMoreValuesError(Py_ssize_t index) { + PyErr_Format(PyExc_ValueError, + "need more than %" CYTHON_FORMAT_SSIZE_T "d value%.1s to unpack", + index, (index == 1) ? "" : "s"); +} + +static CYTHON_INLINE int __Pyx_IterFinish(void) { +#if CYTHON_COMPILING_IN_CPYTHON + PyThreadState *tstate = PyThreadState_GET(); + PyObject* exc_type = tstate->curexc_type; + if (unlikely(exc_type)) { + if (likely(exc_type == PyExc_StopIteration) || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration)) { + PyObject *exc_value, *exc_tb; + exc_value = tstate->curexc_value; + exc_tb = tstate->curexc_traceback; + tstate->curexc_type = 0; + tstate->curexc_value = 0; + tstate->curexc_traceback = 0; + Py_DECREF(exc_type); + Py_XDECREF(exc_value); + Py_XDECREF(exc_tb); + return 0; + } else { + return -1; + } + } + return 0; +#else + if (unlikely(PyErr_Occurred())) { + if (likely(PyErr_ExceptionMatches(PyExc_StopIteration))) { + PyErr_Clear(); + return 0; + } else { + return -1; + } + } + return 0; +#endif +} + +static int __Pyx_IternextUnpackEndCheck(PyObject *retval, Py_ssize_t expected) { + if (unlikely(retval)) { + Py_DECREF(retval); + __Pyx_RaiseTooManyValuesError(expected); + return -1; + } else { + return __Pyx_IterFinish(); + } + return 0; +} + static CYTHON_INLINE int __Pyx_PyBytes_Equals(PyObject* s1, PyObject* s2, int equals) { #if CYTHON_COMPILING_IN_PYPY return PyObject_RichCompareBool(s1, s2, equals); @@ -3615,6 +4271,101 @@ static void __Pyx_WriteUnraisable(const char *name, CYTHON_UNUSED int clineno, } } +static PyObject* __Pyx_ImportFrom(PyObject* module, PyObject* name) { + PyObject* value = __Pyx_PyObject_GetAttrStr(module, name); + if (unlikely(!value) && PyErr_ExceptionMatches(PyExc_AttributeError)) { + PyErr_Format(PyExc_ImportError, + #if PY_MAJOR_VERSION < 3 + "cannot import name %.230s", PyString_AS_STRING(name)); + #else + "cannot import name %S", name); + #endif + } + return value; +} + +static PyObject *__Pyx_Import(PyObject *name, PyObject *from_list, int level) { + PyObject *empty_list = 0; + PyObject *module = 0; + PyObject *global_dict = 0; + PyObject *empty_dict = 0; + PyObject *list; + #if PY_VERSION_HEX < 0x03030000 + PyObject *py_import; + py_import = __Pyx_PyObject_GetAttrStr(__pyx_b, __pyx_n_s_import); + if (!py_import) + goto bad; + #endif + if (from_list) + list = from_list; + else { + empty_list = PyList_New(0); + if (!empty_list) + goto bad; + list = empty_list; + } + global_dict = PyModule_GetDict(__pyx_m); + if (!global_dict) + goto bad; + empty_dict = PyDict_New(); + if (!empty_dict) + goto bad; + #if PY_VERSION_HEX >= 0x02050000 + { + #if PY_MAJOR_VERSION >= 3 + if (level == -1) { + if (strchr(__Pyx_MODULE_NAME, '.')) { + #if PY_VERSION_HEX < 0x03030000 + PyObject *py_level = PyInt_FromLong(1); + if (!py_level) + goto bad; + module = PyObject_CallFunctionObjArgs(py_import, + name, global_dict, empty_dict, list, py_level, NULL); + Py_DECREF(py_level); + #else + module = PyImport_ImportModuleLevelObject( + name, global_dict, empty_dict, list, 1); + #endif + if (!module) { + if (!PyErr_ExceptionMatches(PyExc_ImportError)) + goto bad; + PyErr_Clear(); + } + } + level = 0; /* try absolute import on failure */ + } + #endif + if (!module) { + #if PY_VERSION_HEX < 0x03030000 + PyObject *py_level = PyInt_FromLong(level); + if (!py_level) + goto bad; + module = PyObject_CallFunctionObjArgs(py_import, + name, global_dict, empty_dict, list, py_level, NULL); + Py_DECREF(py_level); + #else + module = PyImport_ImportModuleLevelObject( + name, global_dict, empty_dict, list, level); + #endif + } + } + #else + if (level>0) { + PyErr_SetString(PyExc_RuntimeError, "Relative import is not supported for Python <=2.4."); + goto bad; + } + module = PyObject_CallFunctionObjArgs(py_import, + name, global_dict, empty_dict, list, NULL); + #endif +bad: + #if PY_VERSION_HEX < 0x03030000 + Py_XDECREF(py_import); + #endif + Py_XDECREF(empty_list); + Py_XDECREF(empty_dict); + return module; +} + #define __PYX_VERIFY_RETURN_INT(target_type, func_type, func) \ { \ func_type value = func(x); \ diff --git a/spacy/en.pyx b/spacy/en.pyx index 06923bc89..1fc2f7102 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -10,6 +10,7 @@ from libc.stdint cimport uint64_t from spacy.lexeme cimport Lexeme from ext.murmurhash cimport MurmurHash64A from ext.murmurhash cimport MurmurHash64B +from . import util STRINGS = {} @@ -20,6 +21,23 @@ LEXEMES.set_empty_key(0) cdef Lexeme BLANK_WORD = Lexeme(0, 0, 0, 0, 0, 0.0, 0, False, False, NULL) +def load_tokenization(token_rules): + cdef Lexeme* word + cdef StringHash hashed + for chunk, lex, tokens in token_rules: + hashed = hash_string(chunk, len(chunk)) + assert LEXEMES[hashed] == NULL + word = _add(hashed, lex, len(lex), len(lex)) + for i, lex in enumerate(tokens): + token_string = '%s:@:%d:@:%s' % (chunk, i, lex) + length = len(token_string) + hashed = hash_string(token_string, length) + word.tail = _add(hashed, lex, 0, len(lex)) + word = word.tail + + +load_tokenization(util.read_tokenization('en')) + cpdef Lexeme_addr lookup(unicode string) except 0: '''.. function:: enumerate(sequence[, start=0]) Fetch a Lexeme representing a word string. If the word has not been seen, @@ -156,8 +174,8 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, cdef size_t _find_split(unicode word, size_t length): cdef int i = 0 # Contractions - if word == "'s": - return 2 + if word.endswith("'s"): + return length - 2 # Leading punctuation if is_punct(word, 0, length): return 1 @@ -166,11 +184,8 @@ cdef size_t _find_split(unicode word, size_t length): i = length - 1 while i >= 2 and is_punct(word, i-1, length): i -= 1 - else: - # Doesn't start or end with the punct - while i < length and not is_punct(word, i, length): - i += 1 return i + cdef bint is_punct(unicode word, size_t i, size_t length): return not word[i].isalnum() diff --git a/spacy/lexeme.cpp b/spacy/lexeme.cpp index 72140c838..1d8806510 100644 --- a/spacy/lexeme.cpp +++ b/spacy/lexeme.cpp @@ -1,4 +1,4 @@ -/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */ +/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */ #define PY_SSIZE_T_CLEAN #ifndef CYTHON_USE_PYLONG_INTERNALS diff --git a/spacy/spacy.cpp b/spacy/spacy.cpp index dff7c9ba8..1f91607e5 100644 --- a/spacy/spacy.cpp +++ b/spacy/spacy.cpp @@ -1,4 +1,4 @@ -/* Generated by Cython 0.20.1 on Sat Jul 5 20:44:26 2014 */ +/* Generated by Cython 0.20.1 on Mon Jul 7 00:02:26 2014 */ #define PY_SSIZE_T_CLEAN #ifndef CYTHON_USE_PYLONG_INTERNALS diff --git a/spacy/util.py b/spacy/util.py index 7cb50b82b..449bad876 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,3 +1,10 @@ +import os +from os import path +import codecs + +DATA_DIR = path.join(path.dirname(__file__), '..', 'data') + + def utf8open(loc, mode='r'): return codecs.open(loc, mode, 'utf8') @@ -12,23 +19,23 @@ def load_case_stats(data_dir): return case_stats -def load_clitics(data_dir): - clitics_loc = path.join(data_dir, 'clitics.txt') +def read_tokenization(lang): + loc = path.join(DATA_DIR, lang, 'tokenization') entries = [] seen = set() - with utf8open(clitics_loc) as clitics_file: - for line in clitics_file: + with utf8open(loc) as file_: + for line in file_: line = line.strip() if line.startswith('#'): continue if not line: continue - clitics = line.split() - word = clitics.pop(0) - norm_form = clitics.pop(0) - assert word not in seen, word - seen.add(word) - entries.append((word, norm_form, clitics)) + pieces = line.split() + chunk = pieces.pop(0) + lex = pieces.pop(0) + assert chunk not in seen, chunk + seen.add(chunk) + entries.append((chunk, lex, pieces)) return entries diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 9987a561b..01290a10b 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -28,3 +28,10 @@ def test_case_neq(): def test_punct_neq(): addr = lookup('Hello') assert lookup('Hello,') != addr + + +def test_short(): + addr = lookup('I') + assert unhash(lex_of(addr)) == 'I' + addr = lookup('not') + assert unhash(lex_of(addr)) == 'not'