diff --git a/docs/index.rst b/docs/index.rst index d5b068567..ca7dbee40 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,10 +9,14 @@ spaCy NLP Tokenizer and Lexicon .. toctree:: :maxdepth: 3 - guide/overview - guide/install + guide/overview.rst + guide/install.rst + api/index.rst + modules/index.rst + + Source (GitHub) ---------------- diff --git a/spacy/en.pxd b/spacy/en.pxd index 1a08834ec..8f514ec9d 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -4,4 +4,4 @@ cimport cython cdef class English(Language): - cpdef int _split_one(self, unicode word) + cdef int _split_one(self, unicode word) diff --git a/spacy/en.pyx b/spacy/en.pyx index ebfbff8d2..342981b6a 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -5,22 +5,21 @@ scheme in several important respects: * Whitespace is added as tokens, except for single spaces. e.g., - >>> [w.string for w in tokenize(u'\\nHello \\tThere')] + >>> [w.string for w in EN.tokenize(u'\\nHello \\tThere')] [u'\\n', u'Hello', u' ', u'\\t', u'There'] * Contractions are normalized, e.g. - >>> [w.string for w in u"isn't ain't won't he's")] + >>> [w.string for w in EN.tokenize(u"isn't ain't won't he's")] [u'is', u'not', u'are', u'not', u'will', u'not', u'he', u"__s"] * Hyphenated words are split, with the hyphen preserved, e.g.: - >>> [w.string for w in tokenize(u'New York-based')] + >>> [w.string for w in EN.tokenize(u'New York-based')] [u'New', u'York', u'-', u'based'] Other improvements: -* Full unicode support * Email addresses, URLs, European-formatted dates and other numeric entities not found in the PTB are tokenized correctly * Heuristic handling of word-final periods (PTB expects sentence boundary detection @@ -81,6 +80,13 @@ CAN_PRT = NR_FLAGS; NR_FLAGS += 1 cdef class English(Language): + """English tokenizer, tightly coupled to lexicon. + + Attributes: + name (unicode): The two letter code used by Wikipedia for the language. + lexicon (Lexicon): The lexicon. Exposes the lookup method. + """ + def __cinit__(self, name): flag_funcs = [0 for _ in range(NR_FLAGS)] @@ -110,7 +116,7 @@ cdef class English(Language): Language.__init__(self, name, flag_funcs) - cpdef int _split_one(self, unicode word): + cdef int _split_one(self, unicode word): cdef size_t length = len(word) cdef int i = 0 if word.startswith("'s") or word.startswith("'S"): diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 43e21577b..6224f9fb5 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -4,21 +4,22 @@ from spacy.word cimport Lexeme cdef class Lexicon: - cdef list string_features - cdef list flag_features - - cdef dict _dict - cpdef Lexeme lookup(self, unicode string) + + cdef dict _dict + + cdef list _string_features + cdef list _flag_features cdef class Language: - cdef object name + cdef unicode name cdef dict cache cpdef readonly Lexicon lexicon cpdef list tokenize(self, unicode text) + cpdef Lexeme lookup(self, unicode text) cdef list _tokenize(self, unicode string) - cpdef list _split(self, unicode string) - cpdef int _split_one(self, unicode word) + cdef list _split(self, unicode string) + cdef int _split_one(self, unicode word) diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 221e25b6e..a572811bf 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -41,7 +41,7 @@ cdef class Language: rules, words, probs, clusters, case_stats, tag_stats = lang_data self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats, string_features, flag_features) - self.load_special_tokenization(rules) + self._load_special_tokenization(rules) cpdef list tokenize(self, unicode string): """Tokenize a string. @@ -75,6 +75,17 @@ cdef class Language: assert tokens return tokens + cpdef Lexeme lookup(self, unicode string): + """Retrieve (or create, if not found) a Lexeme for a string, and return it. + + Args: + string (unicode): The string to be looked up. Must be unicode, not bytes. + + Returns: + lexeme (Lexeme): A reference to a lexical type. + """ + return self.lexicon.lookup(string) + cdef list _tokenize(self, unicode string): if string in self.cache: return self.cache[string] @@ -85,7 +96,7 @@ cdef class Language: self.cache[string] = lexemes return lexemes - cpdef list _split(self, unicode string): + cdef list _split(self, unicode string): """Find how to split a contiguous span of non-space characters into substrings. This method calls find_split repeatedly. Most languages will want to @@ -107,10 +118,10 @@ cdef class Language: string = string[split:] return substrings - cpdef int _split_one(self, unicode word): + cdef int _split_one(self, unicode word): return len(word) - def load_special_tokenization(self, token_rules): + def _load_special_tokenization(self, token_rules): '''Load special-case tokenization rules. Loads special-case tokenization rules into the Language.cache cache, @@ -132,14 +143,14 @@ cdef class Language: cdef class Lexicon: def __cinit__(self, words, probs, clusters, case_stats, tag_stats, string_features, flag_features): - self.flag_features = flag_features - self.string_features = string_features + self._flag_features = flag_features + self._string_features = string_features self._dict = {} cdef Lexeme word for string in words: word = Lexeme(string, probs.get(string, 0.0), clusters.get(string, 0), case_stats.get(string, {}), tag_stats.get(string, {}), - self.string_features, self.flag_features) + self._string_features, self._flag_features) self._dict[string] = word cpdef Lexeme lookup(self, unicode string): @@ -155,7 +166,7 @@ cdef class Lexicon: if string in self._dict: return self._dict[string] - cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self.string_features, - self.flag_features) + cdef Lexeme word = Lexeme(string, 0, 0, {}, {}, self._string_features, + self._flag_features) self._dict[string] = word return word