diff --git a/spacy/en.pxd b/spacy/en.pxd index 9f0edb791..ed08a144d 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -10,6 +10,7 @@ from spacy.tokens cimport Tokens cdef class English(spacy.Language): cdef int find_split(self, unicode word) + cdef int set_orth(self, unicode word, Lexeme* lex) except -1 cdef English EN diff --git a/spacy/en.pyx b/spacy/en.pyx index 3b7d506fa..bead7205d 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -1,7 +1,8 @@ # cython: profile=True -'''Serve pointers to Lexeme structs, given strings. Maintain a reverse index, -so that strings can be retrieved from hashes. Use 64-bit hash values and -boldly assume no collisions. +# cython: embedsignature=True +'''Tokenize English text, allowing some differences from the Penn Treebank +tokenization, e.g. for email addresses, URLs, etc. Use en_ptb if full PTB +compatibility is the priority. ''' from __future__ import unicode_literals @@ -9,14 +10,17 @@ from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t from libcpp.vector cimport vector -from spacy.string_tools cimport substr - -from . import util - cimport spacy +from spacy.orthography.latin cimport * + + + cdef class English(spacy.Language): + cdef int set_orth(self, unicode word, Lexeme* lex) except -1: + pass + cdef int find_split(self, unicode word): cdef size_t length = len(word) cdef int i = 0 @@ -26,17 +30,17 @@ cdef class English(spacy.Language): if word.endswith("'s") and length >= 3: return length - 2 # Leading punctuation - if is_punct(word, 0, length): + if check_punct(word, 0, length): return 1 elif length >= 1: # Split off all trailing punctuation characters i = 0 - while i < length and not is_punct(word, i, length): + while i < length and not check_punct(word, i, length): i += 1 return i -cdef bint is_punct(unicode word, size_t i, size_t length): +cdef bint check_punct(unicode word, size_t i, size_t length): # Don't count appostrophes as punct if the next char is a letter if word[i] == "'" and i < (length - 1) and word[i+1].isalpha(): return i == 0 @@ -55,14 +59,52 @@ EN = English('en') cpdef Tokens tokenize(unicode string): + """Tokenize a string. + + Wraps EN.tokenize, where EN is an instance of the class English. The global + variable manages the vocabulary, and memoizes tokenization rules. + + Args: + string (unicode): The string to be split. Must be unicode, not bytes. + + Returns: + tokens (Tokens): A Tokens instance, managing a vector of pointers to + Lexeme structs. The Tokens instance supports sequence interfaces, + but also offers a range of sequence-level operations, which are computed + efficiently in Cython-space. + """ return EN.tokenize(string) cpdef Lexeme_addr lookup(unicode string) except 0: + """Retrieve (or create) a Lexeme for a string. + + Returns a Lexeme ID, which can be used via the accessor + methods in spacy.lexeme + + Args: + string (unicode): The string to be looked up. Must be unicode, not bytes. + + Returns: + LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved. + The LexemeID is really a memory address, making dereferencing it essentially + free. + """ return EN.lookup(string) cpdef unicode unhash(StringHash hash_value): + """Retrieve a string from a hash value. Mostly used for testing. + + In general you should avoid computing with strings, as they are slower than + the intended ID-based usage. However, strings can be recovered if necessary, + although no control is taken for hash collisions. + + Args: + hash_value (uint32_t): The hash of a string, returned by Python's hash() + function. + + Returns: + string (unicode): A unicode string that hashes to the hash_value. + """ return EN.unhash(hash_value) - - diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 6d944eb25..91f5b0884 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -1,83 +1,34 @@ from libc.stdint cimport uint32_t from libc.stdint cimport uint64_t -# Put these above import to avoid circular import problem + ctypedef int ClusterID ctypedef uint32_t StringHash -ctypedef size_t Lexeme_addr -ctypedef char Bits8 -ctypedef uint64_t Bits64 - - -cdef enum OrthFlag: - IS_ALPHA - IS_DIGIT - IS_PUNCT - IS_WHITE - IS_LOWER - IS_UPPER - IS_TITLE - IS_ASCII - - -cdef enum DistFlag: - OFT_UPPER - OFT_TITLE - DIST_FLAG3 - DIST_FLAG4 - DIST_FLAG5 - DIST_FLAG6 - DIST_FLAG7 - DIST_FLAG8 - - -cdef struct Orthography: - StringHash shape - StringHash norm - StringHash last3 - Bits8 flags - - -cdef struct Distribution: - double prob - ClusterID cluster - Bits64 tagdict - Bits8 flags +ctypedef size_t LexID +ctypedef char OrthFlags +ctypedef char DistFlags +ctypedef uint64_t TagFlags cdef struct Lexeme: + StringHash lex char* string size_t length - StringHash lex - Orthography orth # Extra orthographic views - Distribution dist # Distribution info + double prob + ClusterID cluster + TagFlags possible_tags + DistFlags dist_flags + OrthFlags orth_flags + StringHash* string_views -cdef Lexeme BLANK_WORD = Lexeme(NULL, 0, 0, - Orthography(0, 0, 0, 0), - Distribution(0.0, 0, 0, 0) -) +cpdef char first_of(LexID lex_id) except 0 +cpdef size_t length_of(LexID lex_id) except 0 +cpdef double prob_of(LexID lex_id) except 0 +cpdef ClusterID cluster_of(LexID lex_id) except 0 +cpdef bint check_tag_flag(LexID lex, TagFlags flag) except * +cpdef bint check_dist_flag(LexID lex, DistFlags flag) except * +cpdef bint check_orth_flag(LexID lex, OrthFlags flag) except * -cdef enum StringAttr: - LEX - NORM - SHAPE - LAST3 - LENGTH - - -cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0 - -cpdef StringHash lex_of(size_t lex_id) except 0 -cpdef StringHash norm_of(size_t lex_id) except 0 -cpdef StringHash shape_of(size_t lex_id) except 0 -cpdef StringHash last3_of(size_t lex_id) except 0 - -cpdef size_t length_of(size_t lex_id) except * - -cpdef double prob_of(size_t lex_id) except 0 -cpdef ClusterID cluster_of(size_t lex_id) except 0 - -cpdef bint check_orth_flag(size_t lex, OrthFlag flag) except * -cpdef bint check_dist_flag(size_t lex, DistFlag flag) except * +cpdef StringHash view_of(LexID lex_id, size_t view) except 0 diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 37392637b..78c98d045 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,32 +1,32 @@ # cython: profile=True +# cython: embedsignature=True '''Accessors for Lexeme properties, given a lex_id, which is cast to a Lexeme*. Mostly useful from Python-space. From Cython-space, you can just cast to Lexeme* yourself. ''' from __future__ import unicode_literals -from spacy.string_tools cimport substr - from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t -from libcpp.vector cimport vector from spacy.spacy cimport StringHash -cpdef StringHash attr_of(size_t lex_id, StringAttr attr) except 0: - if attr == LEX: - return lex_of(lex_id) - elif attr == NORM: - return norm_of(lex_id) - elif attr == SHAPE: - return shape_of(lex_id) - elif attr == LAST3: - return last3_of(lex_id) - elif attr == LENGTH: - return length_of(lex_id) - else: - raise StandardError +cpdef int set_flags(LexID lex_id, object active_flags) except *: + """Set orthographic bit flags for a Lexeme. + + Args: + lex_id (LexemeID): A reference ID for a Lexeme. + active_flags: A sequence of bits to set as True. + """ + cdef size_t flag + cdef Lexeme* w = lex_id + for flag in active_flags: + w.orth_flags |= 1 << flag + + +cpdef StringHash view_of(LexID lex_id, size_t view) except 0: + return (lex_id).string_views[view] cpdef StringHash lex_of(size_t lex_id) except 0: @@ -37,42 +37,14 @@ cpdef StringHash lex_of(size_t lex_id) except 0: delimited tokens split off. The other fields refer to properties of the string that the lex field stores a hash of, except sic and tail. - >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')] + >>> from spacy import en + >>> [en.unhash(lex_of(lex_id) for lex_id in en.tokenize(u'Hi! world')] [u'Hi', u'!', u'world'] ''' return (lex_id).lex -cpdef StringHash norm_of(size_t lex_id) except 0: - '''Access the `lex' field of the Lexeme pointed to by lex_id. - - The lex field is the hash of the string you would expect to get back from - a standard tokenizer, i.e. the word with punctuation and other non-whitespace - delimited tokens split off. The other fields refer to properties of the - string that the lex field stores a hash of, except sic and tail. - - >>> [unhash(lex_of(lex_id) for lex_id in from_string(u'Hi! world')] - [u'Hi', u'!', u'world'] - ''' - return (lex_id).orth.norm - - -cpdef StringHash shape_of(size_t lex_id) except 0: - return (lex_id).orth.shape - - -cpdef StringHash last3_of(size_t lex_id) except 0: - '''Access the `last3' field of the Lexeme pointed to by lex_id, which stores - the hash of the last three characters of the word: - - >>> lex_ids = [lookup(w) for w in (u'Hello', u'!')] - >>> [unhash(last3_of(lex_id)) for lex_id in lex_ids] - [u'llo', u'!'] - ''' - return (lex_id).orth.last3 - - -cpdef ClusterID cluster_of(size_t lex_id) except 0: +cpdef ClusterID cluster_of(LexID lex_id) except 0: '''Access the `cluster' field of the Lexeme pointed to by lex_id, which gives an integer representation of the cluster ID of the word, which should be understood as a binary address: @@ -88,10 +60,10 @@ cpdef ClusterID cluster_of(size_t lex_id) except 0: while "dapple" is totally different. On the other hand, "scalable" receives the same cluster ID as "pineapple", which is not what we'd like. ''' - return (lex_id).dist.cluster + return (lex_id).cluster -cpdef Py_UNICODE first_of(size_t lex_id): +cpdef char first_of(size_t lex_id) except 0: '''Access the `first' field of the Lexeme pointed to by lex_id, which stores the first character of the lex string of the word. @@ -99,10 +71,10 @@ cpdef Py_UNICODE first_of(size_t lex_id): >>> unhash(first_of(lex_id)) u'H' ''' - return (lex_id).orth.first + return (lex_id).string[0] -cpdef size_t length_of(size_t lex_id) except *: +cpdef size_t length_of(size_t lex_id) except 0: '''Access the `length' field of the Lexeme pointed to by lex_id, which stores the length of the string hashed by lex_of.''' cdef Lexeme* word = lex_id @@ -119,8 +91,10 @@ cpdef double prob_of(size_t lex_id) except 0: >>> prob_of(lookup(u'world')) -20.10340371976182 ''' - return (lex_id).dist.prob + return (lex_id).prob +DEF OFT_UPPER = 1 +DEF OFT_TITLE = 2 cpdef bint is_oft_upper(size_t lex_id): '''Access the `oft_upper' field of the Lexeme pointed to by lex_id, which @@ -134,7 +108,7 @@ cpdef bint is_oft_upper(size_t lex_id): >>> is_oft_upper(lookup(u'aBc')) # This must get the same answer True ''' - return (lex_id).dist.flags & OFT_UPPER + return (lex_id).dist_flags & (1 << OFT_UPPER) cpdef bint is_oft_title(size_t lex_id): @@ -149,11 +123,15 @@ cpdef bint is_oft_title(size_t lex_id): >>> is_oft_title(lookup(u'MARCUS')) # This must get the same value True ''' - return (lex_id).dist.flags & OFT_TITLE + return (lex_id).dist_flags & (1 << OFT_TITLE) -cpdef bint check_orth_flag(size_t lex_id, OrthFlag flag) except *: - return (lex_id).orth.flags & (1 << flag) +cpdef bint check_orth_flag(size_t lex_id, OrthFlags flag) except *: + return (lex_id).orth_flags & (1 << flag) -cpdef bint check_dist_flag(size_t lex_id, DistFlag flag) except *: - return (lex_id).dist.flags & (1 << flag) +cpdef bint check_dist_flag(size_t lex_id, DistFlags flag) except *: + return (lex_id).dist_flags & (1 << flag) + + +cpdef bint check_tag_flag(LexID lex_id, TagFlags flag) except *: + return (lex_id).possible_tags & (1 << flag) diff --git a/spacy/spacy.pxd b/spacy/spacy.pxd index b9caac34f..e1e7a8c7f 100644 --- a/spacy/spacy.pxd +++ b/spacy/spacy.pxd @@ -19,8 +19,6 @@ ctypedef int ClusterID from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport Distribution -from spacy.lexeme cimport Orthography cdef class Language: @@ -29,7 +27,7 @@ cdef class Language: cdef dense_hash_map[StringHash, size_t] vocab cdef dict bacov - cdef Tokens tokenize(self, unicode text) + cpdef Tokens tokenize(self, unicode text) cdef Lexeme* lookup(self, unicode string) except NULL cdef Lexeme** lookup_chunk(self, unicode chunk) except NULL @@ -37,7 +35,8 @@ cdef class Language: cdef Lexeme** new_chunk(self, unicode string, list substrings) except NULL cdef Lexeme* new_lexeme(self, unicode lex) except NULL - cdef unicode unhash(self, StringHash hashed) + cpdef unicode unhash(self, StringHash hashed) - cpdef list find_substrings(self, unicode word) + cpdef list find_substrings(self, unicode chunk) cdef int find_split(self, unicode word) + cdef int set_orth(self, unicode string, Lexeme* word) diff --git a/spacy/spacy.pyx b/spacy/spacy.pyx index a8b4ebe74..addb76b39 100644 --- a/spacy/spacy.pyx +++ b/spacy/spacy.pyx @@ -1,4 +1,13 @@ # cython: profile=True +# cython: embedsignature=True +"""Common classes and utilities across languages. + +Provides the main implementation for the spacy tokenizer. Specific languages +subclass the Language class, over-writing the tokenization rules as necessary. +Special-case tokenization rules are read from data//tokenization . +""" + + from __future__ import unicode_literals from libc.stdlib cimport calloc, free @@ -6,54 +15,13 @@ from libcpp.pair cimport pair from cython.operator cimport dereference as deref from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport BLANK_WORD - -from spacy.string_tools cimport substr +from spacy.lexeme cimport LexID from . import util from os import path -DIST_FLAGS = {} TAGS = {} - - -def get_normalized(unicode lex): - if lex.isalpha() and lex.islower(): - return lex - else: - return get_word_shape(lex) - - -def get_word_shape(unicode lex): - cdef size_t length = len(lex) - shape = "" - last = "" - shape_char = "" - seq = 0 - for c in lex: - if c.isalpha(): - if c.isupper(): - shape_char = "X" - else: - shape_char = "x" - elif c.isdigit(): - shape_char = "d" - else: - shape_char = c - if shape_char == last: - seq += 1 - else: - seq = 0 - last = shape_char - if seq < 3: - shape += shape_char - assert shape - return shape - - -def set_orth_flags(lex): - return 0 - +DIST_FLAGS = {} cdef class Language: def __cinit__(self, name): @@ -64,9 +32,19 @@ cdef class Language: self.chunks.set_empty_key(0) self.vocab.set_empty_key(0) self.load_tokenization(util.read_tokenization(name)) - #self.load_dist_info(util.read_dist_info(name)) + self.load_dist_info(util.read_dist_info(name)) - cdef Tokens tokenize(self, unicode string): + cpdef Tokens tokenize(self, unicode string): + """Tokenize. + + Split the string into tokens. + + Args: + string (unicode): The string to split. + + Returns: + tokens (Tokens): A Tokens object. + """ cdef Lexeme** chunk cdef Tokens tokens = Tokens(self) cdef size_t length = len(string) @@ -85,8 +63,7 @@ cdef class Language: return tokens cdef Lexeme* lookup(self, unicode string) except NULL: - if len(string) == 0: - return &BLANK_WORD + assert len(string) != 0 cdef Lexeme* word = self.vocab[hash(string)] if word == NULL: word = self.new_lexeme(string) @@ -113,56 +90,79 @@ cdef class Language: cdef bytes byte_string = string.encode('utf8') word.string = byte_string word.length = len(byte_string) - word.orth.flags = set_orth_flags(string) - cdef unicode norm = get_normalized(string) - cdef unicode shape = get_word_shape(string) - cdef unicode last3 = string[-3:] - word.lex = hash(string) - word.orth.norm = hash(norm) - word.orth.shape = hash(shape) - word.orth.last3 = hash(last3) - self.bacov[word.lex] = string - self.bacov[word.orth.norm] = norm - self.bacov[word.orth.shape] = shape - self.bacov[word.orth.last3] = last3 + self.set_orth(string, word) - self.vocab[hash(string)] = word + word.lex = hash(string) + self.bacov[word.lex] = string + self.vocab[word.lex] = word return word - cdef unicode unhash(self, StringHash hash_value): + cpdef unicode unhash(self, StringHash hash_value): '''Fetch a string from the reverse index, given its hash value.''' return self.bacov[hash_value] - cpdef list find_substrings(self, unicode word): + cpdef list find_substrings(self, unicode chunk): + """Find how to split a chunk into substrings. + + This method calls find_split repeatedly. Most languages will want to + override find_split, but it may be useful to override this instead. + + Args: + chunk (unicode): The string to be split, e.g. u"Mike's!" + + Returns: + substrings (list): The component substrings, e.g. [u"Mike", "'s", "!"]. + """ substrings = [] - while word: - split = self.find_split(word) + while chunk: + split = self.find_split(chunk) if split == 0: - substrings.append(word) + substrings.append(chunk) break - substrings.append(word[:split]) - word = word[split:] + substrings.append(chunk[:split]) + chunk = chunk[split:] return substrings cdef int find_split(self, unicode word): return len(word) - def load_tokenization(self, token_rules=None): + cdef int set_orth(self, unicode string, Lexeme* word): + pass + + def load_tokenization(self, token_rules): + '''Load special-case tokenization rules. + + Loads special-case tokenization rules into the Language.chunk cache, + read from data//tokenization . The special cases are loaded before + any language data is tokenized, giving these priority. For instance, + the English tokenization rules map "ain't" to ["are", "not"]. + + Args: + token_rules (list): A list of (chunk, tokens) pairs, where chunk is + a string and tokens is a list of strings. + ''' for chunk, tokens in token_rules: self.new_chunk(chunk, tokens) def load_dist_info(self, dist_info): + '''Load distributional information for the known lexemes of the language. + + The distributional information is read from data//dist_info.json . + It contains information like the (smoothed) unigram log probability of + the word, how often the word is found upper-cased, how often the word + is found title-cased, etc. + ''' cdef unicode string cdef dict word_dist cdef Lexeme* w for string, word_dist in dist_info.items(): w = self.lookup(string) - w.dist.prob = word_dist.prob - w.dist.cluster = word_dist.cluster + w.prob = word_dist.prob + w.cluster = word_dist.cluster for flag in word_dist.flags: - w.dist.flags |= DIST_FLAGS[flag] + w.dist_flags |= DIST_FLAGS[flag] for tag in word_dist.tagdict: - w.dist.tagdict |= TAGS[tag] + w.possible_tags |= TAGS[tag] cdef inline bint _is_whitespace(Py_UNICODE c) nogil: diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index ba692280f..ad3fd8e7b 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -4,7 +4,6 @@ from spacy.lexeme cimport Lexeme from cython.operator cimport dereference as deref from spacy.spacy cimport Language -from spacy.lexeme cimport StringAttr cdef class Tokens: @@ -15,5 +14,5 @@ cdef class Tokens: cpdef int append(self, Lexeme_addr token) cpdef int extend(self, Tokens other) except -1 - cpdef object group_by(self, StringAttr attr) - cpdef dict count_by(self, StringAttr attr) + cpdef object group_by(self, size_t attr) + cpdef dict count_by(self, size_t attr) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 1b0d42981..9aaf08106 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -3,7 +3,7 @@ from cython.operator cimport preincrement as inc from spacy.lexeme cimport Lexeme -from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of +#from spacy.lexeme cimport attr_of, lex_of, norm_of, shape_of from spacy.spacy cimport StringHash @@ -37,7 +37,7 @@ cdef class Tokens: for el in other: self.append(el) - cpdef object group_by(self, StringAttr attr): + cpdef object group_by(self, size_t attr): '''Group tokens that share the property attr into Tokens instances, and return a list of them. Returns a tuple of three lists: @@ -66,7 +66,8 @@ cdef class Tokens: cdef StringHash key cdef Lexeme_addr t for t in self.vctr[0]: - key = attr_of(t, attr) + #key = attr_of(t, attr) + key = 0 if key in indices: groups[indices[key]].append(t) else: @@ -77,12 +78,13 @@ cdef class Tokens: groups[-1].append(t) return names, hashes, groups - cpdef dict count_by(self, StringAttr attr): + cpdef dict count_by(self, size_t attr): counts = {} cdef Lexeme_addr t cdef StringHash key for t in self.vctr[0]: - key = attr_of(t, attr) + #key = attr_of(t, attr) + key = 0 if key not in counts: counts[key] = 0 counts[key] += 1