This commit is contained in:
Matthew Honnibal 2015-07-25 21:14:07 +02:00
commit 2e6a60eaec
52 changed files with 317092 additions and 195 deletions

View File

@ -11,11 +11,18 @@ python:
# install dependencies
install:
- "pip install --upgrade setuptools"
- "rm -rf spacy/"
- "pip install spacy"
- "pip install cython fabric fabtools"
- "pip install -r requirements.txt"
- "python setup.py build_ext --inplace"
- "mkdir -p corpora/en"
- "cd corpora/en"
- "wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz"
- "tar -xzf WordNet-3.0.tar.gz"
- "mv WordNet-3.0 wordnet"
- "cd ../../"
- "export PYTHONPATH=`pwd`"
- "python bin/init_model.py lang_data/en corpora/en spacy/en/data"
# run tests
script:
- py.test tests/tokenizer/
- py.test tests/vocab/
- py.test tests/tagger/
- "py.test tests/ -x"

27
bin/gather_freqs.py Normal file
View File

@ -0,0 +1,27 @@
import plac
def main(in_loc, out_loc):
out_file = open(out_loc, 'w')
this_key = None
this_freq = 0
df = 0
for line in open(in_loc):
line = line.strip()
if not line:
continue
freq, key = line.split('\t', 1)
freq = int(freq)
if this_key is not None and key != this_key:
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
this_key = key
this_freq = freq
df = 1
else:
this_freq += freq
df += 1
out_file.write('%d\t%d\t%s\n' % (this_freq, df, this_key))
out_file.close()
if __name__ == '__main__':
plac.call(main)

View File

@ -15,6 +15,8 @@ Requires:
* clusters.txt --- Output of hierarchical clustering, e.g. Brown clusters
* vectors.tgz --- output of something like word2vec
"""
from __future__ import unicode_literals
import plac
from pathlib import Path
@ -45,7 +47,7 @@ def setup_tokenizer(lang_data_dir, tok_dir):
def _read_clusters(loc):
if not loc.exists():
print "Warning: Clusters file not found"
print("Warning: Clusters file not found")
return {}
clusters = {}
for line in codecs.open(str(loc), 'r', 'utf8'):
@ -60,7 +62,7 @@ def _read_clusters(loc):
else:
clusters[word] = '0'
# Expand clusters with re-casing
for word, cluster in clusters.items():
for word, cluster in list(clusters.items()):
if word.lower() not in clusters:
clusters[word.lower()] = cluster
if word.title() not in clusters:
@ -72,7 +74,7 @@ def _read_clusters(loc):
def _read_probs(loc):
if not loc.exists():
print "Warning: Probabilities file not found"
print("Warning: Probabilities file not found")
return {}
probs = {}
for i, line in enumerate(codecs.open(str(loc), 'r', 'utf8')):
@ -85,7 +87,7 @@ def _read_probs(loc):
def _read_senses(loc):
lexicon = defaultdict(lambda: defaultdict(list))
if not loc.exists():
print "Warning: WordNet senses not found"
print("Warning: WordNet senses not found")
return lexicon
sense_names = dict((s, i) for i, s in enumerate(spacy.senses.STRINGS))
pos_ids = {'noun': NOUN, 'verb': VERB, 'adjective': ADJ}
@ -109,13 +111,20 @@ def setup_vocab(src_dir, dst_dir):
if vectors_src.exists():
write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin'))
else:
print "Warning: Word vectors file not found"
print("Warning: Word vectors file not found")
vocab = Vocab(data_dir=None, get_lex_props=get_lex_props)
clusters = _read_clusters(src_dir / 'clusters.txt')
probs = _read_probs(src_dir / 'words.sgt.prob')
lemmatizer = Lemmatizer(str(src_dir / 'wordnet'), NOUN, VERB, ADJ)
if not probs:
min_prob = 0.0
else:
min_prob = min(probs.values())
for word in clusters:
if word not in probs:
probs[word] = min_prob
lexicon = []
for word, prob in reversed(sorted(probs.items(), key=lambda item: item[1])):
for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])):
entry = get_lex_props(word)
if word in clusters or float(prob) >= -17:
entry['prob'] = float(prob)
@ -144,7 +153,7 @@ def main(lang_data_dir, corpora_dir, model_dir):
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
setup_vocab(corpora_dir, model_dir / 'vocab')
if not (model_dir / 'wordnet').exists():
copytree(str(corpora_dir / 'wordnet'), str(model_dir / 'wordnet'))
copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet'))
if __name__ == '__main__':

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
from __future__ import division
from __future__ import unicode_literals
from __future__ import print_function
import os
from os import path
@ -107,7 +108,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp = Language(data_dir=model_dir)
print "Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %"
print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %")
for itn in range(n_iter):
scorer = Scorer()
loss = 0
@ -138,9 +139,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
nlp.entity.train(tokens, gold)
nlp.tagger.train(tokens, gold.tags)
random.shuffle(gold_tuples)
print '%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc)
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc,
scorer.token_acc))
nlp.end_training()
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
@ -219,14 +220,14 @@ def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbos
# write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width)
scorer = evaluate(English, list(read_json_file(dev_loc)),
model_dir, gold_preproc=gold_preproc, verbose=verbose)
print 'TOK', scorer.token_acc
print 'POS', scorer.tags_acc
print 'UAS', scorer.uas
print 'LAS', scorer.las
print('TOK', scorer.token_acc)
print('POS', scorer.tags_acc)
print('UAS', scorer.uas)
print('LAS', scorer.las)
print 'NER P', scorer.ents_p
print 'NER R', scorer.ents_r
print 'NER F', scorer.ents_f
print('NER P', scorer.ents_p)
print('NER R', scorer.ents_r)
print('NER F', scorer.ents_f)
if __name__ == '__main__':

316709
corpora/en/clusters.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,7 @@ cython
cymem == 1.11
pathlib
preshed == 0.37
thinc == 3.2
thinc == 3.3
murmurhash == 0.24
unidecode
numpy

View File

@ -120,7 +120,7 @@ def run_setup(exts):
ext_modules=exts,
license="Dual: Commercial or AGPL",
install_requires=['numpy', 'murmurhash', 'cymem >= 1.11', 'preshed == 0.37',
'thinc == 3.2', "unidecode", 'wget', 'plac', 'six',
'thinc == 3.3', "unidecode", 'wget', 'plac', 'six',
'ujson'],
setup_requires=["headers_workaround"],
)
@ -162,6 +162,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
'spacy.gold', 'spacy.orth',
'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
'spacy.cfile',
'spacy.syntax.ner']

12
spacy/cfile.pxd Normal file
View File

@ -0,0 +1,12 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from cymem.cymem cimport Pool
cdef class CFile:
cdef FILE* fp
cdef bint is_open
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *

40
spacy/cfile.pyx Normal file
View File

@ -0,0 +1,40 @@
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
cdef class CFile:
def __init__(self, loc, mode):
if isinstance(mode, unicode):
mode_str = mode.encode('ascii')
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode_str)
if self.fp == NULL:
raise IOError("Could not open binary file %s" % bytes_loc)
self.is_open = True
def __dealloc__(self):
if self.is_open:
fclose(self.fp)
def close(self):
fclose(self.fp)
self.is_open = False
cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
st = fread(dest, elem_size, number, self.fp)
if st != number:
raise IOError
cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
st = fwrite(src, elem_size, number, self.fp)
if st != number:
raise IOError
cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
cdef void* dest = mem.alloc(number, elem_size)
self.read_into(dest, number, elem_size)
return dest
def write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)

View File

@ -95,15 +95,15 @@ class English(object):
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
if Tagger:
if Tagger and path.exists(path.join(data_dir, 'pos')):
self.tagger = Tagger(self.vocab.strings, data_dir)
else:
self.tagger = None
if Parser:
if Parser and path.exists(path.join(data_dir, 'deps')):
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
else:
self.parser = None
if Entity:
if Entity and path.exists(path.join(data_dir, 'ner')):
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
else:
self.entity = None
@ -153,15 +153,14 @@ class English(object):
self.tagger.model.end_training()
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
packer = Packer(self.vocab, [
(TAG, self.tagger.moves.freqs[TAG].items()),
(HEAD, self.parser.moves.freqs[HEAD].items()),
(DEP, self.parser.moves.freqs[DEP].items()),
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items())
])
packer.dump(path.join(data_dir, 'vocab'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
file_.write(
json.dumps([
(TAG, self.tagger.freqs[TAG].items()),
(DEP, self.parser.moves.freqs[DEP].items()),
(ENT_IOB, self.entity.moves.freqs[ENT_IOB].items()),
(ENT_TYPE, self.entity.moves.freqs[ENT_TYPE].items()),
(HEAD, self.parser.moves.freqs[HEAD].items())]))
@property
def tags(self):

View File

@ -14,6 +14,9 @@ from ..attrs cimport LEMMA as _LEMMA
from ..attrs cimport POS as _POS
from ..attrs cimport TAG as _TAG
from ..attrs cimport DEP as _DEP
from ..attrs cimport HEAD as _HEAD
from ..attrs cimport ENT_IOB as _ENT_IOB
from ..attrs cimport ENT_TYPE as _ENT_TYPE
cpdef enum:

View File

@ -262,6 +262,9 @@ cdef class EnPosTagger:
'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.strings[tag]] = 1
self.freqs[TAG][0] = 1
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.

View File

@ -3,8 +3,6 @@ from cymem.cymem cimport Pool
from .structs cimport TokenC
from .syntax.transition_system cimport Transition
cimport numpy
cdef struct GoldParseC:
int* tags

View File

@ -1,7 +1,5 @@
import numpy
import codecs
import json
import ujson
import random
import re
import os
@ -9,6 +7,11 @@ from os import path
from libc.string cimport memset
try:
import ujson as json
except ImportError:
import json
def tags_to_entities(tags):
entities = []
@ -128,7 +131,7 @@ def read_json_file(loc, docs_filter=None):
yield from read_json_file(path.join(loc, filename))
else:
with open(loc) as file_:
docs = ujson.load(file_)
docs = json.load(file_)
for doc in docs:
if docs_filter is not None and not docs_filter(doc):
continue

View File

@ -13,7 +13,7 @@ cdef Code bit_append(Code code, bint bit) nogil
cdef class BitArray:
cdef bytes data
cdef bytearray data
cdef uchar byte
cdef uchar bit_of_byte
cdef uint32_t i

View File

@ -1,3 +1,5 @@
from __future__ import unicode_literals
from libc.string cimport memcpy
# Note that we're setting the most significant bits here first, when in practice
@ -15,7 +17,7 @@ cdef Code bit_append(Code code, bint bit) nogil:
cdef class BitArray:
def __init__(self, data=b''):
self.data = data
self.data = bytearray(data)
self.byte = 0
self.bit_of_byte = 0
self.i = 0
@ -45,7 +47,7 @@ cdef class BitArray:
start_bit = self.i % 8
if start_bit != 0 and start_byte < len(self.data):
byte = ord(self.data[start_byte])
byte = self.data[start_byte]
for i in range(start_bit, 8):
self.i += 1
yield 1 if (byte & (one << i)) else 0
@ -68,18 +70,24 @@ cdef class BitArray:
# TODO portability
cdef uchar[4] chars
chars[0] = <uchar>ord(self.data[start_byte])
chars[1] = <uchar>ord(self.data[start_byte+1])
chars[2] = <uchar>ord(self.data[start_byte+2])
chars[3] = <uchar>ord(self.data[start_byte+3])
chars[0] = self.data[start_byte]
chars[1] = self.data[start_byte+1]
chars[2] = self.data[start_byte+2]
chars[3] = self.data[start_byte+3]
cdef uint32_t output
memcpy(&output, chars, 4)
self.i += 32
return output
def as_bytes(self):
cdef unsigned char byte_char
if self.bit_of_byte != 0:
return self.data + chr(self.byte)
byte = chr(self.byte)
# Jump through some hoops for Python3
if isinstance(byte, unicode):
return self.data + <bytes>(&self.byte)[:1]
else:
return self.data + chr(self.byte)
else:
return self.data
@ -92,7 +100,7 @@ cdef class BitArray:
self.bit_of_byte += 1
self.i += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.data += bytearray((self.byte,))
self.byte = 0
self.bit_of_byte = 0
@ -106,7 +114,7 @@ cdef class BitArray:
self.byte &= ~(one << self.bit_of_byte)
self.bit_of_byte += 1
if self.bit_of_byte == 8:
self.data += chr(self.byte)
self.data += <bytes>self.byte
self.byte = 0
self.bit_of_byte = 0
self.i += 1

View File

@ -1,4 +1,5 @@
# cython: profile=True
from __future__ import unicode_literals
cimport cython
from libcpp.queue cimport priority_queue
from libcpp.pair cimport pair
@ -110,14 +111,14 @@ cdef class HuffmanCodec:
cdef int branch
cdef int n_msg = msg.shape[0]
cdef bytes bytes_ = bits.as_bytes()
cdef bytearray bytes_ = bits.as_bytes()
cdef unsigned char byte
cdef int i_msg = 0
cdef int i_byte = bits.i // 8
cdef unsigned char i_bit = 0
cdef unsigned char one = 1
while i_msg < n_msg:
byte = ord(bytes_[i_byte])
byte = bytes_[i_byte]
i_byte += 1
for i_bit in range(8):
branch = node.right if (byte & (one << i_bit)) else node.left
@ -138,11 +139,11 @@ cdef class HuffmanCodec:
def __get__(self):
output = []
cdef int i, j
cdef bytes string
cdef unicode string
cdef Code code
for i in range(self.codes.size()):
code = self.codes[i]
string = b'{0:b}'.format(code.bits).rjust(code.length, '0')
string = '{0:b}'.format(code.bits).rjust(code.length, '0')
string = string[::-1]
output.append(string)
return output

View File

@ -10,6 +10,7 @@ from libcpp.pair cimport pair
from cymem.cymem cimport Address, Pool
from preshed.maps cimport PreshMap
from preshed.counter cimport PreshCounter
import json
from ..attrs cimport ORTH, ID, SPACY, TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
from ..tokens.doc cimport Doc
@ -65,7 +66,7 @@ def _gen_orths(Vocab vocab):
def _gen_chars(Vocab vocab):
cdef attr_t orth
cdef size_t addr
char_weights = {chr(i): 1e-20 for i in range(256)}
char_weights = {i: 1e-20 for i in range(256)}
cdef unicode string
cdef bytes char
cdef bytes utf8_str
@ -74,9 +75,9 @@ def _gen_chars(Vocab vocab):
string = vocab.strings[lex.orth]
utf8_str = string.encode('utf8')
for char in utf8_str:
char_weights.setdefault(char, 0.0)
char_weights[char] += c_exp(lex.prob)
char_weights[b' '] += c_exp(lex.prob)
char_weights.setdefault(ord(char), 0.0)
char_weights[ord(char)] += c_exp(lex.prob)
char_weights[ord(' ')] += c_exp(lex.prob)
return char_weights.items()
@ -98,33 +99,34 @@ cdef class Packer:
self._codecs = tuple(codecs)
self.attrs = tuple(attrs)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):
return cls(vocab, util.read_encoding_freqs(data_dir))
def pack(self, Doc doc):
bits = self._orth_encode(doc)
if bits is None:
bits = self._char_encode(doc)
cdef int i
if self.attrs:
array = doc.to_array(self.attrs)
for i, codec in enumerate(self._codecs):
codec.encode_int32(array[:, i], bits)
return bits
codec.encode(array[:, i], bits)
return bits.as_bytes()
def unpack(self, BitArray bits):
def unpack(self, data):
doc = Doc(self.vocab)
self.unpack_into(data, doc)
return doc
def unpack_into(self, byte_string, Doc doc):
bits = BitArray(byte_string)
bits.seek(0)
cdef int32_t length = bits.read32()
if length >= 0:
doc = self._orth_decode(bits, length)
self._orth_decode(bits, length, doc)
else:
doc = self._char_decode(bits, -length)
self._char_decode(bits, -length, doc)
array = numpy.zeros(shape=(len(doc), len(self._codecs)), dtype=numpy.int32)
for i, codec in enumerate(self._codecs):
codec.decode_int32(bits, array[:, i])
codec.decode(bits, array[:, i])
doc.from_array(self.attrs, array)
return doc
@ -141,20 +143,13 @@ cdef class Packer:
bits.append(bool(token.whitespace_))
return bits
def _orth_decode(self, BitArray bits, n):
orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
self.orth_codec.decode_int32(bits, orths)
orths_and_spaces = zip(orths, bits)
cdef Doc doc = Doc(self.vocab, orths_and_spaces)
return doc
def _char_encode(self, Doc doc):
cdef bytes utf8_str = doc.string.encode('utf8')
cdef BitArray bits = BitArray()
cdef int32_t length = len(utf8_str)
# Signal chars with negative length
bits.extend(-length, 32)
self.char_codec.encode(utf8_str, bits)
self.char_codec.encode(bytearray(utf8_str), bits)
cdef int i, j
for i in range(doc.length):
for j in range(doc.data[i].lex.length-1):
@ -164,12 +159,24 @@ cdef class Packer:
bits.append(False)
return bits
def _char_decode(self, BitArray bits, n):
def _orth_decode(self, BitArray bits, int32_t n, Doc doc):
cdef attr_t[:] orths = numpy.ndarray(shape=(n,), dtype=numpy.int32)
self.orth_codec.decode_int32(bits, orths)
cdef int i
cdef bint space
spaces = iter(bits)
for i in range(n):
orth = orths[i]
space = next(spaces)
lex = self.vocab.get_by_orth(doc.mem, orth)
doc.push_back(lex, space)
return doc
def _char_decode(self, BitArray bits, int32_t n, Doc doc):
cdef bytearray utf8_str = bytearray(n)
self.char_codec.decode(bits, utf8_str)
cdef unicode string = utf8_str.decode('utf8')
cdef Doc tokens = Doc(self.vocab)
cdef int start = 0
cdef bint is_spacy
cdef int length = len(string)
@ -178,11 +185,11 @@ cdef class Packer:
for is_end_token in bits:
if is_end_token:
span = string[start:i+1]
lex = self.vocab.get(tokens.mem, span)
lex = self.vocab.get(doc.mem, span)
is_spacy = (i+1) < length and string[i+1] == u' '
tokens.push_back(lex, is_spacy)
doc.push_back(lex, is_spacy)
start = i + 1 + is_spacy
i += 1
if i >= n:
break
return tokens
return doc

View File

@ -81,6 +81,7 @@ cdef class StringStore:
def __getitem__(self, object string_or_id):
cdef bytes byte_string
cdef const Utf8Str* utf8str
cdef int id_
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id == 0:
return u''

View File

@ -1,4 +1,3 @@
# cython: profile=True
"""
Fill an array, context, with every _atomic_ value our features reference.
We then write the _actual features_ as tuples of the atoms. The machinery

View File

@ -1,4 +1,3 @@
# cython: profile=True
from __future__ import unicode_literals
import ctypes

View File

@ -85,6 +85,9 @@ cdef class BiluoPushDown(TransitionSystem):
elif gold.c.ner[i].move == OUT:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
else:
self.freqs[ENT_IOB][1] += 1
self.freqs[ENT_TYPE][0] += 1
cdef Transition lookup_transition(self, object name) except *:
if name == '-':

View File

@ -1,4 +1,3 @@
# cython: profile=True
"""
MALT-style dependency parser
"""
@ -85,18 +84,17 @@ cdef class Parser:
cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
self.model.n_feats, self.model.n_feats)
self.parse(stcls, eg.c)
with nogil:
self.parse(stcls, eg.c)
tokens.set_parse(stcls._sent)
cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
while not stcls.is_final():
memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
self.moves.set_valid(eg.is_valid, stcls)
fill_context(eg.atoms, stcls)
self.model.set_scores(eg.scores, eg.atoms)
eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
self.moves.finalize_state(stcls)

View File

@ -1,4 +1,3 @@
# cython: profile=True
from libc.string cimport memcpy, memset
from libc.stdint cimport uint32_t
from ..vocab cimport EMPTY_LEXEME

View File

@ -33,6 +33,11 @@ cdef class TransitionSystem:
self.freqs = {}
for attr in (TAG, HEAD, DEP, ENT_TYPE, ENT_IOB):
self.freqs[attr] = defaultdict(int)
self.freqs[attr][0] = 1
# Ensure we've seen heads. Need an official dependency length limit...
for i in range(512):
self.freqs[HEAD][i] = 1
self.freqs[HEAD][-i] = 1
cdef int initialize_state(self, StateClass state) except -1:
pass

View File

@ -71,17 +71,6 @@ cdef class Doc:
self.is_tagged = False
self.is_parsed = False
self._py_tokens = []
cdef const LexemeC* lex
cdef attr_t orth
cdef bint space
if orths_and_spaces is not None:
for orth, space in orths_and_spaces:
lex = <LexemeC*>self.vocab._by_orth.get(orth)
if lex != NULL:
assert lex.orth == orth
self.push_back(lex, space)
else:
raise Exception('Lexeme not found: %d' % orth)
def __getitem__(self, object i):
"""Get a token.
@ -122,9 +111,12 @@ cdef class Doc:
def __unicode__(self):
return u''.join([t.string for t in self])
def __str__(self):
return u''.join([t.string for t in self])
@property
def string(self):
return unicode(self)
return u''.join([t.string for t in self])
@property
def ents(self):
@ -303,12 +295,11 @@ cdef class Doc:
return self
def to_bytes(self):
bits = self.vocab.packer.pack(self)
return struct.pack('I', len(bits)) + bits.as_bytes()
byte_string = self.vocab.serializer.pack(self)
return struct.pack('I', len(byte_string)) + byte_string
def from_bytes(self, data):
bits = BitArray(data)
self.vocab.packer.unpack_into(bits, self)
self.vocab.serializer.unpack_into(data[4:], self)
return self
@staticmethod
@ -316,15 +307,14 @@ cdef class Doc:
keep_reading = True
while keep_reading:
try:
n_bits_str = file_.read(4)
if len(n_bits_str) < 4:
n_bytes_str = file_.read(4)
if len(n_bytes_str) < 4:
break
n_bits = struct.unpack('I', n_bits_str)[0]
n_bytes = n_bits // 8 + bool(n_bits % 8)
n_bytes = struct.unpack('I', n_bytes_str)[0]
data = file_.read(n_bytes)
except StopIteration:
keep_reading = False
yield data
yield n_bytes_str + data
# This function is terrible --- need to fix this.
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,

View File

@ -34,6 +34,9 @@ cdef class Token:
def __unicode__(self):
return self.string
def __str__(self):
return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id)

View File

@ -65,16 +65,6 @@ def read_tokenization(lang):
return entries
def read_encoding_freqs(data_dir):
tags = json.load(open(path.join(data_dir, '..', 'pos', 'tag_freqs.json')))
heads = json.load(open(path.join(data_dir, '..', 'deps', 'head_freqs.json')))
deps = json.load(open(path.join(data_dir, '..', 'deps', 'dep_freqs.json')))
iob = json.load(open(path.join(data_dir, '..', 'ner', 'iob_freqs.json')))
ne_types = json.load(open(path.join(data_dir, '..', 'ner', 'ne_freqs.json')))
return [(TAG, tags), (HEAD, heads), (DEP, deps), (ENT_IOB, iob),
(ENT_TYPE, ne_types)]
def read_detoken_rules(lang): # Deprecated?
loc = path.join(DATA_DIR, lang, 'detokenize')
entries = []

View File

@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64
from .structs cimport LexemeC, TokenC
from .typedefs cimport utf8_t, hash_t
from .typedefs cimport utf8_t, attr_t, hash_t
from .strings cimport StringStore
@ -29,9 +29,12 @@ cdef class Vocab:
cpdef readonly StringStore strings
cdef readonly object pos_tags
cdef readonly int length
cdef public object packer
cdef public object _serializer
cdef public object data_dir
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
cdef PreshMap _by_hash

View File

@ -1,3 +1,6 @@
from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
from libc.string cimport memset
from libc.stdint cimport int32_t
@ -6,6 +9,7 @@ import bz2
from os import path
import codecs
import math
import json
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport set_lex_struct_props
@ -13,6 +17,7 @@ from .lexeme cimport Lexeme
from .strings cimport hash_string
from .orth cimport word_shape
from .typedefs cimport attr_t
from .cfile cimport CFile
from cymem.cymem cimport Address
from . import util
@ -54,8 +59,19 @@ cdef class Vocab:
if load_vectors and path.exists(path.join(data_dir, 'vec.bin')):
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
#self.packer = Packer(self, util.read_encoding_freqs(data_dir))
self.packer = None
self._serializer = None
self.data_dir = data_dir
property serializer:
def __get__(self):
if self._serializer is None:
freqs = []
if self.data_dir is not None:
freqs_loc = path.join(self.data_dir, 'serializer.json')
if path.exists(freqs_loc):
freqs = json.load(open(freqs_loc))
self._serializer = Packer(self, freqs)
return self._serializer
def __len__(self):
"""The current number of lexemes stored."""
@ -82,6 +98,27 @@ cdef class Vocab:
self._add_lex_to_vocab(key, lex)
return lex
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
cdef LexemeC* lex
lex = <LexemeC*>self._by_orth.get(orth)
if lex != NULL:
return lex
cdef unicode string = self.strings[orth]
cdef bint is_oov = mem is not self.mem
if len(string) < 3:
mem = self.mem
lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
props = self.lexeme_props_getter(string)
set_lex_struct_props(lex, props, self.strings, EMPTY_VEC)
if is_oov:
lex.id = 0
else:
self._add_lex_to_vocab(hash_string(string), lex)
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
self._by_hash.set(key, <void*>lex)
self._by_orth.set(lex.orth, <void*>lex)
@ -138,19 +175,16 @@ cdef class Vocab:
if path.exists(loc):
assert not path.isdir(loc)
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
cdef FILE* fp = fopen(<char*>bytes_loc, 'wb')
assert fp != NULL
cdef CFile fp = CFile(bytes_loc, 'wb')
cdef size_t st
cdef size_t addr
cdef hash_t key
for key, addr in self._by_hash.items():
lexeme = <LexemeC*>addr
st = fwrite(&lexeme.orth, sizeof(lexeme.orth), 1, fp)
assert st == 1
st = fwrite(lexeme, sizeof(LexemeC), 1, fp)
assert st == 1
st = fclose(fp)
assert st == 0
fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1)
fp.write_from(lexeme, sizeof(LexemeC), 1)
fp.close()
def load_lexemes(self, strings_loc, loc):
self.strings.load(strings_loc)
@ -188,7 +222,7 @@ cdef class Vocab:
fclose(fp)
def load_rep_vectors(self, loc):
file_ = _CFile(loc, b'rb')
cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len
cdef int32_t vec_len
cdef int32_t prev_vec_len = 0
@ -198,22 +232,20 @@ cdef class Vocab:
cdef bytes py_word
cdef vector[float*] vectors
cdef int i
cdef Pool tmp_mem = Pool()
while True:
try:
file_.read(&word_len, sizeof(word_len), 1)
file_.read_into(&word_len, sizeof(word_len), 1)
except IOError:
break
file_.read(&vec_len, sizeof(vec_len), 1)
file_.read_into(&vec_len, sizeof(vec_len), 1)
if prev_vec_len != 0 and vec_len != prev_vec_len:
raise VectorReadError.mismatched_sizes(loc, vec_len, prev_vec_len)
if 0 >= vec_len >= MAX_VEC_SIZE:
raise VectorReadError.bad_size(loc, vec_len)
mem = Address(word_len, sizeof(char))
chars = <char*>mem.ptr
vec = <float*>self.mem.alloc(vec_len, sizeof(float))
file_.read(chars, sizeof(char), word_len)
file_.read(vec, sizeof(float), vec_len)
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
string_id = self.strings[chars[:word_len]]
while string_id >= vectors.size():
@ -235,7 +267,7 @@ cdef class Vocab:
def write_binary_vectors(in_loc, out_loc):
cdef _CFile out_file = _CFile(out_loc, 'wb')
cdef CFile out_file = CFile(out_loc, 'wb')
cdef Address mem
cdef int32_t word_len
cdef int32_t vec_len
@ -252,42 +284,12 @@ def write_binary_vectors(in_loc, out_loc):
word_len = len(word)
vec_len = len(pieces)
out_file.write(sizeof(word_len), 1, &word_len)
out_file.write(sizeof(vec_len), 1, &vec_len)
out_file.write_from(&word_len, 1, sizeof(word_len))
out_file.write_from(&vec_len, 1, sizeof(vec_len))
chars = <char*>word
out_file.write(sizeof(char), len(word), chars)
out_file.write(sizeof(float), vec_len, vec)
cdef class _CFile:
cdef FILE* fp
def __init__(self, loc, bytes mode):
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self.fp = fopen(<char*>bytes_loc, mode)
if self.fp == NULL:
raise IOError
def __dealloc__(self):
fclose(self.fp)
def close(self):
fclose(self.fp)
cdef int read(self, void* dest, size_t elem_size, size_t n) except -1:
st = fread(dest, elem_size, n, self.fp)
if st != n:
raise IOError
cdef int write(self, size_t elem_size, size_t n, void* data) except -1:
st = fwrite(data, elem_size, n, self.fp)
if st != n:
raise IOError
cdef int write_unicode(self, unicode value):
cdef bytes py_bytes = value.encode('utf8')
cdef char* chars = <char*>py_bytes
self.write(sizeof(char), len(py_bytes), chars)
out_file.write_from(chars, len(word), sizeof(char))
out_file.write_from(vec, vec_len, sizeof(float))
class VectorReadError(Exception):

View File

@ -7,3 +7,19 @@ import os
def EN():
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
return English(data_dir=data_dir)
def pytest_addoption(parser):
parser.addoption("--models", action="store_true",
help="include tests that require full models")
parser.addoption("--vectors", action="store_true",
help="include word vectors tests")
parser.addoption("--slow", action="store_true",
help="include slow tests")
def pytest_runtest_setup(item):
for opt in ['models', 'vectors', 'slow']:
if opt in item.keywords and not item.config.getoption("--%s" % opt):
pytest.skip("need --%s option to run" % opt)

View File

@ -1,4 +1,6 @@
import pytest
@pytest.mark.models
def test_simple_types(EN):
tokens = EN(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)

View File

@ -1,6 +1,7 @@
import pytest
@pytest.mark.models
def test_root(EN):
tokens = EN(u"i don't have other assistance")
for t in tokens:

View File

@ -12,6 +12,7 @@ def sun_text():
return text
@pytest.mark.models
def test_consistency(EN, sun_text):
tokens = EN(sun_text)
for head in tokens:
@ -21,6 +22,7 @@ def test_consistency(EN, sun_text):
assert child.head is head
@pytest.mark.models
def test_child_consistency(EN, sun_text):
tokens = EN(sun_text)
@ -53,6 +55,7 @@ def test_child_consistency(EN, sun_text):
assert not children
@pytest.mark.models
def test_edges(EN):
sun_text = u"Chemically, about three quarters of the Sun's mass consists of hydrogen, while the rest is mostly helium."
tokens = EN(sun_text)

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_subtrees(EN):
sent = EN('The four wheels on the bus turned quickly')
wheels = sent[2]

View File

@ -45,7 +45,7 @@ def test1():
codec = HuffmanCodec(list(enumerate(probs)))
py_codes = py_encode(dict(enumerate(probs)))
py_codes = py_codes.items()
py_codes = list(py_codes.items())
py_codes.sort()
assert codec.strings == [c for i, c in py_codes]
@ -60,7 +60,7 @@ def test_round_trip():
strings = list(codec.strings)
codes = {codec.leaves[i]: strings[i] for i in range(len(codec.leaves))}
bits = codec.encode(message)
string = b''.join(b'{0:b}'.format(ord(c)).rjust(8, b'0')[::-1] for c in bits.as_bytes())
string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
for word in message:
code = codes[word]
assert string[:len(code)] == code
@ -76,7 +76,7 @@ def test_rosetta():
symb2freq = defaultdict(int)
for ch in txt:
symb2freq[ch] += 1
by_freq = symb2freq.items()
by_freq = list(symb2freq.items())
by_freq.sort(reverse=True, key=lambda item: item[1])
symbols = [sym for sym, prob in by_freq]
@ -96,6 +96,7 @@ def test_rosetta():
assert my_exp_len == py_exp_len
@pytest.mark.slow
def test_vocab(EN):
codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
expected_length = 0
@ -105,6 +106,7 @@ def test_vocab(EN):
assert 8 < expected_length < 15
@pytest.mark.slow
def test_freqs():
freqs = []
words = []

View File

@ -0,0 +1,23 @@
import pytest
from spacy.serialize.packer import Packer
from spacy.attrs import ORTH, SPACY
from spacy.tokens import Doc
import math
def test_read_write(EN):
doc1 = EN(u'This is a simple test. With a couple of sentences.')
doc2 = EN(u'This is another test document.')
with open('/tmp/spacy_docs.bin', 'wb') as file_:
file_.write(doc1.to_bytes())
file_.write(doc2.to_bytes())
with open('/tmp/spacy_docs.bin', 'rb') as file_:
bytes1, bytes2 = Doc.read_bytes(file_)
r1 = Doc(EN.vocab).from_bytes(bytes1)
r2 = Doc(EN.vocab).from_bytes(bytes2)
assert r1.string == doc1.string
assert r2.string == doc2.string

View File

@ -56,12 +56,12 @@ def test_char_packer(vocab):
bits = BitArray()
bits.seek(0)
byte_str = b'the dog jumped'
byte_str = bytearray(b'the dog jumped')
packer.char_codec.encode(byte_str, bits)
bits.seek(0)
result = [b''] * len(byte_str)
packer.char_codec.decode(bits, result)
assert b''.join(result) == byte_str
assert bytearray(result) == byte_str
def test_packer_unannotated(tokenizer):
@ -120,5 +120,3 @@ def test_packer_annotated(tokenizer):
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]

View File

@ -1,6 +1,8 @@
from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_merge_tokens(EN):
tokens = EN(u'Los Angeles start.')
assert len(tokens) == 4
@ -12,6 +14,7 @@ def test_merge_tokens(EN):
assert tokens[0].head.orth_ == 'start'
@pytest.mark.models
def test_merge_heads(EN):
tokens = EN(u'I found a pilates class near work.')
assert len(tokens) == 8

View File

@ -9,6 +9,7 @@ def doc(EN):
return EN('This is a sentence. This is another sentence. And a third.')
@pytest.mark.models
def test_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0
@ -17,6 +18,7 @@ def test_sent_spans(doc):
assert sum(len(sent) for sent in sents) == len(doc)
@pytest.mark.models
def test_root(doc):
np = doc[2:4]
assert len(np) == 2

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_am_pm(en_nlp):
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
variants = ['a.m.', 'am', 'p.m.', 'pm']
@ -14,7 +15,7 @@ def test_am_pm(en_nlp):
tokens = en_nlp(string, merge_mwes=True)
assert tokens[4].orth_ == '%s%s%s' % (num, space, var)
ents = list(tokens.ents)
assert len(ents) == 1
assert len(ents) == 1, ents
assert ents[0].label_ == 'TIME', string
if ents[0].start == 4 and ents[0].end == 5:
assert ents[0].orth_ == '%s%s%s' % (num, space, var)

View File

@ -17,6 +17,7 @@ def lemmas(tagged):
return [t.lemma_ for t in tagged]
@pytest.mark.models
def test_lemmas(lemmas, tagged):
assert lemmas[0] == 'banana'
assert lemmas[1] == 'in'

View File

@ -12,6 +12,7 @@ def morph_exc():
}
@pytest.mark.models
def test_load_exc(morph_exc):
# Do this local as we want to modify it
nlp = English()

View File

@ -1,7 +1,9 @@
from spacy.en import English
import six
import pytest
@pytest.mark.models
def test_tag_names(EN):
tokens = EN(u'I ate pizzas with anchovies.', parse=False, tag=True)
pizza = tokens[2]

View File

@ -1,8 +1,9 @@
# -*- coding: utf-8 -*-
"""Sphinx doctest is just too hard. Manually paste doctest examples here"""
from spacy.en.attrs import IS_LOWER
import pytest
@pytest.mark.models
def test_1():
import spacy.en
from spacy.parts_of_speech import ADV
@ -21,6 +22,7 @@ def test_1():
assert o == -11.07155704498291
@pytest.mark.models
def test2():
import spacy.en
from spacy.parts_of_speech import ADV
@ -41,6 +43,7 @@ def test2():
-11.07155704498291
@pytest.mark.models
def test3():
import spacy.en
from spacy.parts_of_speech import ADV

View File

@ -15,6 +15,7 @@ def test_attr_of_token(EN):
assert feats_array[0][0] != feats_array[0][1]
@pytest.mark.models
def test_tag(EN):
text = u'A nice sentence.'
tokens = EN(text)
@ -26,6 +27,7 @@ def test_tag(EN):
assert feats_array[3][1] == tokens[3].tag
@pytest.mark.models
def test_dep(EN):
text = u'A nice sentence.'
tokens = EN(text)

View File

@ -4,6 +4,7 @@ import pytest
from spacy.parts_of_speech import ADV
@pytest.mark.models
def test_prob(EN):
tokens = EN(u'Give it back', parse=False)
give = tokens[0]

View File

@ -7,6 +7,7 @@ from spacy.en.attrs import IS_STOP
import pytest
@pytest.mark.models
def test_strings(EN):
tokens = EN(u'Give it back! He pleaded.')
token = tokens[0]

View File

@ -9,6 +9,7 @@ data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
# Let this have its own instances, as we have to be careful about memory here
# that's the point, after all
@pytest.mark.models
def get_orphan_token(text, i):
nlp = English(load_vectors=False, data_dir=data_dir)
tokens = nlp(text)
@ -18,6 +19,7 @@ def get_orphan_token(text, i):
return token
@pytest.mark.models
def test_orphan():
orphan = get_orphan_token('An orphan token', 1)
gc.collect()
@ -36,6 +38,7 @@ def _orphan_from_list(toks):
return lst
@pytest.mark.models
def test_list_orphans():
# Test case from NSchrading
nlp = English(load_vectors=False, data_dir=data_dir)

View File

@ -5,7 +5,7 @@ from spacy.tokens import Doc
import pytest
def test_getitem(EN):
def mest_getitem(EN):
tokens = EN(u'Give it back! He pleaded.')
assert tokens[0].orth_ == 'Give'
assert tokens[-1].orth_ == '.'
@ -13,10 +13,19 @@ def test_getitem(EN):
tokens[len(tokens)]
def test_serialize(EN):
tokens = EN(u' Give it back! He pleaded. ')
packed = tokens.serialize()
new_tokens = Doc.deserialize(EN.vocab, packed)
def mest_serialize(EN):
tokens = EN(u'Give it back! He pleaded.')
packed = tokens.to_bytes()
new_tokens = Doc(EN.vocab).from_bytes(packed)
assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]
def test_serialize_whitespace(EN):
tokens = EN(u' Give it back! He pleaded. ')
packed = tokens.to_bytes()
new_tokens = Doc(EN.vocab).from_bytes(packed)
assert tokens.string == new_tokens.string
assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens]
assert [t.orth for t in tokens] == [t.orth for t in new_tokens]

View File

@ -4,13 +4,14 @@ from spacy.en import English
import pytest
@pytest.mark.vectors
def test_vec(EN):
hype = EN.vocab['hype']
assert hype.orth_ == 'hype'
assert 0.08 >= hype.repvec[0] > 0.07
@pytest.mark.vectors
def test_capitalized(EN):
hype = EN.vocab['Hype']
assert hype.orth_ == 'Hype'

View File

@ -39,7 +39,7 @@ def test_retrieve_id(sstore):
def test_med_string(sstore):
nine_char_string = sstore[b'0123456789']
assert sstore[nine_char_string] == b'0123456789'
assert sstore[nine_char_string] == u'0123456789'
dummy = sstore[b'A']
assert sstore[b'0123456789'] == nine_char_string