diff --git a/spacy/tests/serialize/test_codecs.py b/spacy/tests/serialize/test_codecs.py deleted file mode 100644 index 4e080f600..000000000 --- a/spacy/tests/serialize/test_codecs.py +++ /dev/null @@ -1,51 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ...serialize.packer import _BinaryCodec -from ...serialize.huffman import HuffmanCodec -from ...serialize.bits import BitArray - -import numpy -import pytest - - -def test_serialize_codecs_binary(): - codec = _BinaryCodec() - bits = BitArray() - array = numpy.array([0, 1, 0, 1, 1], numpy.int32) - codec.encode(array, bits) - result = numpy.array([0, 0, 0, 0, 0], numpy.int32) - bits.seek(0) - codec.decode(bits, result) - assert list(array) == list(result) - - -def test_serialize_codecs_attribute(): - freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, - 'over': 8, 'lazy': 1, 'dog': 2, '.': 9} - int_map = {'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, - 'over': 5, 'lazy': 6, 'dog': 7, '.': 8} - - codec = HuffmanCodec([(int_map[string], freq) for string, freq in freqs.items()]) - bits = BitArray() - array = numpy.array([1, 7], dtype=numpy.int32) - codec.encode(array, bits) - result = numpy.array([0, 0], dtype=numpy.int32) - bits.seek(0) - codec.decode(bits, result) - assert list(array) == list(result) - - -def test_serialize_codecs_vocab(en_vocab): - words = ["the", "dog", "jumped"] - for word in words: - _ = en_vocab[word] - codec = HuffmanCodec([(lex.orth, lex.prob) for lex in en_vocab]) - bits = BitArray() - ids = [en_vocab[s].orth for s in words] - array = numpy.array(ids, dtype=numpy.int32) - codec.encode(array, bits) - result = numpy.array(range(len(array)), dtype=numpy.int32) - bits.seek(0) - codec.decode(bits, result) - assert list(array) == list(result) diff --git a/spacy/tests/serialize/test_huffman.py b/spacy/tests/serialize/test_huffman.py deleted file mode 100644 index 51a9fa64d..000000000 --- a/spacy/tests/serialize/test_huffman.py +++ /dev/null @@ -1,110 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals -from __future__ import division - -from ...serialize.huffman import HuffmanCodec -from ...serialize.bits import BitArray - - -from heapq import heappush, heappop, heapify -from collections import defaultdict -import numpy -import pytest - - -def py_encode(symb2freq): - """Huffman encode the given dict mapping symbols to weights - From Rosetta Code - """ - heap = [[wt, [sym, ""]] for sym, wt in symb2freq.items()] - heapify(heap) - while len(heap) > 1: - lo = heappop(heap) - hi = heappop(heap) - for pair in lo[1:]: - pair[1] = '0' + pair[1] - for pair in hi[1:]: - pair[1] = '1' + pair[1] - heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:]) - return dict(heappop(heap)[1:]) - - -def test_serialize_huffman_1(): - probs = numpy.zeros(shape=(10,), dtype=numpy.float32) - probs[0] = 0.3 - probs[1] = 0.2 - probs[2] = 0.15 - probs[3] = 0.1 - probs[4] = 0.06 - probs[5] = 0.02 - probs[6] = 0.01 - probs[7] = 0.005 - probs[8] = 0.0001 - probs[9] = 0.000001 - - codec = HuffmanCodec(list(enumerate(probs))) - py_codes = py_encode(dict(enumerate(probs))) - py_codes = list(py_codes.items()) - py_codes.sort() - assert codec.strings == [c for i, c in py_codes] - - -def test_serialize_huffman_empty(): - codec = HuffmanCodec({}) - assert codec.strings == [] - - -def test_serialize_huffman_round_trip(): - words = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'the', - 'lazy', 'dog', '.'] - freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, - 'over': 8, 'lazy': 1, 'dog': 2, '.': 9} - - codec = HuffmanCodec(freqs.items()) - strings = list(codec.strings) - codes = dict([(codec.leaves[i], strings[i]) for i in range(len(codec.leaves))]) - bits = codec.encode(words) - string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes()) - for word in words: - code = codes[word] - assert string[:len(code)] == code - string = string[len(code):] - unpacked = [0] * len(words) - bits.seek(0) - codec.decode(bits, unpacked) - assert words == unpacked - - -def test_serialize_huffman_rosetta(): - text = "this is an example for huffman encoding" - symb2freq = defaultdict(int) - for ch in text: - symb2freq[ch] += 1 - by_freq = list(symb2freq.items()) - by_freq.sort(reverse=True, key=lambda item: item[1]) - symbols = [sym for sym, prob in by_freq] - - codec = HuffmanCodec(symb2freq.items()) - py_codec = py_encode(symb2freq) - - codes = dict([(codec.leaves[i], codec.strings[i]) for i in range(len(codec.leaves))]) - - my_lengths = defaultdict(int) - py_lengths = defaultdict(int) - for symb, freq in symb2freq.items(): - my = codes[symb] - my_lengths[len(my)] += freq - py_lengths[len(py_codec[symb])] += freq - my_exp_len = sum(length * weight for length, weight in my_lengths.items()) - py_exp_len = sum(length * weight for length, weight in py_lengths.items()) - assert my_exp_len == py_exp_len - - -@pytest.mark.models -def test_vocab(EN): - codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab]) - expected_length = 0 - for i, code in enumerate(codec.strings): - leaf = codec.leaves[i] - expected_length += len(code) * numpy.exp(EN.vocab[leaf].prob) - assert 8 < expected_length < 15 diff --git a/spacy/tests/serialize/test_io.py b/spacy/tests/serialize/test_io.py deleted file mode 100644 index 16c7d326d..000000000 --- a/spacy/tests/serialize/test_io.py +++ /dev/null @@ -1,48 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ...tokens import Doc -from ..util import get_doc - -import pytest - - -def test_serialize_io_read_write(en_vocab, text_file_b): - text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."] - text2 = ["This", "is", "another", "test", "document", "."] - - doc1 = get_doc(en_vocab, text1) - doc2 = get_doc(en_vocab, text2) - text_file_b.write(doc1.to_bytes()) - text_file_b.write(doc2.to_bytes()) - text_file_b.seek(0) - bytes1, bytes2 = Doc.read_bytes(text_file_b) - result1 = get_doc(en_vocab).from_bytes(bytes1) - result2 = get_doc(en_vocab).from_bytes(bytes2) - assert result1.text_with_ws == doc1.text_with_ws - assert result2.text_with_ws == doc2.text_with_ws - - -def test_serialize_io_left_right(en_vocab): - text = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."] - doc = get_doc(en_vocab, text) - result = Doc(en_vocab).from_bytes(doc.to_bytes()) - - for token in result: - assert token.head.i == doc[token.i].head.i - if token.head is not token: - assert token.i in [w.i for w in token.head.children] - for child in token.lefts: - assert child.head.i == token.i - for child in token.rights: - assert child.head.i == token.i - - -@pytest.mark.models -def test_lemmas(EN): - text = "The geese are flying" - doc = EN(text) - result = Doc(doc.vocab).from_bytes(doc.to_bytes()) - assert result[1].lemma_ == 'goose' - assert result[2].lemma_ == 'be' - assert result[3].lemma_ == 'fly' diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py deleted file mode 100644 index e1bd4aecf..000000000 --- a/spacy/tests/serialize/test_packer.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from ...attrs import TAG, DEP, HEAD -from ...serialize.packer import Packer -from ...serialize.bits import BitArray - -from ..util import get_doc - -import pytest - - -@pytest.fixture -def text(): - return "the dog jumped" - - -@pytest.fixture -def text_b(): - return b"the dog jumped" - - -def test_serialize_char_packer(en_vocab, text_b): - packer = Packer(en_vocab, []) - bits = BitArray() - bits.seek(0) - byte_str = bytearray(text_b) - packer.char_codec.encode(byte_str, bits) - bits.seek(0) - result = [b''] * len(byte_str) - packer.char_codec.decode(bits, result) - assert bytearray(result) == byte_str - - -def test_serialize_packer_unannotated(en_tokenizer, text): - packer = Packer(en_tokenizer.vocab, []) - tokens = en_tokenizer(text) - assert tokens.text_with_ws == text - bits = packer.pack(tokens) - result = packer.unpack(bits) - assert result.text_with_ws == text - - -def test_packer_annotated(en_vocab, text): - heads = [1, 1, 0] - deps = ['det', 'nsubj', 'ROOT'] - tags = ['DT', 'NN', 'VBD'] - - attr_freqs = [ - (TAG, [(en_vocab.strings['NN'], 0.1), - (en_vocab.strings['DT'], 0.2), - (en_vocab.strings['JJ'], 0.01), - (en_vocab.strings['VBD'], 0.05)]), - (DEP, {en_vocab.strings['det']: 0.2, - en_vocab.strings['nsubj']: 0.1, - en_vocab.strings['adj']: 0.05, - en_vocab.strings['ROOT']: 0.1}.items()), - (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items()) - ] - - packer = Packer(en_vocab, attr_freqs) - doc = get_doc(en_vocab, [t for t in text.split()], tags=tags, deps=deps, heads=heads) - - # assert doc.text_with_ws == text - assert [t.tag_ for t in doc] == tags - assert [t.dep_ for t in doc] == deps - assert [(t.head.i-t.i) for t in doc] == heads - - bits = packer.pack(doc) - result = packer.unpack(bits) - - # assert result.text_with_ws == text - assert [t.tag_ for t in result] == tags - assert [t.dep_ for t in result] == deps - assert [(t.head.i-t.i) for t in result] == heads - - -def test_packer_bad_chars(en_tokenizer): - text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin" - packer = Packer(en_tokenizer.vocab, []) - - doc = en_tokenizer(text) - bits = packer.pack(doc) - result = packer.unpack(bits) - assert result.string == doc.string - - -@pytest.mark.models -def test_packer_bad_chars_tags(EN): - text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin" - tags = ['JJ', 'NN', ',', 'VBZ', 'DT', 'NN', 'JJ', 'NN', 'NN', - 'ADD', 'NN', ':', 'NN', 'NN', 'NN', 'NN', 'NN'] - - tokens = EN.tokenizer(text) - doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags) - byte_string = doc.to_bytes() - result = get_doc(tokens.vocab).from_bytes(byte_string) - assert [t.tag_ for t in result] == [t.tag_ for t in doc]