2018-06-13 15:09:33 +00:00
|
|
|
# =============================================================================
|
|
|
|
# Fog MinHash LSH Unit Tests
|
|
|
|
# =============================================================================
|
2018-06-21 12:21:08 +00:00
|
|
|
import numpy as np
|
2018-06-13 15:09:33 +00:00
|
|
|
from pytest import approx
|
2018-06-21 12:42:41 +00:00
|
|
|
from fog.lsh import (
|
|
|
|
MinHash,
|
|
|
|
LSBMinHash,
|
|
|
|
SuperMinHash,
|
|
|
|
minhash_similarity,
|
|
|
|
lsb_minhash_similarity
|
|
|
|
)
|
2018-06-13 15:09:33 +00:00
|
|
|
|
|
|
|
TESTS = [
|
|
|
|
('abc', '', 0),
|
|
|
|
('', 'abc', 0),
|
|
|
|
('', '', 1),
|
|
|
|
('abc', 'abc', 1),
|
|
|
|
('abc', 'xyz', 0),
|
|
|
|
('night', 'nacht', 3 / 7),
|
|
|
|
('context', 'contact', 4 / 7),
|
|
|
|
('ht', 'nacht', 2 / 5)
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class TestLSBMinHash(object):
|
|
|
|
def test_basics(self):
|
2018-06-19 16:11:54 +00:00
|
|
|
m = MinHash(512, seed=123)
|
|
|
|
|
|
|
|
for A, B, j in TESTS:
|
|
|
|
sA = m.create_signature(A)
|
|
|
|
sB = m.create_signature(B)
|
|
|
|
|
2018-06-21 12:42:41 +00:00
|
|
|
assert minhash_similarity(sA, sB) == approx(j, abs=1e-1)
|
2018-06-19 16:11:54 +00:00
|
|
|
|
2018-06-21 12:21:08 +00:00
|
|
|
def test_numpy(self):
|
|
|
|
m = MinHash(512, seed=123, use_numpy=True)
|
|
|
|
|
|
|
|
for A, B, j in TESTS:
|
|
|
|
sA = m.create_signature(A)
|
|
|
|
sB = m.create_signature(B)
|
|
|
|
|
|
|
|
assert sA.shape == (512, )
|
|
|
|
assert sA.dtype == np.uint32
|
|
|
|
|
2018-06-21 12:42:41 +00:00
|
|
|
assert minhash_similarity(sA, sB) == approx(j, abs=1e-1)
|
2018-06-21 12:21:08 +00:00
|
|
|
|
2018-06-19 16:11:54 +00:00
|
|
|
def test_lsb(self):
|
2018-06-13 15:09:33 +00:00
|
|
|
m = LSBMinHash(precision=16, seed=123)
|
|
|
|
|
|
|
|
for A, B, j in TESTS:
|
2018-06-19 16:03:39 +00:00
|
|
|
sA = m.create_signature(A)
|
|
|
|
sB = m.create_signature(B)
|
2018-06-13 15:09:33 +00:00
|
|
|
|
2018-06-21 12:42:41 +00:00
|
|
|
assert lsb_minhash_similarity(sA, sB) == approx(j, abs=1e-1)
|
2018-06-20 16:07:23 +00:00
|
|
|
|
|
|
|
def test_super_minhash(self):
|
|
|
|
m = SuperMinHash(512)
|
|
|
|
|
|
|
|
for A, B, j in TESTS:
|
|
|
|
sA = m.create_signature(A)
|
|
|
|
sB = m.create_signature(B)
|
|
|
|
|
2018-06-21 12:42:41 +00:00
|
|
|
assert minhash_similarity(sA, sB) == approx(j, abs=1e-1)
|