mirror of https://github.com/Yomguithereal/fog.git
33 lines
1.3 KiB
Python
33 lines
1.3 KiB
Python
# =============================================================================
|
|
# Fog MinHash Clustering Unit Tests
|
|
# =============================================================================
|
|
import csv
|
|
from test.clustering.utils import Clusters
|
|
from fog.clustering import minhash
|
|
from fog.tokenizers import ngrams
|
|
|
|
with open('./data/universities.csv', 'r') as f:
|
|
UNIVERSITIES = set([line['university'] for line in csv.DictReader(f)])
|
|
|
|
CLUSTERS = Clusters([
|
|
['Kansas State University', 'Arkansas State University'],
|
|
['Taylor University', 'Baylor University'],
|
|
['La Salle University', 'De La Salle University'],
|
|
['Eastern Kentucky University', 'Western Kentucky University'],
|
|
['Eastern Michigan University', 'Western Michigan University'],
|
|
['Western Washington University', 'Eastern Washington University'],
|
|
['Iran University of Science and Technology', 'Jordan University of Science and Technology'],
|
|
['University of Western Australia', 'The University of Western Australia'],
|
|
['University of Western Ontario', 'The University of Western Ontario']
|
|
])
|
|
|
|
|
|
class TestMinHash(object):
|
|
def test_basics(self):
|
|
def key(x):
|
|
return list(ngrams(5, x))
|
|
|
|
clusters = Clusters(minhash(UNIVERSITIES, key=key, radius=0.8, seed=123))
|
|
|
|
assert clusters == CLUSTERS
|