diff --git a/fog/metrics/__init__.py b/fog/metrics/__init__.py index 70232ff..bd2008b 100644 --- a/fog/metrics/__init__.py +++ b/fog/metrics/__init__.py @@ -7,3 +7,4 @@ from fog.metrics.jaccard import ( jaccard_similarity, weighted_jaccard_similarity ) +from fog.metrics.overlap import overlap_coefficient diff --git a/fog/metrics/overlap.py b/fog/metrics/overlap.py new file mode 100644 index 0000000..d31fe62 --- /dev/null +++ b/fog/metrics/overlap.py @@ -0,0 +1,52 @@ +# ============================================================================= +# Fog Overlap Coefficient +# ============================================================================= +# +# Functions computing the overlap coefficient. +# +# [Urls]: +# https://en.wikipedia.org/wiki/Overlap_coefficient +# + +ACCEPTABLE_TYPES = (set, frozenset, dict) + + +def overlap_coefficient(A, B): + """ + Function computing the overlap coefficient of the given sets, i.e. the size + of their intersection divided by the size of the smallest set. + + Runs in O(n), n being the size of the smallest set. + + Args: + A (iterable): First sequence. + B (iterable): Second sequence. + + Returns: + float: overlap coefficient between A & B. + + """ + if A is B: + return 1.0 + + if not isinstance(A, ACCEPTABLE_TYPES): + A = set(A) + + if not isinstance(B, ACCEPTABLE_TYPES): + B = set(B) + + if len(A) == 0 or len(B) == 0: + return 0.0 + + # Swapping to iterate over smaller set and minimize lookups + if len(A) > len(B): + A, B = B, A + + # Counting intersection + I = 0 + + for v in A: + if v in B: + I += 1 + + return I / min(len(A), len(B)) diff --git a/test/metrics/overlap_test.py b/test/metrics/overlap_test.py new file mode 100644 index 0000000..ef38126 --- /dev/null +++ b/test/metrics/overlap_test.py @@ -0,0 +1,22 @@ +# ============================================================================= +# Fog Overlap Coefficient Unit Tests +# ============================================================================= +from pytest import approx +from fog.metrics import overlap_coefficient + +TESTS = [ + ('abc', 'abc', 1.0), + ('abc', 'def', 0.0), + ('abc', 'abd', 2 / 3), + ('abc', 'abcde', 1), + ('abcdefij', 'abc', 1), + (list('abcdefij'), list('abc'), 1), + ((1, 2, 3), (1, 2), 1), + ('aaaaaaabc', 'aaabbbbbbc', 1.0) +] + + +class TestOverlapCoefficient(object): + def test_basics(self): + for A, B, coefficient in TESTS: + assert overlap_coefficient(A, B) == coefficient