Fixing best_matching to chose cluster maximizing F1

Fixes #19
2021-05-17 20:24:08 +02:00 · 2021-05-17 20:24:08 +02:00 · 1e10b3b462
parent 891f57d1d4
commit 1e10b3b462
5 changed files with 50 additions and 71 deletions
--- a/README.md
+++ b/README.md
@ -15,7 +15,7 @@ pip install fog
 ## Usage

 * [Evaluation](#evaluation)
-  * [best_matching](#best_matching)
+  * [best_matching_macro_average](#best_matching_macro_average)
 * [Graph](#graph)
  * [floatsam_sparsification](#floatsam_sparsification)
  * [monopartite_projection](#monopartite_projection)
@ -35,10 +35,10 @@ pip install fog

 ### Evaluation

-#### best_matching
+#### best_matching_macro_average

-Efficient implementation of the "best matching F1" evaluation metric for
-clusters.
+Efficient implementation of the "macro average best matching F1" evaluation
+metric for clusters.

 Note that this metric is not symmetric and will match truth -> predicted.

@ -49,8 +49,6 @@ Note that this metric is not symmetric and will match truth -> predicted.
 that don't exist in truth clusters to be found in predicted ones. Those
 additional items will then be ignored when computing the metrics instead
 of raising an error when found.
-* **micro** *?bool* [`False`]: Whether to compute the micro average instead of the macro
-average of the evaluation metric.

 ### Graph

--- a/docs/build.py
+++ b/docs/build.py
@ -18,7 +18,7 @@ DOCS = [
    {
        'title': 'Evaluation',
        'fns': [
-            evaluation.best_matching
+            evaluation.best_matching_macro_average
        ]
    },
    {
--- a/fog/evaluation/init.py
+++ b/fog/evaluation/init.py
@ -1,2 +1,2 @@
-from fog.evaluation.best_matching import best_matching
+from fog.evaluation.best_matching import best_matching_macro_average
 from fog.evaluation.utils import labels_to_clusters, clusters_to_labels
--- a/fog/evaluation/best_matching.py
+++ b/fog/evaluation/best_matching.py
@ -18,15 +18,14 @@ from typing import Hashable, Iterable, Tuple
 from fog.utils import OnlineMean


-def best_matching(
+def best_matching_macro_average(
    truth: Iterable[Iterable[Hashable]],
    predicted: Iterable[Iterable[Hashable]],
-    allow_additional_items: bool = False,
-    micro: bool = False
+    allow_additional_items: bool = False
 ) -> Tuple[float, float, float]:
    """
-    Efficient implementation of the "best matching F1" evaluation metric for
-    clusters.
+    Efficient implementation of the "macro average best matching F1" evaluation
+    metric for clusters.

    Note that this metric is not symmetric and will match truth -> predicted.

@ -37,8 +36,6 @@ def best_matching(
            that don't exist in truth clusters to be found in predicted ones. Those
            additional items will then be ignored when computing the metrics instead
            of raising an error when found. Defaults to False.
-        micro (bool, optional): Whether to compute the micro average instead of the macro
-            average of the evaluation metric. Defaults to False.

    Returns:
        tuple of floats: precision, recall and f1 score.
@ -89,10 +86,6 @@ def best_matching(
    R = OnlineMean()
    F = OnlineMean()

-    micro_true_positives = 0
-    micro_false_positives = 0
-    micro_false_negatives = 0
-
    # Matching truth
    for cluster in truth:
        if not cluster:
@ -111,38 +104,32 @@ def best_matching(
            candidates[candidate_cluster_index] += 1
            cluster_size += 1

-        matching_cluster_index, true_positives = candidates.most_common(1)[0]
+        best_f1 = -1.0
+        best = None
+
+        # Finding a matching cluster that maximizes F1 score
+        for matching_cluster_index, true_positives in candidates.items():
            matching_cluster_size = predicted_cluster_sizes[matching_cluster_index]

            false_positives = matching_cluster_size - true_positives
            false_negatives = cluster_size - true_positives

-        if not micro:
            precision = true_positives / (true_positives + false_positives)
            recall = true_positives / (true_positives + false_negatives)
            f1 = 2 * precision * recall / (precision + recall)

-            P.add(precision)
-            R.add(recall)
-            F.add(f1)
+            if f1 > best_f1:
+                best_f1 = f1
+                best = (precision, recall, f1)

-        else:
-            micro_true_positives += true_positives
-            micro_false_positives += false_positives
-            micro_false_negatives += false_negatives
+        assert best is not None
+
+        P.add(best[0])
+        R.add(best[1])
+        F.add(best[2])

-    if not micro:
    return (
        float(P),
        float(R),
        float(F)
    )
-
-    micro_precision = micro_true_positives / (micro_true_positives + micro_false_positives)
-    micro_recall = micro_true_positives / (micro_true_positives + micro_false_negatives)
-
-    return (
-        micro_precision,
-        micro_recall,
-        2 * micro_precision * micro_recall / (micro_precision + micro_recall)
-    )
--- a/test/evaluation/best_matching_test.py
+++ b/test/evaluation/best_matching_test.py
@ -2,8 +2,9 @@
 # Fog Best Matching Cluster Evaluation Unit Tests
 # =============================================================================
 from pytest import approx, raises
+from random import shuffle

-from fog.evaluation import best_matching
+from fog.evaluation import best_matching_macro_average


 TRUTH = [
@ -30,57 +31,50 @@ CLUSTERS_WITH_ADDITIONAL_ITEMS = [
 class TestBestMatching(object):
    def test_exceptions(self):
        with raises(TypeError, match='cannot be found'):
-            best_matching([['A1']], [['A2']])
+            best_matching_macro_average([['A1']], [['A2']])

        with raises(TypeError, match='fuzzy'):
-            best_matching([['A1', 'B1']], [['A1'], ['B1'], ['A1']])
+            best_matching_macro_average([['A1', 'B1']], [['A1'], ['B1'], ['A1']])

        with raises(TypeError, match='empty'):
-            best_matching([['A1'], []], [['A1']])
+            best_matching_macro_average([['A1'], []], [['A1']])

        with raises(TypeError, match='empty'):
-            best_matching([['A1']], [['A1'], []])
+            best_matching_macro_average([['A1']], [['A1'], []])

        with raises(TypeError, match='truth is empty'):
-            best_matching([], [['A1']])
+            best_matching_macro_average([], [['A1']])

        with raises(TypeError, match='predicted is empty'):
-            best_matching([['A1']], [])
+            best_matching_macro_average([['A1']], [])

        with raises(TypeError, match='cannot be found'):
-            best_matching([['A1']], [['A1', 'B1']])
+            best_matching_macro_average([['A1']], [['A1', 'B1']])

    def test_basics(self):
-        result = best_matching(TRUTH, CLUSTERS)
+        result = best_matching_macro_average(TRUTH, CLUSTERS)

        assert result == approx((
-            0.625,
+            0.687,
            0.875,
-            0.714
+            0.756
        ), rel=1e-2)

-        assert best_matching(TRUTH, CLUSTERS) == best_matching(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, allow_additional_items=True)
+        assert best_matching_macro_average(TRUTH, CLUSTERS) == best_matching_macro_average(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, allow_additional_items=True)

-    def test_micro(self):
-        result = best_matching(TRUTH, CLUSTERS, micro=True)
+    def test_deterministic(self):
+        shuffled_clusters = CLUSTERS.copy()
+        shuffled_truth = TRUTH.copy()

-        assert result == approx((
-            0.642,
-            0.9,
-            0.75
-        ), rel=1e-2)
+        for _ in range(10):
+            shuffle(shuffled_clusters)
+            shuffle(shuffled_truth)

-        assert best_matching(TRUTH, CLUSTERS, micro=True) == best_matching(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, micro=True, allow_additional_items=True)
+            assert best_matching_macro_average(shuffled_truth, shuffled_clusters) == best_matching_macro_average(TRUTH, CLUSTERS)

    def test_identity(self):
-        result = best_matching(TRUTH, TRUTH)
+        result = best_matching_macro_average(TRUTH, TRUTH)
        assert result == approx((1.0, 1.0, 1.0))

-        result = best_matching(CLUSTERS, CLUSTERS)
-        assert result == approx((1.0, 1.0, 1.0))
-
-        result = best_matching(TRUTH, TRUTH, micro=True)
-        assert result == approx((1.0, 1.0, 1.0))
-
-        result = best_matching(CLUSTERS, CLUSTERS, micro=True)
+        result = best_matching_macro_average(CLUSTERS, CLUSTERS)
        assert result == approx((1.0, 1.0, 1.0))