From 1e10b3b4621ab0100a33accc8763e82be6e77c45 Mon Sep 17 00:00:00 2001
From: Yomguithereal <guillaumeplique@gmail.com>
Date: Mon, 17 May 2021 20:24:08 +0200
Subject: [PATCH] Fixing best_matching to chose cluster maximizing F1 Fixes #19

---
 README.md                             | 10 ++---
 docs/build.py                         |  2 +-
 fog/evaluation/__init__.py            |  2 +-
 fog/evaluation/best_matching.py       | 57 +++++++++++----------------
 test/evaluation/best_matching_test.py | 50 +++++++++++------------
 5 files changed, 50 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index ab33079..778b535 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ pip install fog
 ## Usage
 
 * [Evaluation](#evaluation)
-  * [best_matching](#best_matching)
+  * [best_matching_macro_average](#best_matching_macro_average)
 * [Graph](#graph)
   * [floatsam_sparsification](#floatsam_sparsification)
   * [monopartite_projection](#monopartite_projection)
@@ -35,10 +35,10 @@ pip install fog
 
 ### Evaluation
 
-#### best_matching
+#### best_matching_macro_average
 
-Efficient implementation of the "best matching F1" evaluation metric for
-clusters.
+Efficient implementation of the "macro average best matching F1" evaluation
+metric for clusters.
 
 Note that this metric is not symmetric and will match truth -> predicted.
 
@@ -49,8 +49,6 @@ Note that this metric is not symmetric and will match truth -> predicted.
 that don't exist in truth clusters to be found in predicted ones. Those
 additional items will then be ignored when computing the metrics instead
 of raising an error when found.
-* **micro** *?bool* [`False`]: Whether to compute the micro average instead of the macro
-average of the evaluation metric.
 
 ### Graph
 
diff --git a/docs/build.py b/docs/build.py
index 5870c1a..e03a29e 100644
--- a/docs/build.py
+++ b/docs/build.py
@@ -18,7 +18,7 @@ DOCS = [
     {
         'title': 'Evaluation',
         'fns': [
-            evaluation.best_matching
+            evaluation.best_matching_macro_average
         ]
     },
     {
diff --git a/fog/evaluation/__init__.py b/fog/evaluation/__init__.py
index a8ffac9..09a8146 100644
--- a/fog/evaluation/__init__.py
+++ b/fog/evaluation/__init__.py
@@ -1,2 +1,2 @@
-from fog.evaluation.best_matching import best_matching
+from fog.evaluation.best_matching import best_matching_macro_average
 from fog.evaluation.utils import labels_to_clusters, clusters_to_labels
diff --git a/fog/evaluation/best_matching.py b/fog/evaluation/best_matching.py
index d87dda3..5b9905b 100644
--- a/fog/evaluation/best_matching.py
+++ b/fog/evaluation/best_matching.py
@@ -18,15 +18,14 @@ from typing import Hashable, Iterable, Tuple
 from fog.utils import OnlineMean
 
 
-def best_matching(
+def best_matching_macro_average(
     truth: Iterable[Iterable[Hashable]],
     predicted: Iterable[Iterable[Hashable]],
-    allow_additional_items: bool = False,
-    micro: bool = False
+    allow_additional_items: bool = False
 ) -> Tuple[float, float, float]:
     """
-    Efficient implementation of the "best matching F1" evaluation metric for
-    clusters.
+    Efficient implementation of the "macro average best matching F1" evaluation
+    metric for clusters.
 
     Note that this metric is not symmetric and will match truth -> predicted.
 
@@ -37,8 +36,6 @@ def best_matching(
             that don't exist in truth clusters to be found in predicted ones. Those
             additional items will then be ignored when computing the metrics instead
             of raising an error when found. Defaults to False.
-        micro (bool, optional): Whether to compute the micro average instead of the macro
-            average of the evaluation metric. Defaults to False.
 
     Returns:
         tuple of floats: precision, recall and f1 score.
@@ -89,10 +86,6 @@ def best_matching(
     R = OnlineMean()
     F = OnlineMean()
 
-    micro_true_positives = 0
-    micro_false_positives = 0
-    micro_false_negatives = 0
-
     # Matching truth
     for cluster in truth:
         if not cluster:
@@ -111,38 +104,32 @@ def best_matching(
             candidates[candidate_cluster_index] += 1
             cluster_size += 1
 
-        matching_cluster_index, true_positives = candidates.most_common(1)[0]
-        matching_cluster_size = predicted_cluster_sizes[matching_cluster_index]
+        best_f1 = -1.0
+        best = None
 
-        false_positives = matching_cluster_size - true_positives
-        false_negatives = cluster_size - true_positives
+        # Finding a matching cluster that maximizes F1 score
+        for matching_cluster_index, true_positives in candidates.items():
+            matching_cluster_size = predicted_cluster_sizes[matching_cluster_index]
+
+            false_positives = matching_cluster_size - true_positives
+            false_negatives = cluster_size - true_positives
 
-        if not micro:
             precision = true_positives / (true_positives + false_positives)
             recall = true_positives / (true_positives + false_negatives)
             f1 = 2 * precision * recall / (precision + recall)
 
-            P.add(precision)
-            R.add(recall)
-            F.add(f1)
+            if f1 > best_f1:
+                best_f1 = f1
+                best = (precision, recall, f1)
 
-        else:
-            micro_true_positives += true_positives
-            micro_false_positives += false_positives
-            micro_false_negatives += false_negatives
+        assert best is not None
 
-    if not micro:
-        return (
-            float(P),
-            float(R),
-            float(F)
-        )
-
-    micro_precision = micro_true_positives / (micro_true_positives + micro_false_positives)
-    micro_recall = micro_true_positives / (micro_true_positives + micro_false_negatives)
+        P.add(best[0])
+        R.add(best[1])
+        F.add(best[2])
 
     return (
-        micro_precision,
-        micro_recall,
-        2 * micro_precision * micro_recall / (micro_precision + micro_recall)
+        float(P),
+        float(R),
+        float(F)
     )
diff --git a/test/evaluation/best_matching_test.py b/test/evaluation/best_matching_test.py
index 61b6d48..8bb350b 100644
--- a/test/evaluation/best_matching_test.py
+++ b/test/evaluation/best_matching_test.py
@@ -2,8 +2,9 @@
 # Fog Best Matching Cluster Evaluation Unit Tests
 # =============================================================================
 from pytest import approx, raises
+from random import shuffle
 
-from fog.evaluation import best_matching
+from fog.evaluation import best_matching_macro_average
 
 
 TRUTH = [
@@ -30,57 +31,50 @@ CLUSTERS_WITH_ADDITIONAL_ITEMS = [
 class TestBestMatching(object):
     def test_exceptions(self):
         with raises(TypeError, match='cannot be found'):
-            best_matching([['A1']], [['A2']])
+            best_matching_macro_average([['A1']], [['A2']])
 
         with raises(TypeError, match='fuzzy'):
-            best_matching([['A1', 'B1']], [['A1'], ['B1'], ['A1']])
+            best_matching_macro_average([['A1', 'B1']], [['A1'], ['B1'], ['A1']])
 
         with raises(TypeError, match='empty'):
-            best_matching([['A1'], []], [['A1']])
+            best_matching_macro_average([['A1'], []], [['A1']])
 
         with raises(TypeError, match='empty'):
-            best_matching([['A1']], [['A1'], []])
+            best_matching_macro_average([['A1']], [['A1'], []])
 
         with raises(TypeError, match='truth is empty'):
-            best_matching([], [['A1']])
+            best_matching_macro_average([], [['A1']])
 
         with raises(TypeError, match='predicted is empty'):
-            best_matching([['A1']], [])
+            best_matching_macro_average([['A1']], [])
 
         with raises(TypeError, match='cannot be found'):
-            best_matching([['A1']], [['A1', 'B1']])
+            best_matching_macro_average([['A1']], [['A1', 'B1']])
 
     def test_basics(self):
-        result = best_matching(TRUTH, CLUSTERS)
+        result = best_matching_macro_average(TRUTH, CLUSTERS)
 
         assert result == approx((
-            0.625,
+            0.687,
             0.875,
-            0.714
+            0.756
         ), rel=1e-2)
 
-        assert best_matching(TRUTH, CLUSTERS) == best_matching(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, allow_additional_items=True)
+        assert best_matching_macro_average(TRUTH, CLUSTERS) == best_matching_macro_average(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, allow_additional_items=True)
 
-    def test_micro(self):
-        result = best_matching(TRUTH, CLUSTERS, micro=True)
+    def test_deterministic(self):
+        shuffled_clusters = CLUSTERS.copy()
+        shuffled_truth = TRUTH.copy()
 
-        assert result == approx((
-            0.642,
-            0.9,
-            0.75
-        ), rel=1e-2)
+        for _ in range(10):
+            shuffle(shuffled_clusters)
+            shuffle(shuffled_truth)
 
-        assert best_matching(TRUTH, CLUSTERS, micro=True) == best_matching(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, micro=True, allow_additional_items=True)
+            assert best_matching_macro_average(shuffled_truth, shuffled_clusters) == best_matching_macro_average(TRUTH, CLUSTERS)
 
     def test_identity(self):
-        result = best_matching(TRUTH, TRUTH)
+        result = best_matching_macro_average(TRUTH, TRUTH)
         assert result == approx((1.0, 1.0, 1.0))
 
-        result = best_matching(CLUSTERS, CLUSTERS)
-        assert result == approx((1.0, 1.0, 1.0))
-
-        result = best_matching(TRUTH, TRUTH, micro=True)
-        assert result == approx((1.0, 1.0, 1.0))
-
-        result = best_matching(CLUSTERS, CLUSTERS, micro=True)
+        result = best_matching_macro_average(CLUSTERS, CLUSTERS)
         assert result == approx((1.0, 1.0, 1.0))