diff --git a/README.md b/README.md index ab33079..778b535 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ pip install fog ## Usage * [Evaluation](#evaluation) - * [best_matching](#best_matching) + * [best_matching_macro_average](#best_matching_macro_average) * [Graph](#graph) * [floatsam_sparsification](#floatsam_sparsification) * [monopartite_projection](#monopartite_projection) @@ -35,10 +35,10 @@ pip install fog ### Evaluation -#### best_matching +#### best_matching_macro_average -Efficient implementation of the "best matching F1" evaluation metric for -clusters. +Efficient implementation of the "macro average best matching F1" evaluation +metric for clusters. Note that this metric is not symmetric and will match truth -> predicted. @@ -49,8 +49,6 @@ Note that this metric is not symmetric and will match truth -> predicted. that don't exist in truth clusters to be found in predicted ones. Those additional items will then be ignored when computing the metrics instead of raising an error when found. -* **micro** *?bool* [`False`]: Whether to compute the micro average instead of the macro -average of the evaluation metric. ### Graph diff --git a/docs/build.py b/docs/build.py index 5870c1a..e03a29e 100644 --- a/docs/build.py +++ b/docs/build.py @@ -18,7 +18,7 @@ DOCS = [ { 'title': 'Evaluation', 'fns': [ - evaluation.best_matching + evaluation.best_matching_macro_average ] }, { diff --git a/fog/evaluation/__init__.py b/fog/evaluation/__init__.py index a8ffac9..09a8146 100644 --- a/fog/evaluation/__init__.py +++ b/fog/evaluation/__init__.py @@ -1,2 +1,2 @@ -from fog.evaluation.best_matching import best_matching +from fog.evaluation.best_matching import best_matching_macro_average from fog.evaluation.utils import labels_to_clusters, clusters_to_labels diff --git a/fog/evaluation/best_matching.py b/fog/evaluation/best_matching.py index d87dda3..5b9905b 100644 --- a/fog/evaluation/best_matching.py +++ b/fog/evaluation/best_matching.py @@ -18,15 +18,14 @@ from typing import Hashable, Iterable, Tuple from fog.utils import OnlineMean -def best_matching( +def best_matching_macro_average( truth: Iterable[Iterable[Hashable]], predicted: Iterable[Iterable[Hashable]], - allow_additional_items: bool = False, - micro: bool = False + allow_additional_items: bool = False ) -> Tuple[float, float, float]: """ - Efficient implementation of the "best matching F1" evaluation metric for - clusters. + Efficient implementation of the "macro average best matching F1" evaluation + metric for clusters. Note that this metric is not symmetric and will match truth -> predicted. @@ -37,8 +36,6 @@ def best_matching( that don't exist in truth clusters to be found in predicted ones. Those additional items will then be ignored when computing the metrics instead of raising an error when found. Defaults to False. - micro (bool, optional): Whether to compute the micro average instead of the macro - average of the evaluation metric. Defaults to False. Returns: tuple of floats: precision, recall and f1 score. @@ -89,10 +86,6 @@ def best_matching( R = OnlineMean() F = OnlineMean() - micro_true_positives = 0 - micro_false_positives = 0 - micro_false_negatives = 0 - # Matching truth for cluster in truth: if not cluster: @@ -111,38 +104,32 @@ def best_matching( candidates[candidate_cluster_index] += 1 cluster_size += 1 - matching_cluster_index, true_positives = candidates.most_common(1)[0] - matching_cluster_size = predicted_cluster_sizes[matching_cluster_index] + best_f1 = -1.0 + best = None - false_positives = matching_cluster_size - true_positives - false_negatives = cluster_size - true_positives + # Finding a matching cluster that maximizes F1 score + for matching_cluster_index, true_positives in candidates.items(): + matching_cluster_size = predicted_cluster_sizes[matching_cluster_index] + + false_positives = matching_cluster_size - true_positives + false_negatives = cluster_size - true_positives - if not micro: precision = true_positives / (true_positives + false_positives) recall = true_positives / (true_positives + false_negatives) f1 = 2 * precision * recall / (precision + recall) - P.add(precision) - R.add(recall) - F.add(f1) + if f1 > best_f1: + best_f1 = f1 + best = (precision, recall, f1) - else: - micro_true_positives += true_positives - micro_false_positives += false_positives - micro_false_negatives += false_negatives + assert best is not None - if not micro: - return ( - float(P), - float(R), - float(F) - ) - - micro_precision = micro_true_positives / (micro_true_positives + micro_false_positives) - micro_recall = micro_true_positives / (micro_true_positives + micro_false_negatives) + P.add(best[0]) + R.add(best[1]) + F.add(best[2]) return ( - micro_precision, - micro_recall, - 2 * micro_precision * micro_recall / (micro_precision + micro_recall) + float(P), + float(R), + float(F) ) diff --git a/test/evaluation/best_matching_test.py b/test/evaluation/best_matching_test.py index 61b6d48..8bb350b 100644 --- a/test/evaluation/best_matching_test.py +++ b/test/evaluation/best_matching_test.py @@ -2,8 +2,9 @@ # Fog Best Matching Cluster Evaluation Unit Tests # ============================================================================= from pytest import approx, raises +from random import shuffle -from fog.evaluation import best_matching +from fog.evaluation import best_matching_macro_average TRUTH = [ @@ -30,57 +31,50 @@ CLUSTERS_WITH_ADDITIONAL_ITEMS = [ class TestBestMatching(object): def test_exceptions(self): with raises(TypeError, match='cannot be found'): - best_matching([['A1']], [['A2']]) + best_matching_macro_average([['A1']], [['A2']]) with raises(TypeError, match='fuzzy'): - best_matching([['A1', 'B1']], [['A1'], ['B1'], ['A1']]) + best_matching_macro_average([['A1', 'B1']], [['A1'], ['B1'], ['A1']]) with raises(TypeError, match='empty'): - best_matching([['A1'], []], [['A1']]) + best_matching_macro_average([['A1'], []], [['A1']]) with raises(TypeError, match='empty'): - best_matching([['A1']], [['A1'], []]) + best_matching_macro_average([['A1']], [['A1'], []]) with raises(TypeError, match='truth is empty'): - best_matching([], [['A1']]) + best_matching_macro_average([], [['A1']]) with raises(TypeError, match='predicted is empty'): - best_matching([['A1']], []) + best_matching_macro_average([['A1']], []) with raises(TypeError, match='cannot be found'): - best_matching([['A1']], [['A1', 'B1']]) + best_matching_macro_average([['A1']], [['A1', 'B1']]) def test_basics(self): - result = best_matching(TRUTH, CLUSTERS) + result = best_matching_macro_average(TRUTH, CLUSTERS) assert result == approx(( - 0.625, + 0.687, 0.875, - 0.714 + 0.756 ), rel=1e-2) - assert best_matching(TRUTH, CLUSTERS) == best_matching(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, allow_additional_items=True) + assert best_matching_macro_average(TRUTH, CLUSTERS) == best_matching_macro_average(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, allow_additional_items=True) - def test_micro(self): - result = best_matching(TRUTH, CLUSTERS, micro=True) + def test_deterministic(self): + shuffled_clusters = CLUSTERS.copy() + shuffled_truth = TRUTH.copy() - assert result == approx(( - 0.642, - 0.9, - 0.75 - ), rel=1e-2) + for _ in range(10): + shuffle(shuffled_clusters) + shuffle(shuffled_truth) - assert best_matching(TRUTH, CLUSTERS, micro=True) == best_matching(TRUTH, CLUSTERS_WITH_ADDITIONAL_ITEMS, micro=True, allow_additional_items=True) + assert best_matching_macro_average(shuffled_truth, shuffled_clusters) == best_matching_macro_average(TRUTH, CLUSTERS) def test_identity(self): - result = best_matching(TRUTH, TRUTH) + result = best_matching_macro_average(TRUTH, TRUTH) assert result == approx((1.0, 1.0, 1.0)) - result = best_matching(CLUSTERS, CLUSTERS) - assert result == approx((1.0, 1.0, 1.0)) - - result = best_matching(TRUTH, TRUTH, micro=True) - assert result == approx((1.0, 1.0, 1.0)) - - result = best_matching(CLUSTERS, CLUSTERS, micro=True) + result = best_matching_macro_average(CLUSTERS, CLUSTERS) assert result == approx((1.0, 1.0, 1.0))