GH-127010: Don't lazily track and untrack dicts (GH-127027)

2024-11-20 16:41:20 +00:00 · 2024-11-20 16:41:20 +00:00 · aea0c586d1
parent 7191b7662e
commit aea0c586d1
12 changed files with 43 additions and 282 deletions
--- a/Doc/library/gc.rst
+++ b/Doc/library/gc.rst
@ -204,8 +204,6 @@ The :mod:`gc` module provides the following functions:
      >>> gc.is_tracked({})
      False
      >>> gc.is_tracked({"a": 1})
-      False
-      >>> gc.is_tracked({"a": []})
      True

   .. versionadded:: 3.1
--- a/Include/internal/pycore_dict.h
+++ b/Include/internal/pycore_dict.h
@ -43,8 +43,6 @@ extern int _PyDict_Next(

 extern int _PyDict_HasOnlyStringKeys(PyObject *mp);

-extern void _PyDict_MaybeUntrack(PyObject *mp);
-
 // Export for '_ctypes' shared extension
 PyAPI_FUNC(Py_ssize_t) _PyDict_SizeOf(PyDictObject *);

--- a/InternalDocs/garbage_collector.md
+++ b/InternalDocs/garbage_collector.md
@ -532,8 +532,8 @@
  currently in.  Instead, when that's needed, ad hoc tricks (like the
  `NEXT_MASK_UNREACHABLE` flag) are employed.

-Optimization: delay tracking containers
-=======================================
+Optimization: delayed untracking containers
+===========================================

 Certain types of containers cannot participate in a reference cycle, and so do
 not need to be tracked by the garbage collector. Untracking these objects
@ -546,26 +546,17 @@
 2. When the container is examined by the garbage collector.

 As a general rule, instances of atomic types aren't tracked and instances of
-non-atomic types (containers, user-defined objects...) are.  However, some
-type-specific optimizations can be present in order to suppress the garbage
-collector footprint of simple instances. Some examples of native types that
-benefit from delayed tracking:
+non-atomic types (containers, user-defined objects...) are.

- Tuples containing only immutable objects (integers, strings etc,
-  and recursively, tuples of immutable objects) do not need to be tracked. The
-  interpreter creates a large number of tuples, many of which will not survive
-  until garbage collection. It is therefore not worthwhile to untrack eligible
-  tuples at creation time. Instead, all tuples except the empty tuple are tracked
-  when created. During garbage collection it is determined whether any surviving
-  tuples can be untracked. A tuple can be untracked if all of its contents are
-  already not tracked. Tuples are examined for untracking in all garbage collection
-  cycles. It may take more than one cycle to untrack a tuple.
-
- Dictionaries containing only immutable objects also do not need to be tracked.
-  Dictionaries are untracked when created. If a tracked item is inserted into a
-  dictionary (either as a key or value), the dictionary becomes tracked. During a
-  full garbage collection (all generations), the collector will untrack any dictionaries
-  whose contents are not tracked.
+Tuples containing only immutable objects (integers, strings etc,
+and recursively, tuples of immutable objects) do not need to be tracked. The
+interpreter creates a large number of tuples, many of which will not survive
+until garbage collection. It is therefore not worthwhile to untrack eligible
+tuples at creation time. Instead, all tuples except the empty tuple are tracked
+when created. During garbage collection it is determined whether any surviving
+tuples can be untracked. A tuple can be untracked if all of its contents are
+already not tracked. Tuples are examined for untracking in all garbage collection
+cycles.

 The garbage collector module provides the Python function `is_tracked(obj)`, which returns
 the current tracking status of the object. Subsequent garbage collections may change the
@ -578,11 +569,11 @@
      False
      >>> gc.is_tracked([])
      True
+      >>> gc.is_tracked(())
+      False
      >>> gc.is_tracked({})
-      False
+      True
      >>> gc.is_tracked({"a": 1})
-      False
-      >>> gc.is_tracked({"a": []})
      True
 ```

--- a/Lib/test/test_dict.py
+++ b/Lib/test/test_dict.py
@ -880,115 +880,6 @@ class C(object):
            gc.collect()
            self.assertIs(ref(), None, "Cycle was not collected")

-    def _not_tracked(self, t):
-        # Nested containers can take several collections to untrack
-        gc.collect()
-        gc.collect()
-        self.assertFalse(gc.is_tracked(t), t)
-
-    def _tracked(self, t):
-        self.assertTrue(gc.is_tracked(t), t)
-        gc.collect()
-        gc.collect()
-        self.assertTrue(gc.is_tracked(t), t)
-
-    def test_string_keys_can_track_values(self):
-        # Test that this doesn't leak.
-        for i in range(10):
-            d = {}
-            for j in range(10):
-                d[str(j)] = j
-            d["foo"] = d
-
-    @support.cpython_only
-    def test_track_literals(self):
-        # Test GC-optimization of dict literals
-        x, y, z, w = 1.5, "a", (1, None), []
-
-        self._not_tracked({})
-        self._not_tracked({x:(), y:x, z:1})
-        self._not_tracked({1: "a", "b": 2})
-        self._not_tracked({1: 2, (None, True, False, ()): int})
-        self._not_tracked({1: object()})
-
-        # Dicts with mutable elements are always tracked, even if those
-        # elements are not tracked right now.
-        self._tracked({1: []})
-        self._tracked({1: ([],)})
-        self._tracked({1: {}})
-        self._tracked({1: set()})
-
-    @support.cpython_only
-    def test_track_dynamic(self):
-        # Test GC-optimization of dynamically-created dicts
-        class MyObject(object):
-            pass
-        x, y, z, w, o = 1.5, "a", (1, object()), [], MyObject()
-
-        d = dict()
-        self._not_tracked(d)
-        d[1] = "a"
-        self._not_tracked(d)
-        d[y] = 2
-        self._not_tracked(d)
-        d[z] = 3
-        self._not_tracked(d)
-        self._not_tracked(d.copy())
-        d[4] = w
-        self._tracked(d)
-        self._tracked(d.copy())
-        d[4] = None
-        self._not_tracked(d)
-        self._not_tracked(d.copy())
-
-        # dd isn't tracked right now, but it may mutate and therefore d
-        # which contains it must be tracked.
-        d = dict()
-        dd = dict()
-        d[1] = dd
-        self._not_tracked(dd)
-        self._tracked(d)
-        dd[1] = d
-        self._tracked(dd)
-
-        d = dict.fromkeys([x, y, z])
-        self._not_tracked(d)
-        dd = dict()
-        dd.update(d)
-        self._not_tracked(dd)
-        d = dict.fromkeys([x, y, z, o])
-        self._tracked(d)
-        dd = dict()
-        dd.update(d)
-        self._tracked(dd)
-
-        d = dict(x=x, y=y, z=z)
-        self._not_tracked(d)
-        d = dict(x=x, y=y, z=z, w=w)
-        self._tracked(d)
-        d = dict()
-        d.update(x=x, y=y, z=z)
-        self._not_tracked(d)
-        d.update(w=w)
-        self._tracked(d)
-
-        d = dict([(x, y), (z, 1)])
-        self._not_tracked(d)
-        d = dict([(x, y), (z, w)])
-        self._tracked(d)
-        d = dict()
-        d.update([(x, y), (z, 1)])
-        self._not_tracked(d)
-        d.update([(x, y), (z, w)])
-        self._tracked(d)
-
-    @support.cpython_only
-    def test_track_subtypes(self):
-        # Dict subtypes are always tracked
-        class MyDict(dict):
-            pass
-        self._tracked(MyDict())
-
    def make_shared_key_dict(self, n):
        class C:
            pass
--- a/Misc/NEWS.d/next/Core_and_Builtins/2024-11-19-17-17-32.gh-issue-127010.9Cl4bb.rst
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-11-19-17-17-32.gh-issue-127010.9Cl4bb.rst
@ -0,0 +1,4 @@
+Simplify GC tracking of dictionaries. All dictionaries are tracked when
+created, rather than being lazily tracked when a trackable object was added
+to them. This simplifies the code considerably and results in a slight
+speedup.
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@ -883,6 +883,7 @@ new_dict(PyInterpreterState *interp,
    mp->ma_used = used;
    mp->_ma_watcher_tag = 0;
    ASSERT_CONSISTENT(mp);
+    _PyObject_GC_TRACK(mp);
    return (PyObject *)mp;
 }

@ -1578,64 +1579,6 @@ _PyDict_HasOnlyStringKeys(PyObject *dict)
    return 1;
 }

-#define MAINTAIN_TRACKING(mp, key, value) \
-    do { \
-        if (!_PyObject_GC_IS_TRACKED(mp)) { \
-            if (_PyObject_GC_MAY_BE_TRACKED(key) || \
-                _PyObject_GC_MAY_BE_TRACKED(value)) { \
-                _PyObject_GC_TRACK(mp); \
-            } \
-        } \
-    } while(0)
-
-void
-_PyDict_MaybeUntrack(PyObject *op)
-{
-    PyDictObject *mp;
-    PyObject *value;
-    Py_ssize_t i, numentries;
-
-    ASSERT_WORLD_STOPPED_OR_DICT_LOCKED(op);
-
-    if (!PyDict_CheckExact(op) || !_PyObject_GC_IS_TRACKED(op))
-        return;
-
-    mp = (PyDictObject *) op;
-    ASSERT_CONSISTENT(mp);
-    numentries = mp->ma_keys->dk_nentries;
-    if (_PyDict_HasSplitTable(mp)) {
-        for (i = 0; i < numentries; i++) {
-            if ((value = mp->ma_values->values[i]) == NULL)
-                continue;
-            if (_PyObject_GC_MAY_BE_TRACKED(value)) {
-                return;
-            }
-        }
-    }
-    else {
-        if (DK_IS_UNICODE(mp->ma_keys)) {
-            PyDictUnicodeEntry *ep0 = DK_UNICODE_ENTRIES(mp->ma_keys);
-            for (i = 0; i < numentries; i++) {
-                if ((value = ep0[i].me_value) == NULL)
-                    continue;
-                if (_PyObject_GC_MAY_BE_TRACKED(value))
-                    return;
-            }
-        }
-        else {
-            PyDictKeyEntry *ep0 = DK_ENTRIES(mp->ma_keys);
-            for (i = 0; i < numentries; i++) {
-                if ((value = ep0[i].me_value) == NULL)
-                    continue;
-                if (_PyObject_GC_MAY_BE_TRACKED(value) ||
-                    _PyObject_GC_MAY_BE_TRACKED(ep0[i].me_key))
-                    return;
-            }
-        }
-    }
-    _PyObject_GC_UNTRACK(op);
-}
-
 void
 _PyDict_EnablePerThreadRefcounting(PyObject *op)
 {
@ -1761,7 +1704,6 @@ insert_split_value(PyInterpreterState *interp, PyDictObject *mp, PyObject *key,
 {
    assert(PyUnicode_CheckExact(key));
    ASSERT_DICT_LOCKED(mp);
-    MAINTAIN_TRACKING(mp, key, value);
    PyObject *old_value = mp->ma_values->values[ix];
    if (old_value == NULL) {
        _PyDict_NotifyEvent(interp, PyDict_EVENT_ADDED, mp, key, value);
@ -1818,8 +1760,6 @@ insertdict(PyInterpreterState *interp, PyDictObject *mp,
    if (ix == DKIX_ERROR)
        goto Fail;

-    MAINTAIN_TRACKING(mp, key, value);
-
    if (ix == DKIX_EMPTY) {
        assert(!_PyDict_HasSplitTable(mp));
        /* Insert into new slot. */
@ -1878,8 +1818,6 @@ insert_to_emptydict(PyInterpreterState *interp, PyDictObject *mp,
    /* We don't decref Py_EMPTY_KEYS here because it is immortal. */
    assert(mp->ma_values == NULL);

-    MAINTAIN_TRACKING(mp, key, value);
-
    size_t hashpos = (size_t)hash & (PyDict_MINSIZE-1);
    dictkeys_set_index(newkeys, hashpos, 0);
    if (unicode) {
@ -4024,8 +3962,7 @@ copy_lock_held(PyObject *o)
        split_copy->ma_used = mp->ma_used;
        split_copy->_ma_watcher_tag = 0;
        dictkeys_incref(mp->ma_keys);
-        if (_PyObject_GC_IS_TRACKED(mp))
-            _PyObject_GC_TRACK(split_copy);
+        _PyObject_GC_TRACK(split_copy);
        return (PyObject *)split_copy;
    }

@ -4060,11 +3997,6 @@ copy_lock_held(PyObject *o)

        new->ma_used = mp->ma_used;
        ASSERT_CONSISTENT(new);
-        if (_PyObject_GC_IS_TRACKED(mp)) {
-            /* Maintain tracking. */
-            _PyObject_GC_TRACK(new);
-        }
-
        return (PyObject *)new;
    }

@ -4351,7 +4283,6 @@ dict_setdefault_ref_lock_held(PyObject *d, PyObject *key, PyObject *default_valu
            }
        }

-        MAINTAIN_TRACKING(mp, key, value);
        STORE_USED(mp, mp->ma_used + 1);
        assert(mp->ma_keys->dk_usable >= 0);
        ASSERT_CONSISTENT(mp);
@ -4800,16 +4731,8 @@ dict_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
    d->ma_keys = Py_EMPTY_KEYS;
    d->ma_values = NULL;
    ASSERT_CONSISTENT(d);
-
-    if (type != &PyDict_Type) {
-        // Don't track if a subclass tp_alloc is PyType_GenericAlloc()
-        if (!_PyObject_GC_IS_TRACKED(d)) {
-            _PyObject_GC_TRACK(d);
-        }
-    }
-    else {
-        // _PyType_AllocNoTrack() does not track the created object
-        assert(!_PyObject_GC_IS_TRACKED(d));
+    if (!_PyObject_GC_IS_TRACKED(d)) {
+        _PyObject_GC_TRACK(d);
    }
    return self;
 }
@ -6746,19 +6669,14 @@ make_dict_from_instance_attributes(PyInterpreterState *interp,
 {
    dictkeys_incref(keys);
    Py_ssize_t used = 0;
-    Py_ssize_t track = 0;
    size_t size = shared_keys_usable_size(keys);
    for (size_t i = 0; i < size; i++) {
        PyObject *val = values->values[i];
        if (val != NULL) {
            used += 1;
-            track += _PyObject_GC_MAY_BE_TRACKED(val);
        }
    }
    PyDictObject *res = (PyDictObject *)new_dict(interp, keys, values, used, 0);
-    if (track && res) {
-        _PyObject_GC_TRACK(res);
-    }
    return res;
 }

@ -7204,6 +7122,7 @@ _PyObject_SetManagedDict(PyObject *obj, PyObject *new_dict)
        // since we locked it.
        dict = _PyObject_ManagedDictPointer(obj)->dict;
        err = _PyDict_DetachFromObject(dict, obj);
+        assert(err == 0 || new_dict == NULL);
        if (err == 0) {
            FT_ATOMIC_STORE_PTR(_PyObject_ManagedDictPointer(obj)->dict,
                                (PyDictObject *)Py_XNewRef(new_dict));
@ -7236,7 +7155,21 @@ void
 PyObject_ClearManagedDict(PyObject *obj)
 {
    if (_PyObject_SetManagedDict(obj, NULL) < 0) {
+        /* Must be out of memory */
+        assert(PyErr_Occurred() == PyExc_MemoryError);
        PyErr_WriteUnraisable(NULL);
+        /* Clear the dict */
+        PyDictObject *dict = _PyObject_GetManagedDict(obj);
+        Py_BEGIN_CRITICAL_SECTION2(dict, obj);
+        dict = _PyObject_ManagedDictPointer(obj)->dict;
+        PyInterpreterState *interp = _PyInterpreterState_GET();
+        PyDictKeysObject *oldkeys = dict->ma_keys;
+        set_keys(dict, Py_EMPTY_KEYS);
+        dict->ma_values = NULL;
+        dictkeys_decref(interp, oldkeys, IS_DICT_SHARED(dict));
+        STORE_USED(dict, 0);
+        set_dict_inline_values(obj, NULL);
+        Py_END_CRITICAL_SECTION2();
    }
 }

@ -7261,12 +7194,6 @@ _PyDict_DetachFromObject(PyDictObject *mp, PyObject *obj)
    PyDictValues *values = copy_values(mp->ma_values);

    if (values == NULL) {
-        /* Out of memory. Clear the dict */
-        PyInterpreterState *interp = _PyInterpreterState_GET();
-        PyDictKeysObject *oldkeys = mp->ma_keys;
-        set_keys(mp, Py_EMPTY_KEYS);
-        dictkeys_decref(interp, oldkeys, IS_DICT_SHARED(mp));
-        STORE_USED(mp, 0);
        PyErr_NoMemory();
        return -1;
    }
--- a/Objects/moduleobject.c
+++ b/Objects/moduleobject.c
@ -107,8 +107,6 @@ static void
 track_module(PyModuleObject *m)
 {
    _PyDict_EnablePerThreadRefcounting(m->md_dict);
-    PyObject_GC_Track(m->md_dict);
-
    _PyObject_SetDeferredRefcount((PyObject *)m);
    PyObject_GC_Track(m);
 }
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@ -2340,10 +2340,6 @@ dummy_func(
            DEOPT_IF(ep->me_key != name);
            PyObject *old_value = ep->me_value;
            DEOPT_IF(old_value == NULL);
-            /* Ensure dict is GC tracked if it needs to be */
-            if (!_PyObject_GC_IS_TRACKED(dict) && _PyObject_GC_MAY_BE_TRACKED(PyStackRef_AsPyObjectBorrow(value))) {
-                _PyObject_GC_TRACK(dict);
-            }
            _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
            ep->me_value = PyStackRef_AsPyObjectSteal(value);
            // old_value should be DECREFed after GC track checking is done, if not, it could raise a segmentation fault,
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@ -2914,10 +2914,6 @@
                UOP_STAT_INC(uopcode, miss);
                JUMP_TO_JUMP_TARGET();
            }
-            /* Ensure dict is GC tracked if it needs to be */
-            if (!_PyObject_GC_IS_TRACKED(dict) && _PyObject_GC_MAY_BE_TRACKED(PyStackRef_AsPyObjectBorrow(value))) {
-                _PyObject_GC_TRACK(dict);
-            }
            _PyFrame_SetStackPointer(frame, stack_pointer);
            _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
            stack_pointer = _PyFrame_GetStackPointer(frame);
--- a/Python/gc.c
+++ b/Python/gc.c
@ -5,7 +5,7 @@
 #include "Python.h"
 #include "pycore_ceval.h"         // _Py_set_eval_breaker_bit()
 #include "pycore_context.h"
-#include "pycore_dict.h"          // _PyDict_MaybeUntrack()
+#include "pycore_dict.h"          // _PyInlineValuesSize()
 #include "pycore_initconfig.h"
 #include "pycore_interp.h"        // PyInterpreterState.gc
 #include "pycore_object.h"
@ -747,21 +747,6 @@ untrack_tuples(PyGC_Head *head)
    }
 }

-/* Try to untrack all currently tracked dictionaries */
-static void
-untrack_dicts(PyGC_Head *head)
-{
-    PyGC_Head *next, *gc = GC_NEXT(head);
-    while (gc != head) {
-        PyObject *op = FROM_GC(gc);
-        next = GC_NEXT(gc);
-        if (PyDict_CheckExact(op)) {
-            _PyDict_MaybeUntrack(op);
-        }
-        gc = next;
-    }
-}
-
 /* Return true if object has a pre-PEP 442 finalization method. */
 static int
 has_legacy_finalizer(PyObject *op)
@ -1258,15 +1243,10 @@ handle_resurrected_objects(PyGC_Head *unreachable, PyGC_Head* still_unreachable,
    gc_list_merge(resurrected, old_generation);
 }

-
-#define UNTRACK_TUPLES 1
-#define UNTRACK_DICTS 2
-
 static void
 gc_collect_region(PyThreadState *tstate,
                  PyGC_Head *from,
                  PyGC_Head *to,
-                  int untrack,
                  struct gc_collection_stats *stats);

 static inline Py_ssize_t
@ -1328,7 +1308,7 @@ gc_collect_young(PyThreadState *tstate,

    PyGC_Head survivors;
    gc_list_init(&survivors);
-    gc_collect_region(tstate, young, &survivors, UNTRACK_TUPLES, stats);
+    gc_collect_region(tstate, young, &survivors, stats);
    Py_ssize_t survivor_count = 0;
    if (gcstate->visited_space) {
        /* objects in visited space have bit set, so we set it here */
@ -1471,7 +1451,7 @@ gc_collect_increment(PyThreadState *tstate, struct gc_collection_stats *stats)
    gc_list_validate_space(&increment, gcstate->visited_space);
    PyGC_Head survivors;
    gc_list_init(&survivors);
-    gc_collect_region(tstate, &increment, &survivors, UNTRACK_TUPLES, stats);
+    gc_collect_region(tstate, &increment, &survivors, stats);
    gc_list_validate_space(&survivors, gcstate->visited_space);
    gc_list_merge(&survivors, visited);
    assert(gc_list_is_empty(&increment));
@ -1504,7 +1484,6 @@ gc_collect_full(PyThreadState *tstate,
    gc_list_merge(pending, visited);

    gc_collect_region(tstate, visited, visited,
-                      UNTRACK_TUPLES | UNTRACK_DICTS,
                      stats);
    gcstate->young.count = 0;
    gcstate->old[0].count = 0;
@ -1522,7 +1501,6 @@ static void
 gc_collect_region(PyThreadState *tstate,
                  PyGC_Head *from,
                  PyGC_Head *to,
-                  int untrack,
                  struct gc_collection_stats *stats)
 {
    PyGC_Head unreachable; /* non-problematic unreachable trash */
@ -1536,12 +1514,7 @@ gc_collect_region(PyThreadState *tstate,
    gc_list_init(&unreachable);
    deduce_unreachable(from, &unreachable);
    validate_consistent_old_space(from);
-    if (untrack & UNTRACK_TUPLES) {
-        untrack_tuples(from);
-    }
-    if (untrack & UNTRACK_DICTS) {
-        untrack_dicts(from);
-    }
+    untrack_tuples(from);
    validate_consistent_old_space(to);
    if (from != to) {
        gc_list_merge(from, to);
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@ -3,7 +3,7 @@
 #include "pycore_brc.h"           // struct _brc_thread_state
 #include "pycore_ceval.h"         // _Py_set_eval_breaker_bit()
 #include "pycore_context.h"
-#include "pycore_dict.h"          // _PyDict_MaybeUntrack()
+#include "pycore_dict.h"          // _PyInlineValuesSize()
 #include "pycore_freelist.h"      // _PyObject_ClearFreeLists()
 #include "pycore_initconfig.h"
 #include "pycore_interp.h"        // PyInterpreterState.gc
@ -493,13 +493,6 @@ update_refs(const mi_heap_t *heap, const mi_heap_area_t *area,
                return true;
            }
        }
-        else if (PyDict_CheckExact(op)) {
-            _PyDict_MaybeUntrack(op);
-            if (!_PyObject_GC_IS_TRACKED(op)) {
-                gc_restore_refs(op);
-                return true;
-            }
-        }
    }

    // We repurpose ob_tid to compute "gc_refs", the number of external
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@ -7435,10 +7435,6 @@
                DEOPT_IF(ep->me_key != name, STORE_ATTR);
                PyObject *old_value = ep->me_value;
                DEOPT_IF(old_value == NULL, STORE_ATTR);
-                /* Ensure dict is GC tracked if it needs to be */
-                if (!_PyObject_GC_IS_TRACKED(dict) && _PyObject_GC_MAY_BE_TRACKED(PyStackRef_AsPyObjectBorrow(value))) {
-                    _PyObject_GC_TRACK(dict);
-                }
                _PyFrame_SetStackPointer(frame, stack_pointer);
                _PyDict_NotifyEvent(tstate->interp, PyDict_EVENT_MODIFIED, dict, name, PyStackRef_AsPyObjectBorrow(value));
                stack_pointer = _PyFrame_GetStackPointer(frame);