Re-enable and update XGBoost (#4716)

Co-authored-by: Hood Chatham <roberthoodchatham@gmail.com>
2024-04-27 09:18:10 +09:00 · 2024-04-27 09:18:10 +09:00 · 55339e00e4
parent 4633a958da
commit 55339e00e4
6 changed files with 124 additions and 126 deletions
--- a/packages/xgboost/meta.yaml
+++ b/packages/xgboost/meta.yaml
@ -1,21 +1,20 @@
 package:
  name: xgboost
-  _disabled: true
-  version: 1.6.1
+  version: 2.1.0.dev0
  top-level:
    - xgboost
 source:
-  url: https://files.pythonhosted.org/packages/0e/8c/19309bcaf9a88b0bab34b88935925153f3f3f646163acaae9aa148cf72bb/xgboost-1.6.1.tar.gz
-  sha256: 24072028656f3428e7b8aabf77340ece057f273e41f7f85d67ccaefb7454bb18
+  # temporary URL until xgboost makes a release
+  url: https://github.com/ryanking13/xgboost/releases/download/2.1.0.dev0/xgboost-2.1.0.dev0.tar.gz
+  sha256: 0695165010555807a6d3817b0f3ce05efeac74ede8e1d1f74853db944ad0e9f7
  patches:
-    - patches/0001-Add-missing-template-type.patch
-    - patches/0002-Add-library-loading-path.patch
-    - patches/0003-Fix-type-mismatch-for-CSR-conversion-in-c_api.patch
+    - patches/0001-Fix-compilation-on-32-bit-platforms.patch
 build:
+  # DMLC_LOG_STACK_TRACE=0 is to handle https://github.com/dmlc/xgboost/issues/8595
  cflags: |
    -DDMLC_USE_FOPEN64=0
    -DDMLC_ENABLE_STD_THREAD=0
-    -DDMLC_CXX11_THREAD_LOCAL=0
+    -DDMLC_LOG_STACK_TRACE=0
    -DUSE_OPENMP=0
  exports: requested
 requirements:
@ -28,4 +27,3 @@ about:
  PyPI: https://pypi.org/project/xgboost
  summary: XGBoost Python Package
  license: Apache-2.0
-# Note: this package cannot be updated until we add support for building with meson
--- a/packages/xgboost/patches/0001-Add-missing-template-type.patch
+++ b/packages/xgboost/patches/0001-Add-missing-template-type.patch
@ -1,37 +0,0 @@
-From 4ac9a00d9e16b0879b4e734a4b604c7ce672894e Mon Sep 17 00:00:00 2001
-From: Gyeongjae Choi <def6488@gmail.com>
-Date: Mon, 9 May 2022 06:42:07 +0000
-Subject: [PATCH 1/3] Add missing template type
-
-TODO: Remove this patch when XGBoost version is updated.
-(Upstream PR: https://github.com/dmlc/xgboost/pull/7954)
-
---
- src/common/host_device_vector.cc | 5 ++++-
- 1 file changed, 4 insertions(+), 1 deletion(-)
-
-diff --git a/xgboost/src/common/host_device_vector.cc b/xgboost/src/common/host_device_vector.cc
-index 3a4a59db..fc33317b 100644
--- a/xgboost/src/common/host_device_vector.cc
-+++ b/xgboost/src/common/host_device_vector.cc
-@@ -180,13 +180,16 @@ template class HostDeviceVector<uint64_t>;  // bst_row_t
- template class HostDeviceVector<uint32_t>;  // bst_feature_t
- template class HostDeviceVector<RegTree::Segment>;
- 
-#if defined(__APPLE__)
-+#if defined(__APPLE__) || defined(__EMSCRIPTEN__)
- /*
-  * On OSX:
-  *
-  * typedef unsigned int         uint32_t;
-  * typedef unsigned long long   uint64_t;
-  * typedef unsigned long       __darwin_size_t;
-+ *
-+ * On Emscripten:
-+ * typedef unsigned long        size_t;
-  */
- template class HostDeviceVector<std::size_t>;
- #endif  // defined(__APPLE__)
-- 
-2.35.1
-
--- a/packages/xgboost/patches/0001-Fix-compilation-on-32-bit-platforms.patch
+++ b/packages/xgboost/patches/0001-Fix-compilation-on-32-bit-platforms.patch
@ -0,0 +1,108 @@
+From ec6451264b6a348f4a6eaa2e067fb1ffa432a6c2 Mon Sep 17 00:00:00 2001
+From: Gyeongjae Choi <def6488@gmail.com>
+Date: Tue, 23 Apr 2024 10:04:42 +0000
+Subject: [PATCH 1/1] Fix compilation on 32-bit platforms.
+
+Partially applies the upstream PR: https://github.com/dmlc/xgboost/pull/8964
+
+---
+ src/collective/communicator-inl.h | 9 ---------
+ src/common/quantile.cc            | 6 +++---
+ src/data/iterative_dmatrix.cc     | 2 +-
+ src/metric/auc.cc                 | 2 +-
+ src/objective/adaptive.h          | 2 +-
+ 5 files changed, 6 insertions(+), 15 deletions(-)
+
+diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
+index 991e19f2c..ea7b415b1 100644
+--- a/cpp_src/src/collective/communicator-inl.h
+++ b/cpp_src/src/collective/communicator-inl.h
+@@ -288,15 +288,6 @@ inline void Allreduce(uint64_t *send_receive_buffer, size_t count) {
+   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
+ }
+ 
+-// Specialization for size_t, which is implementation defined, so it might or might not
+-// be one of uint64_t/uint32_t/unsigned long long/unsigned long.
+-template <Operation op, typename T,
+-          typename = std::enable_if_t<std::is_same<size_t, T>{} && !std::is_same<uint64_t, T>{}> >
+-inline void Allreduce(T *send_receive_buffer, size_t count) {
+-  static_assert(sizeof(T) == sizeof(uint64_t));
+-  Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
+-}
+-
+ template <Operation op>
+ inline void Allreduce(float *send_receive_buffer, size_t count) {
+   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kFloat, op);
+diff --git a/src/common/quantile.cc b/src/common/quantile.cc
+index 8c743d940..0ea819c38 100644
+--- a/cpp_src/src/common/quantile.cc
+++ b/cpp_src/src/common/quantile.cc
+@@ -154,7 +154,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
+   worker_segments.resize(1, 0);
+   auto world = collective::GetWorldSize();
+   auto rank = collective::GetRank();
+-  auto n_columns = sketches_.size();
+  std::uint64_t n_columns = sketches_.size();
+ 
+   // get the size of each feature.
+   std::vector<bst_idx_t> sketch_size;
+@@ -285,7 +285,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
+     std::vector<typename WQSketch::SummaryContainer> *p_reduced, std::vector<int32_t> *p_num_cuts) {
+   monitor_.Start(__func__);
+ 
+-  size_t n_columns = sketches_.size();
+  std::uint64_t n_columns = sketches_.size();
+   collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
+   CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
+ 
+@@ -339,7 +339,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
+   ParallelFor(n_columns, n_threads_, [&](auto fidx) {
+     // gcc raises subobject-linkage warning if we put allreduce_result as lambda capture
+     QuantileAllreduce<typename WQSketch::Entry> allreduce_result{global_sketches, worker_segments,
+-                                                                 sketches_scan, n_columns};
+                                                                 sketches_scan, static_cast<size_t>(n_columns)};
+     int32_t intermediate_num_cuts = num_cuts[fidx];
+     auto nbytes = WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
+     if (IsCat(feature_types_, fidx)) {
+diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
+index 0d75d0651..75f9d1145 100644
+--- a/cpp_src/src/data/iterative_dmatrix.cc
+++ b/cpp_src/src/data/iterative_dmatrix.cc
+@@ -100,7 +100,7 @@ void SyncFeatureType(Context const*, std::vector<FeatureType>* p_h_ft) {
+     return;
+   }
+   auto& h_ft = *p_h_ft;
+-  auto n_ft = h_ft.size();
+  std::uint64_t n_ft = h_ft.size();
+   collective::Allreduce<collective::Operation::kMax>(&n_ft, 1);
+   if (!h_ft.empty()) {
+     // Check correct size if this is not an empty DMatrix.
+diff --git a/src/metric/auc.cc b/src/metric/auc.cc
+index 212a3a027..bf2862a7d 100644
+--- a/cpp_src/src/metric/auc.cc
+++ b/cpp_src/src/metric/auc.cc
+@@ -264,7 +264,7 @@ class EvalAUC : public MetricNoCache {
+       info.weights_.SetDevice(ctx_->Device());
+     }
+     //  We use the global size to handle empty dataset.
+-    std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
+    std::array<bst_idx_t, 2> meta{info.labels.Size(), preds.Size()};
+     if (!info.IsVerticalFederated()) {
+       collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
+     }
+diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
+index cbe69e79a..c9e92ae59 100644
+--- a/cpp_src/src/objective/adaptive.h
+++ b/cpp_src/src/objective/adaptive.h
+@@ -42,7 +42,7 @@ inline void UpdateLeafValues(Context const* ctx, std::vector<float>* p_quantiles
+   auto& quantiles = *p_quantiles;
+   auto const& h_node_idx = nidx;
+ 
+-  size_t n_leaf = collective::GlobalMax(ctx, info, h_node_idx.size());
+  std::uint64_t n_leaf = collective::GlobalMax(ctx, info, static_cast<std::uint64_t>(h_node_idx.size()));
+   CHECK(quantiles.empty() || quantiles.size() == n_leaf);
+   if (quantiles.empty()) {
+     quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
+-- 
+2.43.2
+
--- a/packages/xgboost/patches/0002-Add-library-loading-path.patch
+++ b/packages/xgboost/patches/0002-Add-library-loading-path.patch
@ -1,29 +0,0 @@
-From 54c2a9faeb0b0169172c5ab53367e6092f132c5a Mon Sep 17 00:00:00 2001
-From: Gyeongjae Choi <def6488@gmail.com>
-Date: Mon, 9 May 2022 12:07:44 +0000
-Subject: [PATCH 2/3] Add library loading path
-
-TODO: Remove this patch when XGBoost version is updated.
-(Upstream PR: https://github.com/dmlc/xgboost/pull/7954)
-
---
- python-package/xgboost/libpath.py | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/xgboost/libpath.py b/xgboost/libpath.py
-index f7a7d9cd..1ab41cbe 100644
--- a/xgboost/libpath.py
-+++ b/xgboost/libpath.py
-@@ -43,8 +43,7 @@ def find_lib_path() -> List[str]:
-             # directory here
-             dll_path.append(os.path.join(curr_path, './windows/Release/'))
-         dll_path = [os.path.join(p, 'xgboost.dll') for p in dll_path]
-    elif sys.platform.startswith('linux') or sys.platform.startswith(
-            'freebsd'):
-+    elif sys.platform.startswith(('linux', 'freebsd', 'emscripten')):
-         dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path]
-     elif sys.platform == 'darwin':
-         dll_path = [os.path.join(p, 'libxgboost.dylib') for p in dll_path]
-- 
-2.35.1
-
--- a/packages/xgboost/patches/0003-Fix-type-mismatch-for-CSR-conversion-in-c_api.patch
+++ b/packages/xgboost/patches/0003-Fix-type-mismatch-for-CSR-conversion-in-c_api.patch
@ -1,42 +0,0 @@
-From 4ec1b506b424dd9e81fd7127f5712522800a5596 Mon Sep 17 00:00:00 2001
-From: Yizhi Liu <liuyizhi@apache.org>
-Date: Mon, 17 Oct 2022 15:16:45 -0700
-Subject: [PATCH 3/3] Fix type mismatch for CSR conversion in c_api
-
-TODO: Remove this patch when XGBoost version is updated.
-(Upstream PR: https://github.com/dmlc/xgboost/pull/8369)
-
---
- xgboost/core.py | 2 +-
- xgboost/data.py | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/xgboost/core.py b/xgboost/core.py
-index 36548d8..0246779 100644
--- a/xgboost/core.py
-+++ b/xgboost/core.py
-@@ -2119,7 +2119,7 @@ class Booster:
-                     _array_interface(csr.indptr),
-                     _array_interface(csr.indices),
-                     _array_interface(csr.data),
-                    ctypes.c_size_t(csr.shape[1]),
-+                    c_bst_ulong(csr.shape[1]),
-                     from_pystr_to_cstr(json.dumps(args)),
-                     p_handle,
-                     ctypes.byref(shape),
-diff --git a/xgboost/data.py b/xgboost/data.py
-index 119b354..b958436 100644
--- a/xgboost/data.py
-+++ b/xgboost/data.py
-@@ -88,7 +88,7 @@ def _from_scipy_csr(
-             _array_interface(data.indptr),
-             _array_interface(data.indices),
-             _array_interface(data.data),
-            ctypes.c_size_t(data.shape[1]),
-+            c_bst_ulong(data.shape[1]),
-             config,
-             ctypes.byref(handle),
-         )
-- 
-2.35.1
-
--- a/packages/xgboost/test_xgboost.py
+++ b/packages/xgboost/test_xgboost.py
@ -135,11 +135,11 @@ def test_pandas(selenium):
    # 1  2    0    1    0
    # 2  3    0    0    1
    result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False)
-    exp = np.array([[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]])
-    np.testing.assert_array_equal(result, exp)
+    exp = np.array([[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]]).T
+    np.testing.assert_array_equal(result.columns, exp)
    dm = xgb.DMatrix(dummies)
    assert dm.feature_names == ["B", "A_X", "A_Y", "A_Z"]
-    assert dm.feature_types == ["int", "int", "int", "int"]
+    assert dm.feature_types == ["int", "i", "i", "i"]
    assert dm.num_row() == 3
    assert dm.num_col() == 4

@ -228,7 +228,7 @@ def test_pandas_categorical(selenium):
        X, enable_categorical=True
    )

-    assert transformed[:, 0].min() == 0
+    assert transformed.columns[0].min() == 0

    # test missing value
    X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
@ -282,18 +282,18 @@ def test_pandas_label(selenium):
    # label must be a single column
    df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
    with pytest.raises(ValueError):
-        xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
+        xgb.data._transform_pandas_df(df, False, None, None, "label")

    # label must be supported dtype
    df = pd.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
    with pytest.raises(ValueError):
-        xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
+        xgb.data._transform_pandas_df(df, False, None, None, "label")

    df = pd.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
-    result, _, _ = xgb.data._transform_pandas_df(
-        df, False, None, None, "label", "float"
+    result, _, _ = xgb.data._transform_pandas_df(df, False, None, None, "label")
+    np.testing.assert_array_equal(
+        np.stack(result.columns, axis=1), np.array([[1.0], [2.0], [3.0]], dtype=float)
    )
-    np.testing.assert_array_equal(result, np.array([[1.0], [2.0], [3.0]], dtype=float))
    dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
    assert dm.num_row() == 3
    assert dm.num_col() == 2