Re-enable and update XGBoost (#4716)

Co-authored-by: Hood Chatham <roberthoodchatham@gmail.com>
This commit is contained in:
Gyeongjae Choi 2024-04-27 09:18:10 +09:00 committed by GitHub
parent 4633a958da
commit 55339e00e4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 124 additions and 126 deletions

View File

@ -1,21 +1,20 @@
package:
name: xgboost
_disabled: true
version: 1.6.1
version: 2.1.0.dev0
top-level:
- xgboost
source:
url: https://files.pythonhosted.org/packages/0e/8c/19309bcaf9a88b0bab34b88935925153f3f3f646163acaae9aa148cf72bb/xgboost-1.6.1.tar.gz
sha256: 24072028656f3428e7b8aabf77340ece057f273e41f7f85d67ccaefb7454bb18
# temporary URL until xgboost makes a release
url: https://github.com/ryanking13/xgboost/releases/download/2.1.0.dev0/xgboost-2.1.0.dev0.tar.gz
sha256: 0695165010555807a6d3817b0f3ce05efeac74ede8e1d1f74853db944ad0e9f7
patches:
- patches/0001-Add-missing-template-type.patch
- patches/0002-Add-library-loading-path.patch
- patches/0003-Fix-type-mismatch-for-CSR-conversion-in-c_api.patch
- patches/0001-Fix-compilation-on-32-bit-platforms.patch
build:
# DMLC_LOG_STACK_TRACE=0 is to handle https://github.com/dmlc/xgboost/issues/8595
cflags: |
-DDMLC_USE_FOPEN64=0
-DDMLC_ENABLE_STD_THREAD=0
-DDMLC_CXX11_THREAD_LOCAL=0
-DDMLC_LOG_STACK_TRACE=0
-DUSE_OPENMP=0
exports: requested
requirements:
@ -28,4 +27,3 @@ about:
PyPI: https://pypi.org/project/xgboost
summary: XGBoost Python Package
license: Apache-2.0
# Note: this package cannot be updated until we add support for building with meson

View File

@ -1,37 +0,0 @@
From 4ac9a00d9e16b0879b4e734a4b604c7ce672894e Mon Sep 17 00:00:00 2001
From: Gyeongjae Choi <def6488@gmail.com>
Date: Mon, 9 May 2022 06:42:07 +0000
Subject: [PATCH 1/3] Add missing template type
TODO: Remove this patch when XGBoost version is updated.
(Upstream PR: https://github.com/dmlc/xgboost/pull/7954)
---
src/common/host_device_vector.cc | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/xgboost/src/common/host_device_vector.cc b/xgboost/src/common/host_device_vector.cc
index 3a4a59db..fc33317b 100644
--- a/xgboost/src/common/host_device_vector.cc
+++ b/xgboost/src/common/host_device_vector.cc
@@ -180,13 +180,16 @@ template class HostDeviceVector<uint64_t>; // bst_row_t
template class HostDeviceVector<uint32_t>; // bst_feature_t
template class HostDeviceVector<RegTree::Segment>;
-#if defined(__APPLE__)
+#if defined(__APPLE__) || defined(__EMSCRIPTEN__)
/*
* On OSX:
*
* typedef unsigned int uint32_t;
* typedef unsigned long long uint64_t;
* typedef unsigned long __darwin_size_t;
+ *
+ * On Emscripten:
+ * typedef unsigned long size_t;
*/
template class HostDeviceVector<std::size_t>;
#endif // defined(__APPLE__)
--
2.35.1

View File

@ -0,0 +1,108 @@
From ec6451264b6a348f4a6eaa2e067fb1ffa432a6c2 Mon Sep 17 00:00:00 2001
From: Gyeongjae Choi <def6488@gmail.com>
Date: Tue, 23 Apr 2024 10:04:42 +0000
Subject: [PATCH 1/1] Fix compilation on 32-bit platforms.
Partially applies the upstream PR: https://github.com/dmlc/xgboost/pull/8964
---
src/collective/communicator-inl.h | 9 ---------
src/common/quantile.cc | 6 +++---
src/data/iterative_dmatrix.cc | 2 +-
src/metric/auc.cc | 2 +-
src/objective/adaptive.h | 2 +-
5 files changed, 6 insertions(+), 15 deletions(-)
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 991e19f2c..ea7b415b1 100644
--- a/cpp_src/src/collective/communicator-inl.h
+++ b/cpp_src/src/collective/communicator-inl.h
@@ -288,15 +288,6 @@ inline void Allreduce(uint64_t *send_receive_buffer, size_t count) {
Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
}
-// Specialization for size_t, which is implementation defined, so it might or might not
-// be one of uint64_t/uint32_t/unsigned long long/unsigned long.
-template <Operation op, typename T,
- typename = std::enable_if_t<std::is_same<size_t, T>{} && !std::is_same<uint64_t, T>{}> >
-inline void Allreduce(T *send_receive_buffer, size_t count) {
- static_assert(sizeof(T) == sizeof(uint64_t));
- Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kUInt64, op);
-}
-
template <Operation op>
inline void Allreduce(float *send_receive_buffer, size_t count) {
Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kFloat, op);
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 8c743d940..0ea819c38 100644
--- a/cpp_src/src/common/quantile.cc
+++ b/cpp_src/src/common/quantile.cc
@@ -154,7 +154,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
worker_segments.resize(1, 0);
auto world = collective::GetWorldSize();
auto rank = collective::GetRank();
- auto n_columns = sketches_.size();
+ std::uint64_t n_columns = sketches_.size();
// get the size of each feature.
std::vector<bst_idx_t> sketch_size;
@@ -285,7 +285,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
std::vector<typename WQSketch::SummaryContainer> *p_reduced, std::vector<int32_t> *p_num_cuts) {
monitor_.Start(__func__);
- size_t n_columns = sketches_.size();
+ std::uint64_t n_columns = sketches_.size();
collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
@@ -339,7 +339,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
ParallelFor(n_columns, n_threads_, [&](auto fidx) {
// gcc raises subobject-linkage warning if we put allreduce_result as lambda capture
QuantileAllreduce<typename WQSketch::Entry> allreduce_result{global_sketches, worker_segments,
- sketches_scan, n_columns};
+ sketches_scan, static_cast<size_t>(n_columns)};
int32_t intermediate_num_cuts = num_cuts[fidx];
auto nbytes = WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
if (IsCat(feature_types_, fidx)) {
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 0d75d0651..75f9d1145 100644
--- a/cpp_src/src/data/iterative_dmatrix.cc
+++ b/cpp_src/src/data/iterative_dmatrix.cc
@@ -100,7 +100,7 @@ void SyncFeatureType(Context const*, std::vector<FeatureType>* p_h_ft) {
return;
}
auto& h_ft = *p_h_ft;
- auto n_ft = h_ft.size();
+ std::uint64_t n_ft = h_ft.size();
collective::Allreduce<collective::Operation::kMax>(&n_ft, 1);
if (!h_ft.empty()) {
// Check correct size if this is not an empty DMatrix.
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 212a3a027..bf2862a7d 100644
--- a/cpp_src/src/metric/auc.cc
+++ b/cpp_src/src/metric/auc.cc
@@ -264,7 +264,7 @@ class EvalAUC : public MetricNoCache {
info.weights_.SetDevice(ctx_->Device());
}
// We use the global size to handle empty dataset.
- std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
+ std::array<bst_idx_t, 2> meta{info.labels.Size(), preds.Size()};
if (!info.IsVerticalFederated()) {
collective::Allreduce<collective::Operation::kMax>(meta.data(), meta.size());
}
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index cbe69e79a..c9e92ae59 100644
--- a/cpp_src/src/objective/adaptive.h
+++ b/cpp_src/src/objective/adaptive.h
@@ -42,7 +42,7 @@ inline void UpdateLeafValues(Context const* ctx, std::vector<float>* p_quantiles
auto& quantiles = *p_quantiles;
auto const& h_node_idx = nidx;
- size_t n_leaf = collective::GlobalMax(ctx, info, h_node_idx.size());
+ std::uint64_t n_leaf = collective::GlobalMax(ctx, info, static_cast<std::uint64_t>(h_node_idx.size()));
CHECK(quantiles.empty() || quantiles.size() == n_leaf);
if (quantiles.empty()) {
quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
--
2.43.2

View File

@ -1,29 +0,0 @@
From 54c2a9faeb0b0169172c5ab53367e6092f132c5a Mon Sep 17 00:00:00 2001
From: Gyeongjae Choi <def6488@gmail.com>
Date: Mon, 9 May 2022 12:07:44 +0000
Subject: [PATCH 2/3] Add library loading path
TODO: Remove this patch when XGBoost version is updated.
(Upstream PR: https://github.com/dmlc/xgboost/pull/7954)
---
python-package/xgboost/libpath.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/xgboost/libpath.py b/xgboost/libpath.py
index f7a7d9cd..1ab41cbe 100644
--- a/xgboost/libpath.py
+++ b/xgboost/libpath.py
@@ -43,8 +43,7 @@ def find_lib_path() -> List[str]:
# directory here
dll_path.append(os.path.join(curr_path, './windows/Release/'))
dll_path = [os.path.join(p, 'xgboost.dll') for p in dll_path]
- elif sys.platform.startswith('linux') or sys.platform.startswith(
- 'freebsd'):
+ elif sys.platform.startswith(('linux', 'freebsd', 'emscripten')):
dll_path = [os.path.join(p, 'libxgboost.so') for p in dll_path]
elif sys.platform == 'darwin':
dll_path = [os.path.join(p, 'libxgboost.dylib') for p in dll_path]
--
2.35.1

View File

@ -1,42 +0,0 @@
From 4ec1b506b424dd9e81fd7127f5712522800a5596 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Mon, 17 Oct 2022 15:16:45 -0700
Subject: [PATCH 3/3] Fix type mismatch for CSR conversion in c_api
TODO: Remove this patch when XGBoost version is updated.
(Upstream PR: https://github.com/dmlc/xgboost/pull/8369)
---
xgboost/core.py | 2 +-
xgboost/data.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/xgboost/core.py b/xgboost/core.py
index 36548d8..0246779 100644
--- a/xgboost/core.py
+++ b/xgboost/core.py
@@ -2119,7 +2119,7 @@ class Booster:
_array_interface(csr.indptr),
_array_interface(csr.indices),
_array_interface(csr.data),
- ctypes.c_size_t(csr.shape[1]),
+ c_bst_ulong(csr.shape[1]),
from_pystr_to_cstr(json.dumps(args)),
p_handle,
ctypes.byref(shape),
diff --git a/xgboost/data.py b/xgboost/data.py
index 119b354..b958436 100644
--- a/xgboost/data.py
+++ b/xgboost/data.py
@@ -88,7 +88,7 @@ def _from_scipy_csr(
_array_interface(data.indptr),
_array_interface(data.indices),
_array_interface(data.data),
- ctypes.c_size_t(data.shape[1]),
+ c_bst_ulong(data.shape[1]),
config,
ctypes.byref(handle),
)
--
2.35.1

View File

@ -135,11 +135,11 @@ def test_pandas(selenium):
# 1 2 0 1 0
# 2 3 0 0 1
result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False)
exp = np.array([[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]])
np.testing.assert_array_equal(result, exp)
exp = np.array([[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]]).T
np.testing.assert_array_equal(result.columns, exp)
dm = xgb.DMatrix(dummies)
assert dm.feature_names == ["B", "A_X", "A_Y", "A_Z"]
assert dm.feature_types == ["int", "int", "int", "int"]
assert dm.feature_types == ["int", "i", "i", "i"]
assert dm.num_row() == 3
assert dm.num_col() == 4
@ -228,7 +228,7 @@ def test_pandas_categorical(selenium):
X, enable_categorical=True
)
assert transformed[:, 0].min() == 0
assert transformed.columns[0].min() == 0
# test missing value
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
@ -282,18 +282,18 @@ def test_pandas_label(selenium):
# label must be a single column
df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
with pytest.raises(ValueError):
xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
xgb.data._transform_pandas_df(df, False, None, None, "label")
# label must be supported dtype
df = pd.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
with pytest.raises(ValueError):
xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
xgb.data._transform_pandas_df(df, False, None, None, "label")
df = pd.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
result, _, _ = xgb.data._transform_pandas_df(
df, False, None, None, "label", "float"
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None, "label")
np.testing.assert_array_equal(
np.stack(result.columns, axis=1), np.array([[1.0], [2.0], [3.0]], dtype=float)
)
np.testing.assert_array_equal(result, np.array([[1.0], [2.0], [3.0]], dtype=float))
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
assert dm.num_row() == 3
assert dm.num_col() == 2