# xgboost tests are copied from: https://github.com/dmlc/xgboost/tree/master/tests/python import pathlib import pytest from pytest_pyodide import run_in_pyodide DEMO_PATH = pathlib.Path(__file__).parent / "test_data" @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost"]) def test_compat(selenium): import numpy as np from xgboost.compat import lazy_isinstance a = np.array([1, 2, 3]) assert lazy_isinstance(a, "numpy", "ndarray") assert not lazy_isinstance(a, "numpy", "dataframe") @pytest.mark.driver_timeout(60) def test_basic_classification(selenium): @run_in_pyodide(packages=["xgboost"]) def run(selenium, data_train): with open("dermatology.data", "wb") as f: f.write(data_train) import numpy as np import xgboost as xgb # label need to be 0 to num_class -1 data = np.loadtxt( "./dermatology.data", delimiter=",", converters={33: lambda x: int(x == "?"), 34: lambda x: int(x) - 1}, ) sz = data.shape train = data[: int(sz[0] * 0.7), :] test = data[int(sz[0] * 0.7) :, :] train_X = train[:, :33] train_Y = train[:, 34] test_X = test[:, :33] test_Y = test[:, 34] xg_train = xgb.DMatrix(train_X, label=train_Y) xg_test = xgb.DMatrix(test_X, label=test_Y) # setup parameters for xgboost param = {} # use softmax multi-class classification param["objective"] = "multi:softmax" # scale weight of positive examples param["eta"] = 0.1 # type: ignore[assignment] param["max_depth"] = 6 # type: ignore[assignment] param["nthread"] = 4 # type: ignore[assignment] param["num_class"] = 6 # type: ignore[assignment] watchlist = [(xg_train, "train"), (xg_test, "test")] num_round = 5 bst = xgb.train(param, xg_train, num_round, watchlist) # get prediction pred = bst.predict(xg_test) error_rate = np.sum(pred != test_Y) / test_Y.shape[0] assert error_rate < 0.1 # do the same thing again, but output probabilities param["objective"] = "multi:softprob" bst = xgb.train(param, xg_train, num_round, watchlist) # Note: this convention has been changed since xgboost-unity # get prediction, this is in 1D array, need reshape to (ndata, nclass) pred_prob = bst.predict(xg_test).reshape(test_Y.shape[0], 6) pred_label = np.argmax(pred_prob, axis=1) error_rate = np.sum(pred_label != test_Y) / test_Y.shape[0] assert error_rate < 0.1 DATA_TRAIN = (DEMO_PATH / "dermatology.data").read_bytes() run(selenium, DATA_TRAIN) @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost", "pandas", "pytest"]) def test_pandas(selenium): import numpy as np import pandas as pd import pytest import xgboost as xgb df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"]) dm = xgb.DMatrix(df, label=pd.Series([1, 2])) assert dm.feature_names == ["a", "b", "c"] assert dm.feature_types == ["int", "float", "i"] assert dm.num_row() == 2 assert dm.num_col() == 3 np.testing.assert_array_equal(dm.get_label(), np.array([1, 2])) # overwrite feature_names and feature_types dm = xgb.DMatrix( df, label=pd.Series([1, 2]), feature_names=["x", "y", "z"], feature_types=["q", "q", "q"], ) assert dm.feature_names == ["x", "y", "z"] assert dm.feature_types == ["q", "q", "q"] assert dm.num_row() == 2 assert dm.num_col() == 3 # incorrect dtypes df = pd.DataFrame([[1, 2.0, "x"], [2, 3.0, "y"]], columns=["a", "b", "c"]) with pytest.raises(ValueError): xgb.DMatrix(df) # numeric columns df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]]) dm = xgb.DMatrix(df, label=pd.Series([1, 2])) assert dm.feature_names == ["0", "1", "2"] assert dm.feature_types == ["int", "float", "i"] assert dm.num_row() == 2 assert dm.num_col() == 3 np.testing.assert_array_equal(dm.get_label(), np.array([1, 2])) df = pd.DataFrame([[1, 2.0, 1], [2, 3.0, 1]], columns=[4, 5, 6]) dm = xgb.DMatrix(df, label=pd.Series([1, 2])) assert dm.feature_names == ["4", "5", "6"] assert dm.feature_types == ["int", "float", "int"] assert dm.num_row() == 2 assert dm.num_col() == 3 df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]}) dummies = pd.get_dummies(df) # B A_X A_Y A_Z # 0 1 1 0 0 # 1 2 0 1 0 # 2 3 0 0 1 result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False) exp = np.array([[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]]) np.testing.assert_array_equal(result, exp) dm = xgb.DMatrix(dummies) assert dm.feature_names == ["B", "A_X", "A_Y", "A_Z"] assert dm.feature_types == ["int", "int", "int", "int"] assert dm.num_row() == 3 assert dm.num_col() == 4 df = pd.DataFrame({"A=1": [1, 2, 3], "A=2": [4, 5, 6]}) dm = xgb.DMatrix(df) assert dm.feature_names == ["A=1", "A=2"] assert dm.feature_types == ["int", "int"] assert dm.num_row() == 3 assert dm.num_col() == 2 df_int = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=[9, 10]) dm_int = xgb.DMatrix(df_int) df_range = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=range(9, 11, 1)) dm_range = xgb.DMatrix(df_range) assert dm_int.feature_names == ["9", "10"] # assert not "9 " assert dm_int.feature_names == dm_range.feature_names # test MultiIndex as columns df = pd.DataFrame( [(1, 2, 3, 4, 5, 6), (6, 5, 4, 3, 2, 1)], columns=pd.MultiIndex.from_tuples( ( ("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3), ) ), ) dm = xgb.DMatrix(df) assert dm.feature_names == ["a 1", "a 2", "a 3", "b 1", "b 2", "b 3"] assert dm.feature_types == ["int", "int", "int", "int", "int", "int"] assert dm.num_row() == 2 assert dm.num_col() == 6 # test Index as columns df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2])) # print(df.columns, isinstance(df.columns, pd.Index)) Xy = xgb.DMatrix(df) np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"])) @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost", "pandas"]) def test_pandas_slice(selenium): import numpy as np import pandas as pd import xgboost as xgb rng = np.random.RandomState(1994) rows = 100 X = rng.randint(3, 7, size=rows) X = pd.DataFrame({"f0": X}) y = rng.randn(rows) ridxs = [1, 2, 3, 4, 5, 6] m = xgb.DMatrix(X, y) sliced = m.slice(ridxs) assert m.feature_types == sliced.feature_types @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost", "pandas", "pytest"]) def test_pandas_categorical(selenium): import numpy as np import pandas as pd import pytest import xgboost as xgb rng = np.random.RandomState(1994) rows = 100 X = rng.randint(3, 7, size=rows) X = pd.Series(X, dtype="category") X = pd.DataFrame({"f0": X}) y = rng.randn(rows) m = xgb.DMatrix(X, y, enable_categorical=True) assert m.feature_types[0] == "c" X_0 = ["f", "o", "o"] X_1 = [4, 3, 2] X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1}) X["feat_0"] = X["feat_0"].astype("category") # type: ignore[call-overload] transformed, _, feature_types = xgb.data._transform_pandas_df( X, enable_categorical=True ) assert transformed[:, 0].min() == 0 # test missing value X = pd.DataFrame({"f0": ["a", "b", np.NaN]}) X["f0"] = X["f0"].astype("category") # type: ignore[call-overload] arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True) assert not np.any(arr == -1.0) X = X["f0"] # type: ignore[call-overload] y = y[: X.shape[0]] with pytest.raises(ValueError, match=r".*enable_categorical.*"): xgb.DMatrix(X, y) Xy = xgb.DMatrix(X, y, enable_categorical=True) assert Xy.num_row() == 3 assert Xy.num_col() == 1 @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost", "pandas"]) def test_pandas_sparse(selenium): import numpy as np import pandas as pd import xgboost as xgb rows = 100 X = pd.DataFrame( { "A": pd.arrays.SparseArray(np.random.randint(0, 10, size=rows)), "B": pd.arrays.SparseArray(np.random.randn(rows)), "C": pd.arrays.SparseArray( np.random.permutation([True, False] * (rows // 2)) ), } ) y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows))) dtrain = xgb.DMatrix(X, y) booster = xgb.train({}, dtrain, num_boost_round=4) predt_sparse = booster.predict(xgb.DMatrix(X)) predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense())) np.testing.assert_allclose(predt_sparse, predt_dense) @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost", "pandas", "pytest"]) def test_pandas_label(selenium): import numpy as np import pandas as pd import pytest import xgboost as xgb # label must be a single column df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]}) with pytest.raises(ValueError): xgb.data._transform_pandas_df(df, False, None, None, "label", "float") # label must be supported dtype df = pd.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)}) with pytest.raises(ValueError): xgb.data._transform_pandas_df(df, False, None, None, "label", "float") df = pd.DataFrame({"A": np.array([1, 2, 3], dtype=int)}) result, _, _ = xgb.data._transform_pandas_df( df, False, None, None, "label", "float" ) np.testing.assert_array_equal(result, np.array([[1.0], [2.0], [3.0]], dtype=float)) dm = xgb.DMatrix(np.random.randn(3, 2), label=df) assert dm.num_row() == 3 assert dm.num_col() == 2 @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost", "pandas"]) def test_pandas_weight(selenium): import numpy as np import pandas as pd import xgboost as xgb kRows = 32 kCols = 8 X = np.random.randn(kRows, kCols) y = np.random.randn(kRows) w = np.random.uniform(size=kRows).astype(np.float32) w_pd = pd.DataFrame(w) data = xgb.DMatrix(X, y, w_pd) assert data.num_row() == kRows assert data.num_col() == kCols np.testing.assert_array_equal(data.get_weight(), w) @pytest.mark.driver_timeout(60) @run_in_pyodide(packages=["xgboost", "numpy", "scipy"]) def test_scipy_sparse(selenium): import numpy as np import scipy import xgboost as xgb n_rows = 100 n_cols = 10 X = scipy.sparse.random(n_rows, n_cols, format="csr") y = np.random.randn(n_rows) dtrain = xgb.DMatrix(X, y) booster = xgb.train({}, dtrain, num_boost_round=1) copied_predt = booster.predict(xgb.DMatrix(X)) predt = booster.inplace_predict(X) np.testing.assert_allclose(copied_predt, predt)