pyodide/packages/xgboost/test_xgboost.py

# xgboost tests are copied from: https://github.com/dmlc/xgboost/tree/master/tests/python
import pathlib

import pytest
from pytest_pyodide import run_in_pyodide

DEMO_PATH = pathlib.Path(__file__).parent / "test_data"


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost"])
def test_compat(selenium):
    import numpy as np
    from xgboost.compat import lazy_isinstance

    a = np.array([1, 2, 3])
    assert lazy_isinstance(a, "numpy", "ndarray")
    assert not lazy_isinstance(a, "numpy", "dataframe")


@pytest.mark.driver_timeout(60)
def test_basic_classification(selenium):
    @run_in_pyodide(packages=["xgboost"])
    def run(selenium, data_train):

        with open("dermatology.data", "wb") as f:
            f.write(data_train)

        import numpy as np
        import xgboost as xgb

        # label need to be 0 to num_class -1
        data = np.loadtxt(
            "./dermatology.data",
            delimiter=",",
            converters={33: lambda x: int(x == "?"), 34: lambda x: int(x) - 1},
        )
        sz = data.shape

        train = data[: int(sz[0] * 0.7), :]
        test = data[int(sz[0] * 0.7) :, :]

        train_X = train[:, :33]
        train_Y = train[:, 34]

        test_X = test[:, :33]
        test_Y = test[:, 34]

        xg_train = xgb.DMatrix(train_X, label=train_Y)
        xg_test = xgb.DMatrix(test_X, label=test_Y)
        # setup parameters for xgboost
        param = {}
        # use softmax multi-class classification
        param["objective"] = "multi:softmax"
        # scale weight of positive examples
        param["eta"] = 0.1  # type: ignore[assignment]
        param["max_depth"] = 6  # type: ignore[assignment]
        param["nthread"] = 4  # type: ignore[assignment]
        param["num_class"] = 6  # type: ignore[assignment]

        watchlist = [(xg_train, "train"), (xg_test, "test")]
        num_round = 5
        bst = xgb.train(param, xg_train, num_round, watchlist)
        # get prediction
        pred = bst.predict(xg_test)
        error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
        assert error_rate < 0.1

        # do the same thing again, but output probabilities
        param["objective"] = "multi:softprob"
        bst = xgb.train(param, xg_train, num_round, watchlist)
        # Note: this convention has been changed since xgboost-unity
        # get prediction, this is in 1D array, need reshape to (ndata, nclass)
        pred_prob = bst.predict(xg_test).reshape(test_Y.shape[0], 6)
        pred_label = np.argmax(pred_prob, axis=1)
        error_rate = np.sum(pred_label != test_Y) / test_Y.shape[0]
        assert error_rate < 0.1

    DATA_TRAIN = (DEMO_PATH / "dermatology.data").read_bytes()
    run(selenium, DATA_TRAIN)


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost", "pandas", "pytest"])
def test_pandas(selenium):
    import numpy as np
    import pandas as pd
    import pytest
    import xgboost as xgb

    df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"])
    dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
    assert dm.feature_names == ["a", "b", "c"]
    assert dm.feature_types == ["int", "float", "i"]
    assert dm.num_row() == 2
    assert dm.num_col() == 3
    np.testing.assert_array_equal(dm.get_label(), np.array([1, 2]))

    # overwrite feature_names and feature_types
    dm = xgb.DMatrix(
        df,
        label=pd.Series([1, 2]),
        feature_names=["x", "y", "z"],
        feature_types=["q", "q", "q"],
    )
    assert dm.feature_names == ["x", "y", "z"]
    assert dm.feature_types == ["q", "q", "q"]
    assert dm.num_row() == 2
    assert dm.num_col() == 3

    # incorrect dtypes
    df = pd.DataFrame([[1, 2.0, "x"], [2, 3.0, "y"]], columns=["a", "b", "c"])
    with pytest.raises(ValueError):
        xgb.DMatrix(df)

    # numeric columns
    df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]])
    dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
    assert dm.feature_names == ["0", "1", "2"]
    assert dm.feature_types == ["int", "float", "i"]
    assert dm.num_row() == 2
    assert dm.num_col() == 3
    np.testing.assert_array_equal(dm.get_label(), np.array([1, 2]))

    df = pd.DataFrame([[1, 2.0, 1], [2, 3.0, 1]], columns=[4, 5, 6])
    dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
    assert dm.feature_names == ["4", "5", "6"]
    assert dm.feature_types == ["int", "float", "int"]
    assert dm.num_row() == 2
    assert dm.num_col() == 3

    df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
    dummies = pd.get_dummies(df)
    #    B  A_X  A_Y  A_Z
    # 0  1    1    0    0
    # 1  2    0    1    0
    # 2  3    0    0    1
    result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False)
    exp = np.array([[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]])
    np.testing.assert_array_equal(result, exp)
    dm = xgb.DMatrix(dummies)
    assert dm.feature_names == ["B", "A_X", "A_Y", "A_Z"]
    assert dm.feature_types == ["int", "int", "int", "int"]
    assert dm.num_row() == 3
    assert dm.num_col() == 4

    df = pd.DataFrame({"A=1": [1, 2, 3], "A=2": [4, 5, 6]})
    dm = xgb.DMatrix(df)
    assert dm.feature_names == ["A=1", "A=2"]
    assert dm.feature_types == ["int", "int"]
    assert dm.num_row() == 3
    assert dm.num_col() == 2

    df_int = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=[9, 10])
    dm_int = xgb.DMatrix(df_int)
    df_range = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=range(9, 11, 1))
    dm_range = xgb.DMatrix(df_range)
    assert dm_int.feature_names == ["9", "10"]  # assert not "9 "
    assert dm_int.feature_names == dm_range.feature_names

    # test MultiIndex as columns
    df = pd.DataFrame(
        [(1, 2, 3, 4, 5, 6), (6, 5, 4, 3, 2, 1)],
        columns=pd.MultiIndex.from_tuples(
            (
                ("a", 1),
                ("a", 2),
                ("a", 3),
                ("b", 1),
                ("b", 2),
                ("b", 3),
            )
        ),
    )
    dm = xgb.DMatrix(df)
    assert dm.feature_names == ["a 1", "a 2", "a 3", "b 1", "b 2", "b 3"]
    assert dm.feature_types == ["int", "int", "int", "int", "int", "int"]
    assert dm.num_row() == 2
    assert dm.num_col() == 6

    # test Index as columns
    df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2]))
    # print(df.columns, isinstance(df.columns, pd.Index))
    Xy = xgb.DMatrix(df)
    np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"]))


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost", "pandas"])
def test_pandas_slice(selenium):
    import numpy as np
    import pandas as pd
    import xgboost as xgb

    rng = np.random.RandomState(1994)
    rows = 100
    X = rng.randint(3, 7, size=rows)
    X = pd.DataFrame({"f0": X})
    y = rng.randn(rows)
    ridxs = [1, 2, 3, 4, 5, 6]
    m = xgb.DMatrix(X, y)
    sliced = m.slice(ridxs)

    assert m.feature_types == sliced.feature_types


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost", "pandas", "pytest"])
def test_pandas_categorical(selenium):
    import numpy as np
    import pandas as pd
    import pytest
    import xgboost as xgb

    rng = np.random.RandomState(1994)
    rows = 100
    X = rng.randint(3, 7, size=rows)
    X = pd.Series(X, dtype="category")
    X = pd.DataFrame({"f0": X})
    y = rng.randn(rows)
    m = xgb.DMatrix(X, y, enable_categorical=True)
    assert m.feature_types[0] == "c"

    X_0 = ["f", "o", "o"]
    X_1 = [4, 3, 2]
    X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1})
    X["feat_0"] = X["feat_0"].astype("category")  # type: ignore[call-overload]
    transformed, _, feature_types = xgb.data._transform_pandas_df(
        X, enable_categorical=True
    )

    assert transformed[:, 0].min() == 0

    # test missing value
    X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
    X["f0"] = X["f0"].astype("category")  # type: ignore[call-overload]
    arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
    assert not np.any(arr == -1.0)

    X = X["f0"]  # type: ignore[call-overload]
    y = y[: X.shape[0]]
    with pytest.raises(ValueError, match=r".*enable_categorical.*"):
        xgb.DMatrix(X, y)

    Xy = xgb.DMatrix(X, y, enable_categorical=True)
    assert Xy.num_row() == 3
    assert Xy.num_col() == 1


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost", "pandas"])
def test_pandas_sparse(selenium):
    import numpy as np
    import pandas as pd
    import xgboost as xgb

    rows = 100
    X = pd.DataFrame(
        {
            "A": pd.arrays.SparseArray(np.random.randint(0, 10, size=rows)),
            "B": pd.arrays.SparseArray(np.random.randn(rows)),
            "C": pd.arrays.SparseArray(
                np.random.permutation([True, False] * (rows // 2))
            ),
        }
    )
    y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
    dtrain = xgb.DMatrix(X, y)
    booster = xgb.train({}, dtrain, num_boost_round=4)
    predt_sparse = booster.predict(xgb.DMatrix(X))
    predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
    np.testing.assert_allclose(predt_sparse, predt_dense)


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost", "pandas", "pytest"])
def test_pandas_label(selenium):
    import numpy as np
    import pandas as pd
    import pytest
    import xgboost as xgb

    # label must be a single column
    df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
    with pytest.raises(ValueError):
        xgb.data._transform_pandas_df(df, False, None, None, "label", "float")

    # label must be supported dtype
    df = pd.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
    with pytest.raises(ValueError):
        xgb.data._transform_pandas_df(df, False, None, None, "label", "float")

    df = pd.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
    result, _, _ = xgb.data._transform_pandas_df(
        df, False, None, None, "label", "float"
    )
    np.testing.assert_array_equal(result, np.array([[1.0], [2.0], [3.0]], dtype=float))
    dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
    assert dm.num_row() == 3
    assert dm.num_col() == 2


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost", "pandas"])
def test_pandas_weight(selenium):
    import numpy as np
    import pandas as pd
    import xgboost as xgb

    kRows = 32
    kCols = 8

    X = np.random.randn(kRows, kCols)
    y = np.random.randn(kRows)
    w = np.random.uniform(size=kRows).astype(np.float32)
    w_pd = pd.DataFrame(w)
    data = xgb.DMatrix(X, y, w_pd)

    assert data.num_row() == kRows
    assert data.num_col() == kCols

    np.testing.assert_array_equal(data.get_weight(), w)


@pytest.mark.driver_timeout(60)
@run_in_pyodide(packages=["xgboost", "numpy", "scipy"])
def test_scipy_sparse(selenium):
    import numpy as np
    import scipy
    import xgboost as xgb

    n_rows = 100
    n_cols = 10
    X = scipy.sparse.random(n_rows, n_cols, format="csr")
    y = np.random.randn(n_rows)
    dtrain = xgb.DMatrix(X, y)
    booster = xgb.train({}, dtrain, num_boost_round=1)
    copied_predt = booster.predict(xgb.DMatrix(X))
    predt = booster.inplace_predict(X)
    np.testing.assert_allclose(copied_predt, predt)