mirror of https://github.com/pyodide/pyodide.git
341 lines
11 KiB
Python
341 lines
11 KiB
Python
# xgboost tests are copied from: https://github.com/dmlc/xgboost/tree/master/tests/python
|
|
import pathlib
|
|
|
|
import pytest
|
|
from pytest_pyodide import run_in_pyodide
|
|
|
|
DEMO_PATH = pathlib.Path(__file__).parent / "test_data"
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost"])
|
|
def test_compat(selenium):
|
|
import numpy as np
|
|
from xgboost.compat import lazy_isinstance
|
|
|
|
a = np.array([1, 2, 3])
|
|
assert lazy_isinstance(a, "numpy", "ndarray")
|
|
assert not lazy_isinstance(a, "numpy", "dataframe")
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
def test_basic_classification(selenium):
|
|
@run_in_pyodide(packages=["xgboost"])
|
|
def run(selenium, data_train):
|
|
|
|
with open("dermatology.data", "wb") as f:
|
|
f.write(data_train)
|
|
|
|
import numpy as np
|
|
import xgboost as xgb
|
|
|
|
# label need to be 0 to num_class -1
|
|
data = np.loadtxt(
|
|
"./dermatology.data",
|
|
delimiter=",",
|
|
converters={33: lambda x: int(x == "?"), 34: lambda x: int(x) - 1},
|
|
)
|
|
sz = data.shape
|
|
|
|
train = data[: int(sz[0] * 0.7), :]
|
|
test = data[int(sz[0] * 0.7) :, :]
|
|
|
|
train_X = train[:, :33]
|
|
train_Y = train[:, 34]
|
|
|
|
test_X = test[:, :33]
|
|
test_Y = test[:, 34]
|
|
|
|
xg_train = xgb.DMatrix(train_X, label=train_Y)
|
|
xg_test = xgb.DMatrix(test_X, label=test_Y)
|
|
# setup parameters for xgboost
|
|
param = {}
|
|
# use softmax multi-class classification
|
|
param["objective"] = "multi:softmax"
|
|
# scale weight of positive examples
|
|
param["eta"] = 0.1 # type: ignore[assignment]
|
|
param["max_depth"] = 6 # type: ignore[assignment]
|
|
param["nthread"] = 4 # type: ignore[assignment]
|
|
param["num_class"] = 6 # type: ignore[assignment]
|
|
|
|
watchlist = [(xg_train, "train"), (xg_test, "test")]
|
|
num_round = 5
|
|
bst = xgb.train(param, xg_train, num_round, watchlist)
|
|
# get prediction
|
|
pred = bst.predict(xg_test)
|
|
error_rate = np.sum(pred != test_Y) / test_Y.shape[0]
|
|
assert error_rate < 0.1
|
|
|
|
# do the same thing again, but output probabilities
|
|
param["objective"] = "multi:softprob"
|
|
bst = xgb.train(param, xg_train, num_round, watchlist)
|
|
# Note: this convention has been changed since xgboost-unity
|
|
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
|
|
pred_prob = bst.predict(xg_test).reshape(test_Y.shape[0], 6)
|
|
pred_label = np.argmax(pred_prob, axis=1)
|
|
error_rate = np.sum(pred_label != test_Y) / test_Y.shape[0]
|
|
assert error_rate < 0.1
|
|
|
|
DATA_TRAIN = (DEMO_PATH / "dermatology.data").read_bytes()
|
|
run(selenium, DATA_TRAIN)
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost", "pandas", "pytest"])
|
|
def test_pandas(selenium):
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
import xgboost as xgb
|
|
|
|
df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]], columns=["a", "b", "c"])
|
|
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
|
assert dm.feature_names == ["a", "b", "c"]
|
|
assert dm.feature_types == ["int", "float", "i"]
|
|
assert dm.num_row() == 2
|
|
assert dm.num_col() == 3
|
|
np.testing.assert_array_equal(dm.get_label(), np.array([1, 2]))
|
|
|
|
# overwrite feature_names and feature_types
|
|
dm = xgb.DMatrix(
|
|
df,
|
|
label=pd.Series([1, 2]),
|
|
feature_names=["x", "y", "z"],
|
|
feature_types=["q", "q", "q"],
|
|
)
|
|
assert dm.feature_names == ["x", "y", "z"]
|
|
assert dm.feature_types == ["q", "q", "q"]
|
|
assert dm.num_row() == 2
|
|
assert dm.num_col() == 3
|
|
|
|
# incorrect dtypes
|
|
df = pd.DataFrame([[1, 2.0, "x"], [2, 3.0, "y"]], columns=["a", "b", "c"])
|
|
with pytest.raises(ValueError):
|
|
xgb.DMatrix(df)
|
|
|
|
# numeric columns
|
|
df = pd.DataFrame([[1, 2.0, True], [2, 3.0, False]])
|
|
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
|
assert dm.feature_names == ["0", "1", "2"]
|
|
assert dm.feature_types == ["int", "float", "i"]
|
|
assert dm.num_row() == 2
|
|
assert dm.num_col() == 3
|
|
np.testing.assert_array_equal(dm.get_label(), np.array([1, 2]))
|
|
|
|
df = pd.DataFrame([[1, 2.0, 1], [2, 3.0, 1]], columns=[4, 5, 6])
|
|
dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
|
|
assert dm.feature_names == ["4", "5", "6"]
|
|
assert dm.feature_types == ["int", "float", "int"]
|
|
assert dm.num_row() == 2
|
|
assert dm.num_col() == 3
|
|
|
|
df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
|
|
dummies = pd.get_dummies(df)
|
|
# B A_X A_Y A_Z
|
|
# 0 1 1 0 0
|
|
# 1 2 0 1 0
|
|
# 2 3 0 0 1
|
|
result, _, _ = xgb.data._transform_pandas_df(dummies, enable_categorical=False)
|
|
exp = np.array([[1.0, 1.0, 0.0, 0.0], [2.0, 0.0, 1.0, 0.0], [3.0, 0.0, 0.0, 1.0]])
|
|
np.testing.assert_array_equal(result, exp)
|
|
dm = xgb.DMatrix(dummies)
|
|
assert dm.feature_names == ["B", "A_X", "A_Y", "A_Z"]
|
|
assert dm.feature_types == ["int", "int", "int", "int"]
|
|
assert dm.num_row() == 3
|
|
assert dm.num_col() == 4
|
|
|
|
df = pd.DataFrame({"A=1": [1, 2, 3], "A=2": [4, 5, 6]})
|
|
dm = xgb.DMatrix(df)
|
|
assert dm.feature_names == ["A=1", "A=2"]
|
|
assert dm.feature_types == ["int", "int"]
|
|
assert dm.num_row() == 3
|
|
assert dm.num_col() == 2
|
|
|
|
df_int = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=[9, 10])
|
|
dm_int = xgb.DMatrix(df_int)
|
|
df_range = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=range(9, 11, 1))
|
|
dm_range = xgb.DMatrix(df_range)
|
|
assert dm_int.feature_names == ["9", "10"] # assert not "9 "
|
|
assert dm_int.feature_names == dm_range.feature_names
|
|
|
|
# test MultiIndex as columns
|
|
df = pd.DataFrame(
|
|
[(1, 2, 3, 4, 5, 6), (6, 5, 4, 3, 2, 1)],
|
|
columns=pd.MultiIndex.from_tuples(
|
|
(
|
|
("a", 1),
|
|
("a", 2),
|
|
("a", 3),
|
|
("b", 1),
|
|
("b", 2),
|
|
("b", 3),
|
|
)
|
|
),
|
|
)
|
|
dm = xgb.DMatrix(df)
|
|
assert dm.feature_names == ["a 1", "a 2", "a 3", "b 1", "b 2", "b 3"]
|
|
assert dm.feature_types == ["int", "int", "int", "int", "int", "int"]
|
|
assert dm.num_row() == 2
|
|
assert dm.num_col() == 6
|
|
|
|
# test Index as columns
|
|
df = pd.DataFrame([[1, 1.1], [2, 2.2]], columns=pd.Index([1, 2]))
|
|
# print(df.columns, isinstance(df.columns, pd.Index))
|
|
Xy = xgb.DMatrix(df)
|
|
np.testing.assert_equal(np.array(Xy.feature_names), np.array(["1", "2"]))
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost", "pandas"])
|
|
def test_pandas_slice(selenium):
|
|
import numpy as np
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
|
|
rng = np.random.RandomState(1994)
|
|
rows = 100
|
|
X = rng.randint(3, 7, size=rows)
|
|
X = pd.DataFrame({"f0": X})
|
|
y = rng.randn(rows)
|
|
ridxs = [1, 2, 3, 4, 5, 6]
|
|
m = xgb.DMatrix(X, y)
|
|
sliced = m.slice(ridxs)
|
|
|
|
assert m.feature_types == sliced.feature_types
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost", "pandas", "pytest"])
|
|
def test_pandas_categorical(selenium):
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
import xgboost as xgb
|
|
|
|
rng = np.random.RandomState(1994)
|
|
rows = 100
|
|
X = rng.randint(3, 7, size=rows)
|
|
X = pd.Series(X, dtype="category")
|
|
X = pd.DataFrame({"f0": X})
|
|
y = rng.randn(rows)
|
|
m = xgb.DMatrix(X, y, enable_categorical=True)
|
|
assert m.feature_types[0] == "c"
|
|
|
|
X_0 = ["f", "o", "o"]
|
|
X_1 = [4, 3, 2]
|
|
X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1})
|
|
X["feat_0"] = X["feat_0"].astype("category") # type: ignore[call-overload]
|
|
transformed, _, feature_types = xgb.data._transform_pandas_df(
|
|
X, enable_categorical=True
|
|
)
|
|
|
|
assert transformed[:, 0].min() == 0
|
|
|
|
# test missing value
|
|
X = pd.DataFrame({"f0": ["a", "b", np.NaN]})
|
|
X["f0"] = X["f0"].astype("category") # type: ignore[call-overload]
|
|
arr, _, _ = xgb.data._transform_pandas_df(X, enable_categorical=True)
|
|
assert not np.any(arr == -1.0)
|
|
|
|
X = X["f0"] # type: ignore[call-overload]
|
|
y = y[: X.shape[0]]
|
|
with pytest.raises(ValueError, match=r".*enable_categorical.*"):
|
|
xgb.DMatrix(X, y)
|
|
|
|
Xy = xgb.DMatrix(X, y, enable_categorical=True)
|
|
assert Xy.num_row() == 3
|
|
assert Xy.num_col() == 1
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost", "pandas"])
|
|
def test_pandas_sparse(selenium):
|
|
import numpy as np
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
|
|
rows = 100
|
|
X = pd.DataFrame(
|
|
{
|
|
"A": pd.arrays.SparseArray(np.random.randint(0, 10, size=rows)),
|
|
"B": pd.arrays.SparseArray(np.random.randn(rows)),
|
|
"C": pd.arrays.SparseArray(
|
|
np.random.permutation([True, False] * (rows // 2))
|
|
),
|
|
}
|
|
)
|
|
y = pd.Series(pd.arrays.SparseArray(np.random.randn(rows)))
|
|
dtrain = xgb.DMatrix(X, y)
|
|
booster = xgb.train({}, dtrain, num_boost_round=4)
|
|
predt_sparse = booster.predict(xgb.DMatrix(X))
|
|
predt_dense = booster.predict(xgb.DMatrix(X.sparse.to_dense()))
|
|
np.testing.assert_allclose(predt_sparse, predt_dense)
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost", "pandas", "pytest"])
|
|
def test_pandas_label(selenium):
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
import xgboost as xgb
|
|
|
|
# label must be a single column
|
|
df = pd.DataFrame({"A": ["X", "Y", "Z"], "B": [1, 2, 3]})
|
|
with pytest.raises(ValueError):
|
|
xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
|
|
|
|
# label must be supported dtype
|
|
df = pd.DataFrame({"A": np.array(["a", "b", "c"], dtype=object)})
|
|
with pytest.raises(ValueError):
|
|
xgb.data._transform_pandas_df(df, False, None, None, "label", "float")
|
|
|
|
df = pd.DataFrame({"A": np.array([1, 2, 3], dtype=int)})
|
|
result, _, _ = xgb.data._transform_pandas_df(
|
|
df, False, None, None, "label", "float"
|
|
)
|
|
np.testing.assert_array_equal(result, np.array([[1.0], [2.0], [3.0]], dtype=float))
|
|
dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
|
|
assert dm.num_row() == 3
|
|
assert dm.num_col() == 2
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost", "pandas"])
|
|
def test_pandas_weight(selenium):
|
|
import numpy as np
|
|
import pandas as pd
|
|
import xgboost as xgb
|
|
|
|
kRows = 32
|
|
kCols = 8
|
|
|
|
X = np.random.randn(kRows, kCols)
|
|
y = np.random.randn(kRows)
|
|
w = np.random.uniform(size=kRows).astype(np.float32)
|
|
w_pd = pd.DataFrame(w)
|
|
data = xgb.DMatrix(X, y, w_pd)
|
|
|
|
assert data.num_row() == kRows
|
|
assert data.num_col() == kCols
|
|
|
|
np.testing.assert_array_equal(data.get_weight(), w)
|
|
|
|
|
|
@pytest.mark.driver_timeout(60)
|
|
@run_in_pyodide(packages=["xgboost", "numpy", "scipy"])
|
|
def test_scipy_sparse(selenium):
|
|
import numpy as np
|
|
import scipy
|
|
import xgboost as xgb
|
|
|
|
n_rows = 100
|
|
n_cols = 10
|
|
X = scipy.sparse.random(n_rows, n_cols, format="csr")
|
|
y = np.random.randn(n_rows)
|
|
dtrain = xgb.DMatrix(X, y)
|
|
booster = xgb.train({}, dtrain, num_boost_round=1)
|
|
copied_predt = booster.predict(xgb.DMatrix(X))
|
|
predt = booster.inplace_predict(X)
|
|
np.testing.assert_allclose(copied_predt, predt)
|