From ef993da92fcdd2bfb41a9e1e7fd155e276dc857f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 22 Oct 2018 12:51:03 +0200 Subject: [PATCH 1/6] Package scikit-learn --- .circleci/config.yml | 6 + packages/scikit-learn/meta.yaml | 28 +++++ .../patches/use-site-joblib.patch | 104 ++++++++++++++++++ test/packages/test_scikit-learn.py | 78 +++++++++++++ test/test_common.py | 2 +- 5 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 packages/scikit-learn/meta.yaml create mode 100644 packages/scikit-learn/patches/use-site-joblib.patch create mode 100644 test/packages/test_scikit-learn.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 92f657ac4..ba80e860c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -24,6 +24,12 @@ jobs: name: build no_output_timeout: 1200 command: | + + # download scipy package from https://github.com/iodide-project/pyodide/pull/211 + mkdir -p build + wget -q -O build/scipy.js https://1463-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.js + wget -q -O build/scipy.data https://1463-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.data + ccache -z make ccache -s diff --git a/packages/scikit-learn/meta.yaml b/packages/scikit-learn/meta.yaml new file mode 100644 index 000000000..3561370a3 --- /dev/null +++ b/packages/scikit-learn/meta.yaml @@ -0,0 +1,28 @@ +package: + name: scikit-learn + version: 0.20.0 + +source: + url: https://pypi.io/packages/source/s/scikit-learn/scikit-learn-0.20.0.tar.gz + sha256: 97d1d971f8ec257011e64b7d655df68081dd3097322690afa1a71a1d755f8c18 + + patches: + #- patches/unvendor-joblib.patch + - patches/use-site-joblib.patch + +build: + cflags: -Wno-implicit-function-declaration + +requirements: + run: + - numpy # TODO: add scipy, joblib once the corresponding PRs are merged + +test: + imports: + - sklearn + - sklearn.cluster + - sklearn.compose + - sklearn.covariance + - sklearn.cross_decomposition + - sklearn.datasets + - sklearn.decomposition diff --git a/packages/scikit-learn/patches/use-site-joblib.patch b/packages/scikit-learn/patches/use-site-joblib.patch new file mode 100644 index 000000000..f1446de44 --- /dev/null +++ b/packages/scikit-learn/patches/use-site-joblib.patch @@ -0,0 +1,104 @@ +commit 16cf9dc5f79533a121a421b095b6e7ef9ee76e9c +Author: Roman Yurchak +Date: Thu Oct 25 16:56:54 2018 +0200 + + Use site joblib + +diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py +index 6d8acddcc..8191048d7 100644 +--- a/sklearn/datasets/species_distributions.py ++++ b/sklearn/datasets/species_distributions.py +@@ -51,7 +51,7 @@ from .base import _fetch_remote + from .base import RemoteFileMetadata + from ..utils import Bunch + from sklearn.datasets.base import _pkl_filepath +-from sklearn.externals import joblib ++import joblib + + PY3_OR_LATER = sys.version_info[0] >= 3 + +diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py +index d7586c286..d1168cdcf 100644 +--- a/sklearn/ensemble/tests/test_forest.py ++++ b/sklearn/ensemble/tests/test_forest.py +@@ -23,7 +23,11 @@ import pytest + + from sklearn.utils import parallel_backend + from sklearn.utils import register_parallel_backend +-from sklearn.externals.joblib.parallel import LokyBackend ++try: ++ from sklearn.externals.joblib.parallel import LokyBackend ++except ImportError: ++ LokyBackend = object ++ + + from sklearn.utils.testing import assert_almost_equal + from sklearn.utils.testing import assert_array_almost_equal +diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py +index da04b4215..fc3f6a6b1 100644 +--- a/sklearn/metrics/tests/test_score_objects.py ++++ b/sklearn/metrics/tests/test_score_objects.py +@@ -40,7 +40,7 @@ from sklearn.datasets import load_diabetes + from sklearn.model_selection import train_test_split, cross_val_score + from sklearn.model_selection import GridSearchCV + from sklearn.multiclass import OneVsRestClassifier +-from sklearn.externals import joblib ++import joblib + + + REGRESSION_SCORERS = ['explained_variance', 'r2', +diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py +index 990942c9e..e9a6c31bd 100644 +--- a/sklearn/neighbors/tests/test_kde.py ++++ b/sklearn/neighbors/tests/test_kde.py +@@ -10,7 +10,7 @@ from sklearn.pipeline import make_pipeline + from sklearn.datasets import make_blobs + from sklearn.model_selection import GridSearchCV + from sklearn.preprocessing import StandardScaler +-from sklearn.externals import joblib ++import joblib + + + def compute_kernel_slow(Y, X, kernel, h): +diff --git a/sklearn/tests/test_site_joblib.py b/sklearn/tests/test_site_joblib.py +index bffd43cc1..df4c96893 100644 +--- a/sklearn/tests/test_site_joblib.py ++++ b/sklearn/tests/test_site_joblib.py +@@ -1,7 +1,10 @@ + import os + import pytest + from sklearn import externals +-from sklearn.externals import joblib as joblib_vendored ++try: ++ from sklearn.externals import joblib as joblib_vendored ++except ImportError: ++ joblib_vendored = None + from sklearn.utils import Parallel, delayed, Memory, parallel_backend + + if os.environ.get('SKLEARN_SITE_JOBLIB', False): +diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py +index 75b378961..b81b9ab58 100644 +--- a/sklearn/utils/testing.py ++++ b/sklearn/utils/testing.py +@@ -44,7 +44,7 @@ except NameError: + + import sklearn + from sklearn.base import BaseEstimator +-from sklearn.externals import joblib ++import joblib + from sklearn.utils.fixes import signature + from sklearn.utils import deprecated, IS_PYPY, _IS_32BIT + +diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py +index bf8412b3e..2eebb36b0 100644 +--- a/sklearn/utils/tests/test_estimator_checks.py ++++ b/sklearn/utils/tests/test_estimator_checks.py +@@ -5,7 +5,7 @@ import numpy as np + import scipy.sparse as sp + + from sklearn.externals.six.moves import cStringIO as StringIO +-from sklearn.externals import joblib ++import joblib + + from sklearn.base import BaseEstimator, ClassifierMixin + from sklearn.utils import deprecated diff --git a/test/packages/test_scikit-learn.py b/test/packages/test_scikit-learn.py new file mode 100644 index 000000000..99d334783 --- /dev/null +++ b/test/packages/test_scikit-learn.py @@ -0,0 +1,78 @@ +from textwrap import dedent + +def test_scikit_learn(selenium_standalone): + selenium = selenium_standalone + # no automatic depedency resolution for now + selenium.load_package(["numpy", "joblib"]) + selenium.load_package("scipy") + selenium.load_package("scikit-learn") + assert selenium.run(""" + import numpy as np + import sklearn + from sklearn.linear_model import LogisticRegression + + rng = np.random.RandomState(42) + X = rng.rand(100, 20) + y = rng.randint(5, size=100) + + estimator = LogisticRegression(solver='liblinear') + estimator.fit(X, y) + print(estimator.predict(X)) + estimator.score(X, y) + """) > 0 + print(selenium.logs) + +def test_import(selenium_standalone): + selenium = selenium_standalone + # no automatic depedency resolution for now + selenium.load_package(["numpy", "joblib"]) + selenium.load_package("scipy") + selenium.load_package("scikit-learn") + cmd = dedent(""" + import sklearn + import sklearn.calibration + import sklearn.calibration + import sklearn.cluster + import sklearn.compose + import sklearn.covariance + import sklearn.cross_decomposition + import sklearn.datasets + import sklearn.decomposition + import sklearn.discriminant_analysis + import sklearn.dummy + import sklearn.ensemble + import sklearn.exceptions + import sklearn.externals + import sklearn.feature_extraction + import sklearn.feature_selection + import sklearn.gaussian_process + import sklearn.impute + import sklearn.isotonic + import sklearn.kernel_approximation + import sklearn.kernel_ridge + import sklearn.linear_model + import sklearn.manifold + import sklearn.metrics + import sklearn.mixture + import sklearn.model_selection + import sklearn.multiclass + import sklearn.multioutput + import sklearn.naive_bayes + import sklearn.neighbors + import sklearn.neural_network + import sklearn.pipeline + import sklearn.preprocessing + import sklearn.random_projection + import sklearn.semi_supervised + import sklearn.svm + import sklearn.tree + import sklearn.utils + """).splitlines() + + for line in cmd: + try: + selenium.run(line) + print(f'{line} -- OK') + except: + print(f'Error: {line} failed') + print(selenium.logs) diff --git a/test/test_common.py b/test/test_common.py index afef1ac90..c4cb366cc 100644 --- a/test/test_common.py +++ b/test/test_common.py @@ -23,7 +23,7 @@ def registered_packages_meta(): for name in packages} -UNSUPPORTED_PACKAGES = {'chrome': ['pandas', 'scipy'], +UNSUPPORTED_PACKAGES = {'chrome': ['pandas', 'scipy', 'scikit-learn'], 'firefox': []} From b93fcc47b88d4e1c3487b9a74dfc313c633ac4dc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 26 Oct 2018 09:40:00 +0200 Subject: [PATCH 2/6] Increment version numbers --- packages/scikit-learn/meta.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/scikit-learn/meta.yaml b/packages/scikit-learn/meta.yaml index 3561370a3..e9f15e793 100644 --- a/packages/scikit-learn/meta.yaml +++ b/packages/scikit-learn/meta.yaml @@ -15,7 +15,8 @@ build: requirements: run: - - numpy # TODO: add scipy, joblib once the corresponding PRs are merged + - numpy # TODO: add scipy once the corresponding PR is merged + - joblib test: imports: From edda6eefcd391dae1091e9ed86a1b836c948f011 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Tue, 6 Nov 2018 00:38:41 +0100 Subject: [PATCH 3/6] More fixes for joblib --- .circleci/config.yml | 4 +- packages/scikit-learn/meta.yaml | 32 +- .../patches/support-joblib-011.patch | 360 ++++++++++++++++++ .../patches/use-site-joblib.patch | 118 +++++- test/packages/test_scikit-learn.py | 64 +--- 5 files changed, 513 insertions(+), 65 deletions(-) create mode 100644 packages/scikit-learn/patches/support-joblib-011.patch diff --git a/.circleci/config.yml b/.circleci/config.yml index ba80e860c..1c0edc9cd 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -27,8 +27,8 @@ jobs: # download scipy package from https://github.com/iodide-project/pyodide/pull/211 mkdir -p build - wget -q -O build/scipy.js https://1463-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.js - wget -q -O build/scipy.data https://1463-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.data + wget -q -O build/scipy.js https://1535-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.js + wget -q -O build/scipy.data https://1535-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.data ccache -z make diff --git a/packages/scikit-learn/meta.yaml b/packages/scikit-learn/meta.yaml index e9f15e793..9a38607b8 100644 --- a/packages/scikit-learn/meta.yaml +++ b/packages/scikit-learn/meta.yaml @@ -7,8 +7,8 @@ source: sha256: 97d1d971f8ec257011e64b7d655df68081dd3097322690afa1a71a1d755f8c18 patches: - #- patches/unvendor-joblib.patch - patches/use-site-joblib.patch + - patches/support-joblib-011.patch build: cflags: -Wno-implicit-function-declaration @@ -21,9 +21,39 @@ requirements: test: imports: - sklearn + - sklearn.calibration - sklearn.cluster - sklearn.compose - sklearn.covariance - sklearn.cross_decomposition - sklearn.datasets - sklearn.decomposition + - sklearn.discriminant_analysis + - sklearn.dummy + - sklearn.ensemble + - sklearn.exceptions + - sklearn.externals + - sklearn.feature_extraction + - sklearn.feature_selection + - sklearn.gaussian_process + - sklearn.impute + - sklearn.isotonic + - sklearn.kernel_approximation + - sklearn.kernel_ridge + - sklearn.linear_model + - sklearn.manifold + - sklearn.metrics + - sklearn.mixture + - sklearn.model_selection + - sklearn.multiclass + - sklearn.multioutput + - sklearn.naive_bayes + - sklearn.neighbors + - sklearn.neural_network + - sklearn.pipeline + - sklearn.preprocessing + - sklearn.random_projection + - sklearn.semi_supervised + - sklearn.svm + - sklearn.tree + - sklearn.utils diff --git a/packages/scikit-learn/patches/support-joblib-011.patch b/packages/scikit-learn/patches/support-joblib-011.patch new file mode 100644 index 000000000..4624bf7a6 --- /dev/null +++ b/packages/scikit-learn/patches/support-joblib-011.patch @@ -0,0 +1,360 @@ +commit 09954a12bdcef544e940a9b4f661dd7210e2357d +Author: Joel Nothman +Date: Wed Nov 7 08:10:23 2018 +1100 + + BLD we should ensure continued support for joblib 0.11 (#12350) + +diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py +index 542f7ca80..e54f131a4 100644 +--- a/sklearn/ensemble/forest.py ++++ b/sklearn/ensemble/forest.py +@@ -50,7 +50,6 @@ import numpy as np + from scipy.sparse import issparse + from scipy.sparse import hstack as sparse_hstack + +- + from ..base import ClassifierMixin, RegressorMixin + from ..utils import Parallel, delayed + from ..externals import six +@@ -62,7 +61,7 @@ from ..tree._tree import DTYPE, DOUBLE + from ..utils import check_random_state, check_array, compute_sample_weight + from ..exceptions import DataConversionWarning, NotFittedError + from .base import BaseEnsemble, _partition_estimators +-from ..utils.fixes import parallel_helper ++from ..utils.fixes import parallel_helper, _joblib_parallel_args + from ..utils.multiclass import check_classification_targets + from ..utils.validation import check_is_fitted + +@@ -175,7 +174,7 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): + """ + X = self._validate_X_predict(X) + results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, +- prefer="threads")( ++ **_joblib_parallel_args(prefer="threads"))( + delayed(parallel_helper)(tree, 'apply', X, check_input=False) + for tree in self.estimators_) + +@@ -206,7 +205,7 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): + """ + X = self._validate_X_predict(X) + indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, +- prefer="threads")( ++ **_joblib_parallel_args(prefer='threads'))( + delayed(parallel_helper)(tree, 'decision_path', X, + check_input=False) + for tree in self.estimators_) +@@ -324,11 +323,11 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): + # Parallel loop: we prefer the threading backend as the Cython code + # for fitting the trees is internally releasing the Python GIL + # making threading more efficient than multiprocessing in +- # that case. However, we respect any parallel_backend contexts set +- # at a higher level, since correctness does not rely on using +- # threads. ++ # that case. However, for joblib 0.12+ we respect any ++ # parallel_backend contexts set at a higher level, ++ # since correctness does not rely on using threads. + trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, +- prefer="threads")( ++ **_joblib_parallel_args(prefer='threads'))( + delayed(_parallel_build_trees)( + t, self, X, y, sample_weight, i, len(trees), + verbose=self.verbose, class_weight=self.class_weight) +@@ -375,7 +374,7 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): + check_is_fitted(self, 'estimators_') + + all_importances = Parallel(n_jobs=self.n_jobs, +- prefer="threads")( ++ **_joblib_parallel_args(prefer='threads'))( + delayed(getattr)(tree, 'feature_importances_') + for tree in self.estimators_) + +@@ -591,7 +590,8 @@ class ForestClassifier(six.with_metaclass(ABCMeta, BaseForest, + all_proba = [np.zeros((X.shape[0], j), dtype=np.float64) + for j in np.atleast_1d(self.n_classes_)] + lock = threading.Lock() +- Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( ++ Parallel(n_jobs=n_jobs, verbose=self.verbose, ++ **_joblib_parallel_args(require="sharedmem"))( + delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, + lock) + for e in self.estimators_) +@@ -699,7 +699,8 @@ class ForestRegressor(six.with_metaclass(ABCMeta, BaseForest, RegressorMixin)): + + # Parallel loop + lock = threading.Lock() +- Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( ++ Parallel(n_jobs=n_jobs, verbose=self.verbose, ++ **_joblib_parallel_args(require="sharedmem"))( + delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) + for e in self.estimators_) + +diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py +index f12f6f886..c08e5cc32 100644 +--- a/sklearn/ensemble/tests/test_forest.py ++++ b/sklearn/ensemble/tests/test_forest.py +@@ -10,6 +10,7 @@ Testing for the forest module (sklearn.ensemble.forest). + + import pickle + from collections import defaultdict ++from distutils.version import LooseVersion + import itertools + from itertools import combinations + from itertools import product +@@ -21,6 +22,7 @@ from scipy.sparse import coo_matrix + + import pytest + ++from sklearn.utils import _joblib + from sklearn.utils import parallel_backend + from sklearn.utils import register_parallel_backend + try: +@@ -1287,6 +1289,8 @@ class MyBackend(LokyBackend): + register_parallel_backend('testing', MyBackend) + + ++@pytest.mark.skipif(_joblib.__version__ < LooseVersion('0.12'), ++ reason='tests not yet supported in joblib <0.12') + def test_backend_respected(): + clf = RandomForestClassifier(n_estimators=10, n_jobs=2) + +diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py +index 2d0723944..4d0ded532 100644 +--- a/sklearn/linear_model/coordinate_descent.py ++++ b/sklearn/linear_model/coordinate_descent.py +@@ -22,6 +22,7 @@ from ..utils import Parallel, delayed, effective_n_jobs + from ..externals import six + from ..externals.six.moves import xrange + from ..utils.extmath import safe_sparse_dot ++from ..utils.fixes import _joblib_parallel_args + from ..utils.validation import check_is_fitted + from ..utils.validation import column_or_1d + from ..exceptions import ConvergenceWarning +@@ -1203,7 +1204,7 @@ class LinearModelCV(six.with_metaclass(ABCMeta, LinearModel)): + for this_l1_ratio, this_alphas in zip(l1_ratios, alphas) + for train, test in folds) + mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, +- prefer="threads")(jobs) ++ **_joblib_parallel_args(prefer="threads"))(jobs) + mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1)) + mean_mse = np.mean(mse_paths, axis=1) + self.mse_path_ = np.squeeze(np.rollaxis(mse_paths, 2, 1)) +diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py +index 01a4f78ab..091c13c1f 100644 +--- a/sklearn/linear_model/logistic.py ++++ b/sklearn/linear_model/logistic.py +@@ -33,6 +33,7 @@ from ..exceptions import (NotFittedError, ConvergenceWarning, + ChangedBehaviorWarning) + from ..utils.multiclass import check_classification_targets + from ..utils import Parallel, delayed, effective_n_jobs ++from ..utils.fixes import _joblib_parallel_args + from ..model_selection import check_cv + from ..externals import six + from ..metrics import get_scorer +@@ -1346,7 +1347,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, + else: + prefer = 'processes' + fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, +- prefer=prefer)( ++ **_joblib_parallel_args(prefer=prefer))( + path_func(X, y, pos_class=class_, Cs=[self.C], + fit_intercept=self.fit_intercept, tol=self.tol, + verbose=self.verbose, solver=solver, +@@ -1775,7 +1776,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, + else: + prefer = 'processes' + fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, +- prefer=prefer)( ++ **_joblib_parallel_args(prefer=prefer))( + path_func(X, y, train, test, pos_class=label, Cs=self.Cs, + fit_intercept=self.fit_intercept, penalty=self.penalty, + dual=self.dual, solver=solver, tol=self.tol, +diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py +index 5e253003a..236bc10a9 100644 +--- a/sklearn/linear_model/stochastic_gradient.py ++++ b/sklearn/linear_model/stochastic_gradient.py +@@ -33,6 +33,7 @@ from .sgd_fast import SquaredLoss + from .sgd_fast import Huber + from .sgd_fast import EpsilonInsensitive + from .sgd_fast import SquaredEpsilonInsensitive ++from ..utils.fixes import _joblib_parallel_args + + LEARNING_RATE_TYPES = {"constant": 1, "optimal": 2, "invscaling": 3, + "adaptive": 4, "pa1": 5, "pa2": 6} +@@ -613,8 +614,8 @@ class BaseSGDClassifier(six.with_metaclass(ABCMeta, BaseSGD, + strategy is called OVA: One Versus All. + """ + # Use joblib to fit OvA in parallel. +- result = Parallel(n_jobs=self.n_jobs, prefer="threads", +- verbose=self.verbose)( ++ result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, ++ **_joblib_parallel_args(require="sharedmem"))( + delayed(fit_binary)(self, i, X, y, alpha, C, learning_rate, + max_iter, self._expanded_class_weight[i], + 1., sample_weight) +diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py +index 15e29ce4d..7ad29e2c0 100644 +--- a/sklearn/linear_model/tests/test_sgd.py ++++ b/sklearn/linear_model/tests/test_sgd.py +@@ -1,4 +1,5 @@ + ++from distutils.version import LooseVersion + import pickle + import unittest + import pytest +@@ -28,6 +29,13 @@ from sklearn.preprocessing import StandardScaler + from sklearn.exceptions import ConvergenceWarning + from sklearn.model_selection import train_test_split + from sklearn.linear_model import sgd_fast ++from sklearn.model_selection import RandomizedSearchCV ++from sklearn.utils import _joblib ++ ++ ++# 0.23. warning about tol not having its correct default value. ++pytestmark = pytest.mark.filterwarnings( ++ "ignore:max_iter and tol parameters have been") + + + class SparseSGDClassifier(SGDClassifier): +@@ -1471,3 +1479,31 @@ def test_gradient_squared_epsilon_insensitive(): + (2.0, 2.2, -0.2), (-2.0, 1.0, -5.8) + ] + _test_gradient_common(loss, cases) ++ ++ ++def test_multi_thread_multi_class_and_early_stopping(): ++ # This is a non-regression test for a bad interaction between ++ # early stopping internal attribute and thread-based parallelism. ++ clf = SGDClassifier(alpha=1e-3, tol=1e-3, max_iter=1000, ++ early_stopping=True, n_iter_no_change=100, ++ random_state=0, n_jobs=2) ++ clf.fit(iris.data, iris.target) ++ assert clf.n_iter_ > clf.n_iter_no_change ++ assert clf.n_iter_ < clf.n_iter_no_change + 20 ++ assert clf.score(iris.data, iris.target) > 0.8 ++ ++ ++def test_multi_core_gridsearch_and_early_stopping(): ++ # This is a non-regression test for a bad interaction between ++ # early stopping internal attribute and process-based multi-core ++ # parallelism. ++ param_grid = { ++ 'alpha': np.logspace(-4, 4, 9), ++ 'n_iter_no_change': [5, 10, 50], ++ } ++ clf = SGDClassifier(tol=1e-3, max_iter=1000, early_stopping=True, ++ random_state=0) ++ search = RandomizedSearchCV(clf, param_grid, n_iter=10, cv=5, n_jobs=2, ++ random_state=0) ++ search.fit(iris.data, iris.target) ++ assert search.best_score_ > 0.8 +diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py +index 070afbdbb..24554fe68 100644 +--- a/sklearn/utils/fixes.py ++++ b/sklearn/utils/fixes.py +@@ -14,6 +14,8 @@ import os + import errno + import sys + ++from distutils.version import LooseVersion ++ + import numpy as np + import scipy.sparse as sp + import scipy +@@ -332,3 +334,51 @@ except ImportError: # python <3.3 + from collections import Iterable as _Iterable # noqa + from collections import Mapping as _Mapping # noqa + from collections import Sized as _Sized # noqa ++ ++ ++def _joblib_parallel_args(**kwargs): ++ """Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+ ++ ++ For joblib 0.11 this maps both ``prefer`` and ``require`` parameters to ++ a specific ``backend``. ++ ++ Parameters ++ ---------- ++ ++ prefer : str in {'processes', 'threads'} or None ++ Soft hint to choose the default backend if no specific backend ++ was selected with the parallel_backend context manager. ++ ++ require : 'sharedmem' or None ++ Hard condstraint to select the backend. If set to 'sharedmem', ++ the selected backend will be single-host and thread-based even ++ if the user asked for a non-thread based backend with ++ parallel_backend. ++ ++ See joblib.Parallel documentation for more details ++ """ ++ from . import _joblib ++ ++ if _joblib.__version__ >= LooseVersion('0.12'): ++ return kwargs ++ ++ extra_args = set(kwargs.keys()).difference({'prefer', 'require'}) ++ if extra_args: ++ raise NotImplementedError('unhandled arguments %s with joblib %s' ++ % (list(extra_args), _joblib.__version__)) ++ args = {} ++ if 'prefer' in kwargs: ++ prefer = kwargs['prefer'] ++ if prefer not in ['threads', 'processes', None]: ++ raise ValueError('prefer=%s is not supported' % prefer) ++ args['backend'] = {'threads': 'threading', ++ 'processes': 'multiprocessing', ++ None: None}[prefer] ++ ++ if 'require' in kwargs: ++ require = kwargs['require'] ++ if require not in [None, 'sharedmem']: ++ raise ValueError('require=%s is not supported' % require) ++ if require == 'sharedmem': ++ args['backend'] = 'threading' ++ return args +diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py +index 92f954439..5b7b960fa 100644 +--- a/sklearn/utils/tests/test_fixes.py ++++ b/sklearn/utils/tests/test_fixes.py +@@ -16,6 +16,7 @@ from sklearn.utils.fixes import divide + from sklearn.utils.fixes import MaskedArray + from sklearn.utils.fixes import nanmedian + from sklearn.utils.fixes import nanpercentile ++from sklearn.utils.fixes import _joblib_parallel_args + + + def test_divide(): +@@ -57,3 +58,33 @@ def test_nanmedian(axis, expected_median): + def test_nanpercentile(a, q, expected_percentile): + percentile = nanpercentile(a, q) + assert_allclose(percentile, expected_percentile) ++ ++ ++@pytest.mark.parametrize('joblib_version', ('0.11', '0.12.0')) ++def test_joblib_parallel_args(monkeypatch, joblib_version): ++ import sklearn.utils._joblib ++ monkeypatch.setattr(sklearn.utils._joblib, '__version__', joblib_version) ++ ++ if joblib_version == '0.12.0': ++ # arguments are simply passed through ++ assert _joblib_parallel_args(prefer='threads') == {'prefer': 'threads'} ++ assert _joblib_parallel_args(prefer='processes', require=None) == { ++ 'prefer': 'processes', 'require': None} ++ assert _joblib_parallel_args(non_existing=1) == {'non_existing': 1} ++ elif joblib_version == '0.11': ++ # arguments are mapped to the corresponding backend ++ assert _joblib_parallel_args(prefer='threads') == { ++ 'backend': 'threading'} ++ assert _joblib_parallel_args(prefer='processes') == { ++ 'backend': 'multiprocessing'} ++ with pytest.raises(ValueError): ++ _joblib_parallel_args(prefer='invalid') ++ assert _joblib_parallel_args( ++ prefer='processes', require='sharedmem') == { ++ 'backend': 'threading'} ++ with pytest.raises(ValueError): ++ _joblib_parallel_args(require='invalid') ++ with pytest.raises(NotImplementedError): ++ _joblib_parallel_args(verbose=True) ++ else: ++ raise ValueError diff --git a/packages/scikit-learn/patches/use-site-joblib.patch b/packages/scikit-learn/patches/use-site-joblib.patch index f1446de44..1a73a2426 100644 --- a/packages/scikit-learn/patches/use-site-joblib.patch +++ b/packages/scikit-learn/patches/use-site-joblib.patch @@ -1,9 +1,75 @@ -commit 16cf9dc5f79533a121a421b095b6e7ef9ee76e9c +commit 8778fe57c37a275fc36959e9bade234bd9bbe88f Author: Roman Yurchak Date: Thu Oct 25 16:56:54 2018 +0200 Use site joblib +diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py +index 76cb27dad..98cb34f6e 100644 +--- a/sklearn/datasets/california_housing.py ++++ b/sklearn/datasets/california_housing.py +@@ -33,7 +33,7 @@ from .base import _fetch_remote + from .base import _pkl_filepath + from .base import RemoteFileMetadata + from ..utils import Bunch +-from ..externals import joblib ++from ..utils import _joblib as _joblib + + # The original data can be found at: + # http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz +diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py +index a08f61f02..4ac93f93f 100644 +--- a/sklearn/datasets/covtype.py ++++ b/sklearn/datasets/covtype.py +@@ -27,7 +27,7 @@ from .base import RemoteFileMetadata + from ..utils import Bunch + from .base import _pkl_filepath + from ..utils.fixes import makedirs +-from ..externals import joblib ++from ..utils import _joblib as joblib + from ..utils import check_random_state + + # The original data can be found in: +diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py +index c8ed0e308..3aa6ebb35 100644 +--- a/sklearn/datasets/kddcup99.py ++++ b/sklearn/datasets/kddcup99.py +@@ -22,7 +22,8 @@ from .base import _fetch_remote + from .base import get_data_home + from .base import RemoteFileMetadata + from ..utils import Bunch +-from ..externals import joblib, six ++from ..externals import six ++from ..utils import _joblib as joblib + from ..utils import check_random_state + from ..utils import shuffle as shuffle_method + +diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py +index 74915c6c6..ba5bfecfb 100644 +--- a/sklearn/datasets/olivetti_faces.py ++++ b/sklearn/datasets/olivetti_faces.py +@@ -24,7 +24,7 @@ from .base import _fetch_remote + from .base import RemoteFileMetadata + from .base import _pkl_filepath + from ..utils import check_random_state, Bunch +-from ..externals import joblib ++from ..utils import _joblib as joblib + + # The original data can be found at: + # http://cs.nyu.edu/~roweis/data/olivettifaces.mat +diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py +index 7890d7e18..ea22fb076 100644 +--- a/sklearn/datasets/rcv1.py ++++ b/sklearn/datasets/rcv1.py +@@ -22,7 +22,7 @@ from .base import _pkl_filepath + from .base import _fetch_remote + from .base import RemoteFileMetadata + from ..utils.fixes import makedirs +-from ..externals import joblib ++from ..utils import _joblib as joblib + from .svmlight_format import load_svmlight_files + from ..utils import shuffle as shuffle_ + from ..utils import Bunch diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py index 6d8acddcc..8191048d7 100644 --- a/sklearn/datasets/species_distributions.py @@ -17,17 +83,48 @@ index 6d8acddcc..8191048d7 100644 PY3_OR_LATER = sys.version_info[0] >= 3 +diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py +index 8df908a2e..a2440222a 100644 +--- a/sklearn/datasets/twenty_newsgroups.py ++++ b/sklearn/datasets/twenty_newsgroups.py +@@ -45,7 +45,7 @@ from ..utils import check_random_state, Bunch + from ..utils import deprecated + from ..feature_extraction.text import CountVectorizer + from ..preprocessing import normalize +-from ..externals import joblib ++from ..utils import _joblib as joblib + + logger = logging.getLogger(__name__) + +diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py +index 321031892..2a516619b 100644 +--- a/sklearn/ensemble/base.py ++++ b/sklearn/ensemble/base.py +@@ -13,9 +13,11 @@ from ..base import BaseEstimator + from ..base import MetaEstimatorMixin + from ..utils import check_random_state + from ..externals import six +-from ..externals.joblib import effective_n_jobs ++from ..utils import _joblib + from abc import ABCMeta, abstractmethod + ++effective_n_jobs = _joblib.effective_n_jobs ++ + MAX_RAND_SEED = np.iinfo(np.int32).max + + diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py -index d7586c286..d1168cdcf 100644 +index d7586c286..f12f6f886 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py -@@ -23,7 +23,11 @@ import pytest +@@ -23,7 +23,12 @@ import pytest from sklearn.utils import parallel_backend from sklearn.utils import register_parallel_backend -from sklearn.externals.joblib.parallel import LokyBackend +try: -+ from sklearn.externals.joblib.parallel import LokyBackend ++ from sklearn.utils import _joblib ++ LokyBackend = _joblib.parallel.LokyBackend +except ImportError: + LokyBackend = object + @@ -76,6 +173,19 @@ index bffd43cc1..df4c96893 100644 from sklearn.utils import Parallel, delayed, Memory, parallel_backend if os.environ.get('SKLEARN_SITE_JOBLIB', False): +diff --git a/sklearn/utils/_joblib.py b/sklearn/utils/_joblib.py +index e1c39a401..9c4e815f7 100644 +--- a/sklearn/utils/_joblib.py ++++ b/sklearn/utils/_joblib.py +@@ -5,7 +5,7 @@ import os as _os + import warnings as _warnings + + # An environment variable to use the site joblib +-if _os.environ.get('SKLEARN_SITE_JOBLIB', False): ++if True: + with _warnings.catch_warnings(): + _warnings.simplefilter("ignore") + # joblib imports may raise DeprecationWarning on certain Python diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 75b378961..b81b9ab58 100644 --- a/sklearn/utils/testing.py diff --git a/test/packages/test_scikit-learn.py b/test/packages/test_scikit-learn.py index 99d334783..eb745ba04 100644 --- a/test/packages/test_scikit-learn.py +++ b/test/packages/test_scikit-learn.py @@ -1,7 +1,11 @@ -from textwrap import dedent +import pytest -def test_scikit_learn(selenium_standalone): + +def test_scikit_learn(selenium_standalone, request): selenium = selenium_standalone + if selenium.browser == 'chrome': + request.applymarker(pytest.mark.xfail( + run=False, reason='chrome not supported')) # no automatic depedency resolution for now selenium.load_package(["numpy", "joblib"]) selenium.load_package("scipy") @@ -20,59 +24,3 @@ def test_scikit_learn(selenium_standalone): print(estimator.predict(X)) estimator.score(X, y) """) > 0 - print(selenium.logs) - -def test_import(selenium_standalone): - selenium = selenium_standalone - # no automatic depedency resolution for now - selenium.load_package(["numpy", "joblib"]) - selenium.load_package("scipy") - selenium.load_package("scikit-learn") - cmd = dedent(""" - import sklearn - import sklearn.calibration - import sklearn.calibration - import sklearn.cluster - import sklearn.compose - import sklearn.covariance - import sklearn.cross_decomposition - import sklearn.datasets - import sklearn.decomposition - import sklearn.discriminant_analysis - import sklearn.dummy - import sklearn.ensemble - import sklearn.exceptions - import sklearn.externals - import sklearn.feature_extraction - import sklearn.feature_selection - import sklearn.gaussian_process - import sklearn.impute - import sklearn.isotonic - import sklearn.kernel_approximation - import sklearn.kernel_ridge - import sklearn.linear_model - import sklearn.manifold - import sklearn.metrics - import sklearn.mixture - import sklearn.model_selection - import sklearn.multiclass - import sklearn.multioutput - import sklearn.naive_bayes - import sklearn.neighbors - import sklearn.neural_network - import sklearn.pipeline - import sklearn.preprocessing - import sklearn.random_projection - import sklearn.semi_supervised - import sklearn.svm - import sklearn.tree - import sklearn.utils - """).splitlines() - - for line in cmd: - try: - selenium.run(line) - print(f'{line} -- OK') - except: - print(f'Error: {line} failed') - print(selenium.logs) From 15c0228abc77242aa1d433fda5c312df933fccea Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 3 Dec 2018 22:12:03 +0100 Subject: [PATCH 4/6] Update to scikit-learn 0.20.1 --- packages/scikit-learn/meta.yaml | 10 +- .../patches/support-joblib-011.patch | 360 ------------------ .../patches/use-site-joblib.patch | 201 +--------- 3 files changed, 8 insertions(+), 563 deletions(-) delete mode 100644 packages/scikit-learn/patches/support-joblib-011.patch diff --git a/packages/scikit-learn/meta.yaml b/packages/scikit-learn/meta.yaml index 9a38607b8..5a1ad2376 100644 --- a/packages/scikit-learn/meta.yaml +++ b/packages/scikit-learn/meta.yaml @@ -1,21 +1,21 @@ package: name: scikit-learn - version: 0.20.0 + version: 0.20.1 source: - url: https://pypi.io/packages/source/s/scikit-learn/scikit-learn-0.20.0.tar.gz - sha256: 97d1d971f8ec257011e64b7d655df68081dd3097322690afa1a71a1d755f8c18 + url: https://github.com/scikit-learn/scikit-learn/archive/0.20.1.tar.gz + sha256: 618feea121c59a52ea459d6af7bc179344ca345775b04bd60e96740e9df75960 patches: - patches/use-site-joblib.patch - - patches/support-joblib-011.patch build: cflags: -Wno-implicit-function-declaration requirements: run: - - numpy # TODO: add scipy once the corresponding PR is merged + - numpy + - scipy - joblib test: diff --git a/packages/scikit-learn/patches/support-joblib-011.patch b/packages/scikit-learn/patches/support-joblib-011.patch deleted file mode 100644 index 4624bf7a6..000000000 --- a/packages/scikit-learn/patches/support-joblib-011.patch +++ /dev/null @@ -1,360 +0,0 @@ -commit 09954a12bdcef544e940a9b4f661dd7210e2357d -Author: Joel Nothman -Date: Wed Nov 7 08:10:23 2018 +1100 - - BLD we should ensure continued support for joblib 0.11 (#12350) - -diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py -index 542f7ca80..e54f131a4 100644 ---- a/sklearn/ensemble/forest.py -+++ b/sklearn/ensemble/forest.py -@@ -50,7 +50,6 @@ import numpy as np - from scipy.sparse import issparse - from scipy.sparse import hstack as sparse_hstack - -- - from ..base import ClassifierMixin, RegressorMixin - from ..utils import Parallel, delayed - from ..externals import six -@@ -62,7 +61,7 @@ from ..tree._tree import DTYPE, DOUBLE - from ..utils import check_random_state, check_array, compute_sample_weight - from ..exceptions import DataConversionWarning, NotFittedError - from .base import BaseEnsemble, _partition_estimators --from ..utils.fixes import parallel_helper -+from ..utils.fixes import parallel_helper, _joblib_parallel_args - from ..utils.multiclass import check_classification_targets - from ..utils.validation import check_is_fitted - -@@ -175,7 +174,7 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): - """ - X = self._validate_X_predict(X) - results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, -- prefer="threads")( -+ **_joblib_parallel_args(prefer="threads"))( - delayed(parallel_helper)(tree, 'apply', X, check_input=False) - for tree in self.estimators_) - -@@ -206,7 +205,7 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): - """ - X = self._validate_X_predict(X) - indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, -- prefer="threads")( -+ **_joblib_parallel_args(prefer='threads'))( - delayed(parallel_helper)(tree, 'decision_path', X, - check_input=False) - for tree in self.estimators_) -@@ -324,11 +323,11 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): - # Parallel loop: we prefer the threading backend as the Cython code - # for fitting the trees is internally releasing the Python GIL - # making threading more efficient than multiprocessing in -- # that case. However, we respect any parallel_backend contexts set -- # at a higher level, since correctness does not rely on using -- # threads. -+ # that case. However, for joblib 0.12+ we respect any -+ # parallel_backend contexts set at a higher level, -+ # since correctness does not rely on using threads. - trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, -- prefer="threads")( -+ **_joblib_parallel_args(prefer='threads'))( - delayed(_parallel_build_trees)( - t, self, X, y, sample_weight, i, len(trees), - verbose=self.verbose, class_weight=self.class_weight) -@@ -375,7 +374,7 @@ class BaseForest(six.with_metaclass(ABCMeta, BaseEnsemble)): - check_is_fitted(self, 'estimators_') - - all_importances = Parallel(n_jobs=self.n_jobs, -- prefer="threads")( -+ **_joblib_parallel_args(prefer='threads'))( - delayed(getattr)(tree, 'feature_importances_') - for tree in self.estimators_) - -@@ -591,7 +590,8 @@ class ForestClassifier(six.with_metaclass(ABCMeta, BaseForest, - all_proba = [np.zeros((X.shape[0], j), dtype=np.float64) - for j in np.atleast_1d(self.n_classes_)] - lock = threading.Lock() -- Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( -+ Parallel(n_jobs=n_jobs, verbose=self.verbose, -+ **_joblib_parallel_args(require="sharedmem"))( - delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, - lock) - for e in self.estimators_) -@@ -699,7 +699,8 @@ class ForestRegressor(six.with_metaclass(ABCMeta, BaseForest, RegressorMixin)): - - # Parallel loop - lock = threading.Lock() -- Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")( -+ Parallel(n_jobs=n_jobs, verbose=self.verbose, -+ **_joblib_parallel_args(require="sharedmem"))( - delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) - for e in self.estimators_) - -diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py -index f12f6f886..c08e5cc32 100644 ---- a/sklearn/ensemble/tests/test_forest.py -+++ b/sklearn/ensemble/tests/test_forest.py -@@ -10,6 +10,7 @@ Testing for the forest module (sklearn.ensemble.forest). - - import pickle - from collections import defaultdict -+from distutils.version import LooseVersion - import itertools - from itertools import combinations - from itertools import product -@@ -21,6 +22,7 @@ from scipy.sparse import coo_matrix - - import pytest - -+from sklearn.utils import _joblib - from sklearn.utils import parallel_backend - from sklearn.utils import register_parallel_backend - try: -@@ -1287,6 +1289,8 @@ class MyBackend(LokyBackend): - register_parallel_backend('testing', MyBackend) - - -+@pytest.mark.skipif(_joblib.__version__ < LooseVersion('0.12'), -+ reason='tests not yet supported in joblib <0.12') - def test_backend_respected(): - clf = RandomForestClassifier(n_estimators=10, n_jobs=2) - -diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py -index 2d0723944..4d0ded532 100644 ---- a/sklearn/linear_model/coordinate_descent.py -+++ b/sklearn/linear_model/coordinate_descent.py -@@ -22,6 +22,7 @@ from ..utils import Parallel, delayed, effective_n_jobs - from ..externals import six - from ..externals.six.moves import xrange - from ..utils.extmath import safe_sparse_dot -+from ..utils.fixes import _joblib_parallel_args - from ..utils.validation import check_is_fitted - from ..utils.validation import column_or_1d - from ..exceptions import ConvergenceWarning -@@ -1203,7 +1204,7 @@ class LinearModelCV(six.with_metaclass(ABCMeta, LinearModel)): - for this_l1_ratio, this_alphas in zip(l1_ratios, alphas) - for train, test in folds) - mse_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, -- prefer="threads")(jobs) -+ **_joblib_parallel_args(prefer="threads"))(jobs) - mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1)) - mean_mse = np.mean(mse_paths, axis=1) - self.mse_path_ = np.squeeze(np.rollaxis(mse_paths, 2, 1)) -diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py -index 01a4f78ab..091c13c1f 100644 ---- a/sklearn/linear_model/logistic.py -+++ b/sklearn/linear_model/logistic.py -@@ -33,6 +33,7 @@ from ..exceptions import (NotFittedError, ConvergenceWarning, - ChangedBehaviorWarning) - from ..utils.multiclass import check_classification_targets - from ..utils import Parallel, delayed, effective_n_jobs -+from ..utils.fixes import _joblib_parallel_args - from ..model_selection import check_cv - from ..externals import six - from ..metrics import get_scorer -@@ -1346,7 +1347,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, - else: - prefer = 'processes' - fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, -- prefer=prefer)( -+ **_joblib_parallel_args(prefer=prefer))( - path_func(X, y, pos_class=class_, Cs=[self.C], - fit_intercept=self.fit_intercept, tol=self.tol, - verbose=self.verbose, solver=solver, -@@ -1775,7 +1776,7 @@ class LogisticRegressionCV(LogisticRegression, BaseEstimator, - else: - prefer = 'processes' - fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, -- prefer=prefer)( -+ **_joblib_parallel_args(prefer=prefer))( - path_func(X, y, train, test, pos_class=label, Cs=self.Cs, - fit_intercept=self.fit_intercept, penalty=self.penalty, - dual=self.dual, solver=solver, tol=self.tol, -diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py -index 5e253003a..236bc10a9 100644 ---- a/sklearn/linear_model/stochastic_gradient.py -+++ b/sklearn/linear_model/stochastic_gradient.py -@@ -33,6 +33,7 @@ from .sgd_fast import SquaredLoss - from .sgd_fast import Huber - from .sgd_fast import EpsilonInsensitive - from .sgd_fast import SquaredEpsilonInsensitive -+from ..utils.fixes import _joblib_parallel_args - - LEARNING_RATE_TYPES = {"constant": 1, "optimal": 2, "invscaling": 3, - "adaptive": 4, "pa1": 5, "pa2": 6} -@@ -613,8 +614,8 @@ class BaseSGDClassifier(six.with_metaclass(ABCMeta, BaseSGD, - strategy is called OVA: One Versus All. - """ - # Use joblib to fit OvA in parallel. -- result = Parallel(n_jobs=self.n_jobs, prefer="threads", -- verbose=self.verbose)( -+ result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, -+ **_joblib_parallel_args(require="sharedmem"))( - delayed(fit_binary)(self, i, X, y, alpha, C, learning_rate, - max_iter, self._expanded_class_weight[i], - 1., sample_weight) -diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py -index 15e29ce4d..7ad29e2c0 100644 ---- a/sklearn/linear_model/tests/test_sgd.py -+++ b/sklearn/linear_model/tests/test_sgd.py -@@ -1,4 +1,5 @@ - -+from distutils.version import LooseVersion - import pickle - import unittest - import pytest -@@ -28,6 +29,13 @@ from sklearn.preprocessing import StandardScaler - from sklearn.exceptions import ConvergenceWarning - from sklearn.model_selection import train_test_split - from sklearn.linear_model import sgd_fast -+from sklearn.model_selection import RandomizedSearchCV -+from sklearn.utils import _joblib -+ -+ -+# 0.23. warning about tol not having its correct default value. -+pytestmark = pytest.mark.filterwarnings( -+ "ignore:max_iter and tol parameters have been") - - - class SparseSGDClassifier(SGDClassifier): -@@ -1471,3 +1479,31 @@ def test_gradient_squared_epsilon_insensitive(): - (2.0, 2.2, -0.2), (-2.0, 1.0, -5.8) - ] - _test_gradient_common(loss, cases) -+ -+ -+def test_multi_thread_multi_class_and_early_stopping(): -+ # This is a non-regression test for a bad interaction between -+ # early stopping internal attribute and thread-based parallelism. -+ clf = SGDClassifier(alpha=1e-3, tol=1e-3, max_iter=1000, -+ early_stopping=True, n_iter_no_change=100, -+ random_state=0, n_jobs=2) -+ clf.fit(iris.data, iris.target) -+ assert clf.n_iter_ > clf.n_iter_no_change -+ assert clf.n_iter_ < clf.n_iter_no_change + 20 -+ assert clf.score(iris.data, iris.target) > 0.8 -+ -+ -+def test_multi_core_gridsearch_and_early_stopping(): -+ # This is a non-regression test for a bad interaction between -+ # early stopping internal attribute and process-based multi-core -+ # parallelism. -+ param_grid = { -+ 'alpha': np.logspace(-4, 4, 9), -+ 'n_iter_no_change': [5, 10, 50], -+ } -+ clf = SGDClassifier(tol=1e-3, max_iter=1000, early_stopping=True, -+ random_state=0) -+ search = RandomizedSearchCV(clf, param_grid, n_iter=10, cv=5, n_jobs=2, -+ random_state=0) -+ search.fit(iris.data, iris.target) -+ assert search.best_score_ > 0.8 -diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py -index 070afbdbb..24554fe68 100644 ---- a/sklearn/utils/fixes.py -+++ b/sklearn/utils/fixes.py -@@ -14,6 +14,8 @@ import os - import errno - import sys - -+from distutils.version import LooseVersion -+ - import numpy as np - import scipy.sparse as sp - import scipy -@@ -332,3 +334,51 @@ except ImportError: # python <3.3 - from collections import Iterable as _Iterable # noqa - from collections import Mapping as _Mapping # noqa - from collections import Sized as _Sized # noqa -+ -+ -+def _joblib_parallel_args(**kwargs): -+ """Set joblib.Parallel arguments in a compatible way for 0.11 and 0.12+ -+ -+ For joblib 0.11 this maps both ``prefer`` and ``require`` parameters to -+ a specific ``backend``. -+ -+ Parameters -+ ---------- -+ -+ prefer : str in {'processes', 'threads'} or None -+ Soft hint to choose the default backend if no specific backend -+ was selected with the parallel_backend context manager. -+ -+ require : 'sharedmem' or None -+ Hard condstraint to select the backend. If set to 'sharedmem', -+ the selected backend will be single-host and thread-based even -+ if the user asked for a non-thread based backend with -+ parallel_backend. -+ -+ See joblib.Parallel documentation for more details -+ """ -+ from . import _joblib -+ -+ if _joblib.__version__ >= LooseVersion('0.12'): -+ return kwargs -+ -+ extra_args = set(kwargs.keys()).difference({'prefer', 'require'}) -+ if extra_args: -+ raise NotImplementedError('unhandled arguments %s with joblib %s' -+ % (list(extra_args), _joblib.__version__)) -+ args = {} -+ if 'prefer' in kwargs: -+ prefer = kwargs['prefer'] -+ if prefer not in ['threads', 'processes', None]: -+ raise ValueError('prefer=%s is not supported' % prefer) -+ args['backend'] = {'threads': 'threading', -+ 'processes': 'multiprocessing', -+ None: None}[prefer] -+ -+ if 'require' in kwargs: -+ require = kwargs['require'] -+ if require not in [None, 'sharedmem']: -+ raise ValueError('require=%s is not supported' % require) -+ if require == 'sharedmem': -+ args['backend'] = 'threading' -+ return args -diff --git a/sklearn/utils/tests/test_fixes.py b/sklearn/utils/tests/test_fixes.py -index 92f954439..5b7b960fa 100644 ---- a/sklearn/utils/tests/test_fixes.py -+++ b/sklearn/utils/tests/test_fixes.py -@@ -16,6 +16,7 @@ from sklearn.utils.fixes import divide - from sklearn.utils.fixes import MaskedArray - from sklearn.utils.fixes import nanmedian - from sklearn.utils.fixes import nanpercentile -+from sklearn.utils.fixes import _joblib_parallel_args - - - def test_divide(): -@@ -57,3 +58,33 @@ def test_nanmedian(axis, expected_median): - def test_nanpercentile(a, q, expected_percentile): - percentile = nanpercentile(a, q) - assert_allclose(percentile, expected_percentile) -+ -+ -+@pytest.mark.parametrize('joblib_version', ('0.11', '0.12.0')) -+def test_joblib_parallel_args(monkeypatch, joblib_version): -+ import sklearn.utils._joblib -+ monkeypatch.setattr(sklearn.utils._joblib, '__version__', joblib_version) -+ -+ if joblib_version == '0.12.0': -+ # arguments are simply passed through -+ assert _joblib_parallel_args(prefer='threads') == {'prefer': 'threads'} -+ assert _joblib_parallel_args(prefer='processes', require=None) == { -+ 'prefer': 'processes', 'require': None} -+ assert _joblib_parallel_args(non_existing=1) == {'non_existing': 1} -+ elif joblib_version == '0.11': -+ # arguments are mapped to the corresponding backend -+ assert _joblib_parallel_args(prefer='threads') == { -+ 'backend': 'threading'} -+ assert _joblib_parallel_args(prefer='processes') == { -+ 'backend': 'multiprocessing'} -+ with pytest.raises(ValueError): -+ _joblib_parallel_args(prefer='invalid') -+ assert _joblib_parallel_args( -+ prefer='processes', require='sharedmem') == { -+ 'backend': 'threading'} -+ with pytest.raises(ValueError): -+ _joblib_parallel_args(require='invalid') -+ with pytest.raises(NotImplementedError): -+ _joblib_parallel_args(verbose=True) -+ else: -+ raise ValueError diff --git a/packages/scikit-learn/patches/use-site-joblib.patch b/packages/scikit-learn/patches/use-site-joblib.patch index 1a73a2426..613d3c8a0 100644 --- a/packages/scikit-learn/patches/use-site-joblib.patch +++ b/packages/scikit-learn/patches/use-site-joblib.patch @@ -1,180 +1,11 @@ -commit 8778fe57c37a275fc36959e9bade234bd9bbe88f +commit 55cfffe8243d7d85dc6f8d14f045cee8a4e8b0ae Author: Roman Yurchak -Date: Thu Oct 25 16:56:54 2018 +0200 +Date: Mon Dec 3 22:09:46 2018 +0100 Use site joblib -diff --git a/sklearn/datasets/california_housing.py b/sklearn/datasets/california_housing.py -index 76cb27dad..98cb34f6e 100644 ---- a/sklearn/datasets/california_housing.py -+++ b/sklearn/datasets/california_housing.py -@@ -33,7 +33,7 @@ from .base import _fetch_remote - from .base import _pkl_filepath - from .base import RemoteFileMetadata - from ..utils import Bunch --from ..externals import joblib -+from ..utils import _joblib as _joblib - - # The original data can be found at: - # http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz -diff --git a/sklearn/datasets/covtype.py b/sklearn/datasets/covtype.py -index a08f61f02..4ac93f93f 100644 ---- a/sklearn/datasets/covtype.py -+++ b/sklearn/datasets/covtype.py -@@ -27,7 +27,7 @@ from .base import RemoteFileMetadata - from ..utils import Bunch - from .base import _pkl_filepath - from ..utils.fixes import makedirs --from ..externals import joblib -+from ..utils import _joblib as joblib - from ..utils import check_random_state - - # The original data can be found in: -diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py -index c8ed0e308..3aa6ebb35 100644 ---- a/sklearn/datasets/kddcup99.py -+++ b/sklearn/datasets/kddcup99.py -@@ -22,7 +22,8 @@ from .base import _fetch_remote - from .base import get_data_home - from .base import RemoteFileMetadata - from ..utils import Bunch --from ..externals import joblib, six -+from ..externals import six -+from ..utils import _joblib as joblib - from ..utils import check_random_state - from ..utils import shuffle as shuffle_method - -diff --git a/sklearn/datasets/olivetti_faces.py b/sklearn/datasets/olivetti_faces.py -index 74915c6c6..ba5bfecfb 100644 ---- a/sklearn/datasets/olivetti_faces.py -+++ b/sklearn/datasets/olivetti_faces.py -@@ -24,7 +24,7 @@ from .base import _fetch_remote - from .base import RemoteFileMetadata - from .base import _pkl_filepath - from ..utils import check_random_state, Bunch --from ..externals import joblib -+from ..utils import _joblib as joblib - - # The original data can be found at: - # http://cs.nyu.edu/~roweis/data/olivettifaces.mat -diff --git a/sklearn/datasets/rcv1.py b/sklearn/datasets/rcv1.py -index 7890d7e18..ea22fb076 100644 ---- a/sklearn/datasets/rcv1.py -+++ b/sklearn/datasets/rcv1.py -@@ -22,7 +22,7 @@ from .base import _pkl_filepath - from .base import _fetch_remote - from .base import RemoteFileMetadata - from ..utils.fixes import makedirs --from ..externals import joblib -+from ..utils import _joblib as joblib - from .svmlight_format import load_svmlight_files - from ..utils import shuffle as shuffle_ - from ..utils import Bunch -diff --git a/sklearn/datasets/species_distributions.py b/sklearn/datasets/species_distributions.py -index 6d8acddcc..8191048d7 100644 ---- a/sklearn/datasets/species_distributions.py -+++ b/sklearn/datasets/species_distributions.py -@@ -51,7 +51,7 @@ from .base import _fetch_remote - from .base import RemoteFileMetadata - from ..utils import Bunch - from sklearn.datasets.base import _pkl_filepath --from sklearn.externals import joblib -+import joblib - - PY3_OR_LATER = sys.version_info[0] >= 3 - -diff --git a/sklearn/datasets/twenty_newsgroups.py b/sklearn/datasets/twenty_newsgroups.py -index 8df908a2e..a2440222a 100644 ---- a/sklearn/datasets/twenty_newsgroups.py -+++ b/sklearn/datasets/twenty_newsgroups.py -@@ -45,7 +45,7 @@ from ..utils import check_random_state, Bunch - from ..utils import deprecated - from ..feature_extraction.text import CountVectorizer - from ..preprocessing import normalize --from ..externals import joblib -+from ..utils import _joblib as joblib - - logger = logging.getLogger(__name__) - -diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py -index 321031892..2a516619b 100644 ---- a/sklearn/ensemble/base.py -+++ b/sklearn/ensemble/base.py -@@ -13,9 +13,11 @@ from ..base import BaseEstimator - from ..base import MetaEstimatorMixin - from ..utils import check_random_state - from ..externals import six --from ..externals.joblib import effective_n_jobs -+from ..utils import _joblib - from abc import ABCMeta, abstractmethod - -+effective_n_jobs = _joblib.effective_n_jobs -+ - MAX_RAND_SEED = np.iinfo(np.int32).max - - -diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py -index d7586c286..f12f6f886 100644 ---- a/sklearn/ensemble/tests/test_forest.py -+++ b/sklearn/ensemble/tests/test_forest.py -@@ -23,7 +23,12 @@ import pytest - - from sklearn.utils import parallel_backend - from sklearn.utils import register_parallel_backend --from sklearn.externals.joblib.parallel import LokyBackend -+try: -+ from sklearn.utils import _joblib -+ LokyBackend = _joblib.parallel.LokyBackend -+except ImportError: -+ LokyBackend = object -+ - - from sklearn.utils.testing import assert_almost_equal - from sklearn.utils.testing import assert_array_almost_equal -diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py -index da04b4215..fc3f6a6b1 100644 ---- a/sklearn/metrics/tests/test_score_objects.py -+++ b/sklearn/metrics/tests/test_score_objects.py -@@ -40,7 +40,7 @@ from sklearn.datasets import load_diabetes - from sklearn.model_selection import train_test_split, cross_val_score - from sklearn.model_selection import GridSearchCV - from sklearn.multiclass import OneVsRestClassifier --from sklearn.externals import joblib -+import joblib - - - REGRESSION_SCORERS = ['explained_variance', 'r2', -diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py -index 990942c9e..e9a6c31bd 100644 ---- a/sklearn/neighbors/tests/test_kde.py -+++ b/sklearn/neighbors/tests/test_kde.py -@@ -10,7 +10,7 @@ from sklearn.pipeline import make_pipeline - from sklearn.datasets import make_blobs - from sklearn.model_selection import GridSearchCV - from sklearn.preprocessing import StandardScaler --from sklearn.externals import joblib -+import joblib - - - def compute_kernel_slow(Y, X, kernel, h): -diff --git a/sklearn/tests/test_site_joblib.py b/sklearn/tests/test_site_joblib.py -index bffd43cc1..df4c96893 100644 ---- a/sklearn/tests/test_site_joblib.py -+++ b/sklearn/tests/test_site_joblib.py -@@ -1,7 +1,10 @@ - import os - import pytest - from sklearn import externals --from sklearn.externals import joblib as joblib_vendored -+try: -+ from sklearn.externals import joblib as joblib_vendored -+except ImportError: -+ joblib_vendored = None - from sklearn.utils import Parallel, delayed, Memory, parallel_backend - - if os.environ.get('SKLEARN_SITE_JOBLIB', False): diff --git a/sklearn/utils/_joblib.py b/sklearn/utils/_joblib.py -index e1c39a401..9c4e815f7 100644 +index c3df15e91..b9f218ef1 100644 --- a/sklearn/utils/_joblib.py +++ b/sklearn/utils/_joblib.py @@ -5,7 +5,7 @@ import os as _os @@ -186,29 +17,3 @@ index e1c39a401..9c4e815f7 100644 with _warnings.catch_warnings(): _warnings.simplefilter("ignore") # joblib imports may raise DeprecationWarning on certain Python -diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py -index 75b378961..b81b9ab58 100644 ---- a/sklearn/utils/testing.py -+++ b/sklearn/utils/testing.py -@@ -44,7 +44,7 @@ except NameError: - - import sklearn - from sklearn.base import BaseEstimator --from sklearn.externals import joblib -+import joblib - from sklearn.utils.fixes import signature - from sklearn.utils import deprecated, IS_PYPY, _IS_32BIT - -diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py -index bf8412b3e..2eebb36b0 100644 ---- a/sklearn/utils/tests/test_estimator_checks.py -+++ b/sklearn/utils/tests/test_estimator_checks.py -@@ -5,7 +5,7 @@ import numpy as np - import scipy.sparse as sp - - from sklearn.externals.six.moves import cStringIO as StringIO --from sklearn.externals import joblib -+import joblib - - from sklearn.base import BaseEstimator, ClassifierMixin - from sklearn.utils import deprecated From b072a9d34a58ec4fddf051dc29d74d3878d120e8 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 3 Dec 2018 22:12:40 +0100 Subject: [PATCH 5/6] Remove CircleCI hacks --- .circleci/config.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1c0edc9cd..c36de119c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -25,11 +25,6 @@ jobs: no_output_timeout: 1200 command: | - # download scipy package from https://github.com/iodide-project/pyodide/pull/211 - mkdir -p build - wget -q -O build/scipy.js https://1535-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.js - wget -q -O build/scipy.data https://1535-122663163-gh.circle-artifacts.com/0/home/circleci/repo/build/scipy.data - ccache -z make ccache -s From fc7fe8ca301d20bccbdbe9e49ee2aa30f208501c Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 3 Dec 2018 22:14:10 +0100 Subject: [PATCH 6/6] Update tests with packaged scipy --- .circleci/config.yml | 1 - test/packages/test_scikit-learn.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index c36de119c..92f657ac4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -24,7 +24,6 @@ jobs: name: build no_output_timeout: 1200 command: | - ccache -z make ccache -s diff --git a/test/packages/test_scikit-learn.py b/test/packages/test_scikit-learn.py index eb745ba04..4025006f7 100644 --- a/test/packages/test_scikit-learn.py +++ b/test/packages/test_scikit-learn.py @@ -6,9 +6,6 @@ def test_scikit_learn(selenium_standalone, request): if selenium.browser == 'chrome': request.applymarker(pytest.mark.xfail( run=False, reason='chrome not supported')) - # no automatic depedency resolution for now - selenium.load_package(["numpy", "joblib"]) - selenium.load_package("scipy") selenium.load_package("scikit-learn") assert selenium.run(""" import numpy as np