Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_front_page_model_agnostic():
import sklearn
import shap
from sklearn.model_selection import train_test_split
# print the JS visualization code to the notebook
shap.initjs()
# train a SVM classifier
X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.1, random_state=0)
svm = sklearn.svm.SVC(kernel='rbf', probability=True)
svm.fit(X_train, Y_train)
# use Kernel SHAP to explain test set predictions
explainer = shap.KernelExplainer(svm.predict_proba, X_train, nsamples=100, link="logit")
shap_values = explainer.shap_values(X_test)
# plot the SHAP values for the Setosa output of the first instance
shap.force_plot(explainer.expected_value[0], shap_values[0][0, :], X_test.iloc[0, :], link="logit")
def test_kernel_sparse_vs_dense_multirow_background():
import sklearn
import shap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
# train a logistic regression classifier
X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.iris(), test_size=0.1, random_state=0)
lr = LogisticRegression(solver='lbfgs')
lr.fit(X_train, Y_train)
# use Kernel SHAP to explain test set predictions with dense data
explainer = shap.KernelExplainer(lr.predict_proba, X_train, nsamples=100, link="logit", l1_reg="rank(3)")
shap_values = explainer.shap_values(X_test)
X_sparse_train = sp.sparse.csr_matrix(X_train)
X_sparse_test = sp.sparse.csr_matrix(X_test)
lr_sparse = LogisticRegression(solver='lbfgs')
lr_sparse.fit(X_sparse_train, Y_train)
# use Kernel SHAP again but with sparse data
sparse_explainer = shap.KernelExplainer(lr.predict_proba, X_sparse_train, nsamples=100, link="logit", l1_reg="rank(3)")
sparse_shap_values = sparse_explainer.shap_values(X_sparse_test)
assert(np.allclose(shap_values, sparse_shap_values, rtol=1e-05, atol=1e-05))
def test_kernel_shap_with_a1a_sparse_nonzero_background():
np.set_printoptions(threshold=100000)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.utils.sparsefuncs import csc_median_axis_0
import shap
np.random.seed(0)
X, y = shap.datasets.a1a() # pylint: disable=unbalanced-tuple-unpacking
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)
# Calculate median of background data
median_dense = csc_median_axis_0(x_train.tocsc())
median = sp.sparse.csr_matrix(median_dense)
explainer = shap.KernelExplainer(linear_model.predict, median)
shap_values = explainer.shap_values(x_test)
def dense_to_sparse_predict(data):
sparse_data = sp.sparse.csr_matrix(data)
return linear_model.predict(sparse_data)
explainer_dense = shap.KernelExplainer(dense_to_sparse_predict, median_dense.reshape((1, len(median_dense))))
x_test_dense = x_test.toarray()
shap_values_dense = explainer_dense.shap_values(x_test_dense)
# Validate sparse and dense result is the same
assert(np.allclose(shap_values, shap_values_dense, rtol=1e-02, atol=1e-01))
# train a model with single tree
Xd = xgboost.DMatrix(X, label=y)
model = xgboost.train({'eta':1,
'max_depth':max_depth,
'base_score': 0,
"lambda": 0},
Xd, 1)
ypred = model.predict(Xd)
# Compare for five random samples
for i in range(5):
x_ind = np.random.choice(X.shape[1]); x = X[x_ind:x_ind+1,:]
expl = shap.TreeExplainer(model, X, feature_perturbation="interventional")
f = lambda inp : model.predict(xgboost.DMatrix(inp))
expl_kern = shap.KernelExplainer(f, X)
itshap = expl.shap_values(x)
kshap = expl_kern.shap_values(x, nsamples=150)
assert np.allclose(itshap,kshap), \
"Kernel SHAP doesn't match Independent Tree SHAP!"
assert np.allclose(itshap.sum() + expl.expected_value, ypred[x_ind]), \
"SHAP values don't sum to model output!"
def test_null_model_small():
explainer = shap.KernelExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 4)), nsamples=100)
e = explainer.explain(np.ones((1, 4)))
assert np.sum(np.abs(e.effects)) < 1e-8
# train a model with single tree
Xd = xgboost.DMatrix(X, label=y)
model = xgboost.train({'eta':1,
'max_depth':max_depth,
'base_score': 0,
"lambda": 0},
Xd, 1)
ypred = model.predict(Xd)
# Compare for five random samples
for i in range(5):
x_ind = np.random.choice(X.shape[1]); x = X[x_ind:x_ind+1,:]
expl = shap.TreeExplainer(model, X, feature_perturbation="interventional")
f = lambda inp : model.predict(xgboost.DMatrix(inp))
expl_kern = shap.KernelExplainer(f, X)
itshap = expl.shap_values(x)
kshap = expl_kern.shap_values(x, nsamples=150)
assert np.allclose(itshap,kshap), \
"Kernel SHAP doesn't match Independent Tree SHAP!"
assert np.allclose(itshap.sum() + expl.expected_value, ypred[x_ind]), \
"SHAP values don't sum to model output!"
def test_null_model():
explainer = shap.KernelExplainer(lambda x: np.zeros(x.shape[0]), np.ones((2, 10)), nsamples=100)
e = explainer.explain(np.ones((1, 10)))
assert np.sum(np.abs(e)) < 1e-8
self.x_test = x_test
self.y_test = y_test
if learner == "linear":
self.explainer = shap.LinearExplainer(
self.model, self.x_train, feature_dependence="independent"
)
elif learner == "tree":
self.explainer = shap.TreeExplainer(self.model)
elif learner == "kernel":
if hasattr(self.model, "predict_proba"):
func = self.model.predict_proba
else:
func = self.model.predict
self.explainer = shap.KernelExplainer(func, self.x_train)
else:
raise ValueError(f"Learner: {learner} is not supported yet.")
self.expected_value = self.explainer.expected_value
self.shap_values = np.array(self.explainer.shap_values(self.x_test)).astype(
float
)
if isinstance(self.model, lgb.sklearn.LGBMClassifier) and isinstance(
self.expected_value, np.float
):
self.shap_values = self.shap_values[1]
# Calculate misclassified values
self.misclassified_values = self._calculate_misclassified()
progress_callback: Callable,
) -> Tuple[List[np.ndarray], np.ndarray, np.ndarray]:
"""
Computes SHAP values for any learner with KernelExplainer.
"""
# 1000 is a number that for normal data and model do not take so long
data_sample, sample_mask = _subsample_data(transformed_data, 1000)
try:
ref = kmeans(transformed_reference_data.X, k=10)
except ValueError:
# k-means fails with value error when it cannot produce enough clusters
# in this case we will use sample instead of clusters
ref = sample(transformed_reference_data.X, nsamples=100)
explainer = KernelExplainer(
lambda x: (
model(x)
if model.domain.class_var.is_continuous
else model(x, model.Probs)
),
ref,
)
shap_values = []
for i, row in enumerate(data_sample.X):
progress_callback(i / len(data_sample))
shap_values.append(
explainer.shap_values(row, nsamples=100, silent=True, l1_reg=False)
)
return (
_join_shap_values(shap_values),
def _reset_evaluation_background(self, function, **kwargs):
"""Modify the explainer to use the new evalaution example for background data.
Note when constructing explainer an evaluation example is not available hence the initialization data is used.
:param function: Function.
:type function: Function that accepts a 2d ndarray
"""
function, summary = self._prepare_function_and_summary(function, self.original_data_ref,
self.current_index_list,
explain_subset=self.explain_subset, **kwargs)
self.explainer = shap.KernelExplainer(function, summary)