Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
else:
pred_dense = gbm.predict(X_test.to_dense(), raw_score=True)
np.testing.assert_allclose(pred_sparse, pred_dense)
# note: this test used to fail with lightgbm 2.2.1 with error:
# ValueError: zero-size array to reduction operation maximum which has no identity
# on TreeExplainer when trying to compute max nodes:
# max_nodes = np.max([len(t.values) for t in self.trees])
# The test does not fail with latest lightgbm 2.2.3 however
try:
import lightgbm
except:
print("Skipping test_lightgbm_constant_multiclass!")
return
import shap
# train lightgbm model
X, Y = shap.datasets.iris()
Y.fill(1)
model = lightgbm.sklearn.LGBMClassifier(num_classes=3, objective="multiclass")
model.fit(X, Y)
# explain the model's predictions using SHAP values
shap_values = shap.TreeExplainer(model).shap_values(X)
def test_lightgbm_binary():
try:
import lightgbm
except:
print("Skipping test_lightgbm_binary!")
return
import shap
from sklearn.model_selection import train_test_split
# train lightgbm model
X_train,X_test,Y_train,Y_test = train_test_split(*shap.datasets.adult(), test_size=0.2, random_state=0)
model = lightgbm.sklearn.LGBMClassifier()
model.fit(X_train, Y_train)
# explain the model's predictions using SHAP values
shap_values = shap.TreeExplainer(model).shap_values(X_test)
# validate structure of shap values, must be a list of ndarray for both classes
assert isinstance(shap_values, list)
assert len(shap_values) == 2
# ensure plot works for first class
shap.dependence_plot(0, shap_values[0], X_test, show=False)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
ordered=True)})
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype('category')
X_test[cat_cols_actual] = X_test[cat_cols_actual].astype('category')
cat_values = [X[col].cat.categories.tolist() for col in cat_cols_to_store]
gbm0 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred0 = gbm0.predict(X_test, raw_score=True)
pred_prob = gbm0.predict_proba(X_test)[:, 1]
gbm1 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, pd.Series(y), categorical_feature=[0])
pred1 = gbm1.predict(X_test, raw_score=True)
gbm2 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A'])
pred2 = gbm2.predict(X_test, raw_score=True)
gbm3 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = gbm3.predict(X_test, raw_score=True)
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = gbm4.predict(X_test)
gbm5 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=['A', 'B', 'C', 'D', 'E'])
pred5 = gbm5.predict(X_test, raw_score=True)
gbm6 = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y, categorical_feature=[])
pred6 = gbm6.predict(X_test, raw_score=True)
self.assertRaises(AssertionError,
np.testing.assert_allclose,
pred0, pred1)
self.assertRaises(AssertionError,
# note: this test used to fail with lightgbm 2.2.1 with error:
# ValueError: zero-size array to reduction operation maximum which has no identity
# on TreeExplainer when trying to compute max nodes:
# max_nodes = np.max([len(t.values) for t in self.trees])
# The test does not fail with latest lightgbm 2.2.3 however
try:
import lightgbm
except:
print("Skipping test_lightgbm_constant_multiclass!")
return
import shap
# train lightgbm model
X, Y = shap.datasets.iris()
Y.fill(1)
model = lightgbm.sklearn.LGBMClassifier(num_classes=3, objective="multiclass")
model.fit(X, Y)
# explain the model's predictions using SHAP values
shap_values = shap.TreeExplainer(model).shap_values(X)
if isMORT:
mort0 = LiteMORT(params).fit(X, y)
pred0 = list(mort0.predict(X_test))
mort1 = LiteMORT(params).fit(X, y, categorical_feature=[0])
pred1 = list(mort1.predict(X_test))
mort2 = LiteMORT(params).fit(X, y, categorical_feature=['A'])
pred2 = list(mort2.predict(X_test))
mort3 = LiteMORT(params).fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = list(mort3.predict(X_test))
else:
clf=lgb.sklearn.LGBMClassifier()
gbm_ = clf.fit(X, y)
gbm0 = lgb.sklearn.LGBMClassifier().fit(X, y)
pred0 = list(gbm0.predict(X_test))
gbm1 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=[0])
pred1 = list(gbm1.predict(X_test))
gbm2 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A'])
pred2 = list(gbm2.predict(X_test))
gbm3 = lgb.sklearn.LGBMClassifier().fit(X, y, categorical_feature=['A', 'B', 'C', 'D'])
pred3 = list(gbm3.predict(X_test))
gbm3.booster_.save_model('categorical.model')
gbm4 = lgb.Booster(model_file='categorical.model')
pred4 = list(gbm4.predict(X_test))
pred_prob = list(gbm0.predict_proba(X_test)[:, 1])
np.testing.assert_almost_equal(pred_prob, pred4)
input("...")
#np.testing.assert_almost_equal(pred0, pred1)
elif learner == "kernel":
if hasattr(self.model, "predict_proba"):
func = self.model.predict_proba
else:
func = self.model.predict
self.explainer = shap.KernelExplainer(func, self.x_train)
else:
raise ValueError(f"Learner: {learner} is not supported yet.")
self.expected_value = self.explainer.expected_value
self.shap_values = np.array(self.explainer.shap_values(self.x_test)).astype(
float
)
if isinstance(self.model, lgb.sklearn.LGBMClassifier) and isinstance(
self.expected_value, np.float
):
self.shap_values = self.shap_values[1]
# Calculate misclassified values
self.misclassified_values = self._calculate_misclassified()
# As per SHAP guidelines, test data needs to be dense for plotting functions
self.x_test_array = self.x_test.values
def create_model(self):
#TODO: if learning rates are identical throughout - create a regular Classifier
if "is_unbalance" in self.model_params:
is_unbalance = self.model_params.pop("is_unbalance")
self.model_params["class_weight"] = "balanced" if is_unbalance else None
self.model_params['n_estimators'] = self.best_n_iterations
self.model_params["learning_rate"] = self.learning_rates[0] #TODO change
final_model = LGBMClassifier(**self.model_params)
return final_model