Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_sparse(self):
import pandas as pd
X = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.SparseArray(np.random.permutation([True, False] * 150))})
y = pd.Series(pd.SparseArray(np.random.permutation([0, 1] * 150)))
X_test = pd.DataFrame({"A": pd.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.SparseArray(np.random.permutation([True, False] * 30))})
if pd.__version__ >= '0.24.0':
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
self.assertTrue(pd.api.types.is_sparse(dtype))
gbm = lgb.sklearn.LGBMClassifier(n_estimators=10).fit(X, y)
pred_sparse = gbm.predict(X_test, raw_score=True)
if hasattr(X_test, 'sparse'):
pred_dense = gbm.predict(X_test.sparse.to_dense(), raw_score=True)
@unittest.skipIf(not lgb.compat.PANDAS_INSTALLED, 'pandas is not installed')
def test_pandas_categorical(self):
import pandas as pd
np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat)
X = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'c', 'd'] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(['z', 'y', 'x', 'w', 'v'] * 60),
ordered=True)}) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
X_test = pd.DataFrame({"A": np.random.permutation(['a', 'b', 'e'] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(pd.np.random.permutation(['z', 'y'] * 30),
ordered=True)})
hist, bins = gbm.get_split_value_histogram(0, bins=7)
self.assertEqual(len(hist), 7)
self.assertEqual(len(bins), 8)
hist_idx, bins_idx = gbm.get_split_value_histogram(0)
hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[0])
np.testing.assert_array_equal(hist_idx, hist_name)
np.testing.assert_allclose(bins_idx, bins_name)
hist_idx, bins_idx = gbm.get_split_value_histogram(X.shape[-1] - 1)
hist_name, bins_name = gbm.get_split_value_histogram(gbm.feature_name()[X.shape[-1] - 1])
np.testing.assert_array_equal(hist_idx, hist_name)
np.testing.assert_allclose(bins_idx, bins_name)
# test bins string type
if np.__version__ > '1.11.0':
hist_vals, bin_edges = gbm.get_split_value_histogram(0, bins='auto')
hist = gbm.get_split_value_histogram(0, bins='auto', xgboost_style=True)
if lgb.compat.PANDAS_INSTALLED:
mask = hist_vals > 0
np.testing.assert_array_equal(hist_vals[mask], hist['Count'].values)
np.testing.assert_allclose(bin_edges[1:][mask], hist['SplitValue'].values)
else:
mask = hist_vals > 0
np.testing.assert_array_equal(hist_vals[mask], hist[:, 1])
np.testing.assert_allclose(bin_edges[1:][mask], hist[:, 0])
# test histogram is disabled for categorical features
self.assertRaises(lgb.basic.LightGBMError, gbm.get_split_value_histogram, 2)
# coding: utf-8
import lightgbm as lgb
import pandas as pd
if lgb.compat.MATPLOTLIB_INSTALLED:
import matplotlib.pyplot as plt
else:
raise ImportError('You need to install matplotlib for plot_example.py.')
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)