Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dataset.dropna(inplace=True)
# summarize the number of rows and columns in the dataset after listwise drop
(sample, vnum) = dataset.shape
print(sample, vnum)
# Get the number of variables
vnum = vnum - 1
# splice into IVs and DV
values = dataset.values
X = values[:, 0:vnum]
y = values[:, vnum]
# Oversampling
ros = RandomOverSampler(random_state=0)
X_R, y_R = ros.fit_sample(X, y)
# create model
model = Sequential()
model.add(Dense(12, input_dim=vnum, kernel_initializer='uniform', activation='relu'))
model.add(Dense(8, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(X_R, y_R, epochs=150, batch_size=10, verbose=2)
# calculate predictions
predictions = model.predict(X)
# round predictions
rounded = [round(x[0]) for x in predictions]
_ = save_dataset(args.data_dir, X = X_test, y = Y_test,
train=False, balanced=False)
print('UPSAMPLING ORIGINAL DATASET TO IMPROVE IMBALANCES\n')
class_weighting = cw.compute_class_weight('balanced',
np.unique(y),
y)
print(f'ORIGINAL CLASS BALACE: {class_weighting}')
# save original dimensions (except nr of samples) for reshaping later
X_train_shape = list(X_train.shape[1:])
X_test_shape = list(X_test.shape[1:])
# Do the oversampling
ros = RandomOverSampler(ratio='auto')
X_train_balanced, Y_train_balanced = ros.fit_sample(
X = np.reshape(X_train, [X_train.shape[0], -1]),
y = Y_train)
X_test_balanced, Y_test_balanced = ros.fit_sample(
X = np.reshape(X_test, [X_test.shape[0], -1]),
y = Y_test)
# Reshape into original dimensions
X_train_balanced = np.reshape(X_train_balanced,
[len(X_train_balanced)] + X_train_shape)
X_test_balanced = np.reshape(X_test_balanced,
[len(X_test_balanced)] + X_test_shape)
class_weight = cw.compute_class_weight('balanced',
np.unique(Y_train_balanced),
Y_train_balanced)
def over_sample_random(train_inputs, train_targets):
sampler = RandomOverSampler(random_state=32)
train_inputs, train_targets = _sampler_helper(sampler, train_inputs, train_targets)
return train_inputs, train_targets
PpSolution('FR-SVD', 'Truncated SVD', TransformerTruncatedSVD, default_truncated_svd, space_truncated_svd,
'feature',
limit_size=50),
PpSolution('FR-ICA', 'Fast ICA', TransformerFastICA, default_fast_ica, space_fast_ica, 'feature', limit_size=50),
PpSolution('FR-PCA', 'PCA', TransformerPCA, default_pca, space_pca, 'feature', limit_size=50),
# feature selection from model
PpSolution('FR-RFR', 'Selection RF', TransformerSelectionRfR, default_sel_rf, space_sel_rf, 'feature',
problem_type='regression'),
PpSolution('FR-RFC', 'Selection RF', TransformerSelectionRfC, default_sel_rf, space_sel_rf, 'feature',
problem_type='classification'),
PpSolution('FR-LR', 'Selection LSVR', TransformerSelectionLinearSVR, {}, {}, 'feature', problem_type='regression'),
# sampling solutions
PpSolution('SP-PASS', 'No re-sampling', NoSampling, {}, {}, 'sampling'),
PpSolution('SP-ROS', 'Random Over', RandomOverSampler, {}, {}, 'sampling'),
PpSolution('SP-SMOTE', 'SMOTE', SMOTE, {}, {}, 'sampling'),
]
# mapping table
pp_solutions_map = {s.ref: s for s in pp_solutions}
# default pre-processing lists
pp_def_lgbm = ['MS-FIXED', 'FL-PASS', 'DT-DT', 'CE-LAB', 'TX-W2V', 'SC-PASS', 'FR-PASS']
pp_def_trees = ['MS-FIXED', 'FL-PASS', 'DT-DT', 'CE-LAB', 'TX-W2V', 'SC-PASS', 'FR-PASS']
pp_def_knn = ['MS-FIXED', 'FL-PASS', 'DT-DT', 'CE-HOT', 'TX-W2V', 'SC-STD', 'FR-PASS']
pp_def_linear = ['MS-FIXED', 'FL-LOG', 'DT-DT', 'CE-HOT', 'TX-W2V', 'SC-ROBUST', 'FR-PASS']
pp_def_NN = ['MS-FIXED', 'FL-LOG', 'DT-DT', 'CE-HOT', 'TX-W2V', 'SC-MINMAX', 'FR-PASS']
pp_list_lgbm = ['CE-LAB', 'CE-HOT', 'CE-BASE', 'CE-HASH',
'FL-PASS', 'FL-LOG', 'FL-SQRT',
'DT-DT', 'DT-YMD', 'DT-MD',
def __init__(self, operator = None, sampling_strategy='auto', random_state=None):
if operator is None:
raise ValueError("Operator is a required argument.")
self._hyperparams = {
'sampling_strategy': sampling_strategy,
'random_state': random_state}
resampler_instance = OrigModel(**self._hyperparams)
super(RandomOverSamplerImpl, self).__init__(
operator = operator,
resampler = resampler_instance)
print(__doc__)
# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1,
n_samples=200, random_state=10)
# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)
# Apply the random over-sampling
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)
# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)
c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
alpha=0.5)
ax1.set_title('Original set')
ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
label="Class #1", alpha=.5)
def transform(self, X, y=None):
"""Transform the dataframe."""
# TODO how do we validate this happens before train/test split? Or do we need to? Can we implement it in the
# TODO simple trainer in the correct order and leave this to advanced users?
# Extract predicted column
y = np.squeeze(X[[self.predicted_column]])
# Copy the dataframe without the predicted column
temp_dataframe = X.drop([self.predicted_column], axis=1)
# Initialize and fit the under sampler
over_sampler = RandomOverSampler(random_state=self.random_seed)
x_over_sampled, y_over_sampled = over_sampler.fit_sample(temp_dataframe, y)
# Build the resulting under sampled dataframe
result = pd.DataFrame(x_over_sampled)
# Restore the column names
result.columns = temp_dataframe.columns
# Restore the y values
y_over_sampled = pd.Series(y_over_sampled)
result[self.predicted_column] = y_over_sampled
return result
group_names = []
group_labels = []
group_values = []
for n, g in groups.items():
group_names.append(n)
group_values.append(g['indices'])
group_labels.append(g['labels'][g['indices']])
group_values = np.transpose(np.vstack(group_values))
group_labels = np.transpose(np.vstack(group_labels))
# get unique ids for each combination of group attributes
_, profile_idx = np.unique(group_values, axis = 0, return_inverse = True)
profile_labels = range(0, np.max(profile_idx) + 1)
# oversample labels
ros = RandomOverSampler(**kwargs)
X = np.array(data['X'])
Y = np.array(data['Y'])
X_res = []
Y_res = []
G_res = []
assert np.isin((-1, 1), Y).all()
for i in profile_labels:
row_idx = np.isin(profile_idx, i)
profile_values = group_labels[row_idx, :][0]
Xg = X[row_idx, :]
Yg = Y[row_idx]
if np.isin((-1, 1), Yg).all():
Xs, Ys = ros.fit_sample(Xg, Yg)
X_res.append(Xs)
Y_res.append(Ys)
def __init__(self):
super(UpSampling, self).__init__(RandomOverSampler(random_state=RANDOM_SEED[BALANCE_UP_SAMPLING]),
BALANCE_UP_SAMPLING)