Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def training_imbalance_kf(X_, Y_, TFIDF_, IMB_, FS_, pers_, CLF_, name_, model_path):
transform = feature_selection.SelectPercentile(FS_)
clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_), ('fs', transform), ('clf', CLF_)])
kf = KFold(n_splits=10)
kf.get_n_splits(X_)
#X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_,Y_,train_size=.8, stratify=Y_)
for train_index, test_index in kf.split(X_):
X_train, X_test = X_[train_index], X_[test_index]
y_train, y_test = Y_[train_index], Y_[test_index]
clf_model.set_params(fs__percentile=pers_).fit(X_train, y_train)
pickle.dump(clf_model, open(model_path+name_+'.sav', 'wb'))
#y_pred = clf_model.predict(X_test)
model_path):
""" Trains models using handled setting and saves them as .sav objects.
Parameters:
descr_series(Series): description series;
classes_codes(Series): series with classes' codes;
TFIDF_: vectorizer;
IMB_: SMOTE method;
FS_: ranking terms method;
req_percentage(int): percentage to be taken from the ranked list;
CLF_: classifier;
model_path(str): the path to the model.
"""
transformer = feature_selection.SelectPercentile(FS_)
clf_model = Pipeline([('tfidf', TFIDF_), ('imba', IMB_),
('fs', transformer), ('clf', CLF_)])
clf_model.set_params(
fs__percentile=req_percentage).fit(
descr_series,
classes_codes)
dump(clf_model, open(model_path + '.sav', 'wb'))
def stack_predict(self, df, holdout, pipes, amount=2):
X, y = self.split_x_y(df)
X_test, y_test = self.split_x_y(holdout)
pipe = Pipeline(self.top_pipeline(pipes).steps[:-1])
X = pipe.fit_transform(X)
X_test = pipe.transform(X_test)
estimators = []
for i in range(amount):
estimators.append((str(i), self.top_pipeline(pipes, i).steps[-1][1]))
regression = False
if self.METRIC in [
"explained_variance",
"neg_mean_absolute_error",
"neg_mean_squared_error",
"neg_mean_squared_log_error",
"neg_median_absolute_error",
def error_estimation(
x_matrix, y_vector, param_grid, cv=None, scoring=None):
pipeline = Pipeline([('resampler', None), ('classifier', DummyClassifier())])
grid_search_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scoring)
return cross_val_score(grid_search_cv, x_matrix, y_vector)
"n_estimators must be an integer, "
"got {}.".format(type(self.n_estimators))
)
if self.n_estimators <= 0:
raise ValueError(
"n_estimators must be greater than zero, "
"got {}.".format(self.n_estimators)
)
if self.base_estimator is not None:
base_estimator = clone(self.base_estimator)
else:
base_estimator = clone(default)
self.base_estimator_ = Pipeline(
[
(
"sampler",
RandomUnderSampler(
sampling_strategy=self.sampling_strategy,
replacement=self.replacement,
),
),
("classifier", base_estimator),
]
>>> make_pipeline(StandardScaler(), GaussianNB(priors=None))
... # doctest: +NORMALIZE_WHITESPACE
Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('gaussiannb',
GaussianNB(priors=None, var_smoothing=1e-09))],
verbose=False)
"""
memory = kwargs.pop("memory", None)
verbose = kwargs.pop('verbose', False)
if kwargs:
raise TypeError(
'Unknown keyword arguments: "{}"'.format(list(kwargs.keys())[0])
)
return Pipeline(
pipeline._name_estimators(steps), memory=memory, verbose=verbose
)
'kneighborsclassifier': KNeighborsClassifier(),
'decisiontreeclassifier': DecisionTreeClassifier(),
'nusvc': NuSVC(),
'randomforestclassifier': RandomForestClassifier()
}
classifier = classifiers[parameters['classifier'].lower()]
# print(classifier)
classifier_params = get_classifier_params(parameters)
classifier.set_params(**classifier_params)
print(classifier)
resampler = sampler_factory.create_sampler(
parameters['resampler'], Constants.DOCUMENT_CLASSIFIER_SEED)
return Pipeline([('resampler', resampler), ('classifier', classifier)])