Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
parameters = info[1].split(",")
if (model_type != "svd"):
parameters = [item.split("=") for item in parameters]
parameters_dict = {}
for item in parameters:
parameters_dict[item[0]] = item[1]
info = info[0]
for elem in parameters_dict:
if type(parameters_dict[elem]) == str:
parameters_dict[elem] = parameters_dict[elem].replace("'", "")
if (model_type == "rf_regressor"):
from verticapy.learn.ensemble import RandomForestRegressor
model = RandomForestRegressor(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "rf_classifier"):
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(name, cursor, int(parameters_dict['ntree']), int(parameters_dict['mtry']), int(parameters_dict['max_breadth']), float(parameters_dict['sampling_size']), int(parameters_dict['max_depth']), int(parameters_dict['min_leaf_size']), float(parameters_dict['min_info_gain']), int(parameters_dict['nbins']))
elif (model_type == "logistic_reg"):
from verticapy.learn.linear_model import LogisticRegression
model = LogisticRegression(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "linear_reg"):
from verticapy.learn.linear_model import ElasticNet
model = ElasticNet(name, cursor, parameters_dict['regularization'], float(parameters_dict['epsilon']), float(parameters_dict['lambda']), int(parameters_dict['max_iterations']), parameters_dict['optimizer'], float(parameters_dict['alpha']))
elif (model_type == "naive_bayes"):
from verticapy.learn.naive_bayes import MultinomialNB
model = MultinomialNB(name, cursor, float(parameters_dict['alpha']))
elif (model_type == "svm_regressor"):
from verticapy.learn.svm import LinearSVR
model = LinearSVR(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], float(parameters_dict['error_tolerance']), int(parameters_dict['max_iterations']))
elif (model_type == "svm_classifier"):
from verticapy.learn.svm import LinearSVC
model = LinearSVC(name, cursor, float(parameters_dict['epsilon']), float(parameters_dict['C']), True, float(parameters_dict['intercept_scaling']), parameters_dict['intercept_mode'], [float(item) for item in parameters_dict['class_weights'].split(",")], int(parameters_dict['max_iterations']))
elif (model_type == "kmeans"):
rand_int = random.randint(0, 10000000)
temp_information = ("{}.VERTICAPY_TEMP_VIEW_{}".format(schema, rand_int), "{}.VERTICAPY_TEMP_MODEL_{}".format(schema, rand_int))
if (bins < 2):
raise ValueError("Parameter 'bins' must be greater or equals to 2 in case of discretization using the method 'smart'")
columns_check([response], self.parent)
response = vdf_columns_names([response], self.parent)[0]
try:
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP VIEW IF EXISTS {}".format(temp_information[0]))
except:
try:
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(temp_information[1]))
except:
pass
self.parent.to_db(temp_information[0])
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(temp_information[1], self.parent._VERTICAPY_VARIABLES_["cursor"], n_estimators = 20, max_depth = 3, nbins = 100, min_samples_leaf = min_bin_size)
model.fit(temp_information[0], [self.alias], response)
query = ["(SELECT READ_TREE(USING PARAMETERS model_name = '{}', tree_id = {}, format = 'tabular'))".format(temp_information[1], i) for i in range(20)]
query = "SELECT split_value FROM (SELECT split_value, COUNT(*) FROM ({}) x WHERE split_value IS NOT NULL GROUP BY 1 ORDER BY 2 DESC LIMIT {}) y ORDER BY split_value::float".format(" UNION ALL ".join(query), bins - 1)
self.parent.__executeSQL__(query = query, title = "Computes the optimized histogram bins using Random Forest.")
result = self.parent._VERTICAPY_VARIABLES_["cursor"].fetchall()
result = [elem[0] for elem in result]
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP VIEW IF EXISTS {}".format(temp_information[0]))
self.parent._VERTICAPY_VARIABLES_["cursor"].execute("DROP MODEL IF EXISTS {}".format(temp_information[1]))
result = [self.min()] + result + [self.max()]
elif (method == "topk"):
if (k < 2):
raise ValueError("Parameter 'k' must be greater or equals to 2 in case of discretization using the method 'topk'")
distinct = self.topk(k).values["index"]
trans = ("(CASE WHEN {} IN ({}) THEN {} || '' ELSE '{}' END)".format(convert_special_type(self.category(), False), ', '.join(["'{}'".format(str(elem).replace("'", "''")) for elem in distinct]), convert_special_type(self.category(), False), new_category.replace("'", "''")), "varchar", "text")
elif (self.isnum() and method == "same_freq"):
if (bins < 2):
def DummyTreeClassifier(name: str,
cursor = None):
"""
---------------------------------------------------------------------------
Dummy Tree Classifier. This classifier learns by heart the training data.
=> very depth RandomForestClassifier of one tree using all the data.
Parameters
----------
name: str
Name of the the model. The model will be stored in the DB.
cursor: DBcursor, optional
Vertica DB cursor.
"""
return RandomForestClassifier(name = name,
cursor = cursor,
n_estimators = 1,
max_features = "max",
max_leaf_nodes = 1e9,
sample = 1.0,
max_depth = 100,
min_samples_leaf = 1,
min_info_gain = 0.0,
nbins = 1000)
#---#
The maximum number of leaf nodes a tree in the forest can have, an integer
between 1 and 1e9, inclusive.
max_depth: int, optional
The maximum depth for growing each tree, an integer between 1 and 100, inclusive.
min_samples_leaf: int, optional
The minimum number of samples each branch must have after splitting a node, an
integer between 1 and 1e6, inclusive. A split that causes fewer remaining samples
is discarded.
min_info_gain: float, optional
The minimum threshold for including a split, a float between 0.0 and 1.0, inclusive.
A split with information gain less than this threshold is discarded.
nbins: int, optional
The number of bins to use for continuous features, an integer between 2 and 1000,
inclusive.
"""
return RandomForestClassifier(name = name,
cursor = cursor,
n_estimators = 1,
max_features = max_features,
max_leaf_nodes = max_leaf_nodes,
sample = 1.0,
max_depth = max_depth,
min_samples_leaf = min_samples_leaf,
min_info_gain = min_info_gain,
nbins = nbins)
#---#