Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
for target_col_name in self.targets:
Y = train_ds.get_column_original_data(target_col_name)
if self.targets[target_col_name]['type'] == COLUMN_DATA_TYPES.CATEGORICAL:
weight_map = self.targets[target_col_name]['weights']
if weight_map is None:
sample_weight = [1 for x in real]
else:
sample_weight = []
for val in Y:
sample_weight.append(weight_map[val])
self.targets[target_col_name]['model'] = GradientBoostingClassifier(n_estimators=600)
self.targets[target_col_name]['model'].fit(X,Y,sample_weight=sample_weight)
elif self.targets[target_col_name]['type'] == COLUMN_DATA_TYPES.NUMERIC:
self.targets[target_col_name]['model'] = GradientBoostingRegressor(n_estimators=600)
self.targets[target_col_name]['model'].fit(X,Y)
if self.quantiles is not None:
self.targets[target_col_name]['quantile_models'] = {}
for i, quantile in enumerate(self.quantiles):
self.targets[target_col_name]['quantile_models'][i] = GradientBoostingRegressor(n_estimators=600, loss='quantile',alpha=quantile)
self.targets[target_col_name]['quantile_models'][i].fit(X,Y)
else:
self.targets[target_col_name]['model'] = None
def fit_data_source(self, ds):
self.input_column_names = self.input_column_names \
if self.input_column_names is not None else ds.get_feature_names('input_features')
self.output_column_names = self.output_column_names \
if self.output_column_names is not None else ds.get_feature_names('output_features')
self.out_types = ds.out_types
for n, out_type in enumerate(self.out_types):
if out_type == COLUMN_DATA_TYPES.NUMERIC:
ds.encoders[self.output_column_names[n]].extra_outputs = len(self.quantiles) - 1
transformer_already_initialized = False
try:
if len(list(ds.transformer.feature_len_map.keys())) > 0:
transformer_already_initialized = True
except:
pass
if not transformer_already_initialized:
ds.transformer = Transformer(self.input_column_names, self.output_column_names)
self.encoders = ds.encoders
self.transformer = ds.transformer
ds = from_data if isinstance(from_data, DataSource) else DataSource(from_data, self.config)
predictions = self._mixer.predict(ds, include_extra_data=True)
accuracies = {}
for output_column in self._output_columns:
real = list(map(str,ds.get_column_original_data(output_column)))
predicted = list(map(str,predictions[output_column]['predictions']))
weight_map = None
if 'weights' in ds.get_column_config(output_column):
weight_map = ds.get_column_config(output_column)['weights']
accuracy = self.apply_accuracy_function(ds.get_column_config(output_column)['type'], real, predicted,weight_map=weight_map)
if ds.get_column_config(output_column)['type'] in (COLUMN_DATA_TYPES.NUMERIC):
ds.encoders[output_column].decode_log = True
predicted = ds.get_decoded_column_data(output_column, predictions[output_column]['encoded_predictions'])
alternative_accuracy = self.apply_accuracy_function(ds.get_column_config(output_column)['type'], real, predicted,weight_map=weight_map)
if alternative_accuracy['value'] > accuracy['value']:
accuracy = alternative_accuracy
else:
ds.encoders[output_column].decode_log = False
accuracies[output_column] = accuracy
return accuracies
def type_map(col_name):
col_pd_type = from_data[col_name].dtype
col_pd_type = str(col_pd_type)
if col_pd_type in ['int64', 'float64', 'timedelta']:
return COLUMN_DATA_TYPES.NUMERIC
elif col_pd_type in ['bool', 'category']:
return COLUMN_DATA_TYPES.CATEGORICAL
else:
# if the number of uniques is elss than 100 or less,
# than 10% of the total number of rows then keep it as categorical
unique = from_data[col_name].nunique()
if unique < 100 or unique < len(from_data[col_name]) / 10:
return COLUMN_DATA_TYPES.CATEGORICAL
# else assume its text
return COLUMN_DATA_TYPES.TEXT
# else:
priming_data.append(str(i) + ''.join(['n'] * i))
# priming_data.append(str(i))
primting_target.append(i)
output_1_encoder = NumericEncoder(is_target=True)
output_1_encoder.prepare_encoder(primting_target)
encoded_data_1 = output_1_encoder.encode(primting_target)
encoded_data_1 = encoded_data_1.tolist()
enc = DistilBertEncoder()
enc.prepare_encoder(priming_data,
training_data={'targets': [
{'output_type': COLUMN_DATA_TYPES.NUMERIC,'encoded_output': encoded_data_1},
{'output_type': COLUMN_DATA_TYPES.NUMERIC, 'encoded_output': encoded_data_1}
]})
encoded_predicted_target = enc.encode(test_data).tolist()
predicted_targets_1 = output_1_encoder.decode(torch.tensor([x[:3] for x in encoded_predicted_target]))
predicted_targets_2 = output_1_encoder.decode(torch.tensor([x[3:] for x in encoded_predicted_target]))
for predicted_targets in [predicted_targets_1, predicted_targets_2]:
real = list(test_target)
pred = list(predicted_targets)
# handle nan
for i in range(len(pred)):
try:
float(pred[i])