Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
}
if 'weights' in output_feature:
self.targets[output_feature['name']]['weights'] = output_feature['weights']
else:
self.targets[output_feature['name']]['weights'] = None
X = []
for row in train_ds:
X.append(np.array(row[0]))
X = np.array(X)
for target_col_name in self.targets:
Y = train_ds.get_column_original_data(target_col_name)
if self.targets[target_col_name]['type'] == COLUMN_DATA_TYPES.CATEGORICAL:
weight_map = self.targets[target_col_name]['weights']
if weight_map is None:
sample_weight = [1 for x in real]
else:
sample_weight = []
for val in Y:
sample_weight.append(weight_map[val])
self.targets[target_col_name]['model'] = GradientBoostingClassifier(n_estimators=600)
self.targets[target_col_name]['model'].fit(X,Y,sample_weight=sample_weight)
elif self.targets[target_col_name]['type'] == COLUMN_DATA_TYPES.NUMERIC:
self.targets[target_col_name]['model'] = GradientBoostingRegressor(n_estimators=600)
self.targets[target_col_name]['model'].fit(X,Y)
if self.quantiles is not None:
self.targets[target_col_name]['quantile_models'] = {}
if self.batch_size < self.net.available_devices:
self.batch_size = self.net.available_devices
self.awareness_criterion = torch.nn.MSELoss()
if self.criterion_arr is None:
self.criterion_arr = []
self.unreduced_criterion_arr = []
if ds.output_weights is not None and ds.output_weights is not False:
output_weights = torch.Tensor(ds.output_weights).to(self.net.device)
else:
output_weights = None
for k, output_type in enumerate(self.out_types):
if output_type == COLUMN_DATA_TYPES.CATEGORICAL:
if output_weights is None:
weights_slice = None
else:
weights_slice = output_weights[ds.out_indexes[k][0]:ds.out_indexes[k][1]]
self.criterion_arr.append(TransformCrossEntropyLoss(weight=weights_slice))
self.unreduced_criterion_arr.append(TransformCrossEntropyLoss(weight=weights_slice,reduce=False))
elif output_type == COLUMN_DATA_TYPES.MULTIPLE_CATEGORICAL:
if output_weights is None:
weights_slice = None
else:
weights_slice = output_weights[ds.out_indexes[k][0]:ds.out_indexes[k][1]]
self.criterion_arr.append(torch.nn.BCEWithLogitsLoss(weight=weights_slice))
self.unreduced_criterion_arr.append(torch.nn.BCEWithLogitsLoss(weight=weights_slice, reduce=False))
elif output_type == COLUMN_DATA_TYPES.NUMERIC:
def type_map(col_name):
col_pd_type = from_data[col_name].dtype
col_pd_type = str(col_pd_type)
if col_pd_type in ['int64', 'float64', 'timedelta']:
return COLUMN_DATA_TYPES.NUMERIC
elif col_pd_type in ['bool', 'category']:
return COLUMN_DATA_TYPES.CATEGORICAL
else:
# if the number of uniques is elss than 100 or less,
# than 10% of the total number of rows then keep it as categorical
unique = from_data[col_name].nunique()
if unique < 100 or unique < len(from_data[col_name]) / 10:
return COLUMN_DATA_TYPES.CATEGORICAL
# else assume its text
return COLUMN_DATA_TYPES.TEXT
def type_map(col_name):
col_pd_type = from_data[col_name].dtype
col_pd_type = str(col_pd_type)
if col_pd_type in ['int64', 'float64', 'timedelta']:
return COLUMN_DATA_TYPES.NUMERIC
elif col_pd_type in ['bool', 'category']:
return COLUMN_DATA_TYPES.CATEGORICAL
else:
# if the number of uniques is elss than 100 or less,
# than 10% of the total number of rows then keep it as categorical
unique = from_data[col_name].nunique()
if unique < 100 or unique < len(from_data[col_name]) / 10:
return COLUMN_DATA_TYPES.CATEGORICAL
# else assume its text
return COLUMN_DATA_TYPES.TEXT
def prepare_encoder(self, priming_data, training_data=None):
if self._prepared:
raise Exception('You can only call "prepare_encoder" once for a given encoder.')
priming_data = [x if x is not None else '' for x in priming_data]
self._max_len = min(max([len(x) for x in priming_data]), self._model_max_len)
self._tokenizer = self._tokenizer_class.from_pretrained(self._pretrained_model_name)
self._pad_id = self._tokenizer.convert_tokens_to_ids([self._tokenizer.pad_token])[0]
# @TODO: Support multiple targets if they are all categorical
# or train for the categorical target if it's a mix (maybe ?)
# @TODO: Attach a language modeling head and/or use GPT2
# and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text
if training_data is not None and 'targets' in training_data and len(training_data['targets']) == 1 and training_data['targets'][0]['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL:
self._model_type = 'classifier'
self._model = self._classifier_model_class.from_pretrained(self._pretrained_model_name, num_labels=len(
set(training_data['targets'][0]['unencoded_output'])) + 1).to(self.device)
batch_size = 10
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in self._model.named_parameters() if not any(
nd in n for nd in no_decay)], 'weight_decay': 0.000001},
{'params': [p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay)],
'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(