Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
columnless_prediction_distribution[output_column][input_column] = col_missing_output_histogram
# @TODO should be go back to generating this information based on the buckets of the input columns ? Or just keep doing the stats generation for the input columns based on the indexes of the buckets for the output column
for output_column in output_columns:
buckets_stats[output_column] = {}
bucket_indexes = {}
for index,row in full_dataset.iterrows():
value = row[output_column]
if 'percentage_buckets' in stats[output_column]:
percentage_buckets = stats[output_column]['percentage_buckets']
else:
percentage_buckets = None
value_bucket = get_value_bucket(value, percentage_buckets, stats[output_column], self.transaction.hmd)
if value_bucket not in bucket_indexes:
bucket_indexes[value_bucket] = []
bucket_indexes[value_bucket].append(index)
for bucket in bucket_indexes:
buckets_stats[output_column][bucket] = {}
input_data = TransactionData()
input_data.data_frame = full_dataset.loc[bucket_indexes[bucket]]
input_data.columns = input_data.columns
stats_generator = StatsGenerator(session=None, transaction=self.transaction)
try:
with disable_console_output():
col_buckets_stats = stats_generator.run(input_data=input_data, modify_light_metadata=False, print_logs=False)
buckets_stats[output_column][bucket].update(col_buckets_stats)
except Exception as e:
if col_missing_output_stats is None:
pass
elif 'histogram' in col_missing_output_stats[output_column]:
columnless_prediction_distribution[output_column][input_column] = col_missing_output_stats[output_column]['histogram']
# If this coulmn is either very important or not important at all, compute stats for each of the buckets (in the validation data)
if column_importance > 0.8 or column_importance < 0.2:
split_data = {}
for value in full_dataset[input_column]:
if 'percentage_buckets' in stats[input_column]:
bucket = stats[input_column]['percentage_buckets']
else:
bucket = None
vb = get_value_bucket(value, bucket, stats[input_column])
if f'{input_column}_bucket_{vb}' not in split_data:
split_data[f'{input_column}_bucket_{vb}'] = []
split_data[f'{input_column}_bucket_{vb}'].append(value)
row_wise_data = []
max_length = max(list(map(len, split_data.values())))
columns = []
for i in range(max_length):
row_wise_data.append([])
for k in split_data.keys():
# If the sub bucket has less than 6 values, it's no relevant
if len(split_data[k]) > 6:
columns.append(k)
if len(split_data[k]) > i:
:param real_value: The real value/label for this prediction
:param predicted_value: The predicted value/label
:param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
"""
try:
predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
except:
predicted_value = None
try:
real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
except:
real_value = None
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats, hmd)
real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = X + features_existence
self._X_buff.append(X)
self._Y_buff.append(real_value_b)
self._real_buckets_buff = self._Y_buff
self._predicted_buckets_buff.append(predicted_value_b)
if is_original_data:
self._original_real_buckets_buff.append(real_value_b)
self._original_predicted_buckets_buff.append(predicted_value_b)
# If no column is ignored, compute the accuracy for this bucket
def evaluate_prediction_accuracy(self, features_existence, predicted_value):
"""
# Fit the probabilistic validator on an observation def evaluate_prediction_accuracy(self, features_existence, predicted_value):
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
:param predicted_value: The predicted value/label
:return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
"""
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = [X + features_existence]
else:
X = [features_existence]
#X = [[predicted_value_b, *features_existence]]
log_types = np.seterr()
np.seterr(divide='ignore')
distribution = self._probabilistic_model.predict_proba(np.array(X))
np.seterr(divide=log_types['divide'])
if self.buckets is not None:
return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability
else:
return distribution[0][1]
"""
# Register an observation in the validator's internal buffers
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
:param real_value: The real value/label for this prediction
:param predicted_value: The predicted value/label
:param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
"""
predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
try:
real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
except:
real_value = None
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = X + features_existence
self.X_buff.append(X)
self.Y_buff.append(real_value_b)
else:
predicted_value_b = predicted_value
real_value_b = real_value
self.X_buff.append(features_existence)
self.Y_buff.append(real_value_b == predicted_value_b)
def evaluate_prediction_accuracy(self, features_existence, predicted_value):
"""
# Fit the probabilistic validator on an observation def evaluate_prediction_accuracy(self, features_existence, predicted_value):
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
:param predicted_value: The predicted value/label
:return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value)
"""
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = [X + features_existence]
else:
X = [features_existence]
#X = [[predicted_value_b, *features_existence]]
log_types = np.seterr()
np.seterr(divide='ignore')
distribution = self._probabilistic_model.predict_proba(np.array(X))
np.seterr(divide=log_types['divide'])
if self.buckets is not None:
return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability
else:
return distribution[0][1]
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
:param real_value: The real value/label for this prediction
:param predicted_value: The predicted value/label
:param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
"""
nr_missing_features = len([x for x in features_existence if x is False or x is 0])
predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
try:
real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
except:
real_value = None
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = X + features_existence
self.X_buff.append(X)
self.Y_buff.append(real_value_b)
# If no column is ignored, compute the accuracy for this bucket
if nr_missing_features == 0:
if predicted_value_b not in self.bucket_accuracy:
self.bucket_accuracy[predicted_value_b] = []
self.bucket_accuracy[predicted_value_b].append(int(real_value_b == predicted_value_b))
else:
predicted_value_b = predicted_value
real_value_b = real_value
self.X_buff.append(features_existence)
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
:param real_value: The real value/label for this prediction
:param predicted_value: The predicted value/label
:param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
"""
nr_missing_features = len([x for x in features_existence if x is False or x is 0])
predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
try:
real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
except:
real_value = None
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = X + features_existence
self.X_buff.append(X)
self.Y_buff.append(real_value_b)
# If no column is ignored, compute the accuracy for this bucket
if nr_missing_features == 0:
if predicted_value_b not in self.bucket_accuracy:
self.bucket_accuracy[predicted_value_b] = []
self.bucket_accuracy[predicted_value_b].append(int(real_value_b == predicted_value_b))
else:
predicted_value_b = predicted_value
real_value_b = real_value
self.X_buff.append(features_existence)
self.Y_buff.append(real_value_b == predicted_value_b)
# Register an observation in the validator's internal buffers
:param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists)
:param real_value: The real value/label for this prediction
:param predicted_value: The predicted value/label
:param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
"""
predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
try:
real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
except:
real_value = None
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats)
real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = X + features_existence
self.X_buff.append(X)
self.Y_buff.append(real_value_b)
else:
predicted_value_b = predicted_value
real_value_b = real_value
self.X_buff.append(features_existence)
self.Y_buff.append(real_value_b == predicted_value_b)
:param predicted_value: The predicted value/label
:param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value`
"""
try:
predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value)
except:
predicted_value = None
try:
real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.'))
except:
real_value = None
if self.buckets is not None:
predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats, hmd)
real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd)
X = [False] * (len(self.buckets) + 1)
X[predicted_value_b] = True
X = X + features_existence
self._X_buff.append(X)
self._Y_buff.append(real_value_b)
self._real_buckets_buff = self._Y_buff
self._predicted_buckets_buff.append(predicted_value_b)
if is_original_data:
self._original_real_buckets_buff.append(real_value_b)
self._original_predicted_buckets_buff.append(predicted_value_b)
# If no column is ignored, compute the accuracy for this bucket
nr_missing_features = len([x for x in features_existence if x is False or x is 0])