Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def fit(self, dataset_true, dataset_pred):
"""Compute parameters for equalizing generalized odds using true and
predicted scores, while preserving calibration.
Args:
dataset_true (BinaryLabelDataset): Dataset containing true `labels`.
dataset_pred (BinaryLabelDataset): Dataset containing predicted
`scores`.
Returns:
CalibratedEqOddsPostprocessing: Returns self.
"""
# Create boolean conditioning vectors for protected groups
cond_vec_priv = utils.compute_boolean_conditioning_vector(
dataset_pred.protected_attributes,
dataset_pred.protected_attribute_names,
self.privileged_groups)
cond_vec_unpriv = utils.compute_boolean_conditioning_vector(
dataset_pred.protected_attributes,
dataset_pred.protected_attribute_names,
self.unprivileged_groups)
cm = ClassificationMetric(dataset_true, dataset_pred,
unprivileged_groups=self.unprivileged_groups,
privileged_groups=self.privileged_groups)
self.base_rate_priv = cm.base_rate(privileged=True)
self.base_rate_unpriv = cm.base_rate(privileged=False)
# Create a dataset with "trivial" predictions
dataset_trivial = dataset_pred.copy(deepcopy=True)
# sets self.dataset
super(DatasetMetric, self).__init__(dataset)
# TODO: should this deepcopy?
self.privileged_groups = privileged_groups
self.unprivileged_groups = unprivileged_groups
# don't check if nothing was provided
if not self.privileged_groups or not self.unprivileged_groups:
return
priv_mask = utils.compute_boolean_conditioning_vector(
self.dataset.protected_attributes,
self.dataset.protected_attribute_names, self.privileged_groups)
unpriv_mask = utils.compute_boolean_conditioning_vector(
self.dataset.protected_attributes,
self.dataset.protected_attribute_names, self.unprivileged_groups)
if np.any(np.logical_and(priv_mask, unpriv_mask)):
raise ValueError("'privileged_groups' and 'unprivileged_groups'"
" must be disjoint.")
if not np.all(np.logical_or(priv_mask, unpriv_mask)):
warn("There are some instances in the dataset which are not "
"designated as either privileged or unprivileged. Are you sure"
# of the upper-bound inequality constraints at x
# b_ub - 1-D array of values representing the upper-bound of each
# inequality constraint (row) in A_ub.
# Just to keep these between zero and one
A_ub = np.array([[ 1, 0, 0, 0],
[-1, 0, 0, 0],
[ 0, 1, 0, 0],
[ 0, -1, 0, 0],
[ 0, 0, 1, 0],
[ 0, 0, -1, 0],
[ 0, 0, 0, 1],
[ 0, 0, 0, -1]], dtype=np.float64)
b_ub = np.array([1, 0, 1, 0, 1, 0, 1, 0], dtype=np.float64)
# Create boolean conditioning vectors for protected groups
cond_vec_priv = utils.compute_boolean_conditioning_vector(
dataset_pred.protected_attributes,
dataset_pred.protected_attribute_names,
self.privileged_groups)
cond_vec_unpriv = utils.compute_boolean_conditioning_vector(
dataset_pred.protected_attributes,
dataset_pred.protected_attribute_names,
self.unprivileged_groups)
sconst = np.ravel(
dataset_pred.labels[cond_vec_priv] == dataset_pred.favorable_label)
sflip = np.ravel(
dataset_pred.labels[cond_vec_priv] == dataset_pred.unfavorable_label)
oconst = np.ravel(
dataset_pred.labels[cond_vec_unpriv] == dataset_pred.favorable_label)
oflip = np.ravel(
dataset_pred.labels[cond_vec_unpriv] == dataset_pred.unfavorable_label)
y_pred = np.zeros(dataset.scores.shape)
y_pred[fav_pred_inds] = dataset.favorable_label
y_pred[unfav_pred_inds] = dataset.unfavorable_label
# Indices of critical region around the classification boundary
crit_region_inds = np.logical_and(
dataset.scores <= self.classification_threshold+self.ROC_margin,
dataset.scores > self.classification_threshold-self.ROC_margin)
# Indices of privileged and unprivileged groups
cond_priv = utils.compute_boolean_conditioning_vector(
dataset.protected_attributes,
dataset.protected_attribute_names,
self.privileged_groups)
cond_unpriv = utils.compute_boolean_conditioning_vector(
dataset.protected_attributes,
dataset.protected_attribute_names,
self.unprivileged_groups)
# New, fairer labels
dataset_new.labels = y_pred
dataset_new.labels[np.logical_and(crit_region_inds,
cond_priv.reshape(-1,1))] = dataset.unfavorable_label
dataset_new.labels[np.logical_and(crit_region_inds,
cond_unpriv.reshape(-1,1))] = dataset.favorable_label
return dataset_new
"""Perturb the predicted scores to obtain new labels that satisfy
equalized odds constraints, while preserving calibration.
Args:
dataset (BinaryLabelDataset): Dataset containing `scores` that needs
to be transformed.
threshold (float): Threshold for converting `scores` to `labels`.
Values greater than or equal to this threshold are predicted to
be the `favorable_label`. Default is 0.5.
Returns:
dataset (BinaryLabelDataset): transformed dataset.
"""
if self.seed is not None:
np.random.seed(self.seed)
cond_vec_priv = utils.compute_boolean_conditioning_vector(
dataset.protected_attributes,
dataset.protected_attribute_names,
self.privileged_groups)
cond_vec_unpriv = utils.compute_boolean_conditioning_vector(
dataset.protected_attributes,
dataset.protected_attribute_names,
self.unprivileged_groups)
priv_indices = (np.random.random(sum(cond_vec_priv))
<= self.priv_mix_rate)
priv_new_pred = dataset.scores[cond_vec_priv].copy()
priv_new_pred[priv_indices] = self.base_rate_priv
unpriv_indices = (np.random.random(sum(cond_vec_unpriv))
<= self.unpriv_mix_rate)
unpriv_new_pred = dataset.scores[cond_vec_unpriv].copy()
def mahalanobis_distance(self, privileged=None, returned=False):
"""Compute the average Mahalanobis distance between the samples from the
two datasets.
"""
condition = self._to_condition(privileged)
X_orig = self.dataset.features
X_distort = self.distorted_dataset.features
dist_fun = partial(scdist.mahalanobis,
VI=np.linalg.inv(np.cov(np.vstack([X_orig, X_distort]).T)).T)
distance, mask = utils.compute_distance(X_orig, X_distort,
self.dataset.protected_attributes,
self.dataset.protected_attribute_names, dist_fun=dist_fun,
condition=condition)
if returned:
return distance, self.dataset.instance_weights[mask]
return distance
Args:
groups (list): A list of groups over which to calculate this metric.
Groups should be disjoint. By default, this will use the
`privileged_groups` and `unprivileged_groups` as the only two
groups.
alpha (int): See :meth:`generalized_entropy_index`.
References:
.. [2] T. Speicher, H. Heidari, N. Grgic-Hlaca, K. P. Gummadi, A. Singla, A. Weller, and M. B. Zafar,
"A Unified Approach to Quantifying Algorithmic Unfairness: Measuring Individual and Group Unfairness via Inequality Indices,"
ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 2018.
"""
b = np.zeros(self.dataset.labels.size, dtype=np.float64)
for group in groups:
classified_group = utils.compute_boolean_conditioning_vector(
self.classified_dataset.protected_attributes,
self.classified_dataset.protected_attribute_names,
condition=group)
true_group = utils.compute_boolean_conditioning_vector(
self.dataset.protected_attributes,
self.dataset.protected_attribute_names,
condition=group)
# ignore if there are no members of this group present
if not np.any(true_group):
continue
y_pred = self.classified_dataset.labels[classified_group].ravel()
y_true = self.dataset.labels[true_group].ravel()
y_pred = (y_pred == self.classified_dataset.favorable_label).astype(
np.float64)
y_true = (y_true == self.dataset.favorable_label).astype(np.float64)
b[true_group] = np.mean(1 + y_pred - y_true)
def euclidean_distance(self, privileged=None, returned=False):
"""Compute the average Euclidean distance between the samples from the
two datasets.
"""
condition = self._to_condition(privileged)
distance, mask = utils.compute_distance(self.dataset.features,
self.distorted_dataset.features, self.dataset.protected_attributes,
self.dataset.protected_attribute_names, dist_fun=scdist.euclidean,
condition=condition)
if returned:
return distance, self.dataset.instance_weights[mask]
return distance