Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_calculate_roc_points():
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group("A") \
.sort_values(by=SCORE_KEY, ascending=False)
roc_points = _calculate_roc_points(grouped_data, "A")
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
_assert_equal_points(expected_roc_points, roc_points)
def test_calculate_roc_points():
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group("A") \
.sort_values(by=SCORE_KEY, ascending=False)
roc_points = _calculate_roc_points(grouped_data, "A")
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
_assert_equal_points(expected_roc_points, roc_points)
# Try filtering to get the convex hull of the ROC points.
# This should drop the second and third point.
def _get_grouped_data_and_base_points(sensitive_feature_value):
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group(sensitive_feature_value) \
.sort_values(by=SCORE_KEY, ascending=False)
x_grid = np.linspace(0, 1, 100)
if sensitive_feature_value == "A":
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = [1, 2]
def _get_grouped_data_and_base_points(sensitive_feature_value):
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group(sensitive_feature_value) \
.sort_values(by=SCORE_KEY, ascending=False)
x_grid = np.linspace(0, 1, 100)
if sensitive_feature_value == "A":
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = [1, 2]
if sensitive_feature_value == "B":
expected_roc_points = pd.DataFrame({
def _get_scores_labels_and_counts(data):
"""Order samples by scores, counting number of positive, negative, and overall samples.
The samples are sorted into ascending order.
:param data: the DataFrame containing scores and labels
:type data: pandas.DataFrame
:return: a tuple containing the sorted scores, labels, the number of samples, the number
of positive samples, and the number of negative samples
:rtype: tuple of list, list, int, int, int
"""
data_sorted = data.sort_values(by=SCORE_KEY, ascending=False)
scores = list(data_sorted[SCORE_KEY])
labels = list(data_sorted[LABEL_KEY])
n, n_positive, n_negative = _get_counts(labels)
return scores, labels, n, n_positive, n_negative
:return: the training data for the mitigator, grouped by sensitive feature value
:rtype: pandas.DataFrameGroupBy
"""
data_dict = {}
# TODO: extend to multiple columns for additional group data
# and name columns after original column names if possible
# or store the original column names
sensitive_feature_name = SENSITIVE_FEATURE_KEY
if sensitive_feature_names is not None:
if sensitive_feature_name in [SCORE_KEY, LABEL_KEY]:
raise ValueError(SENSITIVE_FEATURE_NAME_CONFLICT_DETECTED_ERROR_MESSAGE)
sensitive_feature_name = sensitive_feature_names[0]
_reformat_data_into_dict(sensitive_feature_name, data_dict, sensitive_features)
_reformat_data_into_dict(SCORE_KEY, data_dict, scores)
_reformat_data_into_dict(LABEL_KEY, data_dict, labels)
return pd.DataFrame(data_dict).groupby(sensitive_feature_name)
def _get_scores_labels_and_counts(data):
"""Order samples by scores, counting number of positive, negative, and overall samples.
The samples are sorted into ascending order.
:param data: the DataFrame containing scores and labels
:type data: pandas.DataFrame
:return: a tuple containing the sorted scores, labels, the number of samples, the number
of positive samples, and the number of negative samples
:rtype: tuple of list, list, int, int, int
"""
data_sorted = data.sort_values(by=SCORE_KEY, ascending=False)
scores = list(data_sorted[SCORE_KEY])
labels = list(data_sorted[LABEL_KEY])
n, n_positive, n_negative = _get_counts(labels)
return scores, labels, n, n_positive, n_negative