Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_calculate_roc_points():
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group("A") \
.sort_values(by=SCORE_KEY, ascending=False)
roc_points = _calculate_roc_points(grouped_data, "A")
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
_assert_equal_points(expected_roc_points, roc_points)
# Try filtering to get the convex hull of the ROC points.
# This should drop the second and third point.
selected_points = \
pd.DataFrame(_filter_points_to_get_convex_hull(roc_points))[['x', 'y', 'operation']]
_assert_equal_points(expected_roc_points, selected_points, ignore_indices=[1, 2])
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = [1, 2]
if sensitive_feature_value == "B":
expected_roc_points = pd.DataFrame({
"x": [0, 1/3, 1],
"y": [0, 3/4, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = []
if sensitive_feature_value == "C":
expected_roc_points = pd.DataFrame({
"x": [0, 0, 2/3, 1],
"y": [0, 1/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = [0]
"x": [0, 1/3, 1],
"y": [0, 3/4, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = []
if sensitive_feature_value == "C":
expected_roc_points = pd.DataFrame({
"x": [0, 0, 2/3, 1],
"y": [0, 1/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = [0]
return grouped_data, expected_roc_points, ignore_for_base_points, x_grid
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group("A") \
.sort_values(by=SCORE_KEY, ascending=False)
roc_points = _calculate_roc_points(grouped_data, "A")
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
_assert_equal_points(expected_roc_points, roc_points)
# Try filtering to get the convex hull of the ROC points.
# This should drop the second and third point.
selected_points = \
pd.DataFrame(_filter_points_to_get_convex_hull(roc_points))[['x', 'y', 'operation']]
_assert_equal_points(expected_roc_points, selected_points, ignore_indices=[1, 2])
def test_predict_from_operation_less():
classifier = ThresholdOperation('<', 0.5).get_predictor_from_operation()
assert classifier(-10000) == 1
assert classifier(0) == 1
assert classifier(0.5) == 0
assert classifier(1) == 0
assert classifier(10000) == 0
def test_predict_from_operation_invalid_operator():
with pytest.raises(ValueError, match="Unrecognized operator: ="):
ThresholdOperation('=', 0.5)
def test_calculate_roc_points():
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group("A") \
.sort_values(by=SCORE_KEY, ascending=False)
roc_points = _calculate_roc_points(grouped_data, "A")
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
_assert_equal_points(expected_roc_points, roc_points)
# Try filtering to get the convex hull of the ROC points.
# This should drop the second and third point.
selected_points = \
pd.DataFrame(_filter_points_to_get_convex_hull(roc_points))[['x', 'y', 'operation']]
_assert_equal_points(expected_roc_points, selected_points, ignore_indices=[1, 2])
def _get_grouped_data_and_base_points(sensitive_feature_value):
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group(sensitive_feature_value) \
.sort_values(by=SCORE_KEY, ascending=False)
x_grid = np.linspace(0, 1, 100)
if sensitive_feature_value == "A":
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = [1, 2]
if sensitive_feature_value == "B":
expected_roc_points = pd.DataFrame({
"x": [0, 1/3, 1],
"y": [0, 3/4, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = []
def _get_grouped_data_and_base_points(sensitive_feature_value):
data = pd.DataFrame({
SENSITIVE_FEATURE_KEY: sensitive_features_ex1,
SCORE_KEY: scores_ex,
LABEL_KEY: labels_ex})
grouped_data = data.groupby(SENSITIVE_FEATURE_KEY).get_group(sensitive_feature_value) \
.sort_values(by=SCORE_KEY, ascending=False)
x_grid = np.linspace(0, 1, 100)
if sensitive_feature_value == "A":
expected_roc_points = pd.DataFrame({
"x": [0, 0.25, 0.5, 0.5, 1],
"y": [0, 1/3, 2/3, 1, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('<', 1.5),
ThresholdOperation('<', 2.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = [1, 2]
if sensitive_feature_value == "B":
expected_roc_points = pd.DataFrame({
"x": [0, 1/3, 1],
"y": [0, 3/4, 1],
"operation": [ThresholdOperation('>', np.inf),
ThresholdOperation('<', 0.5),
ThresholdOperation('>', -np.inf)]
})
ignore_for_base_points = []
if sensitive_feature_value == "C":
:type sensitive_feature_value: str or int
:param flip: if True flip points below the ROC diagonal into points above by applying negative
weights; if False does not allow flipping; default True
:type flip: bool
:return: the ROC curve points with their corresponding threshold operations
:rtype: pandas.DataFrame
"""
scores, labels, n, n_positive, n_negative = _get_scores_labels_and_counts(data)
if n_positive == 0 or n_negative == 0:
raise ValueError(DEGENERATE_LABELS_ERROR_MESSAGE.format(sensitive_feature_value))
scores.append(-np.inf)
labels.append(np.nan)
x_list, y_list, operation_list = [0], [0], [ThresholdOperation('>', np.inf)]
# Iterate through all samples which are sorted by increasing scores.
# Setting the threshold between two scores means that everything smaller
# than the threshold gets a label of 0 while everything larger than the
# threshold gets a label of 1. Flipping labels is an option if flipping
# labels provides better accuracy.
i = 0
count = [0, 0]
while i < n:
threshold = scores[i]
while scores[i] == threshold:
count[labels[i]] += 1
i += 1
# For the ROC curve we calculate points (x, y), where x represents
# the conditional probability P[Y_hat=1 | Y=0] and y represents
# the conditional probability P[Y_hat=1 | Y=1]. The conditional