Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_fbeta(self):
golds = np.array([0, 0, 0, 0, 1])
preds = np.array([1, 1, 0, 0, 1])
pre = metric_score(golds, preds, probs=None, metric="precision")
rec = metric_score(golds, preds, probs=None, metric="recall")
self.assertAlmostEqual(
pre,
metric_score(golds, preds, probs=None, metric="fbeta", beta=1e-6),
places=2,
)
self.assertAlmostEqual(
rec,
metric_score(golds, preds, probs=None, metric="fbeta", beta=1e6),
places=2,
)
def test_matthews(self):
golds = np.array([0, 0, 0, 0, 1])
preds = np.array([1, 0, 0, 0, 0])
mcc = metric_score(golds, preds, probs=None, metric="matthews_corrcoef")
self.assertAlmostEqual(mcc, -0.25)
golds = np.array([0, 0, 0, 0, 1])
preds = np.array([0, 0, 0, 0, 1])
mcc = metric_score(golds, preds, probs=None, metric="matthews_corrcoef")
self.assertAlmostEqual(mcc, 1.0)
def test_f1_multiclass(self):
golds = np.array([0, 0, 1, 1, 2])
preds = np.array([1, 1, 0, 1, 2])
score = metric_score(golds, preds, probs=None, metric="f1_micro")
self.assertAlmostEqual(score, 0.4)
score = metric_score(golds, preds, probs=None, metric="f1_macro")
self.assertAlmostEqual(score, 0.47, 2)
def test_ignores(self):
golds = np.array([0, 0, 0, 1, 1])
preds = np.array([0, -1, 0, 1, 0])
score = metric_score(golds, preds, probs=None, metric="accuracy")
self.assertAlmostEqual(score, 0.6)
score = metric_score(
golds, preds, probs=None, metric="accuracy", filter_dict={"preds": [-1]}
)
self.assertAlmostEqual(score, 0.75)
score = metric_score(
golds, preds, probs=None, metric="accuracy", filter_dict={"golds": [0]}
)
self.assertAlmostEqual(score, 0.5)
score = metric_score(
golds,
preds,
probs=None,
metric="accuracy",
filter_dict={"golds": [1], "preds": [-1]},
)
self.assertAlmostEqual(score, 1.0)
def test_accuracy_basic(self):
golds = np.array([0, 0, 0, 1, 1])
preds = np.array([0, 0, 0, 1, 0])
score = metric_score(golds, preds, probs=None, metric="accuracy")
self.assertAlmostEqual(score, 0.8)
def test_coverage(self):
golds = np.array([0, 0, 0, 0, 1])
preds = np.array([-1, -1, 0, 0, 0])
score = metric_score(golds, preds, probs=None, metric="coverage")
self.assertAlmostEqual(score, 0.6)
score = metric_score(
golds, preds, probs=None, filter_dict={"golds": [1]}, metric="coverage"
)
self.assertAlmostEqual(score, 0.5)
def test_coverage(self):
golds = np.array([0, 0, 0, 0, 1])
preds = np.array([-1, -1, 0, 0, 0])
score = metric_score(golds, preds, probs=None, metric="coverage")
self.assertAlmostEqual(score, 0.6)
score = metric_score(
golds, preds, probs=None, filter_dict={"golds": [1]}, metric="coverage"
)
self.assertAlmostEqual(score, 0.5)
model = get_model()
batch_size = 64
model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=get_n_epochs())
# %% [markdown]
# Finally, we evaluate the trained model by measuring its F1 score and ROC_AUC.
# %%
X_test = get_feature_arrays(df_test)
probs_test = model.predict(X_test)
preds_test = probs_to_preds(probs_test)
print(
f"Test F1 when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='f1')}"
)
print(
f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}"
)
from utils import get_n_epochs
X_train = get_feature_arrays(df_train_filtered)
model = get_model()
batch_size = 64
model.fit(X_train, probs_train_filtered, batch_size=batch_size, epochs=get_n_epochs())
# %% [markdown]
# Finally, we evaluate the trained model by measuring its F1 score and ROC_AUC.
# %%
X_test = get_feature_arrays(df_test)
probs_test = model.predict(X_test)
preds_test = probs_to_preds(probs_test)
print(
f"Test F1 when trained with soft labels: {metric_score(Y_test, preds=preds_test, metric='f1')}"
)
print(
f"Test ROC-AUC when trained with soft labels: {metric_score(Y_test, probs=probs_test, metric='roc_auc')}"
)
# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling.model import LabelModel
# Train LabelModel.
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=100, seed=123, log_freq=20, l2=0.1, lr=0.01)
# %% [markdown]
# As a spot-check for the quality of our LabelModel, we'll score it on the dev set.
# %%
from snorkel.analysis import metric_score
preds_dev = label_model.predict(L_dev)
acc = metric_score(Y_dev, preds_dev, probs=None, metric="accuracy")
print(f"LabelModel Accuracy: {acc:.3f}")
# %% [markdown]
# We see that we get very high accuracy on the development set.
# This is due to the abundance of high quality crowdworker labels.
# **Since we don't have these high quality crowdsourcing labels for the
# test set or new incoming data points, we can't use the LabelModel reliably
# at inference time.**
# In order to run inference on new incoming data points, we need to train a
# discriminative model over the tweets themselves.
# Let's generate a set of labels for that training set.
# %%
preds_train = label_model.predict(L_train)
# %% [markdown]