Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
caplog.set_level(logging.INFO)
emmental.Meta.init()
task1 = "task1"
x1 = np.random.rand(20, 2)
y1 = torch.from_numpy(np.random.rand(20))
task2 = "task2"
x2 = np.random.rand(30, 3)
y2 = torch.from_numpy(np.random.rand(30))
dataloaders = [
EmmentalDataLoader(
task_to_label_dict={task_name: "label"},
dataset=EmmentalDataset(
name=task_name, X_dict={"feature": x}, Y_dict={"label": y}
),
split="train",
batch_size=10,
shuffle=True,
)
for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
]
scheduler = MixedScheduler()
assert scheduler.get_num_batches(dataloaders) == 2
batch_task_names_1 = [
batch_data[0][-2] for batch_data in scheduler.get_batches(dataloaders)
]
caplog.set_level(logging.INFO)
emmental.Meta.init()
task1 = "task1"
x1 = np.random.rand(20, 2)
y1 = torch.from_numpy(np.random.rand(20))
task2 = "task2"
x2 = np.random.rand(30, 3)
y2 = torch.from_numpy(np.random.rand(30))
dataloaders = [
EmmentalDataLoader(
task_to_label_dict={task_name: "label"},
dataset=EmmentalDataset(
name=task_name, X_dict={"feature": x}, Y_dict={"label": y}
),
split="train",
batch_size=10,
shuffle=True,
)
for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
]
scheduler = SequentialScheduler()
assert scheduler.get_num_batches(dataloaders) == 5
batch_task_names = [
batch_data[-2] for batch_data in scheduler.get_batches(dataloaders)
]
torch.tensor(Y2[int(0.9 * N) :]),
)
train_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}
)
train_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}
)
dev_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}
)
dev_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}
)
test_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}
)
test_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test}
)
task_to_label_dict = {"task1": "label1"}
train_dataloader1 = EmmentalDataLoader(
task_to_label_dict=task_to_label_dict,
dataset=train_dataset1,
dataset.add_labels(Y_dict={"label2": x2})
# Check add one more label to dataset
assert torch.equal(dataset[0][1]["label2"], y2[0])
dataset.remove_label(label_name="label1")
# Check remove one more label to dataset
assert "label1" not in dataset.Y_dict
with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids"
)
dataset = EmmentalDataset(
X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data"
)
with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
)
with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
)
name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}
)
train_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}
)
dev_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}
)
dev_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}
)
test_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}
)
test_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_test}, Y_dict={"label2": Y2_test}
)
task_to_label_dict = {"task1": "label1"}
train_dataloader1 = EmmentalDataLoader(
task_to_label_dict=task_to_label_dict,
dataset=train_dataset1,
split="train",
batch_size=10,
)
dev_dataloader1 = EmmentalDataLoader(
dataset.add_labels(Y_dict={"label2": y2})
with pytest.raises(ValueError):
dataset.add_labels(Y_dict={"label2": x2})
# Check add one more label to dataset
assert torch.equal(dataset[0][1]["label2"], y2[0])
dataset.remove_label(label_name="label1")
# Check remove one more label to dataset
assert "label1" not in dataset.Y_dict
with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids"
)
dataset = EmmentalDataset(
X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data"
)
with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
)
with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
)
Y1_train, Y1_dev, Y1_test = (
torch.tensor(Y1[: int(0.8 * N)]),
torch.tensor(Y1[int(0.8 * N) : int(0.9 * N)]),
torch.tensor(Y1[int(0.9 * N) :]),
)
Y2_train, Y2_dev, Y2_test = (
torch.tensor(Y2[: int(0.8 * N)]),
torch.tensor(Y2[int(0.8 * N) : int(0.9 * N)]),
torch.tensor(Y2[int(0.9 * N) :]),
)
train_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_train}, Y_dict={"label1": Y1_train}
)
train_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_train}, Y_dict={"label2": Y2_train}
)
dev_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_dev}, Y_dict={"label1": Y1_dev}
)
dev_dataset2 = EmmentalDataset(
name="synthetic", X_dict={"data": X_dev}, Y_dict={"label2": Y2_dev}
)
test_dataset1 = EmmentalDataset(
name="synthetic", X_dict={"data": X_test}, Y_dict={"label1": Y1_test}
)
test_dataset2 = EmmentalDataset(
torch.Tensor([1, 2, 3, 4, 5]),
]
y1 = torch.Tensor([0, 0, 0, 0, 0])
x2 = [
torch.Tensor([1, 2, 3, 4, 5]),
torch.Tensor([1, 2, 3, 4]),
torch.Tensor([1, 2, 3]),
torch.Tensor([1, 2]),
torch.Tensor([1]),
]
y2 = torch.Tensor([1, 1, 1, 1, 1])
dataset = EmmentalDataset(
X_dict={"data1": x1, "data2": x2},
Y_dict={"label1": y1, "label2": y2},
name="new_data",
)
dataloader1 = EmmentalDataLoader(
task_to_label_dict={"task1": "label1"},
dataset=dataset,
split="train",
batch_size=2,
)
x_batch, y_batch = next(iter(dataloader1))
# Check if the dataloader is correctly constructed
assert dataloader1.task_to_label_dict == {"task1": "label1"}
# Set random seed seed
set_random_seed(2)
task1 = "task1"
x1 = np.random.rand(20, 2)
y1 = torch.from_numpy(np.random.rand(20))
task2 = "task2"
x2 = np.random.rand(30, 3)
y2 = torch.from_numpy(np.random.rand(30))
dataloaders = [
EmmentalDataLoader(
task_to_label_dict={task_name: "label"},
dataset=EmmentalDataset(
name=task_name, X_dict={"feature": x}, Y_dict={"label": y}
),
split="train",
batch_size=10,
shuffle=True,
)
for task_name, x, y in [(task1, x1, y1), (task2, x2, y2)]
]
scheduler = RoundRobinScheduler()
assert scheduler.get_num_batches(dataloaders) == 5
batch_task_names = [
batch_data[-2] for batch_data in scheduler.get_batches(dataloaders)
]
import logging
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
import torch
from emmental.data import EmmentalDataset
from scipy.sparse import csr_matrix
from torch import Tensor
from fonduer.candidates.models import Candidate
from fonduer.learning.utils import mark_sentence, mention_to_tokens
logger = logging.getLogger(__name__)
class FonduerDataset(EmmentalDataset):
"""A FonduerDataset class which is inherited from EmmentalDataset, which takes
list of candidates and corresponding feature matrix as input and wraps them.
:param name: The name of the dataset.
:type name: str
:param candidates: The list of candidates.
:type candidates: List[Candidate]
:param features: The corresponding feature matrix.
:type features: csr_matrix
:param word2id: The name of the dataset.
:type word2id: dict
:param labels: If np.array, it's the label for all candidates; If int, it's
the number of classes of label and we will create placeholder labels
(mainly used for inference).
:type labels: np.array or int
:param labels: Which candidates to use. If None, use all candidates.