Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_doc_classification(caplog=None):
if caplog:
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "bert-base-german-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=False)
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
label_list=["OTHER", "OFFENSE"],
metric="f1_macro",
dev_filename="test-sample.tsv",
test_filename=None,
dev_split=0.0,
label_column_name="coarse_label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = LanguageModel.load(lang_model)
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
def test_processor_saving_loading(caplog):
caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
lang_model = "bert-base-cased"
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=False
)
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
dev_filename=None,
test_filename=None,
dev_split=0.1,
columns=["text", "label", "unused"],
label_list=["OTHER", "OFFENSE"],
metrics=["f1_macro"]
)
dicts = processor.file_to_dicts(file="samples/doc_class/train-sample.tsv")
data, tensor_names = processor.dataset_from_dicts(dicts)
save_dir = "testsave/processor"
processor.save(save_dir)
def test_doc_classification():
#caplog.set_level(logging.CRITICAL)
set_all_seeds(seed=42)
device, n_gpu = initialize_device_settings(use_cuda=False)
n_epochs = 1
batch_size = 1
evaluate_every = 2
lang_model = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(
pretrained_model_name_or_path=lang_model)
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=8,
data_dir="samples/doc_class",
train_filename="train-sample.tsv",
label_list=["OTHER", "OFFENSE"],
metric="f1_macro",
dev_filename="test-sample.tsv",
test_filename=None,
dev_split=0.0,
label_column_name="coarse_label")
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
language_model = Roberta.load(lang_model)
prediction_head = TextClassificationHead(layer_dims=[768, len(processor.tasks["text_classification"]["label_list"])])
evaluate_every = 500
lang_model = "bert-base-uncased"
do_lower_case = True
# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model,
do_lower_case=do_lower_case)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir="../data/toxic-comments",
label_list=label_list,
label_column_name="label",
metric=metric,
quote_char='"',
multilabel=True,
train_filename="train.tsv",
dev_filename="val.tsv",
test_filename=None,
dev_split=0
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
n_epochs = 5
batch_size = 100
evaluate_every = 20
lang_model = "bert-base-cased"
do_lower_case = False
# 1.Create a tokenizer
tokenizer = Tokenizer.load(pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load Cola 2018 Data.
label_list = ["0", "1"]
metric = "mcc"
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=64,
data_dir="../data/cola",
dev_filename="dev.tsv",
dev_split=None,
test_filename=None,
label_list=label_list,
metric=metric,
label_column_name="label"
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
# 4. Create an AdaptiveModel
# The evaluation on the dev-set can be done with one of the predefined metrics or with a
# metric defined as a function from (preds, labels) to a dict that contains all the actual
# metrics values. The function must get registered under a string name and the string name must
# be used.
def mymetrics(preds, labels):
acc = simple_accuracy(preds, labels)
f1other = f1_score(y_true=labels, y_pred=preds, pos_label="OTHER")
f1offense = f1_score(y_true=labels, y_pred=preds, pos_label="OFFENSE")
f1macro = f1_score(y_true=labels, y_pred=preds, average="macro")
f1micro = f1_score(y_true=labels, y_pred=preds, average="micro")
return {"acc": acc, "f1_other": f1other, "f1_offense": f1offense, "f1_macro": f1macro, "f1_micro": f1micro}
register_metrics('mymetrics', mymetrics)
metric = 'mymetrics'
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=64,
data_dir="../data/germeval18",
label_list=label_list,
metric=metric,
label_column_name="coarse_label"
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
# 4. Create an AdaptiveModel
# a) which consists of a pretrained language model as a basis
language_model = LanguageModel.load(lang_model)
# b) and a prediction head on top that is suited for our task => Text classification
"acc": acc,
"f1_other": f1other,
"f1_offense": f1offense,
"f1_macro": f1macro,
"f1_micro": f1micro,
"mcc": mcc
}
register_metrics('mymetrics', mymetrics)
metric = 'mymetrics'
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
# The processor wants to know the possible labels ...
label_list = ["OTHER", "OFFENSE"]
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=64,
data_dir="../data/germeval18",
label_list=label_list,
metric=metric,
label_column_name="coarse_label"
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
batch_size=batch_size)
# Load one silo for each fold in our cross-validation
silos = DataSiloForCrossVal.make(data_silo, n_splits=xval_folds)
# the following steps should be run for each of the folds of the cross validation, so we put them
batch_size = 32
evaluate_every = 500
lang_model = "roberta-base"
# 1.Create a tokenizer
tokenizer = RobertaTokenizer.from_pretrained(
pretrained_model_name_or_path=lang_model)
# 2. Create a DataProcessor that handles all the conversion from raw text into a pytorch Dataset
# Here we load GermEval 2018 Data.
label_list = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
metric = "acc"
processor = TextClassificationProcessor(tokenizer=tokenizer,
max_seq_len=128,
data_dir="../data/toxic-comments",
label_list=label_list,
label_column_name="label",
metric=metric,
quote_char='"',
multilabel=True,
train_filename="train.tsv",
dev_filename="val.tsv",
test_filename=None,
dev_split=0
)
# 3. Create a DataSilo that loads several datasets (train/dev/test), provides DataLoaders for them and calculates a few descriptive statistics of our datasets
data_silo = DataSilo(
processor=processor,
:type header: int
:param proxies: proxy configuration to allow downloads of remote datasets.
Format as in "requests" library: https://2.python-requests.org//en/latest/user/advanced/#proxies
:type proxies: dict
:param kwargs: placeholder for passing generic parameters
:type kwargs: object
"""
#TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs
# Custom processor attributes
self.delimiter = delimiter
self.quote_char = quote_char
self.skiprows = skiprows
self.header = header
super(TextClassificationProcessor, self).__init__(
tokenizer=tokenizer,
max_seq_len=max_seq_len,
train_filename=train_filename,
dev_filename=dev_filename,
test_filename=test_filename,
dev_split=dev_split,
data_dir=data_dir,
tasks={},
proxies=proxies,
)
if metric and label_list:
if multilabel:
task_type = "multilabel_classification"
else:
task_type = "classification"