Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Copyright (c) 2019 NVIDIA Corporation
__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
from nemo.utils.exp_logging import get_logger
logger = get_logger('')
def eval_iter_callback(tensors, global_vars, eval_data_layer, tag_ids):
if "correct_tags" not in global_vars.keys():
global_vars["correct_tags"] = 0
if "token_count" not in global_vars.keys():
global_vars["token_count"] = 0
if "correct_chunks" not in global_vars.keys():
global_vars["correct_chunks"] = 0
if "predicted_chunks" not in global_vars.keys():
global_vars["predicted_chunks"] = 0
if "total_chunks" not in global_vars.keys():
global_vars["total_chunks"] = 0
if "lines" not in global_vars.keys():
global_vars["lines"] = []
__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
import os
import random
import time
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from nemo.utils.exp_logging import get_logger
matplotlib.use("TkAgg")
logger = get_logger('')
def tensor2list(tensor):
return tensor.detach().cpu().tolist()
def eval_iter_callback(tensors,
global_vars,
eval_data_layer):
if "all_intent_preds" not in global_vars.keys():
global_vars["all_intent_preds"] = []
if "all_intent_labels" not in global_vars.keys():
global_vars["all_intent_labels"] = []
if "all_slot_preds" not in global_vars.keys():
global_vars["all_slot_preds"] = []
if "all_slot_labels" not in global_vars.keys():
"""
import itertools
import os
import pickle
import random
import numpy as np
from torch.utils.data import Dataset
from nemo.utils.exp_logging import get_logger
from . import utils
logger = get_logger('')
def get_features(queries,
max_seq_length,
tokenizer,
punct_label_ids=None,
capit_label_ids=None,
pad_label='O',
punct_labels_lines=None,
capit_labels_lines=None,
ignore_extra_tokens=False,
ignore_start_end=False):
"""
Args:
queries (list of str): text sequences
max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
Utility functions for GLUE tasks
Some transformer of this code were adapted from the HuggingFace library at
https://github.com/huggingface/transformers
"""
__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
import os
import random
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from nemo.utils.exp_logging import get_logger
logger = get_logger('')
def eval_iter_callback(tensors, global_vars):
if "all_preds" not in global_vars.keys():
global_vars["all_preds"] = []
if "all_labels" not in global_vars.keys():
global_vars["all_labels"] = []
logits_lists = []
preds_lists = []
labels_lists = []
for kv, v in tensors.items():
# for GLUE classification tasks
if 'logits' in kv:
for v_tensor in v:
"""
import itertools
import os
import pickle
import random
import numpy as np
from torch.utils.data import Dataset
from nemo.utils.exp_logging import get_logger
from . import utils
logger = get_logger('')
def get_features(queries,
max_seq_length,
tokenizer,
pad_label='O',
raw_labels=None,
unique_labels=None,
ignore_extra_tokens=False,
ignore_start_end=False):
"""
Args:
queries (list of str):
max_seq_length (int): max sequence length minus 2 for [CLS] and [SEP]
tokenizer (Tokenizer): such as NemoBertTokenizer
pad_label (str): pad value use for labels.
import subprocess
import sys
import numpy as np
from sentencepiece import SentencePieceTrainer as SPT
from tqdm import tqdm
from nemo.utils.exp_logging import get_logger
from ...utils.nlp_utils import (get_vocab,
write_vocab,
write_vocab_in_order,
label2idx)
logger = get_logger('')
LOGGING_TMP = '{} dataset has already been processed and stored at {}'
def get_stats(lengths):
lengths = np.asarray(lengths)
logger.info(f'Min: {np.min(lengths)} | \
Max: {np.max(lengths)} | \
Mean: {np.mean(lengths)} | \
Median: {np.median(lengths)}')
logger.info(f'75 percentile: {np.percentile(lengths, 75)}')
logger.info(f'99 percentile: {np.percentile(lengths, 99)}')
def get_label_stats(labels, outfile='stats.tsv'):
labels = Counter(labels)
total = sum(labels.values())
# Copyright (c) 2019 NVIDIA Corporation
__all__ = ['eval_iter_callback', 'eval_epochs_done_callback']
import random
import numpy as np
from sklearn.metrics import classification_report
from nemo_nlp.data.datasets.utils import list2str, tensor2list
from nemo_nlp.utils.nlp_utils import plot_confusion_matrix
from nemo.utils.exp_logging import get_logger
logger = get_logger('')
def eval_iter_callback(tensors, global_vars):
if "punct_all_preds" not in global_vars.keys():
global_vars["punct_all_preds"] = []
if "punct_all_labels" not in global_vars.keys():
global_vars["punct_all_labels"] = []
if "capit_all_preds" not in global_vars.keys():
global_vars["capit_all_preds"] = []
if "capit_all_labels" not in global_vars.keys():
global_vars["capit_all_labels"] = []
if "all_subtokens_mask" not in global_vars.keys():
global_vars["all_subtokens_mask"] = []
all_subtokens_mask = []
punct_all_logits, punct_all_labels = [], []
Some parts of this code were adapted from the HuggingFace library at
https://github.com/huggingface/pytorch-pretrained-BERT
"""
import itertools
import random
import numpy as np
from torch.utils.data import Dataset
from nemo.utils.exp_logging import get_logger
from . import utils
logger = get_logger('')
def get_features(queries,
max_seq_length,
tokenizer,
pad_label=128,
raw_slots=None,
ignore_extra_tokens=False,
ignore_start_end=False):
all_subtokens = []
all_loss_mask = []
all_subtokens_mask = []
all_segment_ids = []
all_input_ids = []
all_input_mask = []
sent_lengths = []
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import collections
import numpy as np
import pickle
from torch.utils.data import Dataset
from nemo.utils.exp_logging import get_logger
logger = get_logger('')
class BertPunctuationDataset(Dataset):
def __init__(self, input_file, max_seq_length, tokenizer):
# Cache features and tag_ids
data_dir = os.path.dirname(input_file)
filename = os.path.basename(input_file)[:-4]
features_pkl = os.path.join(data_dir, filename + "_features.pkl")
tag_ids_pkl = os.path.join(data_dir, filename + "_tag_ids.pkl")
if os.path.exists(features_pkl) and os.path.exists(tag_ids_pkl):
# If input_file was already processed, load from pickle files
self.features = pickle.load(open(features_pkl, 'rb'))
self.tag_ids = pickle.load(open(tag_ids_pkl, 'rb'))
logger.info(f'features restored from {features_pkl}')
logger.info(f'tag_ids restored from {tag_ids_pkl}')
import os
import pickle
import numpy as np
from nemo.utils.exp_logging import get_logger
logger = get_logger('')
def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True):
"""
Reads dataset from file line by line, tokenizes each line with tokenizer,
and returns list of lists which corresponds to ids of tokenized strings.
Args:
dataset: path to dataset
tokenizer: tokenizer to convert text into ids
cache_ids: if True, ids are saved to disk as pickle file
with similar name (e.g., data.txt --> data.txt.pkl)
add_bos_eos: bool, whether to add <s> and </s> symbols (e.g., for NMT)
Returns:
ids: list of ids which correspond to tokenized strings of the dataset
"""