Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import sys
from transformers import AutoTokenizer
dataset = sys.argv[1]
model_name_or_path = sys.argv[2]
max_len = int(sys.argv[3])
subword_len_counter = 0
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
max_len -= tokenizer.num_special_tokens_to_add()
with open(dataset, "rt") as f_p:
for line in f_p:
line = line.rstrip()
if not line:
print(line)
subword_len_counter = 0
continue
token = line.split()[0]
current_subwords_len = len(tokenizer.tokenize(token))
# Token contains strange control characters like \x96 or \x95
raise ValueError(
"Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
args.output_dir
)
)
# Set up training device
if args.to_cpu or not torch.cuda.is_available():
args.device = torch.device("cpu")
args.n_gpu = 0
else:
args.device = torch.device("cuda")
args.n_gpu = torch.cuda.device_count()
# Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
config = BertConfig.from_pretrained(args.model_name_or_path)
decoder_model = BertForMaskedLM(config)
model = Model2Model.from_pretrained(
args.model_name_or_path, decoder_model=decoder_model
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)
logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
0,
args.device,
"""
self.texts = texts
self.labels = labels
self.label_dict = label_dict
self.max_seq_length = max_seq_length
if self.label_dict is None and labels is not None:
# {'class1': 0, 'class2': 1, 'class3': 2, ...}
# using this instead of `sklearn.preprocessing.LabelEncoder`
# no easily handle unknown target values
self.label_dict = dict(zip(sorted(set(labels)),
range(len(set(labels)))))
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# suppresses tokenizer warnings
logging.getLogger(
"transformers.tokenization_utils").setLevel(logging.FATAL)
# special tokens for transformers
# in the simplest case a [CLS] token is added in the beginning
# and [SEP] token is added in the end of a piece of text
# [CLS] [SEP] .. <[PAD]>
self.sep_vid = self.tokenizer.vocab["[SEP]"]
self.cls_vid = self.tokenizer.vocab["[CLS]"]
self.pad_vid = self.tokenizer.vocab["[PAD]"]
def load(cls, language: str, device: str = "cpu") -> "TransformerLMScorer":
if language == "en":
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("distilgpt2")
return cls(tokenizer, model, device)
else:
raise RuntimeError(f"Language {language} is not supported.")
from transformers import (AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
AdamW,
ConstantLRSchedule)
super().__init__(*args, **kwargs)
model_config = AutoConfig.from_pretrained(self.model_name)
model_config.num_labels = 1 # set up for regression
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if self.device == "cpu":
self.logger.info("RUNNING ON CPU")
self.rerank_model = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
config=model_config)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
self.rerank_model.to(self.device, non_blocking=True)
self.optimizer = AdamW(self.rerank_model.parameters(), lr=self.lr, correct_bias=False)
self.scheduler = ConstantLRSchedule(self.optimizer)
def main(config):
saved_data = torch.load(
config.model_fn,
map_location='cpu' if config.gpu_id < 0 else 'cuda:%d' % config.gpu_id
)
train_config = saved_data['config']
bert_best = saved_data['bert']
index_to_label = saved_data['classes']
lines = read_text()
with torch.no_grad():
# Declare model and load pre-trained weights.
tokenizer = AutoTokenizer.from_pretrained(train_config.pretrained_model_name)
model = BertForSequenceClassification.from_pretrained(
train_config.pretrained_model_name,
num_labels=len(index_to_label)
)
model.load_state_dict(bert_best)
if config.gpu_id >= 0:
model.cuda(config.gpu_id)
device = next(model.parameters()).device
# Don't forget turn-on evaluation mode.
model.eval()
y_hats = []
for idx in range(0, len(lines), config.batch_size):
mini_batch = tokenizer(