Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if max_len is not None:
flag = 0
for line in lines:
if len(line.split()) > max_len:
flag = 1
break
if flag == 1:
continue
examples.append(lines)
out_step += 1
if (out_step % buffer == 0) and (out_step > 0): # pre-reading the dataset, and cached...
# examples = sorted(examples, key=lambda x: sum([len(xi.split()) for xi in x]) )
for it, example in enumerate(examples):
yield data.Example.fromlist(example, fields)
examples = []
def __init__(self, path, fields, separator="\t", **kwargs):
examples = []
with codecs.open(path, 'r', encoding='utf-8') as input_file:
for idx, line in enumerate(input_file):
line = line.strip()
if idx != 0 and len(line) != 0:
label, _, _, sentence1, sentence2 = line.split(separator)
columns = []
columns.append(tokenize_line_en(sentence1))
columns.append(tokenize_line_en(sentence2))
columns.append([int(label)])
examples.append(data.Example.fromlist(columns, fields))
super(MSRPDataset, self).__init__(examples, fields, **kwargs)
if examples is None:
path = None if os.path.join(path, file) is None else os.path.join(path, file)
examples = []
with open(path) as f:
a, b = 0, 0
for line in f.readlines():
sentence, flag = line.strip().split(' ||| ')
# clear string in every sentence
sentence = clean_str(sentence)
if line[-2] == '0':
a += 1
examples += [data.Example.fromlist([sentence, 'negative'], fields=fields)]
elif line[-2] == '1':
a += 1
examples += [data.Example.fromlist([sentence, 'negative'], fields=fields)]
elif line[-2] == '3':
b += 1
examples += [data.Example.fromlist([sentence, 'positive'], fields=fields)]
elif line[-2] == '4':
b += 1
examples += [data.Example.fromlist([sentence, 'positive'], fields=fields)]
print("a {} b {} ".format(a, b))
super(Twitter, self).__init__(examples, fields, **kwargs)
# build lists of words indexes by POS tab
pos_dict = build_pos_dict(sentences)
# Generate augmented samples
sentences = augmentation(sentences, pos_dict)
else:
sentences = [text for text, _ in input_tsv]
# Load teacher model
model = BertForSequenceClassification.from_pretrained(args.model).to(device)
tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=True)
# Assign labels with teacher
teacher_field = data.Field(sequential=True, tokenize=tokenizer.tokenize, lower=True, include_lengths=True, batch_first=True)
fields = [("text", teacher_field)]
if not args.no_augment:
examples = [data.Example.fromlist([" ".join(words)], fields) for words in sentences]
else:
examples = [data.Example.fromlist([text], fields) for text in sentences]
augmented_dataset = data.Dataset(examples, fields)
teacher_field.vocab = BertVocab(tokenizer.vocab)
new_labels = BertTrainer(model, device, batch_size=args.batch_size).infer(augmented_dataset)
# Write to file
with open(args.output, "w") as f:
f.write("sentence\tscores\n")
for sentence, rating in zip(sentences, new_labels):
if not args.no_augment:
text = " ".join(sentence)
else: text = sentence
f.write("%s\t%.6f %.6f\n" % (text, *rating))
def _get_examples(self, items: list, fields: list):
return [data.Example.fromlist(item, fields) for item in items]
:param kwargs: Passed to the constructor of data.Dataset.
"""
fields = [('src', field)]
if hasattr(path, "readline"): # special usage: stdin
src_file = path
else:
src_path = os.path.expanduser(path + ext)
src_file = open(src_path)
examples = []
for src_line in src_file:
src_line = src_line.strip()
if src_line != '':
examples.append(data.Example.fromlist(
[src_line], fields))
src_file.close()
super(MonoDataset, self).__init__(examples, fields, **kwargs)
def get_examples_from_file(self, path: str, fields: List[NamedField], format: str, encoding: str = 'utf-8',
skip_header: bool = True) -> Tuple[List[Example], List[NamedField]]:
if format.lower() in ["csv", "tsv"]:
sep = "," if format.lower() == "csv" else "\t"
data = pd.read_csv(os.path.expanduser(path), encoding=encoding, header=0 if skip_header else None,
sep=sep)
elif format.lower() == "json":
data = pd.read_json(os.path.expanduser(path), encoding=encoding)
examples = []
for _, row in data.iterrows():
examples.append(Example.fromlist(row.values.tolist(), fields))
return examples, fields
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = self.dirname if path is None else path
examples = []
with open(os.path.join(path, 'rt-polarity.neg')) as f:
examples += [
data.Example.fromlist([line, 'negative'], fields) for line in f]
with open(os.path.join(path, 'rt-polarity.pos')) as f:
examples += [
data.Example.fromlist([line, 'positive'], fields) for line in f]
super(MR, self).__init__(examples, fields, **kwargs)
"""
cache_file = os.path.join(path, 'examples_cache.pk')
fields = [('text', text_field), ('label', label_field)]
if os.path.exists(cache_file):
with open(cache_file, 'rb') as fp:
examples = pickle.load(fp)
else:
examples = []
for label in ['pos', 'neg']:
for fname in glob.iglob(os.path.join(path, label, '*.txt')):
with io.open(fname, 'r', encoding="utf-8") as f:
text = f.readline()
examples.append(data.Example.fromlist([text, label], fields))
with open(cache_file, 'wb') as fp:
pickle.dump(examples, file=fp)
data.Dataset.__init__(self, examples, fields, **kwargs)
punkt_param.abbrev_types = set(['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e', 'ltd', 'jr', 'sr', 'co', 'st', 'ms', 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sept', 'nov', 'dec'])
sentence_splitter = PunktSentenceTokenizer(punkt_param)
fields = [('text', text_field)]
if len(examples) == 0:
examples = []
sentences = []
fp = open(path)
txt = fp.read()
txt = txt.replace('?"', '? "').replace('!"', '! "').replace('."', '. "').replace("?'", "? '").replace("!'", "! '").replace(".'", ". '").replace('\n', ' ')
sentences += sentence_splitter.tokenize(txt.lower())
for sent in sentences[2:]:
text = []
text += text_field.preprocess(sent)
text += ['']
if 3 <= len(text) <= 19:
examples.append(data.Example.fromlist([text + [''] *(19- len(text))], fields))
else:
examples = examples
super(EncoderDataset, self).__init__(
examples, fields, **kwargs)