Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_composition(self):
id_pipeline = data.Pipeline()
pipeline = data.Pipeline(TestPipeline.repeat_n)
pipeline.add_before(id_pipeline)
pipeline.add_after(id_pipeline)
pipeline.add_before(six.text_type.lower)
pipeline.add_after(six.text_type.capitalize)
other_pipeline = data.Pipeline(six.text_type.swapcase)
other_pipeline.add_before(pipeline)
# Assert pipeline gives proper results after composition
# (test that we aren't modfifying pipes member)
assert pipeline("teST") == "Testtesttest"
assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"]
# Assert pipeline that we added to gives proper results
assert other_pipeline("teST") == "tESTTESTTEST"
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = None if os.path.join(path, file) is None else os.path.join(path, file)
examples = []
with open(path) as f:
a, b, c, d, e = 0, 0, 0, 0, 0
for line in f:
sentence, flag = line.strip().split(' ||| ')
if char_data is True:
sentence = sentence.split(" ")
sentence = MR.char_data(self, sentence)
# print(sentence)
# clear string in every sentence
sentence = clean_str(sentence)
if line[-2] == '0':
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = None if os.path.join(path, file) is None else os.path.join(path, file)
examples = []
with open(path) as f:
a, b = 0, 0
for line in f.readlines():
sentence, flag = line.strip().split(' ||| ')
if char_data is True:
sentence = sentence.split(" ")
sentence = MR.char_data(self, sentence)
# print(sentence)
# clear string in every sentence
sentence = clean_str(sentence)
if line[-2] == '0':
train_file: Train filename
validation_file: Validation filename
test_file: Test filename
convert_digits: If True will convert numbers to single 0's
Returns:
A dict containing:
task: 'nyt_ingredients.ner'
iters: (train iter, validation iter, test iter)
vocabs: (Inputs word vocabulary, Inputs character vocabulary,
Tag vocabulary )
"""
# Setup fields with batch dimension first
inputs_word = data.Field(init_token="", eos_token="", batch_first=True, lower=True,
preprocessing=data.Pipeline(
lambda w: '0' if convert_digits and w.isdigit() else w ))
inputs_char_nesting = data.Field(tokenize=list, init_token="", eos_token="",
batch_first=True)
inputs_char = data.NestedField(inputs_char_nesting,
init_token="", eos_token="")
labels = data.Field(init_token="", eos_token="", batch_first=True)
fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)),
('labels', labels)])
# Load the data
if use_local:
Arguments:
path: Path to the data file.
text_field: The field that will be used for text data.
label_field: The field that will be used for label data.
fine_grained: Whether to use the fine-grained (50-class) version of TREC
or the coarse grained (6-class) version.
Remaining keyword arguments: Passed to the constructor of
data.Dataset.
"""
fields = [('text', text_field), ('label', label_field)]
examples = []
def get_label_str(label):
return label.split(':')[0] if not fine_grained else label
label_field.preprocessing = data.Pipeline(get_label_str)
for line in open(os.path.expanduser(path), 'rb'):
# there is one non-ASCII byte: sisterBADBYTEcity; replaced with space
label, _, text = line.replace(b'\xf0', b' ').decode().partition(' ')
examples.append(data.Example.fromlist([text, label], fields))
super(TREC, self).__init__(examples, fields, **kwargs)
args.gpu = -1
if torch.cuda.is_available() and args.cuda:
print("Note: You are using GPU for training")
torch.cuda.set_device(args.gpu)
torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
print("You have Cuda but you're using CPU for training.")
np.random.seed(args.seed)
random.seed(args.seed)
QID = data.Field(sequential=False)
QUESTION = data.Field(batch_first=True)
ANSWER = data.Field(batch_first=True)
LABEL = data.Field(sequential=False)
EXTERNAL = data.Field(sequential=True, tensor_type=torch.FloatTensor, batch_first=True, use_vocab=False,
postprocessing=data.Pipeline(lambda arr, _, train: [float(y) for y in arr]))
if config.dataset == 'TREC':
train, dev, test = TrecDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL)
elif config.dataset == 'wiki':
train, dev, test = WikiDataset.splits(QID, QUESTION, ANSWER, EXTERNAL, LABEL)
else:
print("Unsupported dataset")
exit()
QID.build_vocab(train, dev, test)
QUESTION.build_vocab(train, dev, test)
ANSWER.build_vocab(train, dev, test)
LABEL.build_vocab(train, dev, test)
QUESTION = set_vectors(QUESTION, args.vector_cache)
parser.add_argument('--fix_embeddings',
action='store_true',
help='fix word embeddings')
# Output Parameters
parser.add_argument('--valid_every',
type=int,
default=128,
help='batch interval for running validation')
parser.add_argument('-p',
action='store_true',
help='use this flag to print samples of the data')
args = parser.parse_args()
TEXT = data.Field(sequential=True, lower=True, include_lengths=True)
LABEL = data.Field(sequential=False, use_vocab=False, tensor_type=torch.FloatTensor, postprocessing=data.Pipeline(lambda x, y: float(x)))
if args.valid_only:
train_name = 'valid.tsv'
else:
train_name = 'disc_train.tsv'
print('Reading the data')
train, valid = data.TabularDataset.splits(
path=args.data_dir,
train=train_name, validation='valid.tsv',
format='tsv',
fields=[
('context', TEXT),
('generated', TEXT),
('gold', TEXT),
])
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = self.dirname if path is None else path
examples = []
with open(os.path.join(path, 'rt-polarity.neg')) as f:
examples += [
data.Example.fromlist([line, 'negative'], fields) for line in f]
with open(os.path.join(path, 'rt-polarity.pos')) as f:
examples += [
data.Example.fromlist([line, 'positive'], fields) for line in f]
super(MR, self).__init__(examples, fields, **kwargs)
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()
text_field.preprocessing = data.Pipeline(clean_str)
fields = [('text', text_field), ('label', label_field)]
if examples is None:
path = None if os.path.join(path, file) is None else os.path.join(path, file)
print("loading {}... ".format(path))
examples = []
with open(path) as f:
for line in f.readlines():
if line[-2] == '0':
examples += [data.Example.fromlist([line[:line.find('|')], 'negative'], fields=fields)]
elif line[-2] == '1':
examples += [data.Example.fromlist([line[:line.find('|')], 'positive'], fields=fields)]
super(MR, self).__init__(examples, fields, **kwargs)