Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'is_duplicate'].values.tolist()
assert len(sim_questions) == len(sim_tags)
assert len(set(sim_tags)) == 1
assert next(iter(set(sim_tags))) == 1
# Save each pair of similar questions within the data class
for question, tag in zip(sim_questions, sim_tags):
if preprocess:
q1 = preprocess_sentence(question[1])
q2 = preprocess_sentence(question[2])
else:
q1 = question[1]
q2 = question[2]
if q1 and q2:
self._sim_data.append(Data(question[0], q1, q2, tag, [0, 1]))
def load_kaggle(self, preprocess):
self._data_frame = pd.read_csv(self._corpora_path, header=0)
print(self._data_frame.keys())
if self._partition == 'test':
self._test_data = []
for q_id, q1, q2 in self._data_frame[['test_id', 'question1', 'question2']].values:
# print(q_id, q1, q2)
if isinstance(q1, str) and isinstance(q2, str):
if preprocess:
q1 = preprocess_sentence(q1)
q2 = preprocess_sentence(q2)
self._test_data.append(Data(q_id, q1, q2))
elif isinstance(q1, str) and not isinstance(q2, str):
self._test_data.append(Data(q_id, q1, ""))
elif not isinstance(q1, str) and isinstance(q2, str):
self._test_data.append(Data(q_id, "", q2))
else:
print(q_id)
assert len(self._test_data) == 2345796
else:
self.load_sim_quora(preprocess=preprocess)
self.load_non_sim_quora(preprocess=preprocess)
assert len(set(non_sim_tags)) == 1
assert next(iter(set(non_sim_tags))) == 0
# Save each pair of non similar questions within the data class
for question, tag in zip(non_sim_questions, non_sim_tags):
# There are two errors in the dataset.
# Ids: 105796, 201871 doesn't have a pair of questions.
# This condition prevent storing this value
if preprocess:
q1 = preprocess_sentence(question[1])
q2 = preprocess_sentence(question[2])
else:
q1 = question[1] if isinstance(question[1], str) else None
q2 = question[2] if isinstance(question[2], str) else None
if q1 and q2:
self._non_sim_data.append(Data(question[0], q1, q2, tag, [1, 0]))
def load_kaggle(self, preprocess):
self._data_frame = pd.read_csv(self._corpora_path, header=0)
print(self._data_frame.keys())
if self._partition == 'test':
self._test_data = []
for q_id, q1, q2 in self._data_frame[['test_id', 'question1', 'question2']].values:
# print(q_id, q1, q2)
if isinstance(q1, str) and isinstance(q2, str):
if preprocess:
q1 = preprocess_sentence(q1)
q2 = preprocess_sentence(q2)
self._test_data.append(Data(q_id, q1, q2))
elif isinstance(q1, str) and not isinstance(q2, str):
self._test_data.append(Data(q_id, q1, ""))
elif not isinstance(q1, str) and isinstance(q2, str):
self._test_data.append(Data(q_id, "", q2))
else:
print(q_id)
assert len(self._test_data) == 2345796
else:
self.load_sim_quora(preprocess=preprocess)
self.load_non_sim_quora(preprocess=preprocess)
def load_kaggle(self, preprocess):
self._data_frame = pd.read_csv(self._corpora_path, header=0)
print(self._data_frame.keys())
if self._partition == 'test':
self._test_data = []
for q_id, q1, q2 in self._data_frame[['test_id', 'question1', 'question2']].values:
# print(q_id, q1, q2)
if isinstance(q1, str) and isinstance(q2, str):
if preprocess:
q1 = preprocess_sentence(q1)
q2 = preprocess_sentence(q2)
self._test_data.append(Data(q_id, q1, q2))
elif isinstance(q1, str) and not isinstance(q2, str):
self._test_data.append(Data(q_id, q1, ""))
elif not isinstance(q1, str) and isinstance(q2, str):
self._test_data.append(Data(q_id, "", q2))
else:
print(q_id)
assert len(self._test_data) == 2345796
else:
self.load_sim_quora(preprocess=preprocess)
self.load_non_sim_quora(preprocess=preprocess)