Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
self.splitter = dc.splits.ScaffoldSplitter()
train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)
else:
# TODO: Add special handling for AVE splitter
train_cv, test = self.splitter.train_test_split(dataset, seed=np.random.seed(123), frac_train=train_frac)
train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)
train_valid_dsets = []
train_valid_attr = []
if self.needs_smiles():
# Now that DeepChem splitters have done their work, replace the SMILES strings in the split
# dataset objects with actual compound IDs.
for train, valid in train_cv_pairs:
train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col)
train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False)
valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col)
valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False)
train_valid_dsets.append((train, valid))
train_valid_attr.append((train_attr, valid_attr))
test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col)
test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False)
else:
# Otherwise just subset the ID-to-SMILES maps.
for train, valid in train_cv_pairs:
train_attr = select_attrs_by_dset_ids(train, attr_df)
valid_attr = select_attrs_by_dset_ids(valid, attr_df)
train_valid_attr.append((train_attr, valid_attr))
train_valid_dsets = train_cv_pairs
# TODO: Add special handling for AVE splitter
train_cv, test = self.splitter.train_test_split(dataset, seed=np.random.seed(123), frac_train=train_frac)
train_cv_pairs = self.splitter.k_fold_split(train_cv, self.num_folds)
train_valid_dsets = []
train_valid_attr = []
if self.needs_smiles():
# Now that DeepChem splitters have done their work, replace the SMILES strings in the split
# dataset objects with actual compound IDs.
for train, valid in train_cv_pairs:
train_attr = select_attrs_by_dset_smiles(train, attr_df, smiles_col)
train = DiskDataset.from_numpy(train.X, train.y, ids=train_attr.index.values, verbose=False)
valid_attr = select_attrs_by_dset_smiles(valid, attr_df, smiles_col)
valid = DiskDataset.from_numpy(valid.X, valid.y, ids=valid_attr.index.values, verbose=False)
train_valid_dsets.append((train, valid))
train_valid_attr.append((train_attr, valid_attr))
test_attr = select_attrs_by_dset_smiles(test, attr_df, smiles_col)
test = DiskDataset.from_numpy(test.X, test.y, ids=test_attr.index.values, verbose=False)
else:
# Otherwise just subset the ID-to-SMILES maps.
for train, valid in train_cv_pairs:
train_attr = select_attrs_by_dset_ids(train, attr_df)
valid_attr = select_attrs_by_dset_ids(valid, attr_df)
train_valid_attr.append((train_attr, valid_attr))
train_valid_dsets = train_cv_pairs
test_attr = select_attrs_by_dset_ids(test, attr_df)
return train_valid_dsets, test, train_valid_attr, test_attr
def split(self, dataset, frac_split, split_dirs=None):
"""
Method that does bulk of splitting dataset.
"""
if split_dirs is not None:
assert len(split_dirs) == 2
else:
split_dirs = [tempfile.mkdtemp(), tempfile.mkdtemp()]
# Handle edge case where frac_split is 1
if frac_split == 1:
dataset_1 = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w,
dataset.ids)
dataset_2 = None
return dataset_1, dataset_2
X, y, w, ids = randomize_arrays((dataset.X, dataset.y, dataset.w,
dataset.ids))
if len(y.shape) == 1:
y = np.expand_dims(y, 1)
if len(w.shape) == 1:
w = np.expand_dims(w, 1)
split_indices = self.get_task_split_indices(y, w, frac_split)
# Create weight matrices fpor two haves.
w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
for task, split_index in enumerate(split_indices):
# copy over up to required index for weight first_split
w_1[:split_index, task] = w[:split_index, task]
pocket_featurizer, ligand_featurizer, pdb_subdir, pdb_code)
if features is None:
print("Featurization failed!")
continue
all_features.append(features)
all_labels.append(labels)
ids = np.array(["%s%d" % (pdb_code, i) for i in range(len(labels))])
all_ids.append(ids)
time2 = time.time()
print("TIMING: PDBBind Pocket Featurization took %0.3f s" % (time2-time1))
X = np.vstack(all_features)
y = np.concatenate(all_labels)
w = np.ones_like(y)
ids = np.concatenate(all_ids)
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids, data_dir=data_dir)
return dataset, tasks
complex_num_atoms = 701
max_num_neighbors = 12
neighbor_cutoff = 12.0
train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []
#transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
#for transformer in transformers:
# train_dataset = transformer.transform(train_dataset)
# test_dataset = transformer.transform(test_dataset)
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
train_dataset.X,
y_train,
train_dataset.w,
train_dataset.ids,
tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
test_dataset.X,
y_test,
test_dataset.w,
test_dataset.ids,
tasks=pdbbind_tasks)
at = [6, 7., 8., 9., 11., 12., 15., 16., 17., 20., 25., 30., 35., 53.]
def load_qm7_from_mat(featurizer=None, split='stratified'):
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(current_dir, "qm7.mat")
if not os.path.exists(dataset_file):
os.system('wget -P ' + current_dir +
' http://www.quantum-machine.org/data/qm7.mat')
dataset = scipy.io.loadmat(dataset_file)
X = dataset['X']
y = dataset['T']
w = np.ones_like(y)
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)
print(len(dataset))
current_dir = os.path.dirname(os.path.realpath(__file__))
split_file = os.path.join(current_dir, "./qm7_splits.csv")
split_indices = []
with open(split_file, 'r') as f:
reader = csv.reader(f)
for row in reader:
row_int = (np.asarray(list(map(int, row)))).tolist()
split_indices.append(row_int)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
complex_num_atoms = 701
max_num_neighbors = 12
neighbor_cutoff = 12.0
train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []
#transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
#for transformer in transformers:
# train_dataset = transformer.transform(train_dataset)
# test_dataset = transformer.transform(test_dataset)
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
train_dataset.X,
y_train,
train_dataset.w,
train_dataset.ids,
tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
test_dataset.X,
y_test,
test_dataset.w,
test_dataset.ids,
tasks=pdbbind_tasks)
at = [6, 7., 8., 9., 11., 12., 15., 16., 17., 20., 25., 30., 35., 53.]
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
train_dataset.X,
y_train,
train_dataset.w,
train_dataset.ids,
tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
test_dataset.X,
y_test,
test_dataset.w,
test_dataset.ids,
tasks=pdbbind_tasks)
batch_size = 24
radial1 = [
[1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],
[
1.5, 2.5, 3.5, 4.5, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5, 9.0, 9.5, 10.0,
10.5
],
]
radial2 = [
[0.0, 2.0, 4.0],
y = np.expand_dims(y, 1)
if len(w.shape) == 1:
w = np.expand_dims(w, 1)
split_indices = self.get_task_split_indices(y, w, frac_split)
# Create weight matrices fpor two haves.
w_1, w_2 = np.zeros_like(w), np.zeros_like(w)
for task, split_index in enumerate(split_indices):
# copy over up to required index for weight first_split
w_1[:split_index, task] = w[:split_index, task]
w_2[split_index:, task] = w[split_index:, task]
# check out if any rows in either w_1 or w_2 are just zeros
rows_1 = w_1.any(axis=1)
X_1, y_1, w_1, ids_1 = X[rows_1], y[rows_1], w_1[rows_1], ids[rows_1]
dataset_1 = DiskDataset.from_numpy(X_1, y_1, w_1, ids_1)
rows_2 = w_2.any(axis=1)
X_2, y_2, w_2, ids_2 = X[rows_2], y[rows_2], w_2[rows_2], ids[rows_2]
dataset_2 = DiskDataset.from_numpy(X_2, y_2, w_2, ids_2)
return dataset_1, dataset_2
train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
transformers = []
# convert -logKi to dG = +RTlogKi [kJ/mol]
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
train_dataset.X,
y_train,
train_dataset.w,
train_dataset.ids,
tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
test_dataset.X,
y_test,
test_dataset.w,
test_dataset.ids,
tasks=pdbbind_tasks)
# Atomic convolution variables
# at = atomic numbers (atom types)
# radial basis function parameters [cutoff, mean, width]
at = [
1., 6., 7., 8., 9., 11., 12., 15., 16., 17., 19., 20., 25., 26., 27., 28.,
29., 30., 34., 35., 38., 48., 53., 55., 80.
]
radial = [[1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
[0.0], [0.4]]
rp = create_symmetry_parameters(radial)