Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
metric = [dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)]
elif dataset in [
'bace_r', 'chembl', 'clearance', 'delaney', 'hopv', 'kaggle', 'lipo',
'nci', 'pdbbind', 'ppb', 'qm7', 'qm7b', 'qm8', 'qm9', 'sampl'
]:
mode = 'regression'
metric = [dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)]
pair = (dataset, model)
if pair in CheckFeaturizer:
featurizer = CheckFeaturizer[pair][0]
n_features = CheckFeaturizer[pair][1]
tasks, all_dataset, transformers = load_dataset(
dataset, featurizer, split='index')
all_dataset = dc.data.DiskDataset.merge(all_dataset)
for frac_train in frac_trains:
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter(),
'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
}
splitter = splitters[split]
np.random.seed(seed)
train, valid, test = splitter.train_valid_test_split(
all_dataset,
frac_train=frac_train,
frac_valid=1 - frac_train,
frac_test=0.)
test = valid
if mode == 'classification':
data_dir = deepchem.utils.get_data_dir()
data_dir = os.path.join(data_dir, "UV")
if not os.path.exists(data_dir):
os.mkdir(data_dir)
train_dir = os.path.join(data_dir, "train_dir")
valid_dir = os.path.join(data_dir, "valid_dir")
test_dir = os.path.join(data_dir, "test_dir")
if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
os.path.exists(test_dir)):
logger.info("Reloading existing datasets")
train_dataset = deepchem.data.DiskDataset(train_dir)
valid_dataset = deepchem.data.DiskDataset(valid_dir)
test_dataset = deepchem.data.DiskDataset(test_dir)
else:
logger.info("Featurizing datasets")
train_dataset, valid_dataset, test_dataset = \
gen_uv(UV_tasks=UV_tasks, data_dir=data_dir, train_dir=train_dir,
valid_dir=valid_dir, test_dir=test_dir, shard_size=shard_size)
transformers = get_transformers(train_dataset)
return UV_tasks, (train_dataset, valid_dataset, test_dataset), transformers
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/full_grid.tar.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/featurized_datasets/refined_grid.tar.gz'
)
if not os.path.exists(pdbbind_dir):
os.system('mkdir ' + pdbbind_dir)
deepchem.utils.untargz_file(
os.path.join(data_dir, 'core_grid.tar.gz'), pdbbind_dir)
deepchem.utils.untargz_file(
os.path.join(data_dir, 'full_grid.tar.gz'), pdbbind_dir)
deepchem.utils.untargz_file(
os.path.join(data_dir, 'refined_grid.tar.gz'), pdbbind_dir)
return deepchem.data.DiskDataset(dataset_dir), tasks
UV_tasks = (['logTIC'] +
['w__%d' % i for i in range(210, 401)])
current_dir = os.path.dirname(os.path.realpath(__file__))
raw_train_dir = os.path.join(current_dir, "raw_train_dir")
train_dir = os.path.join(current_dir, "train_dir")
valid_dir = os.path.join(current_dir, "valid_dir")
test_dir = os.path.join(current_dir, "test_dir")
if (os.path.exists(raw_train_dir) and
os.path.exists(train_dir) and
os.path.exists(valid_dir) and
os.path.exists(test_dir)):
print("Reloading existing datasets")
raw_train_dataset = dc.data.DiskDataset(raw_train_dir)
train_dataset = dc.data.DiskDataset(train_dir)
valid_dataset = dc.data.DiskDataset(valid_dir)
test_dataset = dc.data.DiskDataset(test_dir)
else:
print("Featurizing datasets")
(raw_train_dataset, train_dataset, valid_dataset, test_dataset) = \
gen_uv(UV_tasks, raw_train_dir, train_dir, valid_dir, test_dir,
shard_size=shard_size)
transformers = get_transformers(raw_train_dataset)
return UV_tasks, (train_dataset, valid_dataset, test_dataset), transformers
import numpy as np
import tensorflow as tf
import itertools
import time
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)
base_dir = os.getcwd()
data_dir = os.path.join(base_dir, "datasets")
train_dir = os.path.join(data_dir, "scaffold_train")
test_dir = os.path.join(data_dir, "scaffold_test")
train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
pdbbind_tasks = ["-logKd/Ki"]
transformers = []
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
train_dataset.X,
y_train,
train_dataset.w,
train_dataset.ids,
tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
test_dataset.X,
#for transformer in transformers:
# train_dataset = transformer.transform(train_dataset)
# test_dataset = transformer.transform(test_dataset)
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
train_dataset.X,
y_train,
train_dataset.w,
train_dataset.ids,
tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
test_dataset.X,
y_test,
test_dataset.w,
test_dataset.ids,
tasks=pdbbind_tasks)
at = [6, 7., 8., 9., 11., 12., 15., 16., 17., 20., 25., 30., 35., 53.]
radial = [[
1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0, 7.5, 8.0, 8.5,
9.0, 9.5, 10.0, 10.5, 11.0, 11.5, 12.0
], [0.0, 4.0, 8.0], [0.4]]
#radial = [[12.0], [0.0, 4.0, 8.0], [0.4]]
rp = create_symmetry_parameters(radial)
layer_sizes = [32, 32, 16]
weight_init_stddevs = [
1 / np.sqrt(layer_sizes[0]), 1 / np.sqrt(layer_sizes[1]),
]
data_dir = deepchem.utils.get_data_dir()
data_dir = os.path.join(data_dir, "kaggle")
if not os.path.exists(data_dir):
os.mkdir(data_dir)
train_dir = os.path.join(data_dir, "train_dir")
valid_dir = os.path.join(data_dir, "valid_dir")
test_dir = os.path.join(data_dir, "test_dir")
if (os.path.exists(train_dir) and os.path.exists(valid_dir) and
os.path.exists(test_dir)):
logger.info("Reloading existing datasets")
train_dataset = deepchem.data.DiskDataset(train_dir)
valid_dataset = deepchem.data.DiskDataset(valid_dir)
test_dataset = deepchem.data.DiskDataset(test_dir)
else:
logger.info("Featurizing datasets")
train_dataset, valid_dataset, test_dataset = \
gen_kaggle(KAGGLE_tasks, train_dir, valid_dir, test_dir, data_dir,
shard_size=shard_size)
transformers = get_transformers(train_dataset)
return KAGGLE_tasks, (train_dataset, valid_dataset,
test_dataset), transformers
data_dir = os.path.join(base_dir, "datasets")
train_dir = os.path.join(data_dir, "random_train")
test_dir = os.path.join(data_dir, "random_test")
model_dir = os.path.join(base_dir, "random_model")
# Model constants
frag1_num_atoms = 153
frag2_num_atoms = 1119
complex_num_atoms = 1254
max_num_neighbors = 12
neighbor_cutoff = 12.0
# Load and transform datasets
pdbbind_tasks = ["-logKd/Ki"]
train_dataset = dc.data.DiskDataset(train_dir)
test_dataset = dc.data.DiskDataset(test_dir)
transformers = []
# convert -logKi to dG = +RTlogKi [kJ/mol]
y_train = train_dataset.y
y_train *= -1 * 2.479 / 4.184
train_dataset = dc.data.DiskDataset.from_numpy(
train_dataset.X,
y_train,
train_dataset.w,
train_dataset.ids,
tasks=pdbbind_tasks)
y_test = test_dataset.y
y_test *= -1 * 2.479 / 4.184
test_dataset = dc.data.DiskDataset.from_numpy(
test_dataset.X,
y_test,
def featurize_map_function(args):
############################################################## TIMING
time1 = time.time()
############################################################## TIMING
((loader, shard_size, input_type, data_dir), (shard_num, raw_df_shard)) = args
log("Loading shard %d of size %s from file." % (shard_num+1, str(shard_size)),
loader.verbosity)
log("About to featurize shard.", loader.verbosity)
write_fn = partial(
DiskDataset.write_dataframe, data_dir=data_dir,
featurizer=loader.featurizer, tasks=loader.tasks,
mol_id_field=loader.id_field, verbosity=loader.verbosity)
############################################################## TIMING
shard_time1 = time.time()
############################################################## TIMING
metadata_row = loader._featurize_shard(
raw_df_shard, write_fn, shard_num, input_type)
############################################################## TIMING
shard_time2 = time.time()
log("TIMING: shard featurization took %0.3f s" % (shard_time2-shard_time1),
loader.verbosity)
############################################################## TIMING
############################################################## TIMING
time2 = time.time()
log("TIMING: featurization map function took %0.3f s" % (time2-time1),
loader.verbosity)