Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
'pdbbind': dc.molnet.load_pdbbind_grid,
'qm7': dc.molnet.load_qm7_from_mat,
'sampl': dc.molnet.load_sampl,
'tox21': dc.molnet.load_tox21
}
tasks, all_dataset, transformers = loading_functions[dataset](
featurizer=featurizer, reload=reload, split='index')
all_dataset = dc.data.DiskDataset.merge(all_dataset)
for seed in [1,2,3,4,5]:
for frac_train in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter(),
'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
}
splitter = splitters[split]
np.random.seed(seed)
train, valid, test = splitter.train_valid_test_split(all_dataset,
frac_train=frac_train,
frac_valid=1-frac_train,
frac_test=0.)
test = valid
if mode == 'classification':
train_score, valid_score, test_score = benchmark_classification(
train, valid, test, tasks, transformers, n_features, metric,
model, test=False, hyper_parameters=hyper_parameters, seed=seed)
elif mode == 'regression':
train_score, valid_score, test_score = benchmark_regression(
train, valid, test, tasks, transformers, n_features, metric,
model, test=False, hyper_parameters=hyper_parameters, seed=seed)
def load_pdbbind_grid(split="index", featurizer="grid", subset="core"):
"""Load PDBBind datasets. Does not do train/test split"""
dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)
dataset.w
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter(),
'stratified': dc.splits.RandomStratifiedSplitter()
}
splitter = splitters[split]
train, valid, test = splitter.train_valid_test_split(dataset)
transformers = []
for transformer in transformers:
train = transformer.transform(train)
for transformer in transformers:
valid = transformer.transform(valid)
for transformer in transformers:
test = transformer.transform(test)
return tasks, (train, valid, test), transformers
def load_pdbbind_pockets(split="index", subset="core"):
"""Load PDBBind datasets. Does not do train/test split"""
dataset, tasks = featurize_pdbbind_pockets(subset=subset)
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter()}
splitter = splitters[split]
########################################################### DEBUG
print("dataset.X.shape")
print(dataset.X.shape)
print("dataset.y.shape")
print(dataset.y.shape)
print("dataset.w.shape")
print(dataset.w.shape)
print("dataset.ids.shape")
print(dataset.ids.shape)
########################################################### DEBUG
train, valid, test = splitter.train_valid_test_split(dataset)
transformers = []
for transformer in transformers:
# Format is: Image_name count1 count2
lines = [x.split("\t") for x in lines]
counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
y = np.reshape(np.array(counts), (len(counts), 1))
ids = [x[0] for x in lines]
# This is kludgy way to add y to dataset. Can be done better?
dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y, ids=ids)
if split == None:
transformers = []
logger.info("Split is None, no transformers used for the dataset.")
return bbbc002_tasks, (dataset, None, None), transformers
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': deepchem.splits.RandomSplitter(),
}
if split not in splitters:
raise ValueError("Only index and random splits supported.")
splitter = splitters[split]
logger.info("About to split dataset with {} splitter.".format(split))
train, valid, test = splitter.train_valid_test_split(dataset)
all_dataset = (train, valid, test)
transformers = []
if reload:
deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
transformers)
return bbbc002_tasks, all_dataset, transformers
max_atoms = 23
batch_size = 64 # CHANGED FROM 16
layer_structures = [128, 128, 64]
atom_number_cases = [1, 6, 7, 8]
metric = [
dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]
print("Fitting new model...")
train_valid_dataset, test_dataset, all_groups = load_roiterberg_ANI(
mode="atomization")
splitter = dc.splits.RandomGroupSplitter(
broadcast(train_valid_dataset, all_groups))
print("Performing 1-fold split...")
train_dataset, valid_dataset = splitter.train_test_split(
train_valid_dataset, train_dir=train_dir, test_dir=valid_dir)
transformers = [
dc.trans.NormalizationTransformer(
transform_y=True, dataset=train_dataset)
]
print("Total training set shape: ", train_dataset.get_shape())
for transformer in transformers:
train_dataset = transformer.transform(train_dataset)
valid_dataset = transformer.transform(valid_dataset)
# num positive/negative ligands
n_pos = 10
n_neg = 10
# 10 trials on test-set
n_trials = 20
muv_tasks, dataset, transformers = load_muv_convmol()
# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")
task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)
train_folds = fold_datasets[:-1]
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]
# Get supports on test-set
support_generator = dc.data.SupportGenerator(test_dataset, n_pos, n_neg,
n_trials)
# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
for trial_num, (task, support) in enumerate(support_generator):
print("Starting trial %d" % trial_num)
# Number of features on conv-mols
n_feat = 75
# Batch size of models
batch_size = 50
from sklearn.ensemble import RandomForestClassifier
# 4-fold splits
K = 4
# num positive/negative ligands
n_pos = 10
n_neg = 10
# 10 trials on test-set
n_trials = 20
tox21_tasks, dataset, transformers = load_muv_ecfp()
# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")
task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)
train_folds = fold_datasets[:-1]
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]
# Get supports on test-set
support_generator = dc.data.SupportGenerator(
test_dataset, n_pos, n_neg, n_trials)
# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
for (task, support) in support_generator:
# Train model on support
sklearn_model = RandomForestClassifier(
class_weight="balanced", n_estimators=100)
os.system('sh ' + current_dir + '/get_qm9.sh')
qm9_tasks = [
"A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv",
"u0_atom", "u298_atom", "h298_atom", "g298_atom"
]
if featurizer is None:
featurizer = dc.feat.CoulombMatrix(29)
loader = dc.data.SDFLoader(
tasks=qm9_tasks,
smiles_field="smiles",
mol_field="mol",
featurizer=featurizer)
dataset = loader.featurize(dataset_file)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=11)
}
splitter = splitters[split]
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
dataset)
transformers = [
dc.trans.NormalizationTransformer(
transform_y=True, dataset=train_dataset)
]
for transformer in transformers:
train_dataset = transformer.transform(train_dataset)
valid_dataset = transformer.transform(valid_dataset)
test_dataset = transformer.transform(test_dataset)
return qm9_tasks, (train_dataset, valid_dataset, test_dataset), transformers
featurizer = dcCustom.feat.ConvMolFeaturizer()
loader = dcCustom.data.CSVLoader(
tasks = tasks, smiles_field="smiles", protein_field = "proteinName",
source_field = 'protein_dataset', featurizer=featurizer, prot_seq_dict=prot_seq_dict)
dataset = loader.featurize(dataset_file, shard_size=8192)
# print("About to transform data")
# for transformer in transformers:
# dataset = transformer.transform(dataset)
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': dcCustom.splits.RandomSplitter(),
'scaffold': deepchem.splits.ScaffoldSplitter(),
'butina': deepchem.splits.ButinaSplitter(),
'task': deepchem.splits.TaskSplitter()
}
splitter = splitters[split]
# HACK: We set frac_train to 1.0 because assume NCI60 dataset is for prediction only: there
# is no underlying truth. To predict all drug-target pairs, we need to let all samples be in
# the "training" set, though it is a misnomer.
train, valid, test = splitter.train_valid_test_split(dataset, frac_train=1.0,
frac_valid=0.0, frac_test=0)
all_dataset = (train, valid, test)
if reload:
deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
transformers)
return tasks, all_dataset, transformers
from datasets import load_sider_ecfp
from sklearn.ensemble import RandomForestClassifier
# 4-fold splits
K = 4
# num positive/negative ligands
n_pos = 10
n_neg = 10
n_trials = 20
sider_tasks, dataset, transformers = load_sider_ecfp()
# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")
task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)
train_folds = fold_datasets[:-1]
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]
# Get supports on test-set
support_generator = dc.data.SupportGenerator(
test_dataset, n_pos, n_neg, n_trials)
# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
for (task, support) in support_generator:
# Train model on support
sklearn_model = RandomForestClassifier(
class_weight="balanced", n_estimators=100)