Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
train_dataset, test_dataset = random_splitter.train_test_split(dataset, train_dir, test_dir, frac_train=0.8)
#transformers = [dc.trans.NormalizationTransformer(transform_X=True, dataset=train_dataset), dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
for transformer in transformers:
train_dataset = transformer.transform(train_dataset)
for transformer in transformers:
test_dataset = transformer.transform(test_dataset)
regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")
def model_builder(model_dir):
sklearn_model = KernelRidge(
kernel="rbf", alpha=5e-4, gamma=0.008)
return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(tasks, model_builder, model_dir)
# Fit trained model
model.fit(train_dataset)
model.save()
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset, transformers)
train_scores = train_evaluator.compute_model_performance([regression_metric])
print("Train scores [kcal/mol]")
print(train_scores)
test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance([regression_metric])
print("Validation scores [kcal/mol]")
print(test_scores)
import numpy as np
import deepchem as dc
from deepchem.molnet import load_hiv
# Only for debug!
np.random.seed(123)
# Load hiv dataset
n_features = 1024
hiv_tasks, hiv_datasets, transformers = load_hiv()
train_dataset, valid_dataset, test_dataset = hiv_datasets
# Fit models
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
model = dc.models.MultitaskClassifier(
len(hiv_tasks),
n_features,
layer_sizes=[1000],
dropouts=[.25],
learning_rate=0.001,
batch_size=50)
# Fit trained model
model.fit(train_dataset)
print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Train scores")
print(train_scores)
train_folds = fold_datasets[:-1]
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]
# Get supports on test-set
support_generator = dc.data.SupportGenerator(
test_dataset, n_pos, n_neg, n_trials)
# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
for (task, support) in support_generator:
# Train model on support
sklearn_model = RandomForestClassifier(
class_weight="balanced", n_estimators=100)
model = dc.models.SklearnModel(sklearn_model)
model.fit(support)
# Test model
task_dataset = dc.data.get_task_dataset_minus_support(
test_dataset, support, task)
y_pred = model.predict_proba(task_dataset)
score = metric.compute_metric(
task_dataset.y, y_pred, task_dataset.w)
print("Score on task %s is %s" % (str(task), str(score)))
task_scores[task].append(score)
# Join information for all tasks.
mean_task_scores = {}
std_task_scores = {}
for task in range(len(test_dataset.get_task_names())):
mean_task_scores[task] = np.mean(np.array(task_scores[task]))
metric = [
dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]
# Batch size of models
batch_size = 50
n_embedding = 20
graph_model = dc.nn.SequentialDTNNGraph(n_distance=100)
graph_model.add(dc.nn.DTNNEmbedding(n_embedding=n_embedding))
graph_model.add(dc.nn.DTNNStep(n_embedding=n_embedding, n_distance=100))
graph_model.add(dc.nn.DTNNStep(n_embedding=n_embedding, n_distance=100))
graph_model.add(dc.nn.DTNNGather(n_embedding=n_embedding))
n_feat = n_embedding
model = dc.models.MultitaskGraphRegressor(
graph_model,
len(tasks),
n_feat,
batch_size=batch_size,
learning_rate=0.001,
learning_rate_decay_time=1000,
optimizer_type="adam",
beta1=.9,
beta2=.999)
# Fit trained model
model.fit(train_dataset, nb_epoch=50)
print("Evaluating model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)
print("Number of compounds in train set")
print(len(train_dataset))
print("Number of compounds in validation set")
print(len(valid_dataset))
print("Number of compounds in test set")
print(len(test_dataset))
num_features = train_dataset.get_data_shape()[0]
print("Num features: %d" % num_features)
def task_model_builder(model_dir):
sklearn_model = RandomForestRegressor(
n_estimators=100, max_features=int(num_features/3),
min_samples_split=5, n_jobs=-1)
return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(KINASE_tasks, task_model_builder)
###Evaluate models###
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, task_averager=np.mean,
mode="regression")
print("Training model")
model.fit(train_dataset)
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
#Only use for final evaluation
test_scores = model.evaluate(test_dataset, [metric], transformers)
print("Train scores")
print(train_scores)
atom_number_cases = [1, 6, 7, 8, 9]
ANItransformer = dc.trans.ANITransformer(
max_atoms=max_atoms, atom_cases=atom_number_cases)
train_dataset = ANItransformer.transform(train_dataset)
valid_dataset = ANItransformer.transform(valid_dataset)
test_dataset = ANItransformer.transform(test_dataset)
n_feat = ANItransformer.get_num_feats() - 1
# Fit models
metric = [
dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]
model = dc.models.ANIRegression(
len(tasks),
max_atoms,
n_feat,
layer_structures=layer_structures,
atom_number_cases=atom_number_cases,
batch_size=batch_size,
learning_rate=0.001,
use_queue=False,
mode="regression")
# Fit trained model
model.fit(train_dataset, nb_epoch=300, checkpoint_interval=100)
print("Evaluating model")
train_scores = model.evaluate(train_dataset, metric, transformers)
valid_scores = model.evaluate(valid_dataset, metric, transformers)
n_atom_feat = 75
batch_size = 64
max_atoms_train = max([mol.get_num_atoms() for mol in train_dataset.X])
max_atoms_valid = max([mol.get_num_atoms() for mol in valid_dataset.X])
max_atoms_test = max([mol.get_num_atoms() for mol in test_dataset.X])
max_atoms = max([max_atoms_train, max_atoms_valid, max_atoms_test])
reshard_size = 512
transformer = dc.trans.DAGTransformer(max_atoms=max_atoms)
train_dataset.reshard(reshard_size)
train_dataset = transformer.transform(train_dataset)
valid_dataset.reshard(reshard_size)
valid_dataset = transformer.transform(valid_dataset)
model = dc.models.DAGModel(
len(permeability_tasks),
max_atoms=max_atoms,
n_atom_feat=n_atom_feat,
batch_size=batch_size,
learning_rate=1e-3,
use_queue=False,
mode='regression')
# Fit trained model
model.fit(train_dataset, nb_epoch=50)
print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Train scores")
print(train_scores)
# Load Tox21 dataset
tasks, datasets, transformers = dc.molnet.load_qm8()
train_dataset, valid_dataset, test_dataset = datasets
# Fit models
metric = [dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")]
# Batch size of models
batch_size = 50
n_embedding = 20
n_distance = 51
distance_min = -1.
distance_max = 9.2
n_hidden = 15
model = dc.models.DTNNModel(
len(tasks),
n_embedding=n_embedding,
n_hidden=n_hidden,
n_distance=n_distance,
distance_min=distance_min,
distance_max=distance_max,
output_activation=False,
batch_size=batch_size,
learning_rate=0.0001,
use_queue=False,
mode="regression")
# Fit trained model
model.fit(train_dataset, nb_epoch=50)
print("Evaluating model")
n_features=n_features,
logdir=m_dir,
layer_sizes=[1000] * n_layers,
dropouts=[.25] * n_layers,
weight_init_stddevs=[.02] * n_layers,
bias_init_consts=[1.] * n_layers,
learning_rate=.0003,
penalty=.0001,
penalty_type="l2",
optimizer="adam",
batch_size=100)
all_results = []
for trial in range(num_trials):
model = dc.models.SingletaskToMultitask(
UV_tasks, task_model_builder, model_dir="UV_tf_singletask")
print("Fitting Model")
model.fit(train_dataset, nb_epoch=nb_epoch)
print("Evaluating models")
train_score, train_task_scores = model.evaluate(
train_dataset, [metric], transformers, per_task_metrics=True)
valid_score, valid_task_scores = model.evaluate(
valid_dataset, [metric], transformers, per_task_metrics=True)
test_score, test_task_scores = model.evaluate(
test_dataset, [metric], transformers, per_task_metrics=True)
all_results.append((train_score, train_task_scores, valid_score,
valid_task_scores, test_score, test_task_scores))
# Load Delaney dataset
n_features = 1024
delaney_tasks, delaney_datasets, transformers = dc.molnet.load_delaney()
train_dataset, valid_dataset, test_dataset = delaney_datasets
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)
def model_builder(model_dir):
sklearn_model = KernelRidge(kernel="rbf", alpha=1e-3, gamma=0.05)
return dc.models.SklearnModel(sklearn_model, model_dir)
model_dir = tempfile.mkdtemp()
model = dc.models.SingletaskToMultitask(delaney_tasks, model_builder, model_dir)
model.fit(train_dataset)
model.save()
print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
print("Train scores")
print(train_scores)
print("Validation scores")
print(valid_scores)