Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
dataset,
frac_train=frac_train,
frac_valid=frac_valid,
frac_test=frac_test)
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=train_dataset, move_mean=move_mean)
]
for transformer in transformers:
train_dataset = transformer.transform(train_dataset)
valid_dataset = transformer.transform(valid_dataset)
test_dataset = transformer.transform(test_dataset)
if reload:
deepchem.utils.save.save_dataset_to_disk(
save_folder, train_dataset, valid_dataset, test_dataset, transformers)
return qm8_tasks, (train_dataset, valid_dataset, test_dataset), transformers
verbose=True,
seed=seed)
model.fit(train_dataset, nb_epoch=10)
metric = [
dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset,
transformers)
train_scores = train_evaluator.compute_model_performance(
metric,
csv_out="train_predict_ac_random.csv",
stats_out="train_stats_ac_random.csv")
print("Train scores")
print(train_scores)
test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance(
metric,
csv_out="test_predict_ac_random.csv",
stats_out="test_stats_ac_random.csv")
print("Test scores")
print(test_scores)
penalty_type=penalty_type,
dropouts=dropouts,
learning_rate=0.002,
momentum=0.8,
optimizer="adam",
batch_size=24,
conv_layers=1,
boxsize=None,
verbose=True,
seed=seed)
model.fit(train_dataset, nb_epoch=10)
metric = [
dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
]
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset,
transformers)
train_scores = train_evaluator.compute_model_performance(
metric,
csv_out="train_predict_ac_random.csv",
stats_out="train_stats_ac_random.csv")
print("Train scores")
print(train_scores)
test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance(
metric,
csv_out="test_predict_ac_random.csv",
stats_out="test_stats_ac_random.csv")
print("Test scores")
print(test_scores)
def split(self,
dataset,
seed=None,
frac_train=.8,
frac_valid=.1,
frac_test=.1,
log_every_n=None):
"""
Splits protein-ligand pairs in PDBbind into train/validation/test in time order.
"""
if self.year_file is None:
try:
data_dir = os.environ['DEEPCHEM_DATA_DIR']
self.year_file = os.path.join(data_dir, 'pdbbind_year.csv')
if not os.path.exists(self.year_file):
dc.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/pdbbind_year.csv',
dest_dir=data_dir)
except:
raise ValueError("Time description file should be specified")
df = pd.read_csv(self.year_file, header=None)
self.years = {}
for i in range(df.shape[0]):
self.years[df[0][i]] = int(df[1][i])
np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
num_datapoints = len(dataset)
assert len(self.ids) == num_datapoints
train_cutoff = int(frac_train * num_datapoints)
valid_cutoff = int((frac_train + frac_valid) * num_datapoints)
indices = range(num_datapoints)
data_year = [self.years[self.ids[i]] for i in indices]
new_indices = [
if split == 'task':
fold_datasets = splitter.k_fold_split(dataset, K)
all_dataset = fold_datasets
else:
frac_train = kwargs.get("frac_train", 0.8)
frac_valid = kwargs.get('frac_valid', 0.1)
frac_test = kwargs.get('frac_test', 0.1)
train, valid, test = splitter.train_valid_test_split(
dataset,
frac_train=frac_train,
frac_valid=frac_valid,
frac_test=frac_test)
all_dataset = (train, valid, test)
if reload:
deepchem.utils.save.save_dataset_to_disk(save_folder, train, valid, test,
transformers)
return tox21_tasks, all_dataset, transformers
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': dcCustom.splits.RandomSplitter(split_cold=predict_cold, cold_drug=cold_drug,
cold_target=cold_target, split_warm=split_warm, prot_seq_dict=prot_seq_dict,
threshold=filter_threshold),
'scaffold': deepchem.splits.ScaffoldSplitter(),
'butina': deepchem.splits.ButinaSplitter(),
'task': deepchem.splits.TaskSplitter()
}
splitter = splitters[split]
if test:
train, valid, test = splitter.train_valid_test_split(dataset)
all_dataset = (train, valid, test)
if reload:
deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
transformers)
elif cross_validation:
fold_datasets = splitter.k_fold_split(dataset, K)
all_dataset = fold_datasets
if reload:
dcCustom.utils.save.save_cv_dataset_to_disk(save_dir, all_dataset, K, transformers)
else:
# not cross validating, and not testing.
train, valid, test = splitter.train_valid_test_split(dataset, frac_valid=0.2,
frac_test=0)
all_dataset = (train, valid, test)
if reload:
deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
transformers)
for transformer in transformers:
test_dataset = transformer.transform(test_dataset)
regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")
def model_builder(model_dir):
sklearn_model = KernelRidge(
kernel="rbf", alpha=5e-4, gamma=0.008)
return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(tasks, model_builder, model_dir)
# Fit trained model
model.fit(train_dataset)
model.save()
train_evaluator = dc.utils.evaluate.Evaluator(model, train_dataset, transformers)
train_scores = train_evaluator.compute_model_performance([regression_metric])
print("Train scores [kcal/mol]")
print(train_scores)
test_evaluator = dc.utils.evaluate.Evaluator(model, test_dataset, transformers)
test_scores = test_evaluator.compute_model_performance([regression_metric])
print("Validation scores [kcal/mol]")
print(test_scores)
test_dir,
data_dir,
shard_size=2000):
"""Load KAGGLE datasets. Does not do train/test split"""
############################################################## TIMING
time1 = time.time()
############################################################## TIMING
# Set some global variables up top
train_files = os.path.join(data_dir,
"KAGGLE_training_disguised_combined_full.csv.gz")
valid_files = os.path.join(data_dir,
"KAGGLE_test1_disguised_combined_full.csv.gz")
test_files = os.path.join(data_dir,
"KAGGLE_test2_disguised_combined_full.csv.gz")
if not os.path.exists(train_files):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_training_disguised_combined_full.csv.gz',
dest_dir=data_dir)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test1_disguised_combined_full.csv.gz',
dest_dir=data_dir)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test2_disguised_combined_full.csv.gz',
dest_dir=data_dir)
# Featurize KAGGLE dataset
logger.info("About to featurize KAGGLE dataset.")
featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)
loader = deepchem.data.UserCSVLoader(
tasks=KAGGLE_tasks, id_field="Molecule", featurizer=featurizer)
def load_delaney(featurizer='ECFP', split='index', reload=True, move_mean=True):
"""Load delaney datasets."""
# Featurize Delaney dataset
logger.info("About to featurize Delaney dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
if move_mean:
dir_name = "delaney/" + featurizer + "/" + str(split)
else:
dir_name = "delaney/" + featurizer + "_mean_unmoved/" + str(split)
save_dir = os.path.join(data_dir, dir_name)
dataset_file = os.path.join(data_dir, "delaney-processed.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/delaney-processed.csv'
)
delaney_tasks = ['measured log solubility in mols per litre']
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return delaney_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()