Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_qm9(featurizer=None, split='random'):
"""Load qm9 datasets."""
# Featurize qm9 dataset
print("About to featurize qm9 dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(current_dir, "./gdb9.sdf")
if not os.path.exists(dataset_file):
os.system('sh ' + current_dir + '/get_qm9.sh')
qm9_tasks = [
"A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv",
"u0_atom", "u298_atom", "h298_atom", "g298_atom"
]
if featurizer is None:
featurizer = dc.feat.CoulombMatrix(29)
loader = dc.data.SDFLoader(
tasks=qm9_tasks,
smiles_field="smiles",
mol_field="mol",
featurizer=featurizer)
dataset = loader.featurize(dataset_file)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=11)
}
splitter = splitters[split]
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
dataset)
transformers = [
dc.trans.NormalizationTransformer(
def load_kinase(shard_size=10000, num_shards_per_batch=4):
"""Load Kinase datasets."""
verbosity = "high"
train_files = ("KINASE_training_disguised_combined_full.csv.gz")
valid_files = ("KINASE_test1_disguised_combined_full.csv.gz")
test_files = ("KINASE_test2_disguised_combined_full.csv.gz")
# Featurize Kinase dataset
print("About to featurize KINASE dataset.")
featurizer = dc.feat.UserDefinedFeaturizer(merck_descriptors)
KINASE_tasks = (['T_000%d' % i for i in range(13, 100)]
+ ['T_00%d' % i for i in range(100, 112)])
loader = dc.load.DataLoader(
tasks=KINASE_tasks, id_field="Molecule",
featurizer=featurizer, verbosity=verbosity)
train_datasets, valid_datasets, test_datasets = [], [], []
print("Featurizing train datasets")
train_dataset = loader.featurize(
train_files,
shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)
print("Featurizing valid datasets")
valid_dataset = loader.featurize(
valid_files, shard_size=shard_size)
test_files = os.path.join(data_dir,
"KAGGLE_test2_disguised_combined_full.csv.gz")
if not os.path.exists(train_files):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_training_disguised_combined_full.csv.gz',
dest_dir=data_dir)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test1_disguised_combined_full.csv.gz',
dest_dir=data_dir)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test2_disguised_combined_full.csv.gz',
dest_dir=data_dir)
# Featurize KAGGLE dataset
logger.info("About to featurize KAGGLE dataset.")
featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)
loader = deepchem.data.UserCSVLoader(
tasks=KAGGLE_tasks, id_field="Molecule", featurizer=featurizer)
logger.info("Featurizing train datasets")
train_dataset = loader.featurize(train_files, shard_size=shard_size)
logger.info("Featurizing valid datasets")
valid_dataset = loader.featurize(valid_files, shard_size=shard_size)
logger.info("Featurizing test datasets")
test_dataset = loader.featurize(test_files, shard_size=shard_size)
logger.info("Remove missing entries from datasets.")
remove_missing_entries(train_dataset)
remove_missing_entries(valid_dataset)
def load_dataset(dataset_file, featurizer='ECFP', split='index'):
tasks = ['exp']
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
]
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
subset + "_smiles_labels.csv")
tasks = ["-logKd/Ki"]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
loader = deepchem.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
df = pd.read_csv(dataset_file)
if split == None:
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=dataset)
]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return all_nci_tasks, all_dataset, transformers
# Featurize nci dataset
logger.info("About to featurize nci dataset.")
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
loader = deepchem.data.CSVLoader(
tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=shard_size)
if split == None:
logger.info("Split is None, about to transform data")
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=dataset)
]
for transformer in transformers:
dataset = transformer.transform(dataset)
return all_nci_tasks, (dataset, None, None), transformers
current_dir = os.path.dirname(os.path.realpath(__file__))
# Load nci dataset
print("About to load NCI dataset.")
dataset_file1_path = os.path.join(
current_dir, "../../datasets/nci_1.csv.gz")
dataset_file2_path = os.path.join(
current_dir, "../../datasets/nci_2.csv.gz")
dataset_paths = [dataset_file1_path, dataset_file2_path]
# Featurize nci dataset
print("About to featurize nci dataset.")
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',
'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620',
'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251',
'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2',
'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1',
'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',
'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',
'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
'T-47D'])
labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
elif subset == "refined":
labels_file = os.path.join(pdbbind_dir, "INDEX_refined_data.2015")
elif subset == "full":
labels_file = os.path.join(pdbbind_dir, "INDEX_general_PL_data.2015")
else:
raise ValueError("Only core, refined, and full subsets supported.")
print("About to load contents.")
if not os.path.exists(labels_file):
raise ValueError("Run ../pdbbind/get_pdbbind.sh to download dataset.")
contents_df = load_pdbbind_labels(labels_file)
ids = contents_df["PDB code"].values
y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])
# Define featurizers
pocket_featurizer = dc.feat.BindingPocketFeaturizer()
ligand_featurizer = dc.feat.CircularFingerprint(size=1024)
# Featurize Dataset
all_features = []
all_labels = []
missing_pdbs = []
all_ids = []
time1 = time.time()
for ind, pdb_code in enumerate(ids):
print("Processing complex %d, %s" % (ind, str(pdb_code)))
pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
if not os.path.exists(pdb_subdir):
print("%s is missing!" % pdb_subdir)
missing_pdbs.append(pdb_subdir)
continue
features, labels = compute_binding_pocket_features(