Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if reload:
save_dir = os.path.join(
data_dir,
assay_file_name.split(".")[0] + featurizer + "/" + str(split))
dataset_file = os.path.join(data_dir, assay_file_name)
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
"http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/{0}".
format(assay_file_name))
# Featurize PCBA dataset
logger.info("About to featurize PCBA dataset.")
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
with gzip.GzipFile(dataset_file, "r") as fin:
header = fin.readline().rstrip().decode("utf-8")
columns = header.split(",")
columns.remove("mol_id")
columns.remove("smiles")
PCBA_tasks = columns
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
def load_delaney(featurizer='ECFP', split='index'):
"""Load delaney datasets."""
# Featurize Delaney dataset
print("About to featurize Delaney dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(
current_dir, "../../datasets/delaney-processed.csv")
delaney_tasks = ['measured log solubility in mols per litre']
if featurizer == 'ECFP':
featurizer_func = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer_func = dc.feat.ConvMolFeaturizer()
loader = dc.load.DataLoader(
tasks=delaney_tasks, smiles_field="smiles",
featurizer=featurizer_func, verbosity = 'high')
dataset = loader.featurize(
dataset_file, shard_size=8192)
# Initialize transformers
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
if split == "year":
train_datasets, valid_datasets, test_datasets = [], [], []
train_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_train.csv.gz" % set)
valid_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_valid.csv.gz" % set)
test_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_test.csv.gz" % set)
else:
dataset_path = os.path.join(
current_dir, "../datasets/chembl_%s.csv.gz" % set)
# Featurize ChEMBL dataset
print("About to featurize ChEMBL dataset.")
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=chembl_tasks, smiles_field="smiles", featurizer=featurizer)
if split == "year":
print("Featurizing train datasets")
train_dataset = loader.featurize(
train_files, shard_size=shard_size)
print("Featurizing valid datasets")
valid_dataset = loader.featurize(
valid_files, shard_size=shard_size)
print("Featurizing test datasets")
if not os.path.exists(save_dir):
logger.warning("{} does not exist. Creating one.".format(save_dir))
else:
logger.info("{} exists. Loading featurized datasets.".format(save_dir))
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return thermosol_tasks, all_dataset, transformers
dataset_file = os.path.join(data_dir, "thermosol.csv")
if not os.path.exists(dataset_file):
logger.info("{} does not exist. Downloading it.".format(dataset_file))
deepchem.utils.download_url(url=THERMOSOL_URL, dest_dir=data_dir)
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
elif featurizer == 'AdjacencyConv':
featurizer = deepchem.feat.AdjacencyFingerprint(
max_n_atoms=150, max_valence=6)
logger.info("Featurizing datasets.")
loader = deepchem.data.CSVLoader(
tasks=thermosol_tasks, smiles_field='smile', featurizer=featurizer)
dataset = loader.featurize(input_files=[dataset_file], shard_size=2000)
logger.info("Removing missing entries...")
delim = "_CV" + delim
save_dir = os.path.join(data_dir, featurizer + delim + mode + "/" + split)
loaded, all_dataset, transformers = dcCustom.utils.save.load_cv_dataset_from_disk(
save_dir, K)
else:
save_dir = os.path.join(data_dir, featurizer + delim + mode + "/" + split)
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return tasks, all_dataset, transformers
dataset_file = os.path.join(data_dir, file_name)
if featurizer == 'Weave':
featurizer = dcCustom.feat.WeaveFeaturizer()
elif featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dcCustom.feat.ConvMolFeaturizer()
loader = dcCustom.data.CSVLoader(
tasks = tasks, smiles_field="smiles", protein_field = "proteinName",
featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
if mode == 'regression':
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=dataset)
]
elif mode == 'classification':
transformers = [
deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
if split == "year":
train_datasets, valid_datasets, test_datasets = [], [], []
train_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_train.csv.gz" % set)
valid_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_valid.csv.gz" % set)
test_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_test.csv.gz" % set)
else:
dataset_path = os.path.join(
current_dir, "../../datasets/chembl_%s.csv.gz" % set)
# Featurize ChEMBL dataset
print("About to featurize ChEMBL dataset.")
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=chembl_tasks, smiles_field="smiles", featurizer=featurizer)
if split == "year":
print("Featurizing train datasets")
train_dataset = loader.featurize(
train_files, shard_size=shard_size)
print("Featurizing valid datasets")
valid_dataset = loader.featurize(
valid_files, shard_size=shard_size)
print("Featurizing test datasets")
featurizer = deepchem.feat.CoulombMatrix(29)
elif featurizer == 'BPSymmetryFunctionInput':
featurizer = deepchem.feat.BPSymmetryFunctionInput(29)
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
elif featurizer == 'MP':
featurizer = deepchem.feat.WeaveFeaturizer(
graph_distance=False, explicit_H=True)
loader = deepchem.data.SDFLoader(
tasks=qm9_tasks,
smiles_field="smiles",
mol_field="mol",
featurizer=featurizer)
else:
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
img_size = kwargs.get("img_size", 80)
featurizer = deepchem.feat.SmilesToImage(
img_size=img_size, img_spec=img_spec)
loader = deepchem.data.CSVLoader(
tasks=qm9_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
if split == None:
raise ValueError()
dataset_file = os.path.join(data_dir, "bace.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv'
)
bace_tasks = ["pIC50"]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return bace_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
elif featurizer == 'UserDefined':
featurizer = deepchem.feat.UserDefinedFeaturizer(
bace_user_specified_features)
loader = deepchem.data.CSVLoader(
tasks=bace_tasks, smiles_field="mol", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
if split is None:
# Initialize transformers
dataset_file = os.path.join(data_dir, "HIV.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/HIV.csv'
)
hiv_tasks = ["HIV_active"]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return hiv_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
elif featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
featurizer = deepchem.feat.SmilesToImage(img_spec=img_spec)
loader = deepchem.data.CSVLoader(
tasks=hiv_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
if split is None:
transformers = [
def load_muv_ecfp():
"""Load MUV datasets. Does not do train/test split"""
# Load MUV dataset
print("About to load MUV dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(
current_dir, "../../datasets/muv.csv.gz")
# Featurize MUV dataset
print("About to featurize MUV dataset.")
featurizer = dc.feat.CircularFingerprint(size=1024)
MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
'MUV-466', 'MUV-832'])
loader = dc.data.CSVLoader(
tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)