Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_dataset(dataset_file, featurizer='ECFP', split='index'):
tasks = ['exp']
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
]
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()
}
splitter = splitters[split]
train, valid, test = splitter.train_valid_test_split(dataset)
featurizer = dc.feat.ConvMolFeaturizer()
all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',
'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620',
'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251',
'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2',
'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1',
'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',
'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',
'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
'T-47D'])
loader = dc.data.CSVLoader(
tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_paths, shard_size=shard_size)
# Initialize transformers
print("About to transform data")
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()}
splitter = splitters[split]
print("Performing new split.")
smiles_field="smiles",
mol_field="mol",
featurizer=featurizer)
else:
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
img_size = kwargs.get("img_size", 80)
featurizer = deepchem.feat.SmilesToImage(
img_size=img_size, img_spec=img_spec)
loader = deepchem.data.CSVLoader(
tasks=qm8_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
if split == None:
raise ValueError()
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': deepchem.splits.RandomSplitter(),
'stratified': deepchem.splits.SingletaskStratifiedSplitter(task_number=0)
}
splitter = splitters[split]
frac_train = kwargs.get("frac_train", 0.8)
frac_valid = kwargs.get('frac_valid', 0.1)
frac_test = kwargs.get('frac_test', 0.1)
'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']
loader = dc.data.CSVLoader(
tasks=PCBA_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()}
splitter = splitters[split]
print("Performing new split.")
# Featurize SWEET dataset
logger.info("About to featurize SWEET dataset.")
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
img_size = kwargs.get("img_size", 80)
featurizer = deepchem.feat.SmilesToImage(
img_size=img_size, img_spec=img_spec)
else:
raise ValueError("Other featurizations not supported")
dataset_file = os.path.join(data_dir, "sweet.csv.gz")
if not os.path.exists(dataset_file):
dc.utils.download_url(SWEETLEAD_URL)
loader = dc.data.CSVLoader(
tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
]
logger.info("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
if split == None:
return SWEET_tasks, (dataset, None, None), transformers
splitters = {
'index': dc.splits.IndexSplitter(),
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return PPB_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
loader = deepchem.data.CSVLoader(
tasks=PPB_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
if split == None:
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=dataset)
]
logger.info("Split is None, about to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
return PPB_tasks, (dataset, None, None), transformers
splitters = {
def load_tox21(featurizer='ECFP', split='index'):
"""Load Tox21 datasets. Does not do train/test split"""
# Featurize Tox21 dataset
print("About to featurize Tox21 dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(
current_dir, "../datasets/tox21.csv.gz")
tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
'SR-HSE', 'SR-MMP', 'SR-p53']
if featurizer == 'ECFP':
featurizer_func = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer_func = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer_func)
dataset = loader.featurize(dataset_file, shard_size=8192)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter(),
'butina': dc.splits.ButinaSplitter()}
splitter = splitters[split]
valid_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_valid.csv.gz" % set)
test_files = os.path.join(current_dir,
"year_sets/chembl_%s_ts_test.csv.gz" % set)
else:
dataset_path = os.path.join(
current_dir, "../../datasets/chembl_%s.csv.gz" % set)
# Featurize ChEMBL dataset
print("About to featurize ChEMBL dataset.")
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=chembl_tasks, smiles_field="smiles", featurizer=featurizer)
if split == "year":
print("Featurizing train datasets")
train_dataset = loader.featurize(
train_files, shard_size=shard_size)
print("Featurizing valid datasets")
valid_dataset = loader.featurize(
valid_files, shard_size=shard_size)
print("Featurizing test datasets")
test_dataset = loader.featurize(
test_files, shard_size=shard_size)
else:
dataset = loader.featurize(dataset_path, shard_size=shard_size)
deepchem.utils.download_url(url=hppb_URL, dest_dir=data_dir)
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
elif featurizer == 'AdjacencyConv':
featurizer = deepchem.feat.AdjacencyFingerprint(
max_n_atoms=150, max_valence=6)
logger.info("Featurizing datasets.")
loader = deepchem.data.CSVLoader(
tasks=hppb_tasks, smiles_field='smile', featurizer=featurizer)
dataset = loader.featurize(input_files=[dataset_file], shard_size=2000)
logger.info("Removing missing entries...")
remove_missing_entries(dataset)
if split == None:
logger.info("About to transform the data...")
transformers = []
for transformer in transformers:
logger.info("Transforming the dataset with transformer ",
transformer.__class__.__name__)
dataset = transformer.transform(dataset)
return hppb_tasks, (dataset, None, None), transformers
splitters = {