Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/clearance.csv'
)
clearance_tasks = ['exp']
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return clearance_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
loader = deepchem.data.CSVLoader(
tasks=clearance_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
if split is None:
# Initialize transformers
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=dataset, move_mean=move_mean)
]
def load_sampl(featurizer='ECFP', split='index'):
"""Load SAMPL datasets."""
# Featurize SAMPL dataset
print("About to featurize SAMPL dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(
current_dir, "./SAMPL.csv")
SAMPL_tasks = ['expt']
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=SAMPL_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(
dataset_file, shard_size=8192)
# Initialize transformers
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()}
dataset = deepchem.utils.save.load_from_disk(dataset_file)
clintox_tasks = dataset.columns.values[1:].tolist()
logger.info("Tasks in dataset: %s" % (clintox_tasks))
logger.info("Number of tasks in dataset: %s" % str(len(clintox_tasks)))
logger.info("Number of examples in dataset: %s" % str(dataset.shape[0]))
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return clintox_tasks, all_dataset, transformers
# Featurize clintox dataset
logger.info("About to featurize clintox dataset.")
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
loader = deepchem.data.CSVLoader(
tasks=clintox_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
# Transform clintox dataset
if split is None:
transformers = [
deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
]
logger.info("Split is None, about to transform data.")
]
dataset_dir = os.path.join(data_dir, "tox21", featurizer, split)
train, valid, test = os.path.join(dataset_dir, 'train'), os.path.join(
dataset_dir, 'valid'), os.path.join(dataset_dir, 'test')
if os.path.isdir(dataset_dir):
train, valid, test = DiskDataset(data_dir=train), DiskDataset(
data_dir=valid), DiskDataset(data_dir=test)
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=train)
]
return tox21_tasks, (train, valid, test), transformers
if featurizer == 'ECFP':
featurizer_func = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer_func = dc.feat.ConvMolFeaturizer()
elif featurizer == 'AdjMatrix':
featurizer_func = dc.feat.AdjacencyFingerprint(num_atoms_feature=True)
loader = dc.data.CSVLoader(
tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer_func)
dataset = loader.featurize(dataset_file, shard_size=8192)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {
def load_muv_convmol():
"""Load MUV datasets. Does not do train/test split"""
# Load MUV dataset
print("About to load MUV dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(
current_dir, "../../datasets/muv.csv.gz")
# Featurize MUV dataset
print("About to featurize MUV dataset.")
featurizer = dc.feat.ConvMolFeaturizer()
MUV_tasks = sorted(['MUV-692', 'MUV-689', 'MUV-846', 'MUV-859', 'MUV-644',
'MUV-548', 'MUV-852', 'MUV-600', 'MUV-810', 'MUV-712',
'MUV-737', 'MUV-858', 'MUV-713', 'MUV-733', 'MUV-652',
'MUV-466', 'MUV-832'])
loader = dc.data.CSVLoader(
tasks=MUV_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
hopv_tasks = [
'HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap', 'PCE', 'V_OC',
'J_SC', 'fill_factor'
]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return hopv_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == 'Raw':
featurizer = deepchem.feat.RawFeaturizer()
loader = deepchem.data.CSVLoader(
tasks=hopv_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
if split == None:
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=dataset)
]
logger.info("Split is None, about to transform data")
def load_dataset(dataset_file, featurizer='ECFP', split='index'):
tasks = ['exp']
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
]
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()
}
# Load TOXCAST dataset
print("About to load TOXCAST dataset.")
dataset_file = os.path.join(
current_dir, "./processing/toxcast_data.csv.gz")
dataset = dc.utils.save.load_from_disk(dataset_file)
print("Columns of dataset: %s" % str(dataset.columns.values))
print("Number of examples in dataset: %s" % str(dataset.shape[0]))
# Featurize TOXCAST dataset
print("About to featurize TOXCAST dataset.")
if featurizer == 'ECFP':
featurizer_func = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer_func = dc.feat.ConvMolFeaturizer()
TOXCAST_tasks = dataset.columns.values[1:].tolist()
loader = dc.load.DataLoader(tasks=TOXCAST_tasks,
smiles_field="smiles",
featurizer=featurizer_func,
verbosity="high")
dataset = loader.featurize(dataset_file)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
def load_delaney(featurizer='ECFP', split='index'):
"""Load delaney datasets."""
# Featurize Delaney dataset
print("About to featurize Delaney dataset.")
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(current_dir,
"../../datasets/delaney-processed.csv")
delaney_tasks = ['measured log solubility in mols per litre']
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
tasks=delaney_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
# Initialize transformers
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),