Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def load_hopv(featurizer='ECFP', split='index', reload=True):
"""Load HOPV datasets. Does not do train/test split"""
# Featurize HOPV dataset
logger.info("About to featurize HOPV dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
save_dir = os.path.join(data_dir, "hopv/" + featurizer + "/" + str(split))
dataset_file = os.path.join(data_dir, "hopv.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/hopv.tar.gz'
)
deepchem.utils.untargz_file(os.path.join(data_dir, 'hopv.tar.gz'), data_dir)
hopv_tasks = [
'HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap', 'PCE', 'V_OC',
'J_SC', 'fill_factor'
]
if reload:
"""
qm9 dataset loader.
"""
from __future__ import division
from __future__ import unicode_literals
import os
import logging
import deepchem
logger = logging.getLogger(__name__)
DEFAULT_DIR = deepchem.utils.get_data_dir()
GDB9_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb9.tar.gz'
QM9_CSV_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm9.csv'
def load_qm9(featurizer='CoulombMatrix',
split='random',
reload=True,
move_mean=True,
data_dir=None,
save_dir=None,
**kwargs):
"""Load qm9 datasets."""
# Featurize qm9 dataset
logger.info("About to featurize qm9 dataset.")
qm9_tasks = [
"mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv", "u0", "u298",
def load_toxcast(featurizer='ECFP', split='index', reload=True):
data_dir = deepchem.utils.get_data_dir()
if reload:
save_dir = os.path.join(data_dir,
"toxcast/" + featurizer + "/" + str(split))
dataset_file = os.path.join(data_dir, "toxcast_data.csv.gz")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/toxcast_data.csv.gz'
)
dataset = deepchem.utils.save.load_from_disk(dataset_file)
logger.info("Columns of dataset: %s" % str(dataset.columns.values))
logger.info("Number of examples in dataset: %s" % str(dataset.shape[0]))
TOXCAST_tasks = dataset.columns.values[1:].tolist()
if reload:
def load_images_DR(split='random', seed=None):
""" Loader for DR images """
data_dir = deepchem.utils.get_data_dir()
images_path = os.path.join(data_dir, 'DR', 'train')
label_path = os.path.join(data_dir, 'DR', 'trainLabels.csv')
if not os.path.exists(images_path) or not os.path.exists(label_path):
logger.warn("Cannot locate data, \n\
all images(.png) should be stored in the folder: $DEEPCHEM_DATA_DIR/DR/train/,\n\
corresponding label file should be stored as $DEEPCHEM_DATA_DIR/DR/trainLabels.csv.\n\
Please refer to https://www.kaggle.com/c/diabetic-retinopathy-detection for data access"
)
image_names = os.listdir(images_path)
raw_images = []
for im in image_names:
if im.endswith('.jpeg') and not im.startswith(
'cut_') and not 'cut_' + im in image_names:
raw_images.append(im)
if len(raw_images) > 0:
def load_lipo(featurizer='ECFP', split='index', reload=True, move_mean=True):
"""Load Lipophilicity datasets."""
# Featurize Lipophilicity dataset
logger.info("About to featurize Lipophilicity dataset.")
logger.info("About to load Lipophilicity dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
if move_mean:
dir_name = "lipo/" + featurizer + "/" + str(split)
else:
dir_name = "lipo/" + featurizer + "_mean_unmoved/" + str(split)
save_dir = os.path.join(data_dir, dir_name)
dataset_file = os.path.join(data_dir, "Lipophilicity.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/Lipophilicity.csv'
)
Lipo_tasks = ['exp']
if reload:
"""
qm8 dataset loader.
"""
from __future__ import division
from __future__ import unicode_literals
import os
import deepchem
import logging
logger = logging.getLogger(__name__)
DEFAULT_DIR = deepchem.utils.get_data_dir()
GDB8_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/gdb8.tar.gz'
QM8_CSV_URL = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/qm8.csv'
def load_qm8(featurizer='CoulombMatrix',
split='random',
reload=True,
move_mean=True,
data_dir=None,
save_dir=None,
**kwargs):
qm8_tasks = [
"E1-CC2", "E2-CC2", "f1-CC2", "f2-CC2", "E1-PBE0", "E2-PBE0", "f1-PBE0",
"f2-PBE0", "E1-PBE0", "E2-PBE0", "f1-PBE0", "f2-PBE0", "E1-CAM", "E2-CAM",
"f1-CAM", "f2-CAM"
]
def load_bace_classification(featurizer='ECFP', split='random', reload=True):
"""Load bace datasets."""
# Featurize bace dataset
logger.info("About to featurize bace dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
save_dir = os.path.join(data_dir, "bace_c/" + featurizer + "/" + str(split))
dataset_file = os.path.join(data_dir, "bace.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/bace.csv'
)
bace_tasks = ["Class"]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return bace_tasks, all_dataset, transformers
import pandas as pd
import os
import pickle
import array
from bisect import bisect_left
import gzip
import shutil
import deepchem
import requests
data_dir = deepchem.utils.get_data_dir()
sdf_dir = os.path.join(data_dir, "Data")
def create_cid_list(assays_to_parse):
"""Find the union of all compounds tested across one or more assays
"""
min_assay_size = 10000
assay_paths = list()
cid_set = set()
for path, dirs, filenames in os.walk(sdf_dir):
for dir in dirs:
# Each directory holds a range of assay results
joined_path = os.path.join(sdf_dir,dir)
for path, dirs, filenames in os.walk(joined_path):
for filename in filenames:
def load_hiv(featurizer='ECFP', split='index', reload=True, **kwargs):
"""Load hiv datasets. Does not do train/test split"""
# Featurize hiv dataset
logger.info("About to featurize hiv dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
save_dir = os.path.join(data_dir, "hiv/" + featurizer + "/" + str(split))
dataset_file = os.path.join(data_dir, "HIV.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/HIV.csv'
)
hiv_tasks = ["HIV_active"]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return hiv_tasks, all_dataset, transformers