Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def split(self,
dataset,
seed=None,
frac_train=.8,
frac_valid=.1,
frac_test=.1,
log_every_n=None):
"""
Splits protein-ligand pairs in PDBbind into train/validation/test in time order.
"""
if self.year_file is None:
try:
data_dir = os.environ['DEEPCHEM_DATA_DIR']
self.year_file = os.path.join(data_dir, 'pdbbind_year.csv')
if not os.path.exists(self.year_file):
dc.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/pdbbind_year.csv',
dest_dir=data_dir)
except:
raise ValueError("Time description file should be specified")
df = pd.read_csv(self.year_file, header=None)
self.years = {}
for i in range(df.shape[0]):
self.years[df[0][i]] = int(df[1][i])
np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
num_datapoints = len(dataset)
assert len(self.ids) == num_datapoints
train_cutoff = int(frac_train * num_datapoints)
valid_cutoff = int((frac_train + frac_valid) * num_datapoints)
indices = range(num_datapoints)
data_year = [self.years[self.ids[i]] for i in indices]
new_indices = [
https://data.broadinstitute.org/bbbc/BBBC002/.
"""
# Featurize BBBC002 dataset
bbbc002_tasks = ["cell-count"]
data_dir = deepchem.utils.get_data_dir()
if reload:
save_dir = os.path.join(data_dir, "bbbc002/" + str(split))
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return bbbc002_tasks, all_dataset, transformers
dataset_file = os.path.join(data_dir, "BBBC002_v1_images.zip")
labels_file = os.path.join(data_dir, "BBBC002_v1_counts.txt")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_images.zip')
if not os.path.exists(labels_file):
deepchem.utils.download_url(
'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_counts.txt')
# Featurize Images into NumpyArrays
loader = deepchem.data.ImageLoader()
dataset = loader.featurize(dataset_file, in_memory=False)
# Load text file with labels
with open(labels_file) as f:
content = f.readlines()
# Strip the first line which holds field labels
lines = [x.strip() for x in content][1:]
# Format is: Image_name count1 count2
lines = [x.split("\t") for x in lines]
counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
def zinc_decoder():
"""
Returns
-------
obj
A Decoder with weights that were trained on the zinc dataset
"""
current_dir = os.path.dirname(os.path.realpath(__file__))
weights_filename = "zinc_model.h5"
weights_file = os.path.join(current_dir, weights_filename)
if not os.path.exists(weights_file):
download_url("http://karlleswing.com/misc/keras-molecule/model.h5",
current_dir)
mv_cmd = "mv model.h5 %s" % weights_file
call(mv_cmd.split())
return TensorflowMoleculeDecoder(
model_dir=current_dir, weights_file=weights_filename)
# Featurize SWEET dataset
logger.info("About to featurize SWEET dataset.")
if featurizer == 'ECFP':
featurizer = dc.feat.CircularFingerprint(size=1024)
elif featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
img_size = kwargs.get("img_size", 80)
featurizer = deepchem.feat.SmilesToImage(
img_size=img_size, img_spec=img_spec)
else:
raise ValueError("Other featurizations not supported")
dataset_file = os.path.join(data_dir, "sweet.csv.gz")
if not os.path.exists(dataset_file):
dc.utils.download_url(SWEETLEAD_URL)
loader = dc.data.CSVLoader(
tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
]
logger.info("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
if split == None:
return SWEET_tasks, (dataset, None, None), transformers
splitters = {
def load_bbbp(featurizer='ECFP', split='random', reload=True):
"""Load blood-brain barrier penetration datasets """
# Featurize bbb dataset
logger.info("About to featurize bbbp dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
save_dir = os.path.join(data_dir, "bbbp/" + featurizer + "/" + str(split))
dataset_file = os.path.join(data_dir, "BBBP.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/BBBP.csv'
)
bbbp_tasks = ["p_np"]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return bbbp_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
move_mean=True):
"""Load clearance datasets."""
# Featurize clearance dataset
logger.info("About to featurize clearance dataset.")
logger.info("About to load clearance dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
if move_mean:
dir_name = "clearance/" + featurizer + "/" + str(split)
else:
dir_name = "clearance/" + featurizer + "_mean_unmoved/" + str(split)
save_dir = os.path.join(data_dir, dir_name)
dataset_file = os.path.join(data_dir, "clearance.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/clearance.csv'
)
clearance_tasks = ['exp']
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return clearance_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
logger.info("About to load SIDER dataset.")
if data_dir is None:
data_dir = DEFAULT_DIR
if save_dir is None:
save_dir = DEFAULT_DIR
if reload:
save_folder = os.path.join(save_dir, "sider-featurized", str(featurizer))
if featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
save_folder = os.path.join(save_folder, img_spec)
save_folder = os.path.join(save_folder, str(split))
dataset_file = os.path.join(data_dir, "sider.csv.gz")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(url=SIDER_URL, dest_dir=data_dir)
dataset = deepchem.utils.save.load_from_disk(dataset_file)
logger.info("Columns of dataset: %s" % str(dataset.columns.values))
logger.info("Number of examples in dataset: %s" % str(dataset.shape[0]))
SIDER_tasks = dataset.columns.values[1:].tolist()
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_folder)
if loaded:
return SIDER_tasks, all_dataset, transformers
# Featurize SIDER dataset
logger.info("About to featurize SIDER dataset.")
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
def load_lipo(featurizer='ECFP', split='index', reload=True, move_mean=True):
"""Load Lipophilicity datasets."""
# Featurize Lipophilicity dataset
logger.info("About to featurize Lipophilicity dataset.")
logger.info("About to load Lipophilicity dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
if move_mean:
dir_name = "lipo/" + featurizer + "/" + str(split)
else:
dir_name = "lipo/" + featurizer + "_mean_unmoved/" + str(split)
save_dir = os.path.join(data_dir, dir_name)
dataset_file = os.path.join(data_dir, "Lipophilicity.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/Lipophilicity.csv'
)
Lipo_tasks = ['exp']
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return Lipo_tasks, all_dataset, transformers
if featurizer == 'ECFP':
featurizer = deepchem.feat.CircularFingerprint(size=1024)
elif featurizer == 'GraphConv':
featurizer = deepchem.feat.ConvMolFeaturizer()
elif featurizer == 'Weave':
def load_hopv(featurizer='ECFP', split='index', reload=True):
"""Load HOPV datasets. Does not do train/test split"""
# Featurize HOPV dataset
logger.info("About to featurize HOPV dataset.")
data_dir = deepchem.utils.get_data_dir()
if reload:
save_dir = os.path.join(data_dir, "hopv/" + featurizer + "/" + str(split))
dataset_file = os.path.join(data_dir, "hopv.csv")
if not os.path.exists(dataset_file):
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/hopv.tar.gz'
)
deepchem.utils.untargz_file(os.path.join(data_dir, 'hopv.tar.gz'), data_dir)
hopv_tasks = [
'HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap', 'PCE', 'V_OC',
'J_SC', 'fill_factor'
]
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return hopv_tasks, all_dataset, transformers
if featurizer == 'ECFP':
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_5thresh.csv.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_test.csv.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_train.csv.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_valid.csv.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_test.csv.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_train.csv.gz'
)
deepchem.utils.download_url(
'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_valid.csv.gz'
)
logger.info("About to load ChEMBL dataset.")
if reload:
loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
save_dir)
if loaded:
return chembl_tasks, all_dataset, transformers