How to use the deepchem.utils.download_url function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github simonfqy / PADME / dcCustom / splits / splitters.py View on Github external
def split(self,
            dataset,
            seed=None,
            frac_train=.8,
            frac_valid=.1,
            frac_test=.1,
            log_every_n=None):
    """
    Splits protein-ligand pairs in PDBbind into train/validation/test in time order.
    """
    if self.year_file is None:
      try:
        data_dir = os.environ['DEEPCHEM_DATA_DIR']
        self.year_file = os.path.join(data_dir, 'pdbbind_year.csv')
        if not os.path.exists(self.year_file):
          dc.utils.download_url(
              'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/pdbbind_year.csv',
              dest_dir=data_dir)
      except:
        raise ValueError("Time description file should be specified")
    df = pd.read_csv(self.year_file, header=None)
    self.years = {}
    for i in range(df.shape[0]):
      self.years[df[0][i]] = int(df[1][i])
    np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.)
    num_datapoints = len(dataset)
    assert len(self.ids) == num_datapoints
    train_cutoff = int(frac_train * num_datapoints)
    valid_cutoff = int((frac_train + frac_valid) * num_datapoints)
    indices = range(num_datapoints)
    data_year = [self.years[self.ids[i]] for i in indices]
    new_indices = [
github deepchem / deepchem / deepchem / molnet / load_function / bbbc_datasets.py View on Github external
https://data.broadinstitute.org/bbbc/BBBC002/.
  """
  # Featurize BBBC002 dataset
  bbbc002_tasks = ["cell-count"]
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "bbbc002/" + str(split))
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return bbbc002_tasks, all_dataset, transformers
  dataset_file = os.path.join(data_dir, "BBBC002_v1_images.zip")
  labels_file = os.path.join(data_dir, "BBBC002_v1_counts.txt")

  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_images.zip')
  if not os.path.exists(labels_file):
    deepchem.utils.download_url(
        'https://data.broadinstitute.org/bbbc/BBBC002/BBBC002_v1_counts.txt')
  # Featurize Images into NumpyArrays
  loader = deepchem.data.ImageLoader()
  dataset = loader.featurize(dataset_file, in_memory=False)

  # Load text file with labels
  with open(labels_file) as f:
    content = f.readlines()
  # Strip the first line which holds field labels
  lines = [x.strip() for x in content][1:]
  # Format is: Image_name count1 count2
  lines = [x.split("\t") for x in lines]
  counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
github deepchem / deepchem / contrib / autoencoder_models / autoencoder.py View on Github external
def zinc_decoder():
    """
    Returns
    -------
    obj
      A Decoder with weights that were trained on the zinc dataset
    """
    current_dir = os.path.dirname(os.path.realpath(__file__))
    weights_filename = "zinc_model.h5"
    weights_file = os.path.join(current_dir, weights_filename)

    if not os.path.exists(weights_file):
      download_url("http://karlleswing.com/misc/keras-molecule/model.h5",
                   current_dir)
      mv_cmd = "mv model.h5 %s" % weights_file
      call(mv_cmd.split())
    return TensorflowMoleculeDecoder(
        model_dir=current_dir, weights_file=weights_filename)
github deepchem / deepchem / deepchem / molnet / load_function / sweetlead_datasets.py View on Github external
# Featurize SWEET dataset
  logger.info("About to featurize SWEET dataset.")
  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)
  else:
    raise ValueError("Other featurizations not supported")

  dataset_file = os.path.join(data_dir, "sweet.csv.gz")
  if not os.path.exists(dataset_file):
    dc.utils.download_url(SWEETLEAD_URL)
  loader = dc.data.CSVLoader(
      tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)

  # Initialize transformers
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]
  logger.info("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  if split == None:
    return SWEET_tasks, (dataset, None, None), transformers

  splitters = {
github deepchem / deepchem / deepchem / molnet / load_function / bbbp_datasets.py View on Github external
def load_bbbp(featurizer='ECFP', split='random', reload=True):
  """Load blood-brain barrier penetration datasets """
  # Featurize bbb dataset
  logger.info("About to featurize bbbp dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "bbbp/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "BBBP.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/BBBP.csv'
    )

  bbbp_tasks = ["p_np"]

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return bbbp_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
github deepchem / deepchem / deepchem / molnet / load_function / clearance_datasets.py View on Github external
move_mean=True):
  """Load clearance datasets."""
  # Featurize clearance dataset
  logger.info("About to featurize clearance dataset.")
  logger.info("About to load clearance dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    if move_mean:
      dir_name = "clearance/" + featurizer + "/" + str(split)
    else:
      dir_name = "clearance/" + featurizer + "_mean_unmoved/" + str(split)
    save_dir = os.path.join(data_dir, dir_name)

  dataset_file = os.path.join(data_dir, "clearance.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/clearance.csv'
    )

  clearance_tasks = ['exp']

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return clearance_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
github deepchem / deepchem / deepchem / molnet / load_function / sider_datasets.py View on Github external
logger.info("About to load SIDER dataset.")
  if data_dir is None:
    data_dir = DEFAULT_DIR
  if save_dir is None:
    save_dir = DEFAULT_DIR

  if reload:
    save_folder = os.path.join(save_dir, "sider-featurized", str(featurizer))
    if featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      save_folder = os.path.join(save_folder, img_spec)
    save_folder = os.path.join(save_folder, str(split))

  dataset_file = os.path.join(data_dir, "sider.csv.gz")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(url=SIDER_URL, dest_dir=data_dir)

  dataset = deepchem.utils.save.load_from_disk(dataset_file)
  logger.info("Columns of dataset: %s" % str(dataset.columns.values))
  logger.info("Number of examples in dataset: %s" % str(dataset.shape[0]))
  SIDER_tasks = dataset.columns.values[1:].tolist()

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_folder)
    if loaded:
      return SIDER_tasks, all_dataset, transformers

  # Featurize SIDER dataset
  logger.info("About to featurize SIDER dataset.")
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
github deepchem / deepchem / deepchem / molnet / load_function / lipo_datasets.py View on Github external
def load_lipo(featurizer='ECFP', split='index', reload=True, move_mean=True):
  """Load Lipophilicity datasets."""
  # Featurize Lipophilicity dataset
  logger.info("About to featurize Lipophilicity dataset.")
  logger.info("About to load Lipophilicity dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    if move_mean:
      dir_name = "lipo/" + featurizer + "/" + str(split)
    else:
      dir_name = "lipo/" + featurizer + "_mean_unmoved/" + str(split)
    save_dir = os.path.join(data_dir, dir_name)

  dataset_file = os.path.join(data_dir, "Lipophilicity.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/Lipophilicity.csv'
    )

  Lipo_tasks = ['exp']

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return Lipo_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
github deepchem / deepchem / deepchem / molnet / load_function / hopv_datasets.py View on Github external
def load_hopv(featurizer='ECFP', split='index', reload=True):
  """Load HOPV datasets. Does not do train/test split"""
  # Featurize HOPV dataset
  logger.info("About to featurize HOPV dataset.")
  data_dir = deepchem.utils.get_data_dir()
  if reload:
    save_dir = os.path.join(data_dir, "hopv/" + featurizer + "/" + str(split))

  dataset_file = os.path.join(data_dir, "hopv.csv")
  if not os.path.exists(dataset_file):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/hopv.tar.gz'
    )
    deepchem.utils.untargz_file(os.path.join(data_dir, 'hopv.tar.gz'), data_dir)

  hopv_tasks = [
      'HOMO', 'LUMO', 'electrochemical_gap', 'optical_gap', 'PCE', 'V_OC',
      'J_SC', 'fill_factor'
  ]

  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return hopv_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
github deepchem / deepchem / deepchem / molnet / load_function / chembl_datasets.py View on Github external
deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_5thresh.csv.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_sparse.csv.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_test.csv.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_train.csv.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_5thresh_ts_valid.csv.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_test.csv.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_train.csv.gz'
    )
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/chembl_year_sets/chembl_sparse_ts_valid.csv.gz'
    )

  logger.info("About to load ChEMBL dataset.")
  if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return chembl_tasks, all_dataset, transformers