How to use the deepchem.data.CSVLoader function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepchem / deepchem / examples / adme / run_benchmarks.py View on Github external
def load_dataset(dataset_file, featurizer='ECFP', split='index'):
  tasks = ['exp']

  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = dc.feat.ConvMolFeaturizer()

  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
  ]
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)
github deepchem / deepchem / examples / nci / nci_datasets.py View on Github external
featurizer = dc.feat.ConvMolFeaturizer()

  all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
                    'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
                    'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',
                    'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620',
                    'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251',
                    'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2',
                    'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1',
                    'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',
                    'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
                    'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',
                    'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
                    'T-47D'])

  loader = dc.data.CSVLoader(
      tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer)

  dataset = loader.featurize(dataset_paths, shard_size=shard_size)

  # Initialize transformers
  print("About to transform data")
  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
  for transformer in transformers:
    dataset = transformer.transform(dataset)
  
  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  print("Performing new split.")
github deepchem / deepchem / deepchem / molnet / load_function / qm8_datasets.py View on Github external
smiles_field="smiles",
        mol_field="mol",
        featurizer=featurizer)
  else:
    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      img_size = kwargs.get("img_size", 80)
      featurizer = deepchem.feat.SmilesToImage(
          img_size=img_size, img_spec=img_spec)
    loader = deepchem.data.CSVLoader(
        tasks=qm8_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)

  if split == None:
    raise ValueError()

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'stratified': deepchem.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)
github taneishi / dlvs / pcba / datasets.py View on Github external
'PCBA-504845','PCBA-504847','PCBA-504891','PCBA-540276','PCBA-540317',
      'PCBA-588342','PCBA-588453','PCBA-588456','PCBA-588579','PCBA-588590',
      'PCBA-588591','PCBA-588795','PCBA-588855','PCBA-602179','PCBA-602233',
      'PCBA-602310','PCBA-602313','PCBA-602332','PCBA-624170','PCBA-624171',
      'PCBA-624173','PCBA-624202','PCBA-624246','PCBA-624287','PCBA-624288',
      'PCBA-624291','PCBA-624296','PCBA-624297','PCBA-624417','PCBA-651635',
      'PCBA-651644','PCBA-651768','PCBA-651965','PCBA-652025','PCBA-652104',
      'PCBA-652105','PCBA-652106','PCBA-686970','PCBA-686978','PCBA-686979',
      'PCBA-720504','PCBA-720532','PCBA-720542','PCBA-720551','PCBA-720553',
      'PCBA-720579','PCBA-720580','PCBA-720707','PCBA-720708','PCBA-720709',
      'PCBA-720711','PCBA-743255','PCBA-743266','PCBA-875','PCBA-881',
      'PCBA-883','PCBA-884','PCBA-885','PCBA-887','PCBA-891','PCBA-899',
      'PCBA-902','PCBA-903','PCBA-904','PCBA-912','PCBA-914','PCBA-915',
      'PCBA-924','PCBA-925','PCBA-926','PCBA-927','PCBA-938','PCBA-995']

  loader = dc.data.CSVLoader(
      tasks=PCBA_tasks, smiles_field="smiles", featurizer=featurizer)
  
  dataset = loader.featurize(dataset_file)
  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]

  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)
  
  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  print("Performing new split.")
github deepchem / deepchem / deepchem / molnet / load_function / sweetlead_datasets.py View on Github external
# Featurize SWEET dataset
  logger.info("About to featurize SWEET dataset.")
  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == "smiles2img":
    img_spec = kwargs.get("img_spec", "std")
    img_size = kwargs.get("img_size", 80)
    featurizer = deepchem.feat.SmilesToImage(
        img_size=img_size, img_spec=img_spec)
  else:
    raise ValueError("Other featurizations not supported")

  dataset_file = os.path.join(data_dir, "sweet.csv.gz")
  if not os.path.exists(dataset_file):
    dc.utils.download_url(SWEETLEAD_URL)
  loader = dc.data.CSVLoader(
      tasks=SWEET_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)

  # Initialize transformers
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)
  ]
  logger.info("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  if split == None:
    return SWEET_tasks, (dataset, None, None), transformers

  splitters = {
      'index': dc.splits.IndexSplitter(),
github deepchem / deepchem / deepchem / molnet / load_function / ppb_datasets.py View on Github external
if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return PPB_tasks, all_dataset, transformers

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()

  loader = deepchem.data.CSVLoader(
      tasks=PPB_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  if split == None:
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset)
    ]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return PPB_tasks, (dataset, None, None), transformers

  splitters = {
github taneishi / dlvs / datasets.py View on Github external
def load_tox21(featurizer='ECFP', split='index'):
  """Load Tox21 datasets. Does not do train/test split"""
  # Featurize Tox21 dataset
  print("About to featurize Tox21 dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(
      current_dir, "../datasets/tox21.csv.gz")
  tox21_tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER',
                 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5',
                 'SR-HSE', 'SR-MMP', 'SR-p53']
  if featurizer == 'ECFP':
    featurizer_func = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer_func = dc.feat.ConvMolFeaturizer()
  loader = dc.data.CSVLoader(
      tasks=tox21_tasks, smiles_field="smiles", featurizer=featurizer_func)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]

  print("About to transform data")
  for transformer in transformers:
      dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter(),
               'butina': dc.splits.ButinaSplitter()}
  splitter = splitters[split]
github deepchem / deepchem / examples / chembl / chembl_datasets.py View on Github external
valid_files = os.path.join(current_dir,
                                   "year_sets/chembl_%s_ts_valid.csv.gz" % set)
        test_files = os.path.join(current_dir,
                                  "year_sets/chembl_%s_ts_test.csv.gz" % set)
    else:
        dataset_path = os.path.join(
            current_dir, "../../datasets/chembl_%s.csv.gz" % set)

    # Featurize ChEMBL dataset
    print("About to featurize ChEMBL dataset.")
    if featurizer == 'ECFP':
        featurizer = dc.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer = dc.feat.ConvMolFeaturizer()

    loader = dc.data.CSVLoader(
        tasks=chembl_tasks, smiles_field="smiles", featurizer=featurizer)

    if split == "year":
        print("Featurizing train datasets")
        train_dataset = loader.featurize(
            train_files, shard_size=shard_size)

        print("Featurizing valid datasets")
        valid_dataset = loader.featurize(
            valid_files, shard_size=shard_size)

        print("Featurizing test datasets")
        test_dataset = loader.featurize(
            test_files, shard_size=shard_size)
    else:
        dataset = loader.featurize(dataset_path, shard_size=shard_size)
github deepchem / deepchem / deepchem / molnet / load_function / hppb_datasets.py View on Github external
deepchem.utils.download_url(url=hppb_URL, dest_dir=data_dir)

  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()
  elif featurizer == 'AdjacencyConv':
    featurizer = deepchem.feat.AdjacencyFingerprint(
        max_n_atoms=150, max_valence=6)

  logger.info("Featurizing datasets.")
  loader = deepchem.data.CSVLoader(
      tasks=hppb_tasks, smiles_field='smile', featurizer=featurizer)
  dataset = loader.featurize(input_files=[dataset_file], shard_size=2000)

  logger.info("Removing missing entries...")
  remove_missing_entries(dataset)

  if split == None:
    logger.info("About to transform the data...")
    transformers = []
    for transformer in transformers:
      logger.info("Transforming the dataset with transformer ",
                  transformer.__class__.__name__)
      dataset = transformer.transform(dataset)
    return hppb_tasks, (dataset, None, None), transformers

  splitters = {