How to use the deepchem.splits function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepchem / deepchem / examples / benchmark_variable.py View on Github external
'pdbbind': dc.molnet.load_pdbbind_grid,
        'qm7': dc.molnet.load_qm7_from_mat,
        'sampl': dc.molnet.load_sampl,
        'tox21': dc.molnet.load_tox21
      }

      tasks, all_dataset, transformers = loading_functions[dataset](
          featurizer=featurizer, reload=reload, split='index')
      all_dataset = dc.data.DiskDataset.merge(all_dataset)
      for seed in [1,2,3,4,5]:
        for frac_train in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
          splitters = {
            'index': dc.splits.IndexSplitter(),
            'random': dc.splits.RandomSplitter(),
            'scaffold': dc.splits.ScaffoldSplitter(),
            'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
          }
          splitter = splitters[split]
          np.random.seed(seed)
          train, valid, test = splitter.train_valid_test_split(all_dataset,
                                                               frac_train=frac_train,
                                                               frac_valid=1-frac_train,
                                                               frac_test=0.)
          test = valid
          if mode == 'classification':
            train_score, valid_score, test_score = benchmark_classification(
                train, valid, test, tasks, transformers, n_features, metric,
                model, test=False, hyper_parameters=hyper_parameters, seed=seed)
          elif mode == 'regression':
            train_score, valid_score, test_score = benchmark_regression(
                train, valid, test, tasks, transformers, n_features, metric,
                model, test=False, hyper_parameters=hyper_parameters, seed=seed)
github deepchem / deepchem / examples / pdbbind / pdbbind_datasets.py View on Github external
def load_pdbbind_grid(split="index", featurizer="grid", subset="core"):
  """Load PDBBind datasets. Does not do train/test split"""
  dataset, tasks = featurize_pdbbind(feat=featurizer, subset=subset)
  dataset.w

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'scaffold': dc.splits.ScaffoldSplitter(),
      'stratified': dc.splits.RandomStratifiedSplitter()
  }
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = []
  for transformer in transformers:
    train = transformer.transform(train)
  for transformer in transformers:
    valid = transformer.transform(valid)
  for transformer in transformers:
    test = transformer.transform(test)

  return tasks, (train, valid, test), transformers
github deepchem / deepchem / examples / binding_pockets / binding_pocket_datasets.py View on Github external
def load_pdbbind_pockets(split="index", subset="core"):
  """Load PDBBind datasets. Does not do train/test split"""
  dataset, tasks = featurize_pdbbind_pockets(subset=subset)

  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter()}
  splitter = splitters[split]
  ########################################################### DEBUG
  print("dataset.X.shape")
  print(dataset.X.shape)
  print("dataset.y.shape")
  print(dataset.y.shape)
  print("dataset.w.shape")
  print(dataset.w.shape)
  print("dataset.ids.shape")
  print(dataset.ids.shape)
  ########################################################### DEBUG
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = []
  for transformer in transformers:
github deepchem / deepchem / deepchem / molnet / load_function / bbbc_datasets.py View on Github external
# Format is: Image_name count1 count2
  lines = [x.split("\t") for x in lines]
  counts = [(float(x[1]) + float(x[2])) / 2.0 for x in lines]
  y = np.reshape(np.array(counts), (len(counts), 1))
  ids = [x[0] for x in lines]

  # This is kludgy way to add y to dataset. Can be done better?
  dataset = deepchem.data.DiskDataset.from_numpy(dataset.X, y, ids=ids)

  if split == None:
    transformers = []
    logger.info("Split is None, no transformers used for the dataset.")
    return bbbc002_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
  }
  if split not in splitters:
    raise ValueError("Only index and random splits supported.")
  splitter = splitters[split]

  logger.info("About to split dataset with {} splitter.".format(split))
  train, valid, test = splitter.train_valid_test_split(dataset)
  all_dataset = (train, valid, test)
  transformers = []
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  return bbbc002_tasks, all_dataset, transformers
github deepchem / deepchem / examples / roitberg / roitberg.py View on Github external
max_atoms = 23
  batch_size = 64  # CHANGED FROM 16
  layer_structures = [128, 128, 64]
  atom_number_cases = [1, 6, 7, 8]

  metric = [
      dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression"),
      dc.metrics.Metric(dc.metrics.pearson_r2_score, mode="regression")
  ]

  print("Fitting new model...")

  train_valid_dataset, test_dataset, all_groups = load_roiterberg_ANI(
      mode="atomization")

  splitter = dc.splits.RandomGroupSplitter(
      broadcast(train_valid_dataset, all_groups))

  print("Performing 1-fold split...")
  train_dataset, valid_dataset = splitter.train_test_split(
      train_valid_dataset, train_dir=train_dir, test_dir=valid_dir)

  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]

  print("Total training set shape: ", train_dataset.get_shape())

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
github deepchem / deepchem / examples / low_data / muv_graph_conv_one_fold.py View on Github external
# num positive/negative ligands
n_pos = 10
n_neg = 10
# 10 trials on test-set
n_trials = 20

muv_tasks, dataset, transformers = load_muv_convmol()

# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1]
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Get supports on test-set
support_generator = dc.data.SupportGenerator(test_dataset, n_pos, n_neg,
                                             n_trials)

# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}

for trial_num, (task, support) in enumerate(support_generator):
  print("Starting trial %d" % trial_num)

  # Number of features on conv-mols
  n_feat = 75
  # Batch size of models
  batch_size = 50
github deepchem / deepchem / examples / low_data / muv_rf_one_fold.py View on Github external
from sklearn.ensemble import RandomForestClassifier

# 4-fold splits
K = 4
# num positive/negative ligands
n_pos = 10
n_neg = 10
# 10 trials on test-set
n_trials = 20

tox21_tasks, dataset, transformers = load_muv_ecfp()

# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1] 
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Get supports on test-set
support_generator = dc.data.SupportGenerator(
    test_dataset, n_pos, n_neg, n_trials)

# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
for (task, support) in support_generator:
  # Train model on support
  sklearn_model = RandomForestClassifier(
      class_weight="balanced", n_estimators=100)
github deepchem / deepchem / examples / qm9 / qm9_datasets.py View on Github external
os.system('sh ' + current_dir + '/get_qm9.sh')

  qm9_tasks = [
      "A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv",
      "u0_atom", "u298_atom", "h298_atom", "g298_atom"
  ]
  if featurizer is None:
    featurizer = dc.feat.CoulombMatrix(29)
  loader = dc.data.SDFLoader(
      tasks=qm9_tasks,
      smiles_field="smiles",
      mol_field="mol",
      featurizer=featurizer)
  dataset = loader.featurize(dataset_file)
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=11)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)
  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]
  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)
  return qm9_tasks, (train_dataset, valid_dataset, test_dataset), transformers
github simonfqy / PADME / dcCustom / molnet / load_function / nci60_dataset.py View on Github external
featurizer = dcCustom.feat.ConvMolFeaturizer()
  
  loader = dcCustom.data.CSVLoader(
      tasks = tasks, smiles_field="smiles", protein_field = "proteinName", 
      source_field = 'protein_dataset', featurizer=featurizer, prot_seq_dict=prot_seq_dict)
  dataset = loader.featurize(dataset_file, shard_size=8192)  
      
  # print("About to transform data")
  # for transformer in transformers:
  #   dataset = transformer.transform(dataset)
    
  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': dcCustom.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter(),
      'butina': deepchem.splits.ButinaSplitter(),
      'task': deepchem.splits.TaskSplitter()
  }
  splitter = splitters[split]  
  
  # HACK: We set frac_train to 1.0 because assume NCI60 dataset is for prediction only: there
  # is no underlying truth. To predict all drug-target pairs, we need to let all samples be in
  # the "training" set, though it is a misnomer.
  train, valid, test = splitter.train_valid_test_split(dataset, frac_train=1.0, 
    frac_valid=0.0, frac_test=0)
  all_dataset = (train, valid, test)
  if reload:
    deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,
                                             transformers)
  
  return tasks, all_dataset, transformers
github deepchem / deepchem / examples / low_data / sider_rf_one_fold.py View on Github external
from datasets import load_sider_ecfp
from sklearn.ensemble import RandomForestClassifier

# 4-fold splits
K = 4
# num positive/negative ligands
n_pos = 10
n_neg = 10
n_trials = 20

sider_tasks, dataset, transformers = load_sider_ecfp()

# Define metric
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, mode="classification")

task_splitter = dc.splits.TaskSplitter()
fold_datasets = task_splitter.k_fold_split(dataset, K)

train_folds = fold_datasets[:-1] 
train_dataset = dc.splits.merge_fold_datasets(train_folds)
test_dataset = fold_datasets[-1]

# Get supports on test-set
support_generator = dc.data.SupportGenerator(
    test_dataset, n_pos, n_neg, n_trials)

# Compute accuracies
task_scores = {task: [] for task in range(len(test_dataset.get_task_names()))}
for (task, support) in support_generator:
  # Train model on support
  sklearn_model = RandomForestClassifier(
      class_weight="balanced", n_estimators=100)