How to use the deepchem.splits.RandomSplitter function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepchem / deepchem / examples / qm7 / qm7_datasets.py View on Github external
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)
  print(len(dataset))

  current_dir = os.path.dirname(os.path.realpath(__file__))
  split_file = os.path.join(current_dir, "./qm7_splits.csv")

  split_indices = []
  with open(split_file, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
      row_int = (np.asarray(list(map(int, row)))).tolist()
      split_indices.append(row_int)

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)
  print(len(train_dataset))
  print(len(valid_dataset))
  print(len(test_dataset))

  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]

  for transformer in transformers:
github deepchem / deepchem / deepchem / molnet / load_function / pcba_datasets.py View on Github external
dataset = loader.featurize(dataset_file)

  if split == None:
    transformers = [
        deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]

    logger.info("Split is None, about to transform data")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return PCBA_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split dataset using {} splitter.".format(split))
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=train)
  ]

  logger.info("About to transform dataset.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)
github deepchem / deepchem / examples / qm7 / qm7_datasets.py View on Github external
w = np.ones_like(y)
  dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)

  current_dir = os.path.dirname(os.path.realpath(__file__))
  split_file = os.path.join(current_dir, "./qm7_splits.csv")

  split_indices = []
  with open(split_file, 'r') as f:
    reader = csv.reader(f)
    for row in reader:
      row_int = (np.asarray(list(map(int, row)))).tolist()
      split_indices.append(row_int)

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)

  transformers = [
      dc.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset)
  ]

  for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
    valid_dataset = transformer.transform(valid_dataset)
    test_dataset = transformer.transform(test_dataset)
github deepchem / deepchem / deepchem / molnet / load_function / clintox_datasets.py View on Github external
# Transform clintox dataset
  if split is None:
    transformers = [
        deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
    ]

    logger.info("Split is None, about to transform data.")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return clintox_tasks, (dataset, None, None), transformers

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'scaffold': deepchem.splits.ScaffoldSplitter()
  }
  splitter = splitters[split]
  logger.info("About to split data with {} splitter.".format(split))
  train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
      deepchem.trans.BalancingTransformer(transform_w=True, dataset=train)
  ]

  logger.info("About to transform data.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)
github deepchem / deepchem / deepchem / molnet / load_function / chembl_datasets.py View on Github external
if split is None:
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset)
    ]

    logger.info("Split is None, about to transform data.")
    for transformer in transformers:
      dataset = transformer.transform(dataset)

    return chembl_tasks, (dataset, None, None), transformers

  if split != "year":
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
        'scaffold': deepchem.splits.ScaffoldSplitter()
    }

    splitter = splitters[split]
    logger.info("Performing new split.")
    train, valid, test = splitter.train_valid_test_split(dataset)

  transformers = [
      deepchem.trans.NormalizationTransformer(transform_y=True, dataset=train)
  ]

  logger.info("About to transform data.")
  for transformer in transformers:
    train = transformer.transform(train)
    valid = transformer.transform(valid)
    test = transformer.transform(test)
github deepchem / deepchem / deepchem / molnet / load_function / qm8_datasets.py View on Github external
featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == "smiles2img":
      img_spec = kwargs.get("img_spec", "std")
      img_size = kwargs.get("img_size", 80)
      featurizer = deepchem.feat.SmilesToImage(
          img_size=img_size, img_spec=img_spec)
    loader = deepchem.data.CSVLoader(
        tasks=qm8_tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file)

  if split == None:
    raise ValueError()

  splitters = {
      'index': deepchem.splits.IndexSplitter(),
      'random': deepchem.splits.RandomSplitter(),
      'stratified': deepchem.splits.SingletaskStratifiedSplitter(task_number=0)
  }
  splitter = splitters[split]
  frac_train = kwargs.get("frac_train", 0.8)
  frac_valid = kwargs.get('frac_valid', 0.1)
  frac_test = kwargs.get('frac_test', 0.1)

  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset,
      frac_train=frac_train,
      frac_valid=frac_valid,
      frac_test=frac_test)
  transformers = [
      deepchem.trans.NormalizationTransformer(
          transform_y=True, dataset=train_dataset, move_mean=move_mean)
  ]
github deepchem / deepchem / examples / qm7 / qm7_sklearn.py View on Github external
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")
if os.path.exists(base_dir):
  shutil.rmtree(base_dir)
os.makedirs(base_dir)

max_num_atoms = 23
featurizers = dc.feat.CoulombMatrixEig(max_num_atoms)
input_file = "gdb7.sdf"
tasks = ["u0_atom"]
smiles_field = "smiles"
mol_field = "mol"

featurizer = dc.data.SDFLoader(tasks, smiles_field=smiles_field, mol_field=mol_field, featurizer=featurizers)
dataset = featurizer.featurize(input_file, data_dir)
random_splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = random_splitter.train_test_split(dataset, train_dir, test_dir, frac_train=0.8)
#transformers = [dc.trans.NormalizationTransformer(transform_X=True, dataset=train_dataset), dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]

for transformer in transformers:
    train_dataset = transformer.transform(train_dataset)
for transformer in transformers:
    test_dataset = transformer.transform(test_dataset)

regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")

def model_builder(model_dir):
  sklearn_model = KernelRidge(
      kernel="rbf", alpha=5e-4, gamma=0.008)
  return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(tasks, model_builder, model_dir)
github deepchem / deepchem / examples / muv / muv_datasets.py View on Github external
'MUV-466', 'MUV-832'])

  loader = dc.load.DataLoader(
      tasks=MUV_tasks, smiles_field="smiles",
      featurizer=featurizer_func, verbosity="high")
  dataset = loader.featurize(dataset_file)

  # Initialize transformers 
  transformers = [
      dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
  print("About to transform data")
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {'index': dc.splits.IndexSplitter(),
               'random': dc.splits.RandomSplitter(),
               'scaffold': dc.splits.ScaffoldSplitter()}
  splitter = splitters[split]
  train, valid, test = splitter.train_valid_test_split(
	dataset, compute_feature_statistics=False)
  return MUV_tasks, (train, valid, test), transformers
github deepchem / deepchem / examples / chembl / chembl_datasets.py View on Github external
print("About to transform data")
    if split == "year":
        transformers = [
            dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
        for transformer in transformers:
            train = transformer.transform(train_dataset)
            valid = transformer.transform(valid_dataset)
            test = transformer.transform(test_dataset)
    else:
        transformers = [
            dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
        for transformer in transformers:
            dataset = transformer.transform(dataset)

    splitters = {'index': dc.splits.IndexSplitter(),
                 'random': dc.splits.RandomSplitter(),
                 'scaffold': dc.splits.ScaffoldSplitter()}
    if split in splitters:
        splitter = splitters[split]
        print("Performing new split.")
        train, valid, test = splitter.train_valid_test_split(dataset)


    return chembl_tasks, (train, valid, test), transformers