How to use the deepchem.feat function in deepchem

To help you get started, we’ve selected a few deepchem examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github deepchem / deepchem / examples / qm9 / qm9_datasets.py View on Github external
def load_qm9(featurizer=None, split='random'):
  """Load qm9 datasets."""
  # Featurize qm9 dataset
  print("About to featurize qm9 dataset.")
  current_dir = os.path.dirname(os.path.realpath(__file__))
  dataset_file = os.path.join(current_dir, "./gdb9.sdf")
  if not os.path.exists(dataset_file):
    os.system('sh ' + current_dir + '/get_qm9.sh')

  qm9_tasks = [
      "A", "B", "C", "mu", "alpha", "homo", "lumo", "gap", "r2", "zpve", "cv",
      "u0_atom", "u298_atom", "h298_atom", "g298_atom"
  ]
  if featurizer is None:
    featurizer = dc.feat.CoulombMatrix(29)
  loader = dc.data.SDFLoader(
      tasks=qm9_tasks,
      smiles_field="smiles",
      mol_field="mol",
      featurizer=featurizer)
  dataset = loader.featurize(dataset_file)
  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
      'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=11)
  }
  splitter = splitters[split]
  train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
      dataset)
  transformers = [
      dc.trans.NormalizationTransformer(
github deepchem / deepchem / examples / kinase / KINASE_datasets.py View on Github external
def load_kinase(shard_size=10000, num_shards_per_batch=4):
  """Load Kinase datasets."""
  verbosity = "high"
  train_files = ("KINASE_training_disguised_combined_full.csv.gz")
  valid_files = ("KINASE_test1_disguised_combined_full.csv.gz")
  test_files = ("KINASE_test2_disguised_combined_full.csv.gz")

  # Featurize Kinase dataset
  print("About to featurize KINASE dataset.")
  featurizer = dc.feat.UserDefinedFeaturizer(merck_descriptors)
  KINASE_tasks = (['T_000%d' % i for i in range(13, 100)]
                  + ['T_00%d' % i for i in range(100, 112)])

  loader = dc.load.DataLoader(
      tasks=KINASE_tasks, id_field="Molecule",
      featurizer=featurizer, verbosity=verbosity)

  train_datasets, valid_datasets, test_datasets = [], [], []
  print("Featurizing train datasets")
  train_dataset = loader.featurize(
      train_files, 
      shard_size=shard_size, num_shards_per_batch=num_shards_per_batch)

  print("Featurizing valid datasets")
  valid_dataset = loader.featurize(
      valid_files, shard_size=shard_size)
github deepchem / deepchem / deepchem / molnet / load_function / kaggle_datasets.py View on Github external
test_files = os.path.join(data_dir,
                            "KAGGLE_test2_disguised_combined_full.csv.gz")
  if not os.path.exists(train_files):
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_training_disguised_combined_full.csv.gz',
        dest_dir=data_dir)
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test1_disguised_combined_full.csv.gz',
        dest_dir=data_dir)
    deepchem.utils.download_url(
        'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/KAGGLE_test2_disguised_combined_full.csv.gz',
        dest_dir=data_dir)

  # Featurize KAGGLE dataset
  logger.info("About to featurize KAGGLE dataset.")
  featurizer = deepchem.feat.UserDefinedFeaturizer(merck_descriptors)

  loader = deepchem.data.UserCSVLoader(
      tasks=KAGGLE_tasks, id_field="Molecule", featurizer=featurizer)

  logger.info("Featurizing train datasets")
  train_dataset = loader.featurize(train_files, shard_size=shard_size)

  logger.info("Featurizing valid datasets")
  valid_dataset = loader.featurize(valid_files, shard_size=shard_size)

  logger.info("Featurizing test datasets")
  test_dataset = loader.featurize(test_files, shard_size=shard_size)

  logger.info("Remove missing entries from datasets.")
  remove_missing_entries(train_dataset)
  remove_missing_entries(valid_dataset)
github deepchem / deepchem / examples / adme / run_benchmarks.py View on Github external
def load_dataset(dataset_file, featurizer='ECFP', split='index'):
  tasks = ['exp']

  if featurizer == 'ECFP':
    featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = dc.feat.ConvMolFeaturizer()

  loader = dc.data.CSVLoader(
      tasks=tasks, smiles_field="smiles", featurizer=featurizer)
  dataset = loader.featurize(dataset_file, shard_size=8192)

  transformers = [
      dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
  ]
  for transformer in transformers:
    dataset = transformer.transform(dataset)

  splitters = {
      'index': dc.splits.IndexSplitter(),
      'random': dc.splits.RandomSplitter(),
github deepchem / deepchem / deepchem / molnet / load_function / pdbbind_datasets.py View on Github external
if not os.path.exists(dataset_file):
      deepchem.utils.download_url(
          'http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/' +
          subset + "_smiles_labels.csv")

    tasks = ["-logKd/Ki"]
    if reload:
      loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
          save_dir)
      if loaded:
        return tasks, all_dataset, transformers

    if featurizer == 'ECFP':
      featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
      featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
      featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
      featurizer = deepchem.feat.RawFeaturizer()

    loader = deepchem.data.CSVLoader(
        tasks=tasks, smiles_field="smiles", featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=8192)
    df = pd.read_csv(dataset_file)

    if split == None:
      transformers = [
          deepchem.trans.NormalizationTransformer(
              transform_y=True, dataset=dataset)
      ]
github deepchem / deepchem / deepchem / molnet / load_function / nci_datasets.py View on Github external
if reload:
    loaded, all_dataset, transformers = deepchem.utils.save.load_dataset_from_disk(
        save_dir)
    if loaded:
      return all_nci_tasks, all_dataset, transformers

  # Featurize nci dataset
  logger.info("About to featurize nci dataset.")
  if featurizer == 'ECFP':
    featurizer = deepchem.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
    featurizer = deepchem.feat.ConvMolFeaturizer()
  elif featurizer == 'Weave':
    featurizer = deepchem.feat.WeaveFeaturizer()
  elif featurizer == 'Raw':
    featurizer = deepchem.feat.RawFeaturizer()

  loader = deepchem.data.CSVLoader(
      tasks=all_nci_tasks, smiles_field="smiles", featurizer=featurizer)

  dataset = loader.featurize(dataset_file, shard_size=shard_size)

  if split == None:
    logger.info("Split is None, about to transform data")
    transformers = [
        deepchem.trans.NormalizationTransformer(
            transform_y=True, dataset=dataset)
    ]
    for transformer in transformers:
      dataset = transformer.transform(dataset)
    return all_nci_tasks, (dataset, None, None), transformers
github deepchem / deepchem / examples / nci / nci_datasets.py View on Github external
current_dir = os.path.dirname(os.path.realpath(__file__))

  # Load nci dataset
  print("About to load NCI dataset.")
  dataset_file1_path = os.path.join(
      current_dir, "../../datasets/nci_1.csv.gz")
  dataset_file2_path = os.path.join(
      current_dir, "../../datasets/nci_2.csv.gz")

  dataset_paths = [dataset_file1_path, dataset_file2_path]


  # Featurize nci dataset
  print("About to featurize nci dataset.")
  if featurizer == 'ECFP':
      featurizer = dc.feat.CircularFingerprint(size=1024)
  elif featurizer == 'GraphConv':
      featurizer = dc.feat.ConvMolFeaturizer()

  all_nci_tasks = (['CCRF-CEM', 'HL-60(TB)', 'K-562', 'MOLT-4', 'RPMI-8226',
                    'SR', 'A549/ATCC', 'EKVX', 'HOP-62', 'HOP-92', 'NCI-H226',
                    'NCI-H23', 'NCI-H322M', 'NCI-H460', 'NCI-H522', 'COLO 205',
                    'HCC-2998', 'HCT-116', 'HCT-15', 'HT29', 'KM12', 'SW-620',
                    'SF-268', 'SF-295', 'SF-539', 'SNB-19', 'SNB-75', 'U251',
                    'LOX IMVI', 'MALME-3M', 'M14', 'MDA-MB-435', 'SK-MEL-2',
                    'SK-MEL-28', 'SK-MEL-5', 'UACC-257', 'UACC-62', 'IGR-OV1',
                    'OVCAR-3', 'OVCAR-4', 'OVCAR-5', 'OVCAR-8', 'NCI/ADR-RES',
                    'SK-OV-3', '786-0', 'A498', 'ACHN', 'CAKI-1', 'RXF 393',
                    'SN12C', 'TK-10', 'UO-31', 'PC-3', 'DU-145', 'MCF7',
                    'MDA-MB-231/ATCC', 'MDA-MB-468', 'HS 578T', 'BT-549',
                    'T-47D'])
github deepchem / deepchem / examples / binding_pockets / binding_pocket_datasets.py View on Github external
labels_file = os.path.join(pdbbind_dir, "INDEX_core_data.2013")
  elif subset == "refined":
    labels_file = os.path.join(pdbbind_dir, "INDEX_refined_data.2015")
  elif subset == "full":
    labels_file = os.path.join(pdbbind_dir, "INDEX_general_PL_data.2015")
  else:
    raise ValueError("Only core, refined, and full subsets supported.")
  print("About to load contents.")
  if not os.path.exists(labels_file):
    raise ValueError("Run ../pdbbind/get_pdbbind.sh to download dataset.")
  contents_df = load_pdbbind_labels(labels_file)
  ids = contents_df["PDB code"].values
  y = np.array([float(val) for val in contents_df["-logKd/Ki"].values])

  # Define featurizers
  pocket_featurizer = dc.feat.BindingPocketFeaturizer()
  ligand_featurizer = dc.feat.CircularFingerprint(size=1024)

  # Featurize Dataset
  all_features = []
  all_labels = []
  missing_pdbs = []
  all_ids = []
  time1 = time.time()
  for ind, pdb_code in enumerate(ids):
    print("Processing complex %d, %s" % (ind, str(pdb_code)))
    pdb_subdir = os.path.join(pdbbind_dir, pdb_code)
    if not os.path.exists(pdb_subdir):
      print("%s is missing!" % pdb_subdir)
      missing_pdbs.append(pdb_subdir)
      continue
    features, labels = compute_binding_pocket_features(