How to use chemprop - 10 common examples

To help you get started, we’ve selected a few chemprop examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github wengong-jin / chemprop / chemprop / train / train.py View on Github external
num_iters = len(data) if args.last_batch else len(data) // args.batch_size * args.batch_size

    if args.parallel_featurization:
        batch_queue = Queue(args.batch_queue_max_size)
        exit_queue = Queue(1)
        batch_process = Process(target=async_mol2graph, args=(batch_queue, data, args, num_iters, args.batch_size, exit_queue, args.last_batch))
        batch_process.start()
        currently_loaded_batches = []

    iter_size = 1 if args.maml else args.batch_size

    for i in trange(0, num_iters, iter_size):
        if args.moe:
            if not args.batch_domain_encs:
                model.compute_domain_encs(train_smiles)  # want to recompute every batch
            mol_batch = [MoleculeDataset(d[i:i + args.batch_size]) for d in data]
            train_batch, train_targets = [], []
            for b in mol_batch:
                tb, tt = b.smiles(), b.targets()
                train_batch.append(tb)
                train_targets.append(tt)
            test_batch = test_smiles[i:i + args.batch_size]
            loss = model.compute_loss(train_batch, train_targets, test_batch)
            model.zero_grad()

            loss_sum += loss.item()
            iter_count += len(mol_batch)
        elif args.maml:
            task_train_data, task_test_data, task_idx = data.sample_maml_task(args)
            mol_batch = task_test_data
            smiles_batch, features_batch, target_batch = task_train_data.smiles(), task_train_data.features(), task_train_data.targets(task_idx)
            # no mask since we only picked data points that have the desired target
github wengong-jin / chemprop / chemprop / utils.py View on Github external
def build_lr_scheduler(optimizer: Optimizer, args: Namespace, total_epochs: List[int] = None) -> _LRScheduler:
    """
    Builds a learning rate scheduler.

    :param optimizer: The Optimizer whose learning rate will be scheduled.
    :param args: Arguments.
    :return: An initialized learning rate scheduler.
    """
    # Learning rate scheduler
    if args.scheduler == 'noam':
        return NoamLR(
            optimizer=optimizer,
            warmup_epochs=args.warmup_epochs,
            total_epochs=total_epochs or [args.epochs] * args.num_lrs,
            steps_per_epoch=args.train_data_size // args.batch_size,
            init_lr=args.init_lr,
            max_lr=args.max_lr,
            final_lr=args.final_lr
        )

    if args.scheduler == 'none':
        return MockLR(optimizer=optimizer, lr=args.init_lr)

    if args.scheduler == 'decay':
        return ExponentialLR(optimizer, args.lr_decay_rate)

    raise ValueError(f'Learning rate scheduler "{args.scheduler}" not supported.')
github wengong-jin / chemprop / chemprop / models / gan.py View on Github external
self.act_func,
            nn.Linear(self.hidden_size, self.hidden_size),
            self.act_func,
            nn.Linear(self.hidden_size, self.hidden_size),
            self.act_func,
            nn.Linear(self.hidden_size, 1)
        )
        self.beta = args.wgan_beta

        # the optimizers don't really belong here, but we put it here so that we don't clutter code for other opts
        self.optimizerG = Adam(self.encoder.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))
        self.optimizerD = Adam(self.netD.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))

        self.use_scheduler = args.gan_use_scheduler
        if self.use_scheduler:
            self.schedulerG = NoamLR(
                self.optimizerG,
                warmup_epochs=args.warmup_epochs,
                total_epochs=args.epochs,
                steps_per_epoch=args.train_data_length // args.batch_size,
                init_lr=args.init_lr[0] * args.gan_lr_mult,
                max_lr=args.max_lr[0] * args.gan_lr_mult,
                final_lr=args.final_lr[0] * args.gan_lr_mult
            )
            self.schedulerD = NoamLR(
                self.optimizerD,
                warmup_epochs=args.warmup_epochs,
                total_epochs=args.epochs,
                steps_per_epoch=(args.train_data_length // args.batch_size) * args.gan_d_per_g,
                init_lr=args.init_lr[0] * args.gan_lr_mult,
                max_lr=args.max_lr[0] * args.gan_lr_mult,
                final_lr=args.final_lr[0] * args.gan_lr_mult
github wengong-jin / chemprop / chemprop / models / gan.py View on Github external
# the optimizers don't really belong here, but we put it here so that we don't clutter code for other opts
        self.optimizerG = Adam(self.encoder.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))
        self.optimizerD = Adam(self.netD.parameters(), lr=args.init_lr[0] * args.gan_lr_mult, betas=(0, 0.9))

        self.use_scheduler = args.gan_use_scheduler
        if self.use_scheduler:
            self.schedulerG = NoamLR(
                self.optimizerG,
                warmup_epochs=args.warmup_epochs,
                total_epochs=args.epochs,
                steps_per_epoch=args.train_data_length // args.batch_size,
                init_lr=args.init_lr[0] * args.gan_lr_mult,
                max_lr=args.max_lr[0] * args.gan_lr_mult,
                final_lr=args.final_lr[0] * args.gan_lr_mult
            )
            self.schedulerD = NoamLR(
                self.optimizerD,
                warmup_epochs=args.warmup_epochs,
                total_epochs=args.epochs,
                steps_per_epoch=(args.train_data_length // args.batch_size) * args.gan_d_per_g,
                init_lr=args.init_lr[0] * args.gan_lr_mult,
                max_lr=args.max_lr[0] * args.gan_lr_mult,
                final_lr=args.final_lr[0] * args.gan_lr_mult
            )
github wengong-jin / chemprop / chemprop / train / run_training.py View on Github external
debug(f'Number of tasks = {args.num_tasks}')

    if args.dataset_type == 'bert_pretraining':
        data.bert_init(args, logger)

    # Split data
    if args.dataset_type == 'regression_with_binning':  # Note: for now, binning based on whole dataset, not just training set
        data, bin_predictions, regression_data = data
        args.bin_predictions = bin_predictions
        debug(f'Splitting data with seed {args.seed}')
        train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
        _, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
    else:
        debug(f'Splitting data with seed {args.seed}')
        if args.separate_test_set:
            test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger)
            if args.separate_val_set:
                val_data = get_data(path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger)
                train_data = data  # nothing to split; we already got our test and val sets
            else:
                train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
        else:
            train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)

    # Optionally replace test data with train or val data
    if args.test_split == 'train':
        test_data = train_data
    elif args.test_split == 'val':
        test_data = val_data

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
github wengong-jin / chemprop / chemprop / train / run_training.py View on Github external
if args.dataset_type == 'bert_pretraining':
        data.bert_init(args, logger)

    # Split data
    if args.dataset_type == 'regression_with_binning':  # Note: for now, binning based on whole dataset, not just training set
        data, bin_predictions, regression_data = data
        args.bin_predictions = bin_predictions
        debug(f'Splitting data with seed {args.seed}')
        train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
        _, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
    else:
        debug(f'Splitting data with seed {args.seed}')
        if args.separate_test_set:
            test_data = get_data(path=args.separate_test_set, args=args, features_path=args.separate_test_set_features, logger=logger)
            if args.separate_val_set:
                val_data = get_data(path=args.separate_val_set, args=args, features_path=args.separate_val_set_features, logger=logger)
                train_data = data  # nothing to split; we already got our test and val sets
            else:
                train_data, val_data, _ = split_data(data=data, split_type=args.split_type, sizes=(0.8, 0.2, 0.0), seed=args.seed, args=args, logger=logger)
        else:
            train_data, val_data, test_data = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)

    # Optionally replace test data with train or val data
    if args.test_split == 'train':
        test_data = train_data
    elif args.test_split == 'val':
        test_data = val_data

    if args.dataset_type == 'classification':
        class_sizes = get_class_sizes(data)
        debug('Class sizes')
        for i, task_class_sizes in enumerate(class_sizes):
github wengong-jin / chemprop / chemprop / train / run_training.py View on Github external
debug, info = logger.debug, logger.info
    else:
        debug = info = print

    # Set GPU
    if args.gpu is not None:
        torch.cuda.set_device(args.gpu)

    # Print args
    debug(pformat(vars(args)))

    # Get data
    debug('Loading data')
    args.task_names = get_task_names(args.data_path)
    desired_labels = get_desired_labels(args, args.task_names)
    data = get_data(path=args.data_path, args=args, logger=logger)
    args.num_tasks = data.num_tasks()
    args.features_size = data.features_size()
    args.real_num_tasks = args.num_tasks - args.features_size if args.predict_features else args.num_tasks
    debug(f'Number of tasks = {args.num_tasks}')

    if args.dataset_type == 'bert_pretraining':
        data.bert_init(args, logger)

    # Split data
    if args.dataset_type == 'regression_with_binning':  # Note: for now, binning based on whole dataset, not just training set
        data, bin_predictions, regression_data = data
        args.bin_predictions = bin_predictions
        debug(f'Splitting data with seed {args.seed}')
        train_data, _, _ = split_data(data=data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
        _, val_data, test_data = split_data(regression_data, split_type=args.split_type, sizes=args.split_sizes, seed=args.seed, args=args, logger=logger)
    else:
github wengong-jin / chemprop / chemprop / random_forest.py View on Github external
elif args.dataset_type == 'classification':
        model = RandomForestClassifier(n_estimators=args.num_trees, n_jobs=-1)
    else:
        raise ValueError(f'dataset_type "{args.dataset_type}" not supported.')

    train_targets = train_data.targets()
    if train_data.num_tasks() == 1:
        train_targets = [targets[0] for targets in train_targets]

    model.fit(train_data.features(), train_targets)

    test_preds = model.predict(test_data.features())
    if train_data.num_tasks() == 1:
        test_preds = [[pred] for pred in test_preds]

    scores = evaluate_predictions(
        preds=test_preds,
        targets=test_data.targets(),
        metric_func=metric_func,
        dataset_type=args.dataset_type
    )

    return scores
github wengong-jin / chemprop / chemprop / random_forest.py View on Github external
if args.dataset_type == 'regression':
            model = RandomForestRegressor(n_estimators=args.num_trees, n_jobs=-1)
        elif args.dataset_type == 'classification':
            model = RandomForestClassifier(class_weight=args.class_weight, n_estimators=args.num_trees, n_jobs=-1)
        else:
            raise ValueError(f'dataset_type "{args.dataset_type}" not supported.')

        model.fit(train_features, train_targets)

        test_preds = model.predict(test_features)

        test_preds = [[pred] for pred in test_preds]
        test_targets = [[target] for target in test_targets]

        score = evaluate_predictions(
            preds=test_preds,
            targets=test_targets,
            metric_func=metric_func,
            dataset_type=args.dataset_type
        )
        scores.append(score[0])

    return scores
github wengong-jin / chemprop / end_to_end.py View on Github external
parser.add_argument('--train_path', type=str, required=True,
                        help='Path to CSV file containing training data in chronological order')
    parser.add_argument('--val_path', type=str, required=True,
                        help='Path to CSV file containing val data in chronological order')
    parser.add_argument('--train_save', type=str, required=True,
                        help='Path to CSV file for new train data')
    parser.add_argument('--val_save', type=str, required=True,
                        help='Path to CSV file for new val data')
    parser.add_argument('--val_frac', type=float, default=0.2,
                        help='frac of data to use for validation')
    parser.add_argument('--train_val_save', type=str, required=True,
                        help='Path to CSV file for combined train and val data')
    args = parser.parse_args()

    set_logger(logger, args.save_dir, args.quiet)
    modify_train_args(args)
    modify_hyper_opt_args(args)

    # Preprocess train and validation data
    resplit(args)
    merge_train_val(args)
    for path in [args.train_save, args.val_save, args.train_val_save]:
        args.data_path = path
        args.save_path = path
        average_duplicates(args)

    # Optimize hyperparameters
    args.data_path = args.train_save
    args.separate_test_set = args.val_save
    optimize_hyperparameters(args)

    # Determine best hyperparameters, update args, and train