Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)
print(len(dataset))
current_dir = os.path.dirname(os.path.realpath(__file__))
split_file = os.path.join(current_dir, "./qm7_splits.csv")
split_indices = []
with open(split_file, 'r') as f:
reader = csv.reader(f)
for row in reader:
row_int = (np.asarray(list(map(int, row)))).tolist()
split_indices.append(row_int)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
}
splitter = splitters[split]
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
dataset)
print(len(train_dataset))
print(len(valid_dataset))
print(len(test_dataset))
transformers = [
dc.trans.NormalizationTransformer(
transform_y=True, dataset=train_dataset)
]
for transformer in transformers:
dataset = loader.featurize(dataset_file)
if split == None:
transformers = [
deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
]
logger.info("Split is None, about to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
return PCBA_tasks, (dataset, None, None), transformers
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': deepchem.splits.RandomSplitter(),
'scaffold': deepchem.splits.ScaffoldSplitter()
}
splitter = splitters[split]
logger.info("About to split dataset using {} splitter.".format(split))
train, valid, test = splitter.train_valid_test_split(dataset)
transformers = [
deepchem.trans.BalancingTransformer(transform_w=True, dataset=train)
]
logger.info("About to transform dataset.")
for transformer in transformers:
train = transformer.transform(train)
valid = transformer.transform(valid)
test = transformer.transform(test)
w = np.ones_like(y)
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids=None)
current_dir = os.path.dirname(os.path.realpath(__file__))
split_file = os.path.join(current_dir, "./qm7_splits.csv")
split_indices = []
with open(split_file, 'r') as f:
reader = csv.reader(f)
for row in reader:
row_int = (np.asarray(list(map(int, row)))).tolist()
split_indices.append(row_int)
splitters = {
'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'indice': dc.splits.IndiceSplitter(valid_indices=split_indices[1]),
'stratified': dc.splits.SingletaskStratifiedSplitter(task_number=0)
}
splitter = splitters[split]
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
dataset)
transformers = [
dc.trans.NormalizationTransformer(
transform_y=True, dataset=train_dataset)
]
for transformer in transformers:
train_dataset = transformer.transform(train_dataset)
valid_dataset = transformer.transform(valid_dataset)
test_dataset = transformer.transform(test_dataset)
# Transform clintox dataset
if split is None:
transformers = [
deepchem.trans.BalancingTransformer(transform_w=True, dataset=dataset)
]
logger.info("Split is None, about to transform data.")
for transformer in transformers:
dataset = transformer.transform(dataset)
return clintox_tasks, (dataset, None, None), transformers
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': deepchem.splits.RandomSplitter(),
'scaffold': deepchem.splits.ScaffoldSplitter()
}
splitter = splitters[split]
logger.info("About to split data with {} splitter.".format(split))
train, valid, test = splitter.train_valid_test_split(dataset)
transformers = [
deepchem.trans.BalancingTransformer(transform_w=True, dataset=train)
]
logger.info("About to transform data.")
for transformer in transformers:
train = transformer.transform(train)
valid = transformer.transform(valid)
test = transformer.transform(test)
if split is None:
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=dataset)
]
logger.info("Split is None, about to transform data.")
for transformer in transformers:
dataset = transformer.transform(dataset)
return chembl_tasks, (dataset, None, None), transformers
if split != "year":
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': deepchem.splits.RandomSplitter(),
'scaffold': deepchem.splits.ScaffoldSplitter()
}
splitter = splitters[split]
logger.info("Performing new split.")
train, valid, test = splitter.train_valid_test_split(dataset)
transformers = [
deepchem.trans.NormalizationTransformer(transform_y=True, dataset=train)
]
logger.info("About to transform data.")
for transformer in transformers:
train = transformer.transform(train)
valid = transformer.transform(valid)
test = transformer.transform(test)
featurizer = deepchem.feat.WeaveFeaturizer()
elif featurizer == "smiles2img":
img_spec = kwargs.get("img_spec", "std")
img_size = kwargs.get("img_size", 80)
featurizer = deepchem.feat.SmilesToImage(
img_size=img_size, img_spec=img_spec)
loader = deepchem.data.CSVLoader(
tasks=qm8_tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file)
if split == None:
raise ValueError()
splitters = {
'index': deepchem.splits.IndexSplitter(),
'random': deepchem.splits.RandomSplitter(),
'stratified': deepchem.splits.SingletaskStratifiedSplitter(task_number=0)
}
splitter = splitters[split]
frac_train = kwargs.get("frac_train", 0.8)
frac_valid = kwargs.get('frac_valid', 0.1)
frac_test = kwargs.get('frac_test', 0.1)
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
dataset,
frac_train=frac_train,
frac_valid=frac_valid,
frac_test=frac_test)
transformers = [
deepchem.trans.NormalizationTransformer(
transform_y=True, dataset=train_dataset, move_mean=move_mean)
]
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")
if os.path.exists(base_dir):
shutil.rmtree(base_dir)
os.makedirs(base_dir)
max_num_atoms = 23
featurizers = dc.feat.CoulombMatrixEig(max_num_atoms)
input_file = "gdb7.sdf"
tasks = ["u0_atom"]
smiles_field = "smiles"
mol_field = "mol"
featurizer = dc.data.SDFLoader(tasks, smiles_field=smiles_field, mol_field=mol_field, featurizer=featurizers)
dataset = featurizer.featurize(input_file, data_dir)
random_splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = random_splitter.train_test_split(dataset, train_dir, test_dir, frac_train=0.8)
#transformers = [dc.trans.NormalizationTransformer(transform_X=True, dataset=train_dataset), dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
transformers = [dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
for transformer in transformers:
train_dataset = transformer.transform(train_dataset)
for transformer in transformers:
test_dataset = transformer.transform(test_dataset)
regression_metric = dc.metrics.Metric(dc.metrics.mean_absolute_error, mode="regression")
def model_builder(model_dir):
sklearn_model = KernelRidge(
kernel="rbf", alpha=5e-4, gamma=0.008)
return dc.models.SklearnModel(sklearn_model, model_dir)
model = dc.models.SingletaskToMultitask(tasks, model_builder, model_dir)
'MUV-466', 'MUV-832'])
loader = dc.load.DataLoader(
tasks=MUV_tasks, smiles_field="smiles",
featurizer=featurizer_func, verbosity="high")
dataset = loader.featurize(dataset_file)
# Initialize transformers
transformers = [
dc.trans.BalancingTransformer(transform_w=True, dataset=dataset)]
print("About to transform data")
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()}
splitter = splitters[split]
train, valid, test = splitter.train_valid_test_split(
dataset, compute_feature_statistics=False)
return MUV_tasks, (train, valid, test), transformers
print("About to transform data")
if split == "year":
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=train_dataset)]
for transformer in transformers:
train = transformer.transform(train_dataset)
valid = transformer.transform(valid_dataset)
test = transformer.transform(test_dataset)
else:
transformers = [
dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)]
for transformer in transformers:
dataset = transformer.transform(dataset)
splitters = {'index': dc.splits.IndexSplitter(),
'random': dc.splits.RandomSplitter(),
'scaffold': dc.splits.ScaffoldSplitter()}
if split in splitters:
splitter = splitters[split]
print("Performing new split.")
train, valid, test = splitter.train_valid_test_split(dataset)
return chembl_tasks, (train, valid, test), transformers