Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
not_in_model))
else:
X = df[self._features].values # rectify feature order
y_pred = self._backend.predict(X)
df[target + " predicted"] = y_pred
self._logger.debug("Prediction finished successfully.")
return df
if __name__ == "__main__":
from matminer.datasets.dataset_retrieval import load_dataset
from automatminer.featurization import AutoFeaturizer
from automatminer.preprocessing import DataCleaner, FeatureReducer
# Load a dataset
df = load_dataset("elastic_tensor_2015").rename(
columns={"formula": "composition"})[["composition", "K_VRH"]]
testdf = df.iloc[501:550]
traindf = df.iloc[:100]
target = "K_VRH"
# Get top-lvel transformers
autofeater = AutoFeaturizer()
cleaner = DataCleaner()
reducer = FeatureReducer()
learner = TPOTAdaptor("regression", max_time_mins=5)
# Fit transformers on training data
traindf = autofeater.fit_transform(traindf, target)
traindf = cleaner.fit_transform(traindf, target)
traindf = reducer.fit_transform(traindf, target)
learner.fit(traindf, target)
data.columns.tolist())
# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(lambda x: x.get_integer_formula_and_factor()[0])
data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))
data.reset_index(inplace=True, drop=True)
# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
cf.Stoichiometry(),
cf.ElementProperty.from_preset('magpie'),
cf.ValenceOrbital(props=['frac']),
cf.IonProperty(fast=True)
])
# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])
# Make the model
model = Pipeline([
('imputer', Imputer()),
('model', RandomForestRegressor())
])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')
raise ValueError("{} not supported yet!"
"".format(learner_name))
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(**reducer_kwargs),
"cleaner": DataCleaner(**cleaner_kwargs),
"autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs)}
logger = initialize_logger(AMM_LOGGER_BASENAME, log_dir=base_save_dir)
pipe = MatPipe(**pipe_config, logger=logger)
# Set up dataset
# Dataset should already be set up correctly as json beforehand.
# this includes targets being converted to classification, removing
# extra columns, having the names of featurization cols set to the
# same as the matpipe config, etc.
df = load_dataframe_from_json(data_file)
pipe.fit(df, target)
pipe.save(os.path.join(base_save_dir, "pipe.p"))
else:
raise ValueError("{} not supported yet!" "".format(learner_name))
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(**reducer_kwargs),
"cleaner": DataCleaner(**cleaner_kwargs),
"autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
}
pipe = MatPipe(**pipe_config)
# Set up dataset
# Dataset should already be set up correctly as json beforehand.
# this includes targets being converted to classification, removing
# extra columns, having the names of featurization cols set to the
# same as the matpipe config, etc.
df = load_dataframe_from_json(data_file)
pipe.fit(df, target)
pipe.save(os.path.join(base_save_dir, "pipe.p"))
def need_fit(self):
fs = [
sf.PartialRadialDistributionFunction(),
sf.BondFractions(),
sf.BagofBonds(coulomb_matrix=sf.CoulombMatrix()),
sf.BagofBonds(coulomb_matrix=sf.SineCoulombMatrix()),
]
return self._get_featurizers(fs)
# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(lambda x: x.get_integer_formula_and_factor()[0])
data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))
data.reset_index(inplace=True, drop=True)
# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
cf.Stoichiometry(),
cf.ElementProperty.from_preset('magpie'),
cf.ValenceOrbital(props=['frac']),
cf.IonProperty(fast=True)
])
# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])
# Make the model
model = Pipeline([
('imputer', Imputer()),
('model', RandomForestRegressor())
])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')
# Save the model, featurizer, and data using pickle
with open('model.pkl', 'wb') as fp:
def need_fit(self):
fs = [
sf.PartialRadialDistributionFunction(),
sf.BondFractions(),
sf.BagofBonds(coulomb_matrix=sf.CoulombMatrix()),
sf.BagofBonds(coulomb_matrix=sf.SineCoulombMatrix()),
]
return self._get_featurizers(fs)
df = pd.DataFrame(pickle.load(f))[["structure", prop_col]].dropna()
idx_list = list(range(len(df)))
kf = KFold(n_splits=5, random_state=18012019, shuffle=True)
for kf_idx, (remain_index, test_index) in enumerate(kf.split(idx_list)):
if kf_idx in kf_indices:
kf_tmp_output_path = os.path.join(
tmp_output_path, "kfold_{}".format(kf_idx)
)
if not os.path.exists(kf_tmp_output_path):
os.makedirs(kf_tmp_output_path, exist_ok=True)
train_index, val_index = train_test_split(
remain_index, test_size=0.25, random_state=18012019, shuffle=True
)
cgcnnfz = CGCNNFeaturizer(
task=args.task,
distributed=distributed,
n_works=args.n_works,
disable_cuda=disable_cuda,
save_idx=kf_tmp_output_path,
output_path=kf_tmp_output_path,
atom_init_fea=atom_features,
use_batch=False,
test=args.test,
dropout_percent=0.5,
batch_size=args.batch_size,
warm_start_file=args.warm_start,
warm_start_latest=True,
use_pretrained=False,
save_model_to_dir=os.path.join(kf_tmp_output_path, "model"),
save_checkpoint_to_dir=os.path.join(kf_tmp_output_path, "checkpoint"),
print('Loaded {} rows with {} columns:'.format(len(data), len(data.columns)),
data.columns.tolist())
# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(lambda x: x.get_integer_formula_and_factor()[0])
data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))
data.reset_index(inplace=True, drop=True)
# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
cf.Stoichiometry(),
cf.ElementProperty.from_preset('magpie'),
cf.ValenceOrbital(props=['frac']),
cf.IonProperty(fast=True)
])
# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])
# Make the model
model = Pipeline([
('imputer', Imputer()),
('model', RandomForestRegressor())
])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')
# Get only the minimum energy structure at each composition
data['composition'] = data['structure'].apply(lambda x: x.composition)
data['integer_formula'] = data['composition'].apply(lambda x: x.get_integer_formula_and_factor()[0])
data.sort_values('e_above_hull', ascending=True, inplace=True)
data.drop_duplicates('integer_formula', keep='first', inplace=True)
print('Reduced dataset to {} unique compositions.'.format(len(data)))
data.reset_index(inplace=True, drop=True)
# Create the featurizer, which will take the composition as input
featurizer = MultipleFeaturizer([
cf.Stoichiometry(),
cf.ElementProperty.from_preset('magpie'),
cf.ValenceOrbital(props=['frac']),
cf.IonProperty(fast=True)
])
# Compute the features
featurizer.set_n_jobs(1)
X = featurizer.featurize_many(data['composition'])
# Make the model
model = Pipeline([
('imputer', Imputer()),
('model', RandomForestRegressor())
])
model.fit(X, data['formation_energy_per_atom'])
print('Trained a RandomForest model')
# Save the model, featurizer, and data using pickle