Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
raise ValueError("{} not supported yet!"
"".format(learner_name))
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(**reducer_kwargs),
"cleaner": DataCleaner(**cleaner_kwargs),
"autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs)}
logger = initialize_logger(AMM_LOGGER_BASENAME, log_dir=base_save_dir)
pipe = MatPipe(**pipe_config, logger=logger)
# Set up dataset
# Dataset should already be set up correctly as json beforehand.
# this includes targets being converted to classification, removing
# extra columns, having the names of featurization cols set to the
# same as the matpipe config, etc.
df = load_dataframe_from_json(data_file)
pipe.fit(df, target)
pipe.save(os.path.join(base_save_dir, "pipe.p"))
else:
raise ValueError("{} not supported yet!" "".format(learner_name))
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(**reducer_kwargs),
"cleaner": DataCleaner(**cleaner_kwargs),
"autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
}
pipe = MatPipe(**pipe_config)
# Set up dataset
# Dataset should already be set up correctly as json beforehand.
# this includes targets being converted to classification, removing
# extra columns, having the names of featurization cols set to the
# same as the matpipe config, etc.
df = load_dataframe_from_json(data_file)
pipe.fit(df, target)
pipe.save(os.path.join(base_save_dir, "pipe.p"))
autofeaturizer_kwargs["cache_src"] = os.path.join(base_save_dir, "features.json")
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(**reducer_kwargs),
"cleaner": DataCleaner(**cleaner_kwargs),
"autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs)}
logger = initialize_logger(AMM_LOGGER_BASENAME, log_dir=save_dir)
pipe = MatPipe(**pipe_config, logger=logger)
# Set up dataset
# Dataset should already be set up correctly as json beforehand.
# this includes targets being converted to classification, removing
# extra columns, having the names of featurization cols set to the
# same as the matpipe config, etc.
df = load_dataframe_from_json(data_file)
# Check other parameters that would otherwise not be checked until after
# benchmarking, hopefully saves some errors at the end during scoring.
if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]:
raise ValueError("Problem must be either classification or "
"regression.")
elif problem_type == AMM_CLF_NAME:
if not isinstance(clf_pos_label, (str, bool)):
raise TypeError("The classification positive label should be a "
"string, or bool not {}."
"".format(type(clf_pos_label)))
elif clf_pos_label not in df[target]:
raise ValueError("The classification positive label should be"
"present in the target column.")
elif len(df[target].unique()) > 2:
raise ValueError("Only binary classification scoring available"
)
pipe_config = {
"learner": learner,
"reducer": FeatureReducer(**reducer_kwargs),
"cleaner": DataCleaner(**cleaner_kwargs),
"autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
}
pipe = MatPipe(**pipe_config)
# Set up dataset
# Dataset should already be set up correctly as json beforehand.
# this includes targets being converted to classification, removing
# extra columns, having the names of featurization cols set to the
# same as the matpipe config, etc.
df = load_dataframe_from_json(data_file)
# Check other parameters that would otherwise not be checked until after
# benchmarking, hopefully saves some errors at the end during scoring.
if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]:
raise ValueError("Problem must be either classification or " "regression.")
elif problem_type == AMM_CLF_NAME:
if not isinstance(clf_pos_label, (str, bool)):
raise TypeError(
"The classification positive label should be a "
"string, or bool not {}."
"".format(type(clf_pos_label))
)
elif clf_pos_label not in df[target]:
raise ValueError(
"The classification positive label should be"
"present in the target column."
"""
Decorate a dataframe containing composition, structure, bandstructure,
and/or DOS objects with descriptors.
Args:
df (pandas.DataFrame): The dataframe not containing features.
target (str): The ML-target property contained in the df.
Returns:
df (pandas.DataFrame): Transformed dataframe containing features.
"""
if self.cache_src and os.path.exists(self.cache_src):
logger.debug(
self._log_prefix + "Reading cache_src {}".format(self.cache_src)
)
cached_df = load_dataframe_from_json(self.cache_src)
if not all([loc in cached_df.index for loc in df.index]):
raise AutomatminerError(
"Feature cache does not contain all "
"entries (by DataFrame index) needed "
"to transform the input df."
)
else:
cached_subdf = cached_df.loc[df.index]
if target in cached_subdf.columns:
if target not in df.columns:
logger.warn(
self._log_prefix
+ "Target not present in both cached df and input df."
" Cannot perform comparison to ensure index match."
)
else:
hole mass_z (target): Effective hole mass in z direction (BoltzTraP)
epsilon_x opt (target): Static dielectric function in x direction
calculated with OptB88vDW functional.
epsilon_y opt (target): Static dielectric function in y direction
calculated with OptB88vDW functional.
epsilon_z opt (target): Static dielectric function in z direction
calculated with OptB88vDW functional.
epsilon_x tbmbj (target): Static dielectric function in x direction
calculated with TBMBJ functional.
epsilon_y tbmbj (target): Static dielectric function in y direction
calculated with TBMBJ functional.
epsilon_z tbmbj (target): Static dielectric function in z direction
calculated with TBMBJ functional.
"""
df = load_dataframe_from_json(os.path.join(data_dir, 'jdft_3d.json'))
colmap = {"el_mass_x": "e mass_x",
"el_mass_y": "e mass_y",
"el_mass_z": "e mass_z",
"epsx": "epsilon_x opt",
"epsy": "epsilon_y opt",
"epsz": "epsilon_z opt",
"exfoliation_en": "e_exfol",
"form_enp": "e_form",
"gv": "shear modulus",
"hl_mass_x": "hole mass_x",
"hl_mass_y": "hole mass_y",
"hl_mass_z": "hole mass_z",
"kv": "bulk modulus",
"magmom": "mu_b",
"mbj_gap": "gap tbmbj",
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from automatminer.utils.ml import regression_or_classification
from automatminer.utils.ml import AMM_CLF_NAME, AMM_REG_NAME
from automatminer_dev.config import BENCHMARK_FULL_SET, GLASS, EXPT_IS_METAL, EXPT_GAP
from matminer.utils.io import load_dataframe_from_json
benchmark_dir = os.environ["AMM_DATASET_DIR"]
bmarks = BENCHMARK_FULL_SET
bmarks = [GLASS, EXPT_GAP, EXPT_IS_METAL]
for p in bmarks:
pname = p["name"]
print("Loading {}".format(pname))
df = load_dataframe_from_json(os.path.join(benchmark_dir, p["data_file"]))
target = p["target"]
ltype = p["problem_type"]
if ltype == AMM_REG_NAME:
kf = KFold(n_splits=5, random_state=18012019, shuffle=True)
estimator = DummyRegressor(strategy="mean")
scoring = "neg_mean_absolute_error"
multiplier = -1
elif ltype == AMM_CLF_NAME:
kf = StratifiedKFold(n_splits=5, random_state=18012019, shuffle=True)
estimator = DummyClassifier(strategy="stratified")
multiplier = 1
scoring = "roc_auc"
else:
raise ValueError("problem type {} is not known.".format(ltype))
cvs = cross_val_score(
def __init__(self,
initial_ltol: float = 0.2,
initial_stol: float = 0.3,
initial_angle_tol: float = 5.,
use_fingerprint_matching: bool = True,
fingerprint_distance_cutoff: float = 0.4):
db_file = resource_filename('robocrys.condense', 'mineral_db.json.gz')
self.mineral_db = load_dataframe_from_json(db_file)
self.initial_ltol = initial_ltol
self.initial_stol = initial_stol
self.initial_angle_tol = initial_angle_tol
self.fingerprint_distance_cutoff = fingerprint_distance_cutoff
self.use_fingerprint_matching = use_fingerprint_matching
self._structure = None
self._mineral_db = None