Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Returns:
df (pandas.DataFrame): DataFrame with featurizer_type column
ready for featurization.
"""
# todo: Make the following conversions more robust (no [0] type checking)
type_tester = df[featurizer_type].iloc[0]
if featurizer_type == self.composition_col:
# Convert formulas to composition objects
if isinstance(type_tester, str):
logger.info(
self._log_prefix
+ "Compositions detected as strings. Attempting "
"conversion to Composition objects..."
)
stc = StrToComposition(
overwrite_data=True, target_col_id=featurizer_type
)
stc.set_n_jobs(self.n_jobs)
df = stc.featurize_dataframe(
df,
featurizer_type,
multiindex=self.multiindex,
ignore_errors=True,
inplace=False,
)
elif isinstance(type_tester, dict):
logger.info(
self._log_prefix + "Compositions detected as dicts. Attempting "
"conversion to Composition objects..."
)
df = load_dataset("expt_gap")
df = df.rename(columns={"formula": "composition"})
# print("Ground Truth")
# print(df[df["composition"] == "ZrW2"]) # should be 0.00
# print(df[df["composition"] == "ZrSe2"]) # should be 2.00
# raise ValueError
excluded_compositions = []
# Prevent differences in order of formula symbols from corrupting the actual number of unique compositions
df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
df, "composition"
)
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])
unique = df["composition"].unique()
print("Number of unique compositions:", len(unique))
# raise ValueError
new_df_dict = {"composition": [], "gap expt": []}
for c in tqdm(unique):
df_per_comp_gaps = df[df["composition"] == c]
per_comp_gaps = df_per_comp_gaps["gap expt"]
measurement_range = max(per_comp_gaps) - min(per_comp_gaps)
if measurement_range > 0.1:
# print(df_per_comp_gaps)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
df = load_dataset("expt_gap")
df = df.rename(columns={"formula": "composition"})
print(df)
df["is_metal"] = df["gap expt"] == 0
df = df.drop(columns=["gap expt"])
# print("Ground truth")
# print(df[df["composition"]=="ZrSe3"]) # should be False in final dataframe also
# print(df[df["composition"]=="ZrW2"]) # should be True in final dataframe also
# print(df["is_metal"].value_counts()) # proportion is about 2500 metals to 4k nonmetals
# raise ValueError
df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
df, "composition"
)
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])
unique = df["composition"].unique()
print("Number of unique compositions:", len(unique))
problem_compositions = []
new_df_dict = {"composition": [], "is_metal": []}
for c in tqdm(unique):
df_per_comp_is_metal = df[df["composition"] == c]
per_comp_is_metal = df_per_comp_is_metal["is_metal"]
any_metals = any(per_comp_is_metal)
all_metals = any(per_comp_is_metal)
is_metal = None
from tqdm import tqdm
import pandas as pd
# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
df = load_dataset("glass_ternary_landolt")
df = df.rename(columns={"formula": "composition"})
df = df[["composition", "gfa"]]
df = StrToComposition(target_col_id="composition_obj").featurize_dataframe(
df, "composition"
)
df["composition"] = [c.reduced_formula for c in df["composition_obj"]]
df = df.drop(columns=["composition_obj"])
# print("Ground truth")
# print(df[df["composition"]=="ZrTi9"]) # should be False in final dataframe also!!
# print(df[df["composition"]=="ZrVCo8"]) # should be True in final dataframe also!
# print(df["gfa"].value_counts()) # proportion is about 5000 GFA 2054 no GFA
# raise ValueError
unique = df["composition"].unique()
print(len(df))
print(len(unique))
problem_compositions = []