Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Low-Rank SVD via Fast Alternating Least Squares.
"""
_fancyimpute_options = {'KNN', 'BiScaler', 'NuclearNormMinimization', 'SoftImpute', 'IterativeSVD'}
if (not has_fancyimpute) and (method in _fancyimpute_options):
raise ValueError('You must install `fancyimpute` (pip install fancyimpute) to use this method')
_base_options = {'mean', 'median', 'constant'}
if (method not in _base_options) and (method not in _fancyimpute_options) and (not isinstance(method, (int,float))):
raise ValueError('method not understood.. Use `mean`, `median`, a scalar, or an option from `fancyimpute`')
X_incomplete = data.copy()
if method == 'KNN':
if value is None:
value = 3
X_filled = KNN(k=value, verbose=False).complete(X_incomplete)
elif method == 'BiScaler':
X_filled = BiScaler(verbose=False).fit_transform(X_incomplete)
elif method == 'SoftImpute':
X_filled = SoftImpute(verbose=False).complete(X_incomplete)
elif method == 'IterativeSVD':
if value is None:
rank = min(10, X_incomplete.shape[0]-2)
else:
rank = value
X_filled = IterativeSVD(rank=rank, verbose=False).complete(X_incomplete)
elif method == 'mean':
col_means = np.nanmean(X_incomplete, axis=0)
if (value == "zero"):
inputed_value = 0
elif (value == "mean"):
inputed_value = np.mean(data_drop)
elif (value == "max"):
inputed_value = np.max(data_drop)
elif (value == "min"):
inputed_value = np.min(data_drop)
elif (value == "new"):
inputed_value = 0 # 0 is the value that never happens in our categorical map
elif (value == "popular"):
inputed_value = popular_value(data_drop)
# special type of imputed, just return after imputation
elif (value == "knn"):
from fancyimpute import KNN
data_clean = KNN(k=5).complete(data)
return data_clean
else:
raise ValueError("no such impute strategy: {}".format(value))
if np.isnan(inputed_value):
inputed_value = 0
data_imputed[index] = inputed_value
if verbose: print("imputed missing value: {}".format(inputed_value))
return data_imputed
regularization_weight = 10.0 ** -negative_log_regularization_weight
table.add_entry(
solver=IterativeImputer(
n_nearest_features=80,
max_iter=50
),
name="IterativeImputer_%d" % negative_log_regularization_weight)
for fill_method in ["mean", "median"]:
table.add_entry(
solver=SimpleFill(fill_method=fill_method),
name="SimpleFill_%s" % fill_method)
for k in [1, 3, 7]:
table.add_entry(
solver=KNN(
k=k,
orientation="rows"),
name="KNN_k%d" % (k,))
for shrinkage_value in [25, 50, 100]:
# SoftImpute without rank constraints
table.add_entry(
solver=SoftImpute(
shrinkage_value=shrinkage_value),
name="SoftImpute_lambda%d" % (shrinkage_value,))
for rank in [10, 20, 40]:
table.add_entry(
solver=IterativeSVD(
rank=rank,
init_fill_method="zero"),
# only for numerical values
# Nearest neighbor imputations which weights samples
# using the mean squared difference on features for which two
# rows both have observed data.
from fancyimpute import KNN
df = dataset
if dataset.select_dtypes(['number']).isnull().sum().sum() > 0:
X = dataset.select_dtypes(['number'])
for i in X.columns:
X[i] = KNN(k=k, verbose=False).fit_transform(X)
Z = dataset.select_dtypes(include=['object'])
df = pd.DataFrame.from_records(
X, columns=dataset.select_dtypes(['number']).columns)
df = df.join(Z)
else:
pass
return df
if (label_col_name==None or len(label_col_name)==0):
is_eval = False
else:
is_eval = True
missing_col_id = []
data, label = self.__df2np(data, label_col_name, missing_col_id)
# mask = np.isnan(data)
# imputation_list = ["mean"] * len(missing_col_id)
# data_mean = mvp.imputeData(data, missing_col_id, imputation_list, self.verbose)
# data_mean = scale(data_mean)
# data_mean[mask] = np.nan
# data_clean = KNN(k=5, normalizer=BiScaler).complete(data)
data_clean = KNN(k=5).complete(data)
#data_clean = MICE().complete(data)
if (is_eval): self.__evaluation(data_clean, label)
return data_clean
def __knn(self, test_data):
"""
wrap fancyimpute-knn
"""
missing_col_id = []
test_data = mvp.df2np(test_data, missing_col_id, self._verbose)
if (len(missing_col_id) == 0): return test_data
complete_data = knn(k=self._k, verbose=(1 if self._verbose else 0)).complete(test_data)
return complete_data
def impute_knn(X, mask, hyperparams={'k':[2,4,6]}):
return fancyimpute_hpo(KNN,hyperparams, X, mask)