Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_iterative_svd_with_low_rank_random_matrix():
solver = IterativeSVD(rank=3)
XY_completed = solver.fit_transform(XY_incomplete)
_, missing_mae = reconstruction_error(
XY,
XY_completed,
missing_mask,
name="IterativeSVD")
assert missing_mae < 0.1, "Error too high!"
# COLUMNS = ["worker_id", "score", "accuracy", "attempted", "correct", "boomerang"]
data = pivoted.copy(deep=True)
matrix = data.ix[:, 1:] # without worker_id
# data['accuracy'] = matrix.mean(axis=ROW_WISE) * 100
# data['attempted'] = matrix.count(axis=ROW_WISE)
# data['correct'] = matrix.sum(axis=ROW_WISE)
# data = data[data["attempted"]>=MIN_TASKS]
# turn incorrect to -1 as imputations will fill with 0
# matrix[matrix <= 0] = -1
try:
mat = IterativeSVD(verbose=False, init_fill_method="mean").complete(matrix)
except Exception:
mat = SoftImpute(verbose=False, init_fill_method="mean").complete(matrix)
data['score'] = mat.mean(axis=ROW_WISE)
data = data.sort_values(by=['score'], ascending=[False])
percentile = data['score'].quantile(settings.WORKER_SPLIT_PERCENTILE)
# Top 25% = 3-2 and Bottom 75% = 2-1
num_workers = len(data)
num_workers_top_x = len(data[data['score'] >= percentile])
top_x = data.head(num_workers_top_x)
# add extra worker at inflexion point from top set as it will have 2.0 duplicated
bottom_y = data.tail(num_workers - num_workers_top_x + 1)
del transposed_matrix_with_zeros
# Store the absolute/percentages of imputed values
total = transposed_matrix.isnull().sum().sort_values(ascending=False)
percent = (transposed_matrix.isnull().sum()/transposed_matrix.isnull().count()).sort_values(ascending=False)
total_percent_imputed = sum(percent) / len(transposed_matrix.count())
job_context['total_percent_imputed'] = total_percent_imputed
logger.info("Total percentage of data to impute!", total_percent_imputed=total_percent_imputed)
# Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix
svd_algorithm = job_context['dataset'].svd_algorithm
if svd_algorithm != 'NONE':
svd_start = time.time()
logger.info("IterativeSVD algorithm: %s" % svd_algorithm)
svd_algorithm = str.lower(svd_algorithm)
imputed_matrix = IterativeSVD(rank=10, svd_algorithm=svd_algorithm).fit_transform(transposed_matrix)
else:
imputed_matrix = transposed_matrix
logger.info("Skipping IterativeSVD")
del transposed_matrix
# Untranspose imputed_matrix (genes are now rows, samples are now columns)
untransposed_imputed_matrix = imputed_matrix.transpose()
del imputed_matrix
# Convert back to Pandas
untransposed_imputed_matrix_df = pd.DataFrame.from_records(untransposed_imputed_matrix)
untransposed_imputed_matrix_df.index = row_col_filtered_combined_matrix_samples_index
untransposed_imputed_matrix_df.columns = row_col_filtered_combined_matrix_samples_columns
del untransposed_imputed_matrix
del row_col_filtered_combined_matrix_samples_index
del row_col_filtered_combined_matrix_samples_columns
if value is None:
value = 3
X_filled = KNN(k=value, verbose=False).complete(X_incomplete)
elif method == 'BiScaler':
X_filled = BiScaler(verbose=False).fit_transform(X_incomplete)
elif method == 'SoftImpute':
X_filled = SoftImpute(verbose=False).complete(X_incomplete)
elif method == 'IterativeSVD':
if value is None:
rank = min(10, X_incomplete.shape[0]-2)
else:
rank = value
X_filled = IterativeSVD(rank=rank, verbose=False).complete(X_incomplete)
elif method == 'mean':
col_means = np.nanmean(X_incomplete, axis=0)
for i in range(X_incomplete.shape[1]):
X_incomplete[:,i][np.isnan(X_incomplete[:,i])] = col_means[i]
X_filled = X_incomplete
elif method == 'median':
col_means = np.nanmean(X_incomplete, axis=0)
for i in range(X_incomplete.shape[1]):
X_incomplete[:,i][np.isnan(X_incomplete[:,i])] = col_means[i]
X_filled = X_incomplete
elif method == 'constant':
if value is None:
raise ValueError('Must give `value` argument if method == constant')
table.add_entry(
solver=KNN(
k=k,
orientation="rows"),
name="KNN_k%d" % (k,))
for shrinkage_value in [25, 50, 100]:
# SoftImpute without rank constraints
table.add_entry(
solver=SoftImpute(
shrinkage_value=shrinkage_value),
name="SoftImpute_lambda%d" % (shrinkage_value,))
for rank in [10, 20, 40]:
table.add_entry(
solver=IterativeSVD(
rank=rank,
init_fill_method="zero"),
name="IterativeSVD_rank%d" % (rank,))
table.save_html_table()
table.print_sorted_errors()