Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."
if self.verbose and self.cat_dissim == ng_dissim:
print("Ng's dissimilarity measure was used to train this model, "
"but now that it is predicting the model will fall back to "
"using simple matching dissimilarity.")
X = pandas_to_numpy(X)
X = check_array(X, dtype=None)
X, _ = encode_features(X, enc_map=self._enc_map)
return _labels_cost(X, self._enc_cluster_centroids, self.cat_dissim)[0]
def k_modes(X, n_clusters, max_iter, dissim, init, n_init, verbose, random_state, n_jobs):
"""k-modes algorithm"""
random_state = check_random_state(random_state)
if sparse.issparse(X):
raise TypeError("k-modes does not support sparse data.")
X = check_array(X, dtype=None)
# Convert the categorical values in X to integers for speed.
# Based on the unique values in X, we can make a mapping to achieve this.
X, enc_map = encode_features(X)
n_points, n_attrs = X.shape
assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, n_points)
# Are there more n_clusters than unique rows? Then set the unique
# rows as initial values and skip iteration.
unique = get_unique_rows(X)
n_unique = unique.shape[0]
if n_unique <= n_clusters:
max_iter = 0
n_init = 1
n_clusters = n_unique
init = unique
results = []
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
# Convert the categorical values in Xcat to integers for speed.
# Based on the unique values in Xcat, we can make a mapping to achieve this.
Xcat, enc_map = encode_features(Xcat)
# Are there more n_clusters than unique rows? Then set the unique
# rows as initial values and skip iteration.
unique = get_unique_rows(X)
n_unique = unique.shape[0]
if n_unique <= n_clusters:
max_iter = 0
n_init = 1
n_clusters = n_unique
init = list(_split_num_cat(unique, categorical))
init[1], _ = encode_features(init[1], enc_map)
# Estimate a good value for gamma, which determines the weighing of
# categorical values in clusters (see Huang [1997]).
if gamma is None:
gamma = 0.5 * Xnum.std()
results = []
seeds = random_state.randint(np.iinfo(np.int32).max, size=n_init)
if n_jobs == 1:
for init_no in range(n_init):
results.append(k_prototypes_single(Xnum, Xcat, nnumattrs, ncatattrs,
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seeds[init_no]))
else:
results = Parallel(n_jobs=n_jobs, verbose=0)(
"All columns are categorical, use k-modes instead of k-prototypes."
assert max(categorical) < X.shape[1], \
"Categorical index larger than number of columns."
ncatattrs = len(categorical)
nnumattrs = X.shape[1] - ncatattrs
n_points = X.shape[0]
assert n_clusters <= n_points, "Cannot have more clusters ({}) " \
"than data points ({}).".format(n_clusters, n_points)
Xnum, Xcat = _split_num_cat(X, categorical)
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
# Convert the categorical values in Xcat to integers for speed.
# Based on the unique values in Xcat, we can make a mapping to achieve this.
Xcat, enc_map = encode_features(Xcat)
# Are there more n_clusters than unique rows? Then set the unique
# rows as initial values and skip iteration.
unique = get_unique_rows(X)
n_unique = unique.shape[0]
if n_unique <= n_clusters:
max_iter = 0
n_init = 1
n_clusters = n_unique
init = list(_split_num_cat(unique, categorical))
init[1], _ = encode_features(init[1], enc_map)
# Estimate a good value for gamma, which determines the weighing of
# categorical values in clusters (see Huang [1997]).
if gamma is None:
gamma = 0.5 * Xnum.std()
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
assert hasattr(self, '_enc_cluster_centroids'), "Model not yet fitted."
if categorical is not None:
assert isinstance(categorical, (int, list, tuple)), "The 'categorical' \
argument needs to be an integer with the index of the categorical \
column in your data, or a list or tuple of several of them, \
but it is a {}.".format(type(categorical))
X = pandas_to_numpy(X)
Xnum, Xcat = _split_num_cat(X, categorical)
Xnum, Xcat = check_array(Xnum), check_array(Xcat, dtype=None)
Xcat, _ = encode_features(Xcat, enc_map=self._enc_map)
return _labels_cost(Xnum, Xcat, self._enc_cluster_centroids,
self.num_dissim, self.cat_dissim, self.gamma)[0]