Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
gamma * cat_dissim(centroids[1], Xcat[ipoint], X=Xcat, membship=membship)
)
if membship[clust, ipoint]:
# Point is already in its right place.
continue
# Move point, and update old/new cluster frequencies and centroids.
moves += 1
old_clust = np.argwhere(membship[:, ipoint])[0][0]
# Note that membship gets updated by kmodes.move_point_cat.
# move_point_num only updates things specific to the k-means part.
cl_attr_sum, cl_memb_sum = move_point_num(
Xnum[ipoint], clust, old_clust, cl_attr_sum, cl_memb_sum
)
cl_attr_freq, membship, centroids[1] = kmodes.move_point_cat(
Xcat[ipoint], ipoint, clust, old_clust,
cl_attr_freq, membship, centroids[1]
)
# Update old and new centroids for numerical attributes using
# the means and sums of all values
for iattr in range(len(Xnum[ipoint])):
for curc in (clust, old_clust):
if cl_memb_sum[curc]:
centroids[0][curc, iattr] = cl_attr_sum[curc, iattr] / cl_memb_sum[curc]
else:
centroids[0][curc, iattr] = 0.
# In case of an empty cluster, reinitialize with a random point
# from largest cluster.
if not cl_memb_sum[old_clust]:
if cl_memb_sum[curc]:
centroids[0][curc, iattr] = cl_attr_sum[curc, iattr] / cl_memb_sum[curc]
else:
centroids[0][curc, iattr] = 0.
# In case of an empty cluster, reinitialize with a random point
# from largest cluster.
if not cl_memb_sum[old_clust]:
from_clust = membship.sum(axis=1).argmax()
choices = [ii for ii, ch in enumerate(membship[from_clust, :]) if ch]
rindx = random_state.choice(choices)
cl_attr_sum, cl_memb_sum = move_point_num(
Xnum[rindx], old_clust, from_clust, cl_attr_sum, cl_memb_sum
)
cl_attr_freq, membship, centroids[1] = kmodes.move_point_cat(
Xcat[rindx], rindx, old_clust, from_clust,
cl_attr_freq, membship, centroids[1]
)
return centroids, moves
n_clusters, n_points, max_iter,
num_dissim, cat_dissim, gamma,
init, init_no, verbose, seed)
for init_no, seed in enumerate(seeds))
all_centroids, all_labels, all_costs, all_n_iters, all_epoch_costs = zip(*results)
best = np.argmin(all_costs)
if n_init > 1 and verbose:
print("Best run was number {}".format(best + 1))
# Note: return gamma in case it was automatically determined.
return all_centroids[best], enc_map, all_labels[best], all_costs[best], \
all_n_iters[best], all_epoch_costs[best], gamma
class KPrototypes(kmodes.KModes):
"""k-protoypes clustering algorithm for mixed numerical/categorical data.
Parameters
-----------
n_clusters : int, optional, default: 8
The number of clusters to form as well as the number of
centroids to generate.
max_iter : int, default: 300
Maximum number of iterations of the k-modes algorithm for a
single run.
num_dissim : func, default: euclidian_dissim
Dissimilarity function used by the algorithm for numerical variables.
Defaults to the Euclidian dissimilarity function.