Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
) -> numpy.ndarray:
try:
from hdbscan import HDBSCAN
except ImportError as e:
print("Cannot import hdbscan: %s" % e)
sys.exit(1)
opt_dist_chain = numpy.cumsum(
numpy.array(
[0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]
)
)
if len(route) < 2:
clusters = numpy.zeros(len(route), dtype=int)
else:
clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
return clusters
def sample_scores(self, X_test):
# initialize the outputs
pred_score = np.zeros([X_test.shape[0], 1])
for i in range(X_test.shape[0]):
x_i = X_test[i, :]
x_i = np.asarray(x_i).reshape(1, x_i.shape[0])
x_comb = np.concatenate((self.X_train, x_i), axis=0)
clusterer = hdbscan.HDBSCAN()
clusterer.fit(x_comb)
# record the current item
pred_score[i, :] = clusterer.outlier_scores_[-1]
return pred_score
logging.info("Fitting a new HDBSCAN model.")
# Connect to SQL DB
s = self._create_db_session(self.db_config)
# Delete all clusters to refill the table with the new predictions.
s.query(ArticleCluster).delete()
s.commit()
# Fetch all document embeddings
papers = s.query(ArticleVector.article_id, ArticleVector.vector)
# Unroll abstracts and paper IDs
self.ids, self.embeddings = zip(*papers)
# Fit HDBSCAN
clusterer = hdbscan.HDBSCAN(
min_cluster_size=self.min_cluster_size,
min_samples=self.min_samples,
prediction_data=True,
).fit(self.embeddings)
# Assign soft clusters to embeddings
self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
# Store clusterer in S3
store_on_s3(clusterer, self.s3_bucket, self.clusterer_name)
else:
logging.info("Loading fitted HDBSCAN from S3.")
# Load clusterer from S3
clusterer = load_from_s3(self.s3_bucket, self.clusterer_name)
# Predict soft labels
def cluster(self):
"""
Clusters all given facts using HDBSCAN, and writes the resulting
"""
Log.write("Starting HDBSCAN clustering")
Log.write("Min Cluster Size: " + str(self.min_cluster_size))
Log.write("Min Sample: " + str(self.min_sample))
X = self.data_tuple[0]
hdb = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size, min_samples=self.min_sample)
hdb.fit(X)
s = Save(self.data_type)
s.save_binary(self.data_type + "s_cluster_model.bin", hdb)
n_clusters = len(set(hdb.labels_)) - (1 if -1 in hdb.labels_ else 0)
s.save_text(self.data_tuple, hdb.labels_, "w")
Log.write("Number of estimated clusters : %d" % n_clusters)
return hdb
# Fetch all document embeddings
papers = s.query(ArticleVector.article_id, ArticleVector.vector)
# Unroll abstracts and paper IDs
self.ids, self.embeddings = zip(*papers)
# Fit HDBSCAN
clusterer = hdbscan.HDBSCAN(
min_cluster_size=self.min_cluster_size,
min_samples=self.min_samples,
prediction_data=True,
).fit(self.embeddings)
# Assign soft clusters to embeddings
self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
# Store clusterer in S3
store_on_s3(clusterer, self.s3_bucket, self.clusterer_name)
else:
logging.info("Loading fitted HDBSCAN from S3.")
# Load clusterer from S3
clusterer = load_from_s3(self.s3_bucket, self.clusterer_name)
# Predict soft labels
self.soft_clusters = hdbscan.prediction.membership_vector(
clusterer, np.array(self.embeddings)
)
# Group arXiv paper IDs with clusters
id_clusters_mapping = self._create_mappings(
self.ids, self.soft_clusters, "clusters"
min_samples=self.min_samples,
prediction_data=True,
).fit(self.embeddings)
# Assign soft clusters to embeddings
self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
# Store clusterer in S3
store_on_s3(clusterer, self.s3_bucket, self.clusterer_name)
else:
logging.info("Loading fitted HDBSCAN from S3.")
# Load clusterer from S3
clusterer = load_from_s3(self.s3_bucket, self.clusterer_name)
# Predict soft labels
self.soft_clusters = hdbscan.prediction.membership_vector(
clusterer, np.array(self.embeddings)
)
# Group arXiv paper IDs with clusters
id_clusters_mapping = self._create_mappings(
self.ids, self.soft_clusters, "clusters"
)
# Store mapping in DB
s.bulk_insert_mappings(ArticleCluster, id_clusters_mapping)
s.commit()
self.next(self.end)
def hdbscan(feat, min_samples=10):
import hdbscan
db = hdbscan.HDBSCAN(min_cluster_size=min_samples)
labels_ = db.fit_predict(feat)
return labels_
pvals = np.array(pvals)
inds, = np.nonzero(pvals
from .format_data import format_data as formatter
# dictionary of models
models = {
'KMeans': KMeans,
'MiniBatchKMeans': MiniBatchKMeans,
'AgglomerativeClustering': AgglomerativeClustering,
'FeatureAgglomeration': FeatureAgglomeration,
'Birch': Birch,
'SpectralClustering': SpectralClustering,
}
try:
from hdbscan import HDBSCAN
_has_hdbscan = True
models.update({'HDBSCAN': HDBSCAN})
except ImportError:
_has_hdbscan = False
@memoize
def cluster(x, cluster='KMeans', n_clusters=3, ndims=None, format_data=True):
"""
Performs clustering analysis and returns a list of cluster labels
Parameters
----------
x : A Numpy array, Pandas Dataframe or list of arrays/dfs
The data to be clustered. You can pass a single array/df or a list.
If a list is passed, the arrays will be stacked and the clustering
will be performed across all lists (i.e. not within each list).
def _get_clustering(self, clustering_alg, kwargs):
logger.info(f'Using {clustering_alg} for clustering')
if clustering_alg == 'hdbscan':
min_cluster_size = kwargs.get('min_cluster_size', 50)
min_samples = kwargs.get('min_samples', None),
metric = kwargs.get('metric', 'euclidean')
cluster_selection_method = kwargs.get('cluster_selection_method', 'eom')
logger.info(f'HDBSCAN params: min_cluster_size: {min_cluster_size}, min_samples: {min_samples}')
return hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric=metric,
cluster_selection_method=cluster_selection_method)
else:
bandwidth = kwargs['bandwidth']
logger.info(f'MeanShift params: bandwidth: {bandwidth}, bin_seeding: True')
# use fast MeanShift with bin seeding
return MeanShift(bandwidth=bandwidth, bin_seeding=True)