How to use hdbscan - 10 common examples

To help you get started, we’ve selected a few hdbscan examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github src-d / hercules / python / labours / modes / devs.py View on Github external
) -> numpy.ndarray:
    try:
        from hdbscan import HDBSCAN
    except ImportError as e:
        print("Cannot import hdbscan: %s" % e)
        sys.exit(1)

    opt_dist_chain = numpy.cumsum(
        numpy.array(
            [0] + [dists[route[i], route[i + 1]] for i in range(len(route) - 1)]
        )
    )
    if len(route) < 2:
        clusters = numpy.zeros(len(route), dtype=int)
    else:
        clusters = HDBSCAN(min_cluster_size=2).fit_predict(opt_dist_chain[:, numpy.newaxis])
    return clusters
github yzhao062 / pyod / models / glosh.py View on Github external
def sample_scores(self, X_test):
        # initialize the outputs
        pred_score = np.zeros([X_test.shape[0], 1])

        for i in range(X_test.shape[0]):
            x_i = X_test[i, :]

            x_i = np.asarray(x_i).reshape(1, x_i.shape[0])
            x_comb = np.concatenate((self.X_train, x_i), axis=0)

            clusterer = hdbscan.HDBSCAN()
            clusterer.fit(x_comb)

            # record the current item
            pred_score[i, :] = clusterer.outlier_scores_[-1]
        return pred_score
github nestauk / nesta / nesta / core / tasks / projects / ai_diversity / doc2cluster.py View on Github external
logging.info("Fitting a new HDBSCAN model.")
            # Connect to SQL DB
            s = self._create_db_session(self.db_config)

            # Delete all clusters to refill the table with the new predictions.
            s.query(ArticleCluster).delete()
            s.commit()

            # Fetch all document embeddings
            papers = s.query(ArticleVector.article_id, ArticleVector.vector)

            # Unroll abstracts and paper IDs
            self.ids, self.embeddings = zip(*papers)

            # Fit HDBSCAN
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=self.min_cluster_size,
                min_samples=self.min_samples,
                prediction_data=True,
            ).fit(self.embeddings)

            # Assign soft clusters to embeddings
            self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)

            # Store clusterer in S3
            store_on_s3(clusterer, self.s3_bucket, self.clusterer_name)
        else:
            logging.info("Loading fitted HDBSCAN from S3.")
            # Load clusterer from S3
            clusterer = load_from_s3(self.s3_bucket, self.clusterer_name)

            # Predict soft labels
github Cyberjusticelab / JusticeAI / src / ml_service / feature_extraction / clustering / hdbscan / hdbscan_wrapper.py View on Github external
def cluster(self):
        """
        Clusters all given facts using HDBSCAN, and writes the resulting
        """
        Log.write("Starting HDBSCAN clustering")
        Log.write("Min Cluster Size: " + str(self.min_cluster_size))
        Log.write("Min Sample: " + str(self.min_sample))
        X = self.data_tuple[0]
        hdb = hdbscan.HDBSCAN(min_cluster_size=self.min_cluster_size, min_samples=self.min_sample)
        hdb.fit(X)
        s = Save(self.data_type)
        s.save_binary(self.data_type + "s_cluster_model.bin", hdb)
        n_clusters = len(set(hdb.labels_)) - (1 if -1 in hdb.labels_ else 0)
        s.save_text(self.data_tuple, hdb.labels_, "w")
        Log.write("Number of estimated clusters : %d" % n_clusters)
        return hdb
github nestauk / nesta / nesta / core / tasks / projects / ai_diversity / doc2cluster.py View on Github external
# Fetch all document embeddings
            papers = s.query(ArticleVector.article_id, ArticleVector.vector)

            # Unroll abstracts and paper IDs
            self.ids, self.embeddings = zip(*papers)

            # Fit HDBSCAN
            clusterer = hdbscan.HDBSCAN(
                min_cluster_size=self.min_cluster_size,
                min_samples=self.min_samples,
                prediction_data=True,
            ).fit(self.embeddings)

            # Assign soft clusters to embeddings
            self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)

            # Store clusterer in S3
            store_on_s3(clusterer, self.s3_bucket, self.clusterer_name)
        else:
            logging.info("Loading fitted HDBSCAN from S3.")
            # Load clusterer from S3
            clusterer = load_from_s3(self.s3_bucket, self.clusterer_name)

            # Predict soft labels
            self.soft_clusters = hdbscan.prediction.membership_vector(
                clusterer, np.array(self.embeddings)
            )

        # Group arXiv paper IDs with clusters
        id_clusters_mapping = self._create_mappings(
            self.ids, self.soft_clusters, "clusters"
github nestauk / nesta / nesta / core / tasks / projects / ai_diversity / doc2cluster.py View on Github external
min_samples=self.min_samples,
                prediction_data=True,
            ).fit(self.embeddings)

            # Assign soft clusters to embeddings
            self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)

            # Store clusterer in S3
            store_on_s3(clusterer, self.s3_bucket, self.clusterer_name)
        else:
            logging.info("Loading fitted HDBSCAN from S3.")
            # Load clusterer from S3
            clusterer = load_from_s3(self.s3_bucket, self.clusterer_name)

            # Predict soft labels
            self.soft_clusters = hdbscan.prediction.membership_vector(
                clusterer, np.array(self.embeddings)
            )

        # Group arXiv paper IDs with clusters
        id_clusters_mapping = self._create_mappings(
            self.ids, self.soft_clusters, "clusters"
        )
        # Store mapping in DB
        s.bulk_insert_mappings(ArticleCluster, id_clusters_mapping)
        s.commit()
        self.next(self.end)
github XiaohangZhan / cdp / tools / baseline_clustering.py View on Github external
def hdbscan(feat, min_samples=10):
    import hdbscan
    db = hdbscan.HDBSCAN(min_cluster_size=min_samples)
    labels_ = db.fit_predict(feat)
    return labels_
github ContextLab / hypertools / hypertools / tools / cluster.py View on Github external
from .format_data import format_data as formatter

# dictionary of models
models = {
    'KMeans': KMeans,
    'MiniBatchKMeans': MiniBatchKMeans,
    'AgglomerativeClustering': AgglomerativeClustering,
    'FeatureAgglomeration': FeatureAgglomeration,
    'Birch': Birch,
    'SpectralClustering': SpectralClustering,
}

try:
    from hdbscan import HDBSCAN
    _has_hdbscan = True
    models.update({'HDBSCAN': HDBSCAN})
except ImportError:
    _has_hdbscan = False


@memoize
def cluster(x, cluster='KMeans', n_clusters=3, ndims=None, format_data=True):
    """
    Performs clustering analysis and returns a list of cluster labels

    Parameters
    ----------
    x : A Numpy array, Pandas Dataframe or list of arrays/dfs
        The data to be clustered.  You can pass a single array/df or a list.
        If a list is passed, the arrays will be stacked and the clustering
        will be performed across all lists (i.e. not within each list).
github wolny / pytorch-3dunet / unet3d / predictor.py View on Github external
def _get_clustering(self, clustering_alg, kwargs):
        logger.info(f'Using {clustering_alg} for clustering')

        if clustering_alg == 'hdbscan':
            min_cluster_size = kwargs.get('min_cluster_size', 50)
            min_samples = kwargs.get('min_samples', None),
            metric = kwargs.get('metric', 'euclidean')
            cluster_selection_method = kwargs.get('cluster_selection_method', 'eom')

            logger.info(f'HDBSCAN params: min_cluster_size: {min_cluster_size}, min_samples: {min_samples}')
            return hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric=metric,
                                   cluster_selection_method=cluster_selection_method)
        else:
            bandwidth = kwargs['bandwidth']
            logger.info(f'MeanShift params: bandwidth: {bandwidth}, bin_seeding: True')
            # use fast MeanShift with bin seeding
            return MeanShift(bandwidth=bandwidth, bin_seeding=True)

hdbscan

Clustering based on density with variable density clusters

BSD-3-Clause
Latest version published 1 month ago

Package Health Score

91 / 100
Full package analysis