Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
# Fetch all document embeddings
papers = s.query(ArticleVector.article_id, ArticleVector.vector)
# Unroll abstracts and paper IDs
self.ids, self.embeddings = zip(*papers)
# Fit HDBSCAN
clusterer = hdbscan.HDBSCAN(
min_cluster_size=self.min_cluster_size,
min_samples=self.min_samples,
prediction_data=True,
).fit(self.embeddings)
# Assign soft clusters to embeddings
self.soft_clusters = hdbscan.all_points_membership_vectors(clusterer)
# Store clusterer in S3
store_on_s3(clusterer, self.s3_bucket, self.clusterer_name)
else:
logging.info("Loading fitted HDBSCAN from S3.")
# Load clusterer from S3
clusterer = load_from_s3(self.s3_bucket, self.clusterer_name)
# Predict soft labels
self.soft_clusters = hdbscan.prediction.membership_vector(
clusterer, np.array(self.embeddings)
)
# Group arXiv paper IDs with clusters
id_clusters_mapping = self._create_mappings(
self.ids, self.soft_clusters, "clusters"