Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _calculate_linkage_fastcluster(self):
import fastcluster
# Fastcluster has a memory-saving vectorized version, but only
# with certain linkage methods, and mostly with euclidean metric
vector_methods = ('single', 'centroid', 'median', 'ward')
euclidean_methods = ('centroid', 'median', 'ward')
euclidean = self.metric == 'euclidean' and self.method in \
euclidean_methods
if euclidean or self.method == 'single':
return fastcluster.linkage_vector(self.array,
method=self.method,
metric=self.metric)
else:
pairwise_dists = distance.pdist(self.array, metric=self.metric)
linkage = fastcluster.linkage(pairwise_dists, method=self.method)
del pairwise_dists
return linkage
def cluster_entities(self, embeddings_path, link_method='average'):
embeddings_dict = self.load_embeddings(embeddings_path,
self.unique_entities)
# Create distance matrix (vector) using cosine similarity
# between all entity strings
embeddings_array = np.array([embedding for embedding in embeddings_dict.values()])
dist_vec = pdist(embeddings_array, 'cosine')
# Cluster distance matrix to find co-referring entities
Z = fastcluster.linkage(dist_vec, method=link_method)
cluster_labels = fcluster(Z, t=self.clust_dist_thres,
criterion='distance')
cluster_members_all = []
entity_list = np.array([entity for entity in embeddings_dict.keys()])
for clus_label in np.unique(cluster_labels):
clus_indx = cluster_labels == clus_label
cluster_members = list(entity_list[clus_indx])
cluster_members_all.append(cluster_members)
output = {'cluster_members': cluster_members_all,
'cluster_labels': cluster_labels,
'cluster_rep': self.get_cluster_representatives(cluster_members_all)}
self.entity_cluster_results = output
def __cluster_columns__(self, column_distance, column_linkage):
self.data = [list(col) for col in zip(*self.data)]
if not self.missing_value is False:
self.data, missing_values_indexes = self.__impute_missing_values__(self.data)
self.column_clustering = fastcluster.linkage(self.data, method=column_linkage, metric=column_distance)
self.data_order = hcluster.leaves_list(self.column_clustering)
if not self.missing_value is False:
self.data = self.__return_missing_values__(self.data, missing_values_indexes)
self.data = zip(*self.data)
self.data = self.__reorder_data__(self.data, self.data_order)
self.original_data = self.__reorder_data__(self.original_data, self.data_order)
if self.header:
self.header = self.__reorder_data__([self.header], self.data_order)[0]
hie_mat = None
try:
method = kwargs["method"]
except KeyError:
method = 'complete'
if hie_mat != None:
self.hie_mat = hie_mat
# print "[HIERARCHICAL] Matrix provided."
else:
if self.hie_mat == None:
#self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False)
# print "[HIERARCHICAL] Calculating Matrix"
#self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method)
self.hie_mat = hcluster_fast.linkage(self.condensed_matrix.get_data(), method = method)
# else:
# print "[HIERARCHICAL] Matrix was already stored"
algorithm_details = "Hierarchical with "+method+" method (cutoff = " +str(cutoff)+")"
if cutoff != None:
# Then apply the cutoff, this doesn't work much as expected
# print "[HIERARCHICAL] getting clustering."+algorithm_details
group_list = hcluster.fcluster(self.hie_mat,cutoff)
# print "[HIERARCHICAL] Clustering done."+algorithm_details
# Then let's generate the clusters
clusters = gen_clusters_from_class_list(group_list)
return Clustering(clusters,details = algorithm_details)
else:
return None
)
else:
# Fetch the Expr Matrix
dm = self.expr(
genes=genes,
accessions=accessions,
raw=raw,
gene_normalize=gene_normalize,
)
# set the outliers to the maximium value for the heatmap
dm[dm > expr_boundaries] = expr_boundaries
dm[dm < -1*expr_boundaries] = -1 * expr_boundaries
# Get the Gene clustering order
if cluster_method in hier_cluster_methods:
self.log("Ordering rows by leaf")
expr_linkage = fastcluster.linkage(dm.fillna(0), method=cluster_method)
order = leaves_list(expr_linkage)
dm = dm.iloc[order, :]
elif cluster_method == "mcl":
self.log("Ordering rows by MCL cluster")
order = (
self.clusters.loc[dm.index]
.fillna(np.inf)
.sort_values(by="cluster")
.index.values
)
dm = dm.loc[order, :]
else:
# No cluster order.
self.log("Unknown gene ordering: {}, no ordering performed", cluster_method)
# Get leaves of accessions
help="File containing the vocabulary")
parser.add_option("-v", "--vectors-file", dest="vectors_file",
help="File containing the word_vectors")
parser.add_option("-m", "--method", dest="method", help="Clustering method")
parser.add_option("-s", "--metric", dest="metric", help="Clusterig metric")
parser.add_option("-o", "--output-file", dest="output_file",
help="File containing the word hierarchy")
options, _ = parser.parse_args()
words = read_words(options.vocab_file)
print "Read", len(words), "words..."
vectors = read_vectors(options.vectors_file)
print "Read", len(vectors), "vectors..."
cluster_data = fastcluster.linkage(
vectors, method=options.method, metric=options.metric)
hierarchy = convert(cluster_data)
print "Tree depth:", depth(words, hierarchy)
write_tree(words, hierarchy, options.output_file)
else:
model_comb[w]=model_wiki_vec[str(w)]
model_comb_vocab.append(w)
sentences=sentences_bigrammed
##Create a frequency count of words in email
words=[w for text in sentences_nouns for w in text]
Vocab=set(words)
#Run Agglomerative clustering
logger.info('Clustering for depth...')
data_d2v,word_d2v=create_word_list(model_comb,model_comb_vocab,25*local_vec+300,sentences_nouns,repeat=False,normalized=True,min_count=0,l2_threshold=0)
spcluster=fastcluster.linkage(data_d2v,method='average',metric='cosine')
##Calculate depth of words
num_points=len(data_d2v)
depth=calculate_depth(spcluster,word_d2v,num_points)
logger.info('Computing co-occurence graph')
T=[' '.join(w) for w in sentences_nouns]
##Co-occurence matrix
cv=CountVectorizer(token_pattern=u'(?u)\\b([^\\s]+)')
bow_matrix = cv.fit_transform(T)
id2word={}
for key, value in cv.vocabulary_.items():
id2word[value]=key
# len(ids) * 2 matrix, where only intersected hits are retained and each row is a hit with
# two features (genome coordinates of a hit).
#
data = []
for id in idsList:
data.append(hits[id]['bd'])
Y = numpy.array(data, int)
print('data: {}\n{}'.format(Y.shape, Y))
distMatrix = scipy.spatial.distance.pdist(Y, tools.distFunction)
#distMatrix = scipy.spatial.distance.pdist(Y, metric='euclidean')
#print('distMatrix: {}\n{}'.format(distMatrix.shape, distMatrix))
# fastcluster requires the dissimilarity matrix instead of similarity matrix!
hclusters = fastcluster.linkage(distMatrix, method='single', preserve_input='False')
del distMatrix
#cophenet = scipy.cluster.hierarchy.cophenet(hclusters, distMatrix)
#print('cophenetCorrelation = {}'.format(cophenet[0]))
#nids = len(ids)
#print('nids={} timesOfMergingCluster={}'.format(nids, len(hclusters)))
#for i, cluster in enumerate(hclusters):
# print('cluster {:>3} {:>6} {:>6} {:>9.2g} {:>6}'.format(i, int(cluster[0]), int(cluster[1]), cluster[2], int(cluster[3])))
for i, id in enumerate(idsList):
print('intersected hits', i, hits[id]['bd'], hits[id]['orf'], hits[id]['occurence'], hits[id]['hmmhit'], hits[id]['tirs'])
# dengrogram of hierachical clustering
#scipy.cluster.hierarchy.dendrogram(hclusters)
# form flat clusters from the hierarchical clustering
# Note: t=1.1 instead of 1.0 ensures that the intersected hits with only 1 bp intersect are included in same cluster.
t = 1.1
def cluster_data(self, dataFrame, metric, method):
'''
Clusters the data
'''
try:
if metric == 'euclidean':
linkage = fastcluster.linkage(dataFrame, method = method, metric = metric)
else:
distanceMatrix = scd.pdist(dataFrame, metric = metric)
linkage = sch.linkage(distanceMatrix,method = method)
del distanceMatrix
except:
tk.messagebox.showinfo('Error ..','Data could not be clustered. This might be due to rows that contain exactly the same values.')
return None, None
maxD = 0.7*max(linkage[:,2])
return linkage, maxD
if fused_id is not None:
tracks[fused_id].vision_cnt += 1
tracks[fused_id].update_vision_fusion()
if DEBUG:
print("NEW CYCLE")
if VISION_POINT in ar_pts:
print("vision", ar_pts[VISION_POINT])
idens = list(tracks.keys())
track_pts = np.array([tracks[iden].get_key_for_cluster() for iden in idens])
# If we have multiple points, cluster them
if len(track_pts) > 1:
link = linkage_vector(track_pts, method='centroid')
cluster_idxs = fcluster(link, 2.5, criterion='distance')
clusters = [None]*max(cluster_idxs)
for idx in xrange(len(track_pts)):
cluster_i = cluster_idxs[idx]-1
if clusters[cluster_i] == None:
clusters[cluster_i] = Cluster()
clusters[cluster_i].add(tracks[idens[idx]])
elif len(track_pts) == 1:
# TODO: why do we need this?
clusters = [Cluster()]
clusters[0].add(tracks[idens[0]])
else:
clusters = []