How to use the datasketch.MinHashLSHForest function in datasketch

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github erikbern / ann-benchmarks / ann_benchmarks / algorithms / datasketch.py View on Github external
def fit(self, X):
        self._index = MinHashLSHForest(num_perm=self._n_perm, l=self._n_rep)
        for i, x in enumerate(X):
            m = MinHash(num_perm=self._n_perm)
            for e in x:
                m.update(str(e).encode('utf8'))
            self._index.add(str(i), m)
        self._index.index()
github ekzhu / datasketch / benchmark / lshforest_benchmark.py View on Github external
def benchmark_lshforest(num_perm, l, k, index_data, query_data):
    print("Building LSH Forest index")
    forest = MinHashLSHForest(num_perm=num_perm, l=l)
    for key, minhash in zip(index_data.keys, index_data.minhashes[num_perm]):
        forest.add(key, minhash)
    forest.index()
    print("Querying")
    times = []
    results = []
    for qs, minhash in zip(query_data.sets, query_data.minhashes[num_perm]):
        start = time.clock()
        result = forest.query(minhash, k)
        duration = time.clock() - start
        times.append(duration)
        results.append(sorted([[key, _compute_jaccard(qs, index_data.sets[key])]
                               for key in result],
                              key=lambda x : x[1], reverse=True))
    return times, results
github ricsinaruto / Seq2seqChatbots / t2t_csaky / data_filtering / hash_jaccard.py View on Github external
def clustering(self, data_tag):
    """
    Params:
      :data_tag: Whether it's source or target data.
    """

    # Create a min hash forest to quickly find nearest neighbours.
    self.forest = MinHashLSHForest(num_perm=self.num_perm)

    # Initialize clusters.
    medoids = random.sample(range(len(self.data_points[data_tag])),
                            self.num_clusters[data_tag])

    for i in range(self.num_clusters[data_tag]):
      cl = self.ClusterClass(self.data_points[data_tag][medoids[i]])
      self.clusters[data_tag].append(cl)

      # Put medoids in a the forest.
      self.forest.add(i, self.clusters[data_tag][-1].medoid.min_hash)
    self.forest.index()

    # For each data_point find a cluster.
    self.cluster_points(data_tag)