How to use the datasketch.experimental.aio.lsh.AsyncMinHashLSH function in datasketch

To help you get started, we’ve selected a few datasketch examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test__H_mongo(self):
        """
        Check _H output consistent bytes length given
        the same concatenated hash value size
        """
        mg = WeightedMinHashGenerator(100, sample_size=128)
        for l in range(2, mg.sample_size + 1, 16):
            m = mg.minhash(np.random.randint(1, 99999999, 100))
            async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                       num_perm=128) as lsh:
                await lsh.insert("m", m)
                fs = (ht.keys() for ht in lsh.hashtables)
                hashtables = await asyncio.gather(*fs)
                sizes = [len(H) for H in hashtables]
                self.assertTrue(all(sizes[0] == s for s in sizes))
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test_insertion_session_mongo(self):
        def chunk(it, size):
            it = iter(it)
            return iter(lambda: tuple(islice(it, size)), ())

        _chunked_str = chunk((random.choice(string.ascii_lowercase) for _ in range(10000)), 4)
        seq = frozenset(chain((''.join(s) for s in _chunked_str),
                              ('aahhb', 'aahh', 'aahhc', 'aac', 'kld', 'bhg', 'kkd', 'yow', 'ppi', 'eer')))
        objs = [MinHash(16) for _ in range(len(seq))]
        for e, obj in zip(seq, objs):
            for i in e:
                obj.update(i.encode('utf-8'))

        data = [(e, m) for e, m in zip(seq, objs)]

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.5, num_perm=16) as lsh:
            async with lsh.insertion_session(batch_size=1000) as session:
                fs = (session.insert(key, minhash, check_duplication=False) for key, minhash in data)
                await asyncio.gather(*fs)

            for t in lsh.hashtables:
                self.assertTrue(await t.size() >= 1)
                items = []
                for H in await t.keys():
                    items.extend(await t.get(H))
                self.assertTrue('aahhb' in items)
                self.assertTrue('kld' in items)
            self.assertTrue(await lsh.has_key('aahhb'))
            self.assertTrue(await lsh.has_key('kld'))
            for i, H in enumerate(await lsh.keys.get('aahh')):
                self.assertTrue('aahh' in await lsh.hashtables[i].get(H))
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test_init_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8) as lsh:
            self.assertTrue(await lsh.is_empty())
            b1, r1 = lsh.b, lsh.r

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8,
                                   weights=(0.2, 0.8)) as lsh:
            b2, r2 = lsh.b, lsh.r
        self.assertTrue(b1 < b2)
        self.assertTrue(r1 > r2)
github ekzhu / datasketch / test / aio / test_lsh_mongo.py View on Github external
async def test_init_mongo(self):
        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8) as lsh:
            self.assertTrue(await lsh.is_empty())
            b1, r1 = lsh.b, lsh.r

        async with AsyncMinHashLSH(storage_config=self._storage_config_mongo,
                                   threshold=0.8,
                                   weights=(0.2, 0.8)) as lsh:
            b2, r2 = lsh.b, lsh.r
        self.assertTrue(b1 < b2)
        self.assertTrue(r1 > r2)