How to use the perception.hashers.tools.string_to_vector function in Perception

To help you get started, we’ve selected a few Perception examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github thorn-oss / perception / perception / hashers / hasher.py View on Github external
def string_to_vector(self, hash_string: str, hash_format: str = 'base64'):
        """Convert hash string to vector.

        Args:
            hash_string: The input hash string
            hash_format: One of 'base64' or 'hex'
        """
        return tools.string_to_vector(
            hash_string,
            dtype=self.dtype,
            hash_length=self.hash_length,
            hash_format=hash_format)
github thorn-oss / perception / perception / tools.py View on Github external
'If hasher is provided, hash_length, hash_dtype, and distance_metric must all be None.'
        hash_length = hasher.hash_length
        hash_dtype = hasher.dtype
        distance_metric = hasher.distance_metric
    assert hash_length is not None
    assert isinstance(hash_dtype, str)
    assert isinstance(distance_metric, str)
    # If there is more than one hash for an id, we want them
    # to be sequential in case we are able to use the more
    # efficient distance calculation (compute_euclidean_pairwise_duplicates)
    # that skips computation of distance between two hashes for the same file.
    multiple_hashes_per_id = _multiple_hashes_for_ids(hashes)
    if multiple_hashes_per_id:
        hashes = sorted(hashes)
    vectors = np.array([
        perception_hashers.tools.string_to_vector(
            hash_string=hash_string_or_vector,
            hash_format=hash_format,
            hash_length=hash_length,
            dtype=hash_dtype)
        if isinstance(hash_string_or_vector, str) else hash_string_or_vector
        for _, hash_string_or_vector in hashes
    ])
    files = np.array([identifier for identifier, _ in hashes])
    pairs: typing.List[typing.Tuple[str, str]] = []
    n_hashes = len(vectors)
    iterator = range(n_hashes)
    if progress is not None:
        iterator = progress(iterator, total=n_hashes, desc='Deduplicating.')
    start_idx = 0
    end_idx = None
    if distance_metric != 'euclidean' or 'int' not in hash_dtype or extensions is None:
github thorn-oss / perception / perception / benchmarking / common.py View on Github external
# may have been dropped due to being invalid.
            noops = hashsets[(hashsets['transform_name'] == 'noop')
                             & (hashsets['hasher_name'] == hasher_name)
                             & (hashsets['guid'].isin(hashset['guid']))]
            hashset = hashset[hashset['guid'].isin(noops['guid'])]
            dtype, distance_metric, hash_length = hashset.iloc[0][[
                'hasher_dtype', 'hasher_distance_metric', 'hasher_hash_length'
            ]]
            n_noops = len(noops.guid)
            n_hashset = len(hashset.guid)
            noop_guids = noops.guid.values
            mask = create_mask(hashset.guid.values, noops.guid.values)
            if distance_metric != 'custom':
                X_trans = np.array(
                    hashset.hash.apply(
                        string_to_vector,
                        hash_length=int(hash_length),
                        dtype=dtype,
                        hash_format='base64').tolist())
                X_noop = np.array(
                    noops.hash.apply(
                        string_to_vector,
                        dtype=dtype,
                        hash_format='base64',
                        hash_length=int(hash_length)).tolist())
                if distance_metric != 'euclidean' or 'int' not in dtype or extensions is None:
                    distance_matrix = spatial.distance.cdist(
                        XA=X_trans, XB=X_noop, metric=distance_metric)
                    distance_to_closest_image = distance_matrix.min(axis=1)
                    distance_to_correct_image = np.ma.masked_array(
                        distance_matrix, np.logical_not(mask)).min(axis=1)
                    distance_matrix_incorrect_image = np.ma.masked_array(
github thorn-oss / perception / perception / benchmarking / common.py View on Github external
'hasher_dtype', 'hasher_distance_metric', 'hasher_hash_length'
            ]]
            n_noops = len(noops.guid)
            n_hashset = len(hashset.guid)
            noop_guids = noops.guid.values
            mask = create_mask(hashset.guid.values, noops.guid.values)
            if distance_metric != 'custom':
                X_trans = np.array(
                    hashset.hash.apply(
                        string_to_vector,
                        hash_length=int(hash_length),
                        dtype=dtype,
                        hash_format='base64').tolist())
                X_noop = np.array(
                    noops.hash.apply(
                        string_to_vector,
                        dtype=dtype,
                        hash_format='base64',
                        hash_length=int(hash_length)).tolist())
                if distance_metric != 'euclidean' or 'int' not in dtype or extensions is None:
                    distance_matrix = spatial.distance.cdist(
                        XA=X_trans, XB=X_noop, metric=distance_metric)
                    distance_to_closest_image = distance_matrix.min(axis=1)
                    distance_to_correct_image = np.ma.masked_array(
                        distance_matrix, np.logical_not(mask)).min(axis=1)
                    distance_matrix_incorrect_image = np.ma.masked_array(
                        distance_matrix, mask)
                    distance_to_incorrect_image = distance_matrix_incorrect_image.min(
                        axis=1)
                    closest_incorrect_guid = noop_guids[
                        distance_matrix_incorrect_image.argmin(axis=1)]
                else: