Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def string_to_vector(self, hash_string: str, hash_format: str = 'base64'):
"""Convert hash string to vector.
Args:
hash_string: The input hash string
hash_format: One of 'base64' or 'hex'
"""
return tools.string_to_vector(
hash_string,
dtype=self.dtype,
hash_length=self.hash_length,
hash_format=hash_format)
'If hasher is provided, hash_length, hash_dtype, and distance_metric must all be None.'
hash_length = hasher.hash_length
hash_dtype = hasher.dtype
distance_metric = hasher.distance_metric
assert hash_length is not None
assert isinstance(hash_dtype, str)
assert isinstance(distance_metric, str)
# If there is more than one hash for an id, we want them
# to be sequential in case we are able to use the more
# efficient distance calculation (compute_euclidean_pairwise_duplicates)
# that skips computation of distance between two hashes for the same file.
multiple_hashes_per_id = _multiple_hashes_for_ids(hashes)
if multiple_hashes_per_id:
hashes = sorted(hashes)
vectors = np.array([
perception_hashers.tools.string_to_vector(
hash_string=hash_string_or_vector,
hash_format=hash_format,
hash_length=hash_length,
dtype=hash_dtype)
if isinstance(hash_string_or_vector, str) else hash_string_or_vector
for _, hash_string_or_vector in hashes
])
files = np.array([identifier for identifier, _ in hashes])
pairs: typing.List[typing.Tuple[str, str]] = []
n_hashes = len(vectors)
iterator = range(n_hashes)
if progress is not None:
iterator = progress(iterator, total=n_hashes, desc='Deduplicating.')
start_idx = 0
end_idx = None
if distance_metric != 'euclidean' or 'int' not in hash_dtype or extensions is None:
# may have been dropped due to being invalid.
noops = hashsets[(hashsets['transform_name'] == 'noop')
& (hashsets['hasher_name'] == hasher_name)
& (hashsets['guid'].isin(hashset['guid']))]
hashset = hashset[hashset['guid'].isin(noops['guid'])]
dtype, distance_metric, hash_length = hashset.iloc[0][[
'hasher_dtype', 'hasher_distance_metric', 'hasher_hash_length'
]]
n_noops = len(noops.guid)
n_hashset = len(hashset.guid)
noop_guids = noops.guid.values
mask = create_mask(hashset.guid.values, noops.guid.values)
if distance_metric != 'custom':
X_trans = np.array(
hashset.hash.apply(
string_to_vector,
hash_length=int(hash_length),
dtype=dtype,
hash_format='base64').tolist())
X_noop = np.array(
noops.hash.apply(
string_to_vector,
dtype=dtype,
hash_format='base64',
hash_length=int(hash_length)).tolist())
if distance_metric != 'euclidean' or 'int' not in dtype or extensions is None:
distance_matrix = spatial.distance.cdist(
XA=X_trans, XB=X_noop, metric=distance_metric)
distance_to_closest_image = distance_matrix.min(axis=1)
distance_to_correct_image = np.ma.masked_array(
distance_matrix, np.logical_not(mask)).min(axis=1)
distance_matrix_incorrect_image = np.ma.masked_array(
'hasher_dtype', 'hasher_distance_metric', 'hasher_hash_length'
]]
n_noops = len(noops.guid)
n_hashset = len(hashset.guid)
noop_guids = noops.guid.values
mask = create_mask(hashset.guid.values, noops.guid.values)
if distance_metric != 'custom':
X_trans = np.array(
hashset.hash.apply(
string_to_vector,
hash_length=int(hash_length),
dtype=dtype,
hash_format='base64').tolist())
X_noop = np.array(
noops.hash.apply(
string_to_vector,
dtype=dtype,
hash_format='base64',
hash_length=int(hash_length)).tolist())
if distance_metric != 'euclidean' or 'int' not in dtype or extensions is None:
distance_matrix = spatial.distance.cdist(
XA=X_trans, XB=X_noop, metric=distance_metric)
distance_to_closest_image = distance_matrix.min(axis=1)
distance_to_correct_image = np.ma.masked_array(
distance_matrix, np.logical_not(mask)).min(axis=1)
distance_matrix_incorrect_image = np.ma.masked_array(
distance_matrix, mask)
distance_to_incorrect_image = distance_matrix_incorrect_image.min(
axis=1)
closest_incorrect_guid = noop_guids[
distance_matrix_incorrect_image.argmin(axis=1)]
else: