Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
raise ValueError('Predicate is nor derived from PredicateBase')
self._predicate_list = predicate_list
self._reduce_func = reduce_func
def get_fields(self):
fields = set()
for p in self._predicate_list:
fields |= p.get_fields()
return fields
def do_include(self, values):
include_list = [p.do_include(values) for p in self._predicate_list]
return self._reduce_func(include_list)
class in_pseudorandom_split(PredicateBase):
""" Split dataset according to a split list based on volume_guid.
The split is pseudorandom (can not supply the seed yet), i.e. the split outcome is always the same.
Split is performed by hashing volume_guid uniformly to 0:1 range and returning part of full dataset
which was hashed in given sub-range
Example:
'split_list = [0.5, 0.2, 0.3]' - dataset will be split on three subsets in proportion
subset 1: 0.5 of log data
subset 2: 0.2 of log data
subset 3: 0.3 of log data
Note, split is not exact, so avoid small fraction (e.g. 0.001) to avoid empty sets
"""
def __init__(self, fraction_list, subset_index, predicate_field):
""" split_list: a list of log fractions (real numbers in range [0:1])
subset_index: define which subset will be used by the Reader
""" Test if predicate_field list contain at least one value from inclusion_values set """
def __init__(self, inclusion_values, _predicate_field):
self._inclusion_values = list(inclusion_values)
self._predicate_field = _predicate_field
def get_fields(self):
return {self._predicate_field}
def do_include(self, values):
if not isinstance(values[self._predicate_field], collections.Iterable):
raise ValueError('Predicate field should have iterable type')
return any(np.in1d(values[self._predicate_field], self._inclusion_values))
class in_lambda(PredicateBase):
""" Wrap up custom function to be used as a predicate
example: in_lambda(['labels_object_roles'], lambda labels_object_roles : len(labels_object_roles) > 3)
"""
def __init__(self, predicate_fields, predicate_func, state_arg=None):
"""
:param predicate_fields: list of fields to be used in predicate
:param predicate_func: predicate function
example: lambda labels_object_roles : len(labels_object_roles) > 3
:param state_arg: additional object to keep function state. it will be passed to
predicate_func after fields arguments ONLY if it is not None
"""
if not isinstance(predicate_fields, list):
raise ValueError('Predicate fields should be a list')
self._predicate_fields = predicate_fields
self._predicate_func = predicate_func
def _apply_predicate_to_row_groups(self, dataset, row_groups, predicate):
"""Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup
indexes and a list of worker_predicate: predicates that could not be applied at this level
(parquet partitioning)."""
if predicate:
if not isinstance(predicate, PredicateBase):
raise ValueError('predicate parameter is expected to be derived from PredicateBase')
predicate_fields = predicate.get_fields()
if set(predicate_fields) == dataset.partitions.partition_names:
assert len(dataset.partitions.partition_names) == 1, \
'Datasets with only a single partition level supported at the moment'
filtered_row_group_indexes = []
for piece_index, piece in enumerate(row_groups):
partition_name, partition_index = piece.partition_keys[0]
partition_value = dataset.partitions[0].keys[partition_index]
# Convert partition value to correct type per the schema
partition_value = self.schema.fields[partition_name].numpy_dtype(partition_value)
if predicate.do_include({partition_name: partition_value}):
filtered_row_group_indexes.append(piece_index)
""" A predicate used to negate another predicate. """
def __init__(self, predicate):
if not isinstance(predicate, PredicateBase):
raise ValueError('Predicate is nor derived from PredicateBase')
self._predicate = predicate
def get_fields(self):
return self._predicate.get_fields()
def do_include(self, values):
return not self._predicate.do_include(values)
class in_reduce(PredicateBase):
""" A predicate used to aggregate other predicates using any reduce logical operation."""
def __init__(self, predicate_list, reduce_func):
""" predicate_list: list of predicates
reduce_func: function to aggregate result of all predicates in the list
e.g. all() will implements logical 'And', any() implements logical 'Or'
"""
check_list = [isinstance(p, PredicateBase) for p in predicate_list]
if not all(check_list):
raise ValueError('Predicate is nor derived from PredicateBase')
self._predicate_list = predicate_list
self._reduce_func = reduce_func
def get_fields(self):
fields = set()
for p in self._predicate_list:
def _apply_predicate_to_row_groups(self, dataset, row_groups, predicate):
"""Filters the list of row group indexes using rowgroup selector object. Returns a modified list of rowgroup
indexes and a list of worker_predicate: predicates that could not be applied at this level
(parquet partitioning)."""
if predicate:
if not isinstance(predicate, PredicateBase):
raise ValueError('predicate parameter is expected to be derived from PredicateBase')
predicate_fields = predicate.get_fields()
if set(predicate_fields) == dataset.partitions.partition_names:
assert len(dataset.partitions.partition_names) == 1, \
'Datasets with only a single partition level supported at the moment'
filtered_row_group_indexes = []
for piece_index, piece in enumerate(row_groups):
partition_name, partition_index = piece.partition_keys[0]
partition_value = dataset.partitions[0].keys[partition_index]
if predicate.do_include({partition_name: partition_value}):
filtered_row_group_indexes.append(piece_index)
worker_predicate = None
else:
@abc.abstractmethod
def get_fields(self):
pass
@abc.abstractmethod
def do_include(self, values):
pass
def _string_to_bucket(string, bucket_num):
hash_str = hashlib.md5(string.encode('utf-8')).hexdigest()
return int(hash_str, 16) % bucket_num
class in_set(PredicateBase):
""" Test if predicate_field value is in inclusion_values set """
def __init__(self, inclusion_values, predicate_field):
self._inclusion_values = set(inclusion_values)
self._predicate_field = predicate_field
def get_fields(self):
return {self._predicate_field}
def do_include(self, values):
return values[self._predicate_field] in self._inclusion_values
class in_intersection(PredicateBase):
""" Test if predicate_field list contain at least one value from inclusion_values set """
raise ValueError('Predicate fields should be a list')
self._predicate_fields = predicate_fields
self._predicate_func = predicate_func
self._state_arg = state_arg
def get_fields(self):
return set(self._predicate_fields)
def do_include(self, values):
args = [values[field] for field in self._predicate_fields]
if self._state_arg is not None:
args.append(self._state_arg)
return self._predicate_func(*args)
class in_negate(PredicateBase):
""" A predicate used to negate another predicate. """
def __init__(self, predicate):
if not isinstance(predicate, PredicateBase):
raise ValueError('Predicate is nor derived from PredicateBase')
self._predicate = predicate
def get_fields(self):
return self._predicate.get_fields()
def do_include(self, values):
return not self._predicate.do_include(values)
class in_reduce(PredicateBase):
def __init__(self, predicate_list, reduce_func):
""" predicate_list: list of predicates
reduce_func: function to aggregate result of all predicates in the list
e.g. all() will implements logical 'And', any() implements logical 'Or'
"""
check_list = [isinstance(p, PredicateBase) for p in predicate_list]
if not all(check_list):
raise ValueError('Predicate is nor derived from PredicateBase')
self._predicate_list = predicate_list
self._reduce_func = reduce_func
class in_set(PredicateBase):
""" Test if predicate_field value is in inclusion_values set """
def __init__(self, inclusion_values, predicate_field):
self._inclusion_values = set(inclusion_values)
self._predicate_field = predicate_field
def get_fields(self):
return {self._predicate_field}
def do_include(self, values):
return values[self._predicate_field] in self._inclusion_values
class in_intersection(PredicateBase):
""" Test if predicate_field list contain at least one value from inclusion_values set """
def __init__(self, inclusion_values, _predicate_field):
self._inclusion_values = list(inclusion_values)
self._predicate_field = _predicate_field
def get_fields(self):
return {self._predicate_field}
def do_include(self, values):
if not isinstance(values[self._predicate_field], collections.Iterable):
raise ValueError('Predicate field should have iterable type')
return any(np.in1d(values[self._predicate_field], self._inclusion_values))
class in_lambda(PredicateBase):