Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for (i, val) in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for (key, v) in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
# Check that all partition names map to the same type after
# transformation by val_to_num
if len(vals_by_type) > 1:
examples = [x[0] for x in vals_by_type.values()]
warnings.warn(
"Partition names coerce to values of different"
" types, e.g. %s" % examples
)
return {k: list(v) for k, v in cats.items()}
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for (key, val) in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for (i, val) in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for (key, v) in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
from fastparquet.util import ex_from_sep, val_to_num, groupby_types
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for key, val in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
def _paths_to_cats(paths, scheme):
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for (key, val) in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for (i, val) in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for (key, v) in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
def _paths_to_cats(paths, scheme):
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
from fastparquet.util import ex_from_sep, val_to_num, groupby_types
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for key, val in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
# Check that all partition names map to the same type after
# transformation by val_to_num
if len(vals_by_type) > 1:
import warnings
examples = [x[0] for x in vals_by_type.values()]
warnings.warn(
"Partition names coerce to values of different"
" types, e.g. %s" % examples
)
def _paths_to_cats(paths, scheme):
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
from fastparquet.util import ex_from_sep, val_to_num, groupby_types
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for key, val in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
def _paths_to_cats(paths, scheme):
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for (key, val) in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for (i, val) in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for (key, v) in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
if y
]
for x in pyarrow.compress.__doc__.split("\n")
if "upported types" in x
][0]
except ImportError:
arrow_exceptions = ()
_pyarrow = []
# OPTIONAL: load fastparquet
try:
from fastparquet.compression import compressions
# BROTLI IS BUGGED!
_fastparquet_opts = [
{"engine": "fastparquet", "compression": x} for x in compressions.keys() if x != "BROTLI"
]
except ImportError:
_fastparquet_opts = []
def estimate_uniqueness_proportion(df, col, r=10000):
# sample = serv.Detalle.sample(r)
n = df.shape[0]
sample = df[col][np.random.randint(0, n, r)]
counts = sample.value_counts()
fis = Counter(counts)
estimate = math.sqrt(n / r) * fis[1] + sum([fis[x] for x in fis if x > 1])
return estimate / n
class PandasCompressor(Predictor, BaseCompressor):
for keep_var in keep_vars:
if isinstance(keep_var, str):
toload_regex.append(r'^({})$'.format(keep_var))
toload_regex = re.compile('|'.join(toload_regex)).search
toload_vars: Dict[int, List[str]] = {}
for year in self.years:
if self.parquet_engine == 'pyarrow':
try:
pf = pq.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
except pa.ArrowIOError:
pf = pq.ParquetDataset(self._fpath(self.percent, year, 'bsfab'))
cols = pf.schema.names
elif self.parquet_engine == 'fastparquet':
pf = fp.ParquetFile(self._fpath(self.percent, year, 'bsfab'))
cols = pf.columns
toload_vars[year] = [x for x in cols if toload_regex(x)]
for keep_var in keep_vars:
if isinstance(keep_var, re._pattern_type):
toload_vars[year].extend([
x for x in cols if keep_var.search(x)])
# Deduplicate while preserving order
toload_vars[year] = list(dict.fromkeys(toload_vars[year]))
# Check cols against keep_vars
# Is there an item in keep_vars that wasn't matched?
# NOTE need to check this against regex values of keep_vars
for var in keep_vars:
if [x for x in toload_vars[year] if re.search(var, x)] == []: