Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for (i, val) in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for (key, v) in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
# Check that all partition names map to the same type after
# transformation by val_to_num
if len(vals_by_type) > 1:
examples = [x[0] for x in vals_by_type.values()]
warnings.warn(
"Partition names coerce to values of different"
" types, e.g. %s" % examples
)
return {k: list(v) for k, v in cats.items()}
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for (key, val) in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for (i, val) in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for (key, v) in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
from fastparquet.util import ex_from_sep, val_to_num, groupby_types
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for key, val in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
def _paths_to_cats(paths, scheme):
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for (key, val) in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for (i, val) in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for (key, v) in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
def _paths_to_cats(paths, scheme):
"""Extract out fields and labels from directory names"""
# can be factored out in fastparquet
from fastparquet.util import ex_from_sep, val_to_num, groupby_types
cats = OrderedDict()
raw_cats = OrderedDict()
for path in paths:
s = ex_from_sep("/")
if scheme == "hive":
partitions = s.findall(path)
for key, val in partitions:
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
else:
for i, val in enumerate(path.split("/")[:-1]):
key = "dir%i" % i
cats.setdefault(key, set()).add(val_to_num(val))
raw_cats.setdefault(key, set()).add(val)
for key, v in cats.items():
# Check that no partition names map to the same value after
# transformation by val_to_num
raw = raw_cats[key]
if len(v) != len(raw):
conflicts_by_value = OrderedDict()
for raw_val in raw_cats[key]:
conflicts_by_value.setdefault(val_to_num(raw_val), set()).add(raw_val)
conflicts = [
c for k in conflicts_by_value.values() if len(k) > 1 for c in k
]
raise ValueError("Partition names map to the same value: %s" % conflicts)
vals_by_type = groupby_types(v)
# Check that all partition names map to the same type after
# transformation by val_to_num
if len(vals_by_type) > 1:
import warnings
examples = [x[0] for x in vals_by_type.values()]
warnings.warn(
"Partition names coerce to values of different"
" types, e.g. %s" % examples
)