Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
if None in [cs_min, cs_max] and i == 0:
skip_cols.add(col)
continue
if isinstance(cs_min, np.datetime64):
cs_min = pd.Timestamp(cs_min)
cs_max = pd.Timestamp(cs_max)
d.update(
{
"min": cs_min,
"max": cs_max,
"null_count": pf.statistics["null_count"][col][i],
}
)
s["columns"].append(d)
# Need this to filter out partitioned-on categorical columns
s["filter"] = fastparquet.api.filter_out_cats(row_group, filters)
s["total_byte_size"] = row_group.total_byte_size
s["file_path_0"] = row_group.columns[0].file_path # 0th column only
stats.append(s)
else:
stats = None
pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed
pf.fmd.row_groups = None
# Create `parts`
# This is a list of row-group-descriptor dicts, or file-paths
# if we have a list of files and gather_statistics=False
if not parts:
partsin = pf.row_groups
if fast_metadata:
)
(
meta,
filters,
index_name,
out_type,
all_columns,
index_names,
storage_name_mapping,
) = _pf_validation(pf, columns, index, categories, filters)
rgs = [
rg
for rg in pf.row_groups
if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema))
and not (fastparquet.api.filter_out_cats(rg, filters))
]
name = "read-parquet-" + tokenize(fs_token, paths, all_columns, filters, categories)
dsk = {
(name, i): (
_read_parquet_row_group,
fs,
pf.row_group_filename(rg),
index_names,
all_columns,
rg,
out_type == Series,
categories,
pf.schema,
pf.cats,
pf.dtypes,