Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
:type column_series: :class:`pandas:pandas.Series`
:return: JSON serializable dictionary of the output from calling :meth:`pandas:pandas.Series.describe`
"""
desc = column_series.describe().to_frame().T
if additional_aggs:
for agg in additional_aggs:
if agg == 'mode':
mode = column_series.mode().values
desc['mode'] = np.nan if len(mode) > 1 else mode[0]
continue
desc[agg] = getattr(column_series, agg)()
desc_f_overrides = {
'I': lambda f, i, c: f.add_int(i, c, as_string=True),
'F': lambda f, i, c: f.add_float(i, c, precision=4, as_string=True),
}
desc_f = grid_formatter(grid_columns(desc), nan_display='N/A', overrides=desc_f_overrides)
desc = desc_f.format_dict(next(desc.itertuples(), None))
if 'count' in desc:
# pandas always returns 'count' as a float and it adds useless decimal points
desc['count'] = desc['count'].split('.')[0]
return desc
pearson = s0.corr(s1, method='pearson')
spearman = s0.corr(s1, method='spearman')
stats = dict(
pearson='N/A' if pd.isnull(pearson) else pearson,
spearman='N/A' if pd.isnull(spearman) else spearman,
correlated=len(data),
only_in_s0=len(data[data[cols[0]].isnull()]),
only_in_s1=len(data[data[cols[1]].isnull()])
)
if len(data) > 15000:
return jsonify(
stats=stats,
error='Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...'
)
f = grid_formatter(grid_columns(data))
data = f.format_dicts(data.itertuples())
return jsonify(data=data, x=cols[0], y=cols[1], stats=stats)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
# state of the dataframe (EX: d.data['new_col'] = 'foo')
curr_dtypes = [c['name'] for c in DTYPES[data_id]]
if any(c not in curr_dtypes for c in data.columns):
data, _ = format_data(data)
DATA[data_id] = data
DTYPES[data_id] = build_dtypes_state(data)
params = retrieve_grid_params(request)
ids = get_str_arg(request, 'ids')
if ids:
ids = json.loads(ids)
else:
return jsonify({})
col_types = DTYPES[data_id]
f = grid_formatter(col_types)
curr_settings = SETTINGS.get(data_id, {})
if curr_settings.get('sort') != params.get('sort'):
data = sort_df_for_grid(data, params)
DATA[data_id] = data
if params.get('sort') is not None:
curr_settings = dict_merge(curr_settings, dict(sort=params['sort']))
else:
curr_settings = {k: v for k, v in curr_settings.items() if k != 'sort'}
data = filter_df_for_grid(data, params)
if params.get('query') is not None:
curr_settings = dict_merge(curr_settings, dict(query=params['query']))
else:
curr_settings = {k: v for k, v in curr_settings.items() if k != 'query'}
SETTINGS[data_id] = curr_settings
total = len(data)
date_counts = data[name].dropna().value_counts()
if len(date_counts[date_counts > 1]) > 1:
valid_date_cols.append(name)
if data[valid_corr_cols].isnull().values.any():
data = data.corr(method='pearson')
else:
# using pandas.corr proved to be quite slow on large datasets so I moved to numpy:
# https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow
data = np.corrcoef(data[valid_corr_cols].values, rowvar=False)
data = pd.DataFrame(data, columns=valid_corr_cols, index=valid_corr_cols)
data.index.name = str('column')
data = data.reset_index()
col_types = grid_columns(data)
f = grid_formatter(col_types, nan_display=None)
return jsonify(data=f.format_dicts(data.itertuples()), dates=valid_date_cols)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
x_col, y_col = str('x'), str('y')
if group_col is not None:
data = data[group_col + [x, y]].sort_values(group_col + [x])
data.columns = group_col + [x_col, y_col]
if agg is not None:
data = data.groupby(group_col + [x_col])
data = getattr(data, agg)().reset_index()
max_groups = 15
if len(data[group_col].drop_duplicates()) > max_groups:
msg = (
'Group ({}) contains more than {} unique values, please add additional filter'
' or else chart will be unreadable'
).format(', '.join(group_col), max_groups)
raise Exception(msg)
f = grid_formatter(
grid_columns(data[[x_col, y_col]]), overrides={'D': lambda f, i, c: f.add_timestamp(i, c)}, nan_display=None
)
y_fmt = next((fmt for _, name, fmt in f.fmts if name == y_col), None)
ret_data = dict(data={}, min=y_fmt(data[y_col].min(), None), max=y_fmt(data[y_col].max(), None))
dtypes = get_dtypes(data)
group_fmts = {c: find_dtype_formatter(dtypes[c]) for c in group_col}
for group_val, grp in data.groupby(group_col):
group_val = '/'.join([
group_fmts[gc](gv) for gv, gc in zip(make_list(group_val), group_col)
])
ret_data['data'][group_val] = f.format_lists(grp)
return ret_data
data = data[[x, y]].sort_values(x)
data.columns = [x_col, y_col]
if agg is not None:
data = data.groupby(x_col)
group_fmts = {c: find_dtype_formatter(dtypes[c]) for c in group_col}
for group_val, grp in data.groupby(group_col):
group_val = '/'.join([
group_fmts[gc](gv) for gv, gc in zip(make_list(group_val), group_col)
])
ret_data['data'][group_val] = f.format_lists(grp)
return ret_data
data = data[[x, y]].sort_values(x)
data.columns = [x_col, y_col]
if agg is not None:
data = data.groupby(x_col)
data = getattr(data, agg)().reset_index()
if any(data[x_col].duplicated()):
raise Exception('{} contains duplicates, please specify group or additional filtering'.format(x))
f = grid_formatter(
grid_columns(data), overrides={'D': lambda f, i, c: f.add_timestamp(i, c)}, nan_display=None
)
y_fmt = next((fmt for _, name, fmt in f.fmts if name == y_col), None)
ret_data = dict(
data={str('all'): f.format_lists(data)},
min=y_fmt(data[y_col].min(), None),
max=y_fmt(data[y_col].max(), None)
)
return ret_data