Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
data: {:col1:col2: {data: [{corr: 0.99, date: 'YYYY-MM-DD'},...], max: 0.99, min: 0.99}
} or {error: 'Exception message', traceback: 'Exception stacktrace'}
"""
try:
query = get_str_arg(request, 'query')
data = DATA[data_id]
data = data.query(query) if query is not None else data
cols = get_str_arg(request, 'cols')
cols = cols.split(',')
date_col = get_str_arg(request, 'dateCol')
data = data.groupby(date_col)[list(set(cols))].corr(method='pearson')
data.index.names = ['date', 'column']
data = data.reset_index()
data = data[data.column == cols[0]][['date', cols[1]]]
data.columns = ['date', 'corr']
return jsonify(build_chart(data, 'date', 'corr'))
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
query = get_str_arg(request, 'query')
data = DATA[data_id]
if query:
try:
data = data.query(query)
except BaseException as e:
return jsonify(dict(error='Invalid query: {}'.format(str(e))))
if not len(data):
return jsonify(dict(error='query "{}" found no data, please alter'.format(query)))
x = get_str_arg(request, 'x')
y = get_str_arg(request, 'y')
group_col = get_str_arg(request, 'group')
if group_col is not None:
group_col = group_col.split(',')
agg = get_str_arg(request, 'agg')
return jsonify(build_chart(data, x, y, group_col, agg))
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
total = len(data)
results = {}
for sub_range in ids:
sub_range = list(map(int, sub_range.split('-')))
if len(sub_range) == 1:
sub_df = data.iloc[sub_range[0]:sub_range[0] + 1]
sub_df = f.format_dicts(sub_df.itertuples())
results[sub_range[0]] = dict_merge({IDX_COL: sub_range[0]}, sub_df[0])
else:
[start, end] = sub_range
sub_df = data.iloc[start:] if end >= len(data) - 1 else data.iloc[start:end + 1]
sub_df = f.format_dicts(sub_df.itertuples())
for i, d in zip(range(start, end + 1), sub_df):
results[i] = dict_merge({IDX_COL: i}, d)
return_data = dict(results=results, columns=[dict(name=IDX_COL, dtype='int64')] + DTYPES[data_id], total=total)
return jsonify(return_data)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
uniq_f = find_dtype_formatter(get_dtypes(data)[column])
return_data['uniques'] = dict(
data=[uniq_f(u, nan_display='N/A') for u in uniq_vals],
top=False
)
else: # get top 100 most common values
uniq_vals = data[column].value_counts().sort_values(ascending=False).head(100).index.values
uniq_f = find_dtype_formatter(get_dtypes(data)[column])
return_data['uniques'] = dict(
data=[uniq_f(u, nan_display='N/A') for u in uniq_vals],
top=True
)
return jsonify(return_data)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
spearman='N/A' if pd.isnull(spearman) else spearman,
correlated=len(data),
only_in_s0=len(data[data[cols[0]].isnull()]),
only_in_s1=len(data[data[cols[1]].isnull()])
)
if len(data) > 15000:
return jsonify(
stats=stats,
error='Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...'
)
f = grid_formatter(grid_columns(data))
data = f.format_dicts(data.itertuples())
return jsonify(data=data, x=cols[0], y=cols[1], stats=stats)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
stats = dict(
pearson='N/A' if pd.isnull(pearson) else pearson,
spearman='N/A' if pd.isnull(spearman) else spearman,
correlated=len(data),
only_in_s0=len(data[data[cols[0]].isnull()]),
only_in_s1=len(data[data[cols[1]].isnull()])
)
if len(data) > 15000:
return jsonify(
stats=stats,
error='Dataset exceeds 15,000 records, cannot render scatter. Please apply filter...'
)
f = grid_formatter(grid_columns(data))
data = f.format_dicts(data.itertuples())
return jsonify(data=data, x=cols[0], y=cols[1], stats=stats)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
col = get_str_arg(request, 'col', 'values')
query = get_str_arg(request, 'query')
bins = get_int_arg(request, 'bins', 20)
try:
data = DATA[data_id]
if query:
data = data.query(query)
selected_col = find_selected_column(data, col)
data = data[~pd.isnull(data[selected_col])][[selected_col]]
hist = np.histogram(data, bins=bins)
desc = load_describe(data[selected_col])
return jsonify(data=[json_float(h) for h in hist[0]], labels=['{0:.1f}'.format(l) for l in hist[1]], desc=desc)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
return_data['describe']['unique'] = json_int(len(uniq_vals), as_string=True)
if len(uniq_vals) <= 100:
uniq_f = find_dtype_formatter(get_dtypes(data)[column])
return_data['uniques'] = dict(
data=[uniq_f(u, nan_display='N/A') for u in uniq_vals],
top=False
)
else: # get top 100 most common values
uniq_vals = data[column].value_counts().sort_values(ascending=False).head(100).index.values
uniq_f = find_dtype_formatter(get_dtypes(data)[column])
return_data['uniques'] = dict(
data=[uniq_f(u, nan_display='N/A') for u in uniq_vals],
top=True
)
return jsonify(return_data)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
data = DATA[data_id]
dtypes = DTYPES[data_id]
mdata = METADATA[data_id]
return dict(
data_id=data_id,
rows=len(data),
columns=len(dtypes),
names=','.join([c['name'] for c in dtypes]),
start=json_date(mdata['start']),
ts=json_timestamp(mdata['start']),
name=mdata['name']
)
try:
processes = sorted([_load_process(data_id) for data_id in DATA], key=lambda p: p['ts'])
return jsonify(dict(data=processes, success=True))
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))
if len(date_counts[date_counts > 1]) > 1:
valid_date_cols.append(name)
if data[valid_corr_cols].isnull().values.any():
data = data.corr(method='pearson')
else:
# using pandas.corr proved to be quite slow on large datasets so I moved to numpy:
# https://stackoverflow.com/questions/48270953/pandas-corr-and-corrwith-very-slow
data = np.corrcoef(data[valid_corr_cols].values, rowvar=False)
data = pd.DataFrame(data, columns=valid_corr_cols, index=valid_corr_cols)
data.index.name = str('column')
data = data.reset_index()
col_types = grid_columns(data)
f = grid_formatter(col_types, nan_display=None)
return jsonify(data=f.format_dicts(data.itertuples()), dates=valid_date_cols)
except BaseException as e:
return jsonify(dict(error=str(e), traceback=str(traceback.format_exc())))