Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
import os
import modin.pandas as pd
from utils import time_logger
import numpy as np
parser = argparse.ArgumentParser(description="arithmetic benchmark")
parser.add_argument("--path", dest="path", help="path to the csv data file")
parser.add_argument("--logfile", dest="logfile", help="path to the log file")
args = parser.parse_args()
file = args.path
file_size = os.path.getsize(file)
logging.basicConfig(filename=args.logfile, level=logging.INFO)
df = pd.read_csv(file)
blocks = df._block_partitions.flatten().tolist()
ray.wait(blocks, len(blocks))
num_rows, num_cols = df.shape
new_row = np.random.randint(0, 100, size=num_cols)
new_col = np.random.randint(0, 100, size=num_rows)
def rand_row_loc():
return np.random.randint(0, num_rows)
def rand_col_loc():
return np.random.randint(0, num_cols)
from __future__ import print_function
import modin.pandas as pd
import numpy as np
import os
num_rows = [100, 10000, 100000, 150000, 200000, 350000, 500000]
num_cols = [1000]
path_to_data = "benchmarks/data/"
if not os.path.exists(path_to_data):
os.makedirs(path_to_data)
for r in num_rows:
for c in num_cols:
df = pd.DataFrame(np.random.randint(0, 100, size=(r, c)))
df.to_csv(path_to_data + "test-data-{}-{}.csv".format(r, c))
# Files for multi df tests
num_rows = [100, 1000, 100000, 1000000]
num_cols = [1000]
path_to_data = "benchmarks/data/multi/"
if not os.path.exists(path_to_data):
os.makedirs(path_to_data)
for r in num_rows:
for c in num_cols:
df = pd.DataFrame(np.random.randint(0, 100, size=(r, c)))
df.to_csv(path_to_data + "test-data-{}-{}.csv".format(r, c))
def __init__(self, modin_engine=None):
# can't change engine, so track it as a global
# https://github.com/modin-project/modin
global MODIN_ENGINE
global MODIN_PANDAS
if MODIN_PANDAS is None:
if modin_engine is None:
raise ValueError("modin_engine not set")
MODIN_ENGINE = modin_engine
# https://github.com/modin-project/modin
os.environ["MODIN_ENGINE"] = MODIN_ENGINE
import modin.pandas
MODIN_PANDAS = modin.pandas
else:
if (modin_engine is not None) and (modin_engine != MODIN_ENGINE):
raise ValueError("MODIN_ENGINE already set to "
+ MODIN_ENGINE
+ ", and called with modin_engine=="
+ modin_engine)
data_algebra.eval_model.EvalModel.__init__(self)
self.impl = PandasModelBase(pd=MODIN_PANDAS,
presentation_model_name='modin')
df.iloc[:, rand_col_loc()] = new_col
with time_logger("write a row: {}; Size: {} bytes".format(file, file_size)):
df.iloc[rand_row_loc(), :] = new_row
# element r/w
with time_logger("read an element: {}; Size: {} bytes".format(file, file_size)):
df.iloc[rand_row_loc(), rand_col_loc()]
with time_logger("write an element: {}; Size: {} bytes".format(file, file_size)):
df.iloc[rand_row_loc(), rand_col_loc()] = np.random.randint(0, 100)
# appending
with time_logger("append a row: {}; Size: {} bytes".format(file, file_size)):
df.append(pd.Series(new_row), ignore_index=True)
with time_logger("append a column: {}; Size: {} bytes".format(file, file_size)):
df["new"] = new_col
def _format_input(
self, data: Dict
) -> Union[pandas.DataFrame, Dict[str, pandas.DataFrame]]:
try:
if data.pop("multi_dataframe", None) is True:
logger.debug("Formatting pandas multi_dataframe input")
return {key: pandas.DataFrame(value) for key, value in data.items()}
else:
return pandas.DataFrame(data)
except (TypeError, KeyError) as exc:
err_msg = f"Error reading in json: {exc}"
logger.warning(err_msg)
raise HTTPException(status_code=400, detail=err_msg)
ans = x[['id2','id4','v1','v2']].groupby(['id2','id4'], observed=True).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))
ans.reset_index(inplace=True)