Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _index_grouped(self):
if self._index_grouped_cache is None:
if self._is_multi_by:
# Because we are doing a collect (to_pandas) here and then groupby, we
# end up using pandas implementation. Add the warning so the user is
# aware.
ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
ErrorMessage.default_to_pandas("Groupby with multiple columns")
self._index_grouped_cache = {
k: v.index
for k, v in self._df._query_compiler.getitem_column_array(self._by)
.to_pandas()
.groupby(by=self._by)
}
else:
if isinstance(self._by, type(self._query_compiler)):
by = self._by.to_pandas().squeeze()
else:
by = self._by
if self._axis == 0:
self._index_grouped_cache = self._index.groupby(by)
else:
self._index_grouped_cache = self._columns.groupby(by)
return self._index_grouped_cache
left,
right,
on=None,
left_on=None,
right_on=None,
left_by=None,
right_by=None,
fill_method=None,
suffixes=("_x", "_y"),
how="outer",
):
if not isinstance(left, DataFrame):
raise ValueError(
"can not merge DataFrame with instance of type {}".format(type(right))
)
ErrorMessage.default_to_pandas("`merge_ordered`")
if isinstance(right, DataFrame):
right = to_pandas(right)
return DataFrame(
pandas.merge_ordered(
to_pandas(left),
right,
on=on,
left_on=left_on,
right_on=right_on,
left_by=left_by,
right_by=right_by,
fill_method=fill_method,
suffixes=suffixes,
how=how,
)
def _default_to_pandas(self, op, *args, **kwargs):
"""Helper method to use default pandas function"""
empty_self_str = "" if not self.empty else " for empty DataFrame"
ErrorMessage.default_to_pandas(
"`{}.{}`{}".format(
self.__name__,
op if isinstance(op, str) else op.__name__,
empty_self_str,
)
)
args = (a._to_pandas() if hasattr(a, "_to_pandas") else a for a in args)
kwargs = {
k: v._to_pandas() if hasattr(v, "_to_pandas") else v
for k, v in kwargs.items()
}
if callable(op):
result = op(self._to_pandas(), *args, **kwargs)
elif isinstance(op, str):
# The inner `getattr` is ensuring that we are treating this object (whether
# it is a DataFrame, Series, etc.) as a pandas object. The outer `getattr`
columns: Which columns to encode.
sparse (bool): Not Implemented: If True, returns SparseDataFrame.
drop_first (bool): Whether to remove the first level of encoded data.
dtype: The dtype for the get_dummies call.
Returns:
DataFrame or one-hot encoded data.
"""
if sparse:
raise NotImplementedError(
"SparseDataFrame is not implemented. "
"To contribute to Modin, please visit "
"github.com/modin-project/modin."
)
if not isinstance(data, DataFrame):
ErrorMessage.default_to_pandas("`get_dummies` on non-DataFrame")
return DataFrame(
pandas.get_dummies(
data,
prefix=prefix,
prefix_sep=prefix_sep,
dummy_na=dummy_na,
columns=columns,
sparse=sparse,
drop_first=drop_first,
dtype=dtype,
)
)
else:
new_manager = data._query_compiler.get_dummies(
columns,
prefix=prefix,
**kwds
):
"""Apply a function along input axis of DataFrame.
Args:
func: The function to apply
axis: The axis over which to apply the func.
broadcast: Whether or not to broadcast.
raw: Whether or not to convert to a Series.
reduce: Whether or not to try to apply reduction procedures.
Returns:
Series or DataFrame, depending on func.
"""
axis = self._get_axis_number(axis)
ErrorMessage.non_verified_udf()
if isinstance(func, str):
if axis == 1:
kwds["axis"] = axis
result = self._string_function(func, *args, **kwds)
# Sometimes we can return a scalar here
if isinstance(result, BasePandasDataset):
return result._query_compiler
return result
elif isinstance(func, dict):
if axis == 1:
raise TypeError(
"(\"'dict' object is not callable\", "
"'occurred at index {0}'".format(self.index[0])
)
if len(self.columns) != len(set(self.columns)):
warnings.warn(
def read_parquet(cls, path, engine, columns, **kwargs):
"""Load a parquet object from the file path, returning a DataFrame.
Ray DataFrame only supports pyarrow engine for now.
Args:
path: The filepath of the parquet file.
We only support local files for now.
engine: Ray only support pyarrow reader.
This argument doesn't do anything for now.
kwargs: Pass into parquet's read_pandas function.
Notes:
ParquetFile API is used. Please refer to the documentation here
https://arrow.apache.org/docs/python/parquet.html
"""
ErrorMessage.default_to_pandas("`read_parquet`")
return cls.from_pandas(pandas.read_parquet(path, engine, columns, **kwargs))
def lreshape(data, groups, dropna=True, label=None):
if not isinstance(data, DataFrame):
raise ValueError("can not lreshape with instance of type {}".format(type(data)))
ErrorMessage.default_to_pandas("`lreshape`")
return DataFrame(
pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label)
)