Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def test_mapcolvals():
"""Testing MapColVals pipeline stages."""
df = _test_df()
value_map = {1: 'Gold', 2: 'Silver', 3: 'Bronze'}
res_df = MapColVals('Medal', value_map).apply(df)
assert res_df['Medal']['UK'] == 'Gold'
assert res_df['Medal']['USSR'] == 'Bronze'
assert res_df['Medal']['US'] == 'Silver'
def test_mapcolvals_with_res_name_no_drop():
"""Testing MapColVals pipeline stages."""
df = _test_df()
value_map = {1: 'Gold', 2: 'Silver', 3: 'Bronze'}
map_stage = MapColVals(
'Medal', value_map, result_columns='Metal', drop=False)
res_df = map_stage(df)
assert res_df['Medal']['UK'] == 1
assert res_df['Medal']['USSR'] == 3
assert res_df['Medal']['US'] == 2
assert res_df['Metal']['UK'] == 'Gold'
assert res_df['Metal']['USSR'] == 'Bronze'
assert res_df['Metal']['US'] == 'Silver'
def test_mapcolvals_bad_res_name_len():
"""Testing MapColVals pipeline stages."""
value_map = {1: 'Gold', 2: 'Silver', 3: 'Bronze'}
with pytest.raises(ValueError):
map_stage = MapColVals('Medal', value_map, result_columns=['A', 'B'])
assert isinstance(map_stage, MapColVals)
'value_map': nltk.word_tokenize,
'drop': drop,
'suffix': '_tok',
'exmsg': TokenizeWords._DEF_TOKENIZE_EXC_MSG.format(col_str),
'appmsg': TokenizeWords._DEF_TOKENIZE_APP_MSG.format(col_str),
'desc': "Tokenize {}".format(col_str),
}
super_kwargs.update(**kwargs)
super().__init__(**super_kwargs)
def _prec(self, df):
return super()._prec(df) and all(
col_type == object for col_type in df.dtypes[self._columns])
class UntokenizeWords(MapColVals):
"""A pipeline stage that joins token lists to whitespace-seperated strings.
Parameters
----------
columns : str or list-like
Column names in the DataFrame to be untokenized.
drop : bool, default True
If set to True, the source columns are dropped after being untokenized,
and the resulting columns retain the names of the source columns.
Otherwise, untokenized columns gain the suffix '_untok'.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> data = [[3.2, ['Shake', 'and', 'bake!']]]
>>> df = pd.DataFrame(data, [1], ['freq', 'content'])
self._result_columns = [
col + self.suffix for col in self._columns
]
else:
self._result_columns = _interpret_columns_param(result_columns)
if len(self._result_columns) != len(self._columns):
raise ValueError(
"columns and result_columns parameters must"
" be string lists of the same length!"
)
col_str = _list_str(self._columns)
sfx = "s" if len(self._columns) > 1 else ""
self._drop = drop
super_kwargs = {
"exmsg": MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str),
"appmsg": MapColVals._DEF_MAP_COLVAL_APP_MSG.format(
sfx, col_str, self._value_map
),
"desc": "Map values of column{} {} with {}.".format(
sfx, col_str, self._value_map
),
}
super_kwargs.update(**kwargs)
super().__init__(**super_kwargs)
'value_map': self._stopwords_remover,
'drop': drop,
'suffix': '_nostop',
'exmsg': RemoveStopwords._DEF_STOPWORDS_EXC_MSG.format(col_str),
'appmsg': RemoveStopwords._DEF_STOPWORDS_APP_MSG.format(col_str),
'desc': "Removing stopwords from {}".format(col_str),
}
super_kwargs.update(**kwargs)
super().__init__(**super_kwargs)
def _prec(self, df):
return super()._prec(df) and all(
col_type == object for col_type in df.dtypes[self._columns])
class SnowballStem(MapColVals):
"""A pipeline stage that stems words in a list using the Snowball stemmer.
Parameters
----------
stemmer_name : str
The name of the Snowball stemmer to use. Should be one of the Snowball
stemmers implemented by nltk. E.g. 'EnglishStemmer'.
columns : str or list-like
Column names in the DataFrame to stem tokens in.
drop : bool, default True
If set to True, the source columns are dropped after stemming, and the
resulting columns retain the names of the source columns. Otherwise,
resulting columns gain the suffix '_stem'.
Example
-------
else:
self._result_columns = [
col + self.suffix for col in self._columns
]
else:
self._result_columns = _interpret_columns_param(result_columns)
if len(self._result_columns) != len(self._columns):
raise ValueError(
"columns and result_columns parameters must"
" be string lists of the same length!"
)
col_str = _list_str(self._columns)
sfx = "s" if len(self._columns) > 1 else ""
self._drop = drop
super_kwargs = {
"exmsg": MapColVals._DEF_MAP_COLVAL_EXC_MSG.format(sfx, col_str),
"appmsg": MapColVals._DEF_MAP_COLVAL_APP_MSG.format(
sfx, col_str, self._value_map
),
"desc": "Map values of column{} {} with {}.".format(
sfx, col_str, self._value_map
),
}
super_kwargs.update(**kwargs)
super().__init__(**super_kwargs)
'value_map': UntokenizeWords._untokenize_list,
'drop': drop,
'suffix': '_untok',
'exmsg': UntokenizeWords._DEF_UNTOKENIZE_EXC_MSG.format(col_str),
'appmsg': "Untokenizing {}".format(col_str),
'desc': "Untokenize {}".format(col_str),
}
super_kwargs.update(**kwargs)
super().__init__(**super_kwargs)
def _prec(self, df):
return super()._prec(df) and all(
col_type == object for col_type in df.dtypes[self._columns])
class RemoveStopwords(MapColVals):
"""A pipeline stage that removes stopwords from a tokenized list.
Parameters
----------
langugae : str or array-like
If a string is given, interpreted as the language of the stopwords, and
should then be one of the languages supported by the NLTK Stopwords
Corpus. If a list is given, it is assumed to be the list of stopwords
to remove.
columns : str or list-like
Column names in the DataFrame from which to remove stopwords.
drop : bool, default True
If set to True, the source columns are dropped after stopword removal,
and the resulting columns retain the names of the source columns.
Otherwise, resulting columns gain the suffix '_nostop'.
import importlib
import collections
import nltk
import pandas as pd
from pdpipe.core import PdPipelineStage
from pdpipe.util import out_of_place_col_insert
from pdpipe.col_generation import MapColVals
from pdpipe.shared import (
_interpret_columns_param,
_list_str
)
class TokenizeWords(MapColVals):
"""A pipeline stage that tokenize a sentence into words by whitespaces.
Parameters
----------
columns : str or list-like
Column names in the DataFrame to be tokenized.
drop : bool, default True
If set to True, the source columns are dropped after being tokenized,
and the resulting tokenized columns retain the names of the source
columns. Otherwise, tokenized columns gain the suffix '_tok'.
Example
-------
>>> import pandas as pd; import pdpipe as pdp;
>>> df = pd.DataFrame([[3.2, "Kick the baby!"]], [1], ['freq', 'content'])
>>> tokenize_stage = pdp.TokenizeWords('content')