Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def assert_valid_numpy(data):
__tracebackhide__ = True
X, y = data.to_numpy()
assert isinstance(X, np.ndarray), "X is not a numpy array"
assert isinstance(y, np.ndarray), "y is not a numpy array"
assert X.ndim == 2 and y.ndim == 1, "X and y dimensions are incorrect"
# Patch pandas and make defaults assertions
X, y = data.to_data()
assert isinstance(X, np.ndarray), "to_data does not return numpy"
assert isinstance(y, np.ndarray), "to_data does not return numpy"
with pytest.raises(DatasetsError):
data.to_pandas(), "exception not raised when pandas unavailable"
def test_missing_find_dataset_path(tmpdir):
"""
Test find_dataset_path when the dataset does not exist
"""
data_home = tmpdir.mkdir("fixtures")
# When the data directory doesn't exist
with pytest.raises(DatasetsError):
find_dataset_path("foo", data_home=str(data_home))
# When the data directory exists but no file is in the directory
foo = data_home.mkdir("foo")
with pytest.raises(DatasetsError):
find_dataset_path("foo", data_home=str(data_home))
# When the specified file doesn't exist
fpath = foo.join("foo.csv")
fpath.write("1,2,3")
with pytest.raises(DatasetsError):
find_dataset_path("foo", data_home=str(data_home), ext=".npz")
def to_pandas(self):
"""
Returns the dataset as two pandas objects: X and y.
Returns
-------
X : DataFrame with shape (n_instances, n_features)
A pandas DataFrame containing feature data and named columns.
y : Series with shape (n_instances,)
A pandas Series containing target data and an index that matches
the feature DataFrame index.
"""
# Ensure the metadata is valid before continuing
if self.meta is None:
raise DatasetsError(
(
"the downloaded dataset was improperly packaged without meta.json "
"- please report this bug to the Yellowbrick maintainers!"
)
)
if "features" not in self.meta or "target" not in self.meta:
raise DatasetsError(
(
"the downloaded dataset was improperly packaged without features "
"or target - please report this bug to the Yellowbrick maintainers!"
)
)
# Load data frame and return features and target
# TODO: Return y as None if there is no self.meta["target"]
def to_dataframe(self):
"""
Returns the entire dataset as a single pandas DataFrame.
Returns
-------
df : DataFrame with shape (n_instances, n_columns)
A pandas DataFrame containing the complete original data table
including all targets (specified by the meta data) and all
features (including those that might have been filtered out).
"""
if pd is None:
raise DatasetsError(
"pandas is required to load DataFrame, it can be installed with pip"
)
path = find_dataset_path(self.name, ext=".csv.gz", data_home=self.data_home)
return pd.read_csv(path, compression="gzip")
if fname is None:
if ext is None:
path = os.path.join(data_home, dataset)
else:
path = os.path.join(data_home, dataset, "{}{}".format(dataset, ext))
else:
path = os.path.join(data_home, dataset, fname)
# Determine if the path exists
if not os.path.exists(path):
# Suppress exceptions if required
if not raises:
return None
raise DatasetsError(
("could not find dataset at {} - does it need to be downloaded?").format(
path
)
)
return path
y : Series with shape (n_instances,)
A pandas Series containing target data and an index that matches
the feature DataFrame index.
"""
# Ensure the metadata is valid before continuing
if self.meta is None:
raise DatasetsError(
(
"the downloaded dataset was improperly packaged without meta.json "
"- please report this bug to the Yellowbrick maintainers!"
)
)
if "features" not in self.meta or "target" not in self.meta:
raise DatasetsError(
(
"the downloaded dataset was improperly packaged without features "
"or target - please report this bug to the Yellowbrick maintainers!"
)
)
# Load data frame and return features and target
# TODO: Return y as None if there is no self.meta["target"]
df = self.to_dataframe()
return df[self.meta["features"]], df[self.meta["target"]]
def to_numpy(self):
"""
Returns the dataset as two numpy arrays: X and y.
Returns
-------
X : array-like with shape (n_instances, n_features)
A numpy array describing the instance features.
y : array-like with shape (n_instances,)
A numpy array describing the target vector.
"""
path = find_dataset_path(self.name, ext=".npz", data_home=self.data_home)
with np.load(path, allow_pickle=False) as npf:
if "X" not in npf or "y" not in npf:
raise DatasetsError(
(
"the downloaded dataset was improperly packaged without numpy "
"arrays - please report this bug to the Yellowbrick maintainers!"
)
)
# TODO: How to handle the case where y is None?
return npf["X"], npf["y"]
Extract the archive file after downloading it
"""
data_home = get_data_home(data_home)
# Get the name of the file from the URL
basename = os.path.basename(url)
name, _ = os.path.splitext(basename)
# Get the archive and data directory paths
archive = os.path.join(data_home, basename)
datadir = os.path.join(data_home, name)
# If the archive exists cleanup or raise override exception
if os.path.exists(archive):
if not replace:
raise DatasetsError((
"dataset already exists at {}, set replace=False to overwrite"
).format(archive))
cleanup_dataset(name, data_home=data_home)
# Create the output directory if it does not exist
if not os.path.exists(datadir):
os.mkdir(datadir)
# Fetch the response in a streaming fashion and write it to disk.
response = urlopen(url)
with open(archive, 'wb') as f:
while True:
chunk = response.read(CHUNK)
if not chunk: