Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def assert_valid_dataset(data, name):
__tracebackhide__ = True
assert isinstance(data, Dataset), "not a Dataset object"
assert name in DATASETS, "dataset not in manifest"
assert dataset_exists(name), "dataset directory does not exist"
assert dataset_archive(name, DATASETS[name]["signature"]), "dataset archive does not match signature"
assert find_dataset_path(name, ext=".csv.gz", raises=False) is not None, "no .csv.tgz in dataset"
assert find_dataset_path(name, ext=".npz", raises=False) is not None, "no .npz in dataset"
n_files = len(data.contents())
assert n_files == 4 or n_files == 5, "not enough files in dataset"
assert len(data.README) > 0, "readme contains no data"
assert len(data.meta) > 0, "metadata is empty"
if n_files == 5:
assert len(data.citation) > 0, "citation.bib is empty"
assert "features" in data.meta, "no features in metadata"
assert "target" in data.meta, "no target in metadata"
"""
Returns the entire dataset as a single pandas DataFrame.
Returns
-------
df : DataFrame with shape (n_instances, n_columns)
A pandas DataFrame containing the complete original data table
including all targets (specified by the meta data) and all
features (including those that might have been filtered out).
"""
if pd is None:
raise DatasetsError(
"pandas is required to load DataFrame, it can be installed with pip"
)
path = find_dataset_path(self.name, ext=".csv.gz", data_home=self.data_home)
return pd.read_csv(path, compression="gzip")
def contents(self):
"""
Contents returns a list of the files in the data directory.
"""
data = find_dataset_path(self.name, data_home=self.data_home, ext=None)
return os.listdir(data)
def root(self):
"""
Discovers and caches the root directory of the corpus.
"""
return find_dataset_path(self.name, data_home=self.data_home, ext=None)
def to_numpy(self):
"""
Returns the dataset as two numpy arrays: X and y.
Returns
-------
X : array-like with shape (n_instances, n_features)
A numpy array describing the instance features.
y : array-like with shape (n_instances,)
A numpy array describing the target vector.
"""
path = find_dataset_path(self.name, ext=".npz", data_home=self.data_home)
with np.load(path, allow_pickle=False) as npf:
if "X" not in npf or "y" not in npf:
raise DatasetsError(
(
"the downloaded dataset was improperly packaged without numpy "
"arrays - please report this bug to the Yellowbrick maintainers!"
)
)
# TODO: How to handle the case where y is None?
return npf["X"], npf["y"]
def meta(self):
"""
Returns the contents of the meta.json file that describes important
attributes about the dataset and modifies the behavior of the loader.
"""
path = find_dataset_path(
self.name, data_home=self.data_home, fname="meta.json", raises=False
)
if path is None:
return None
with open(path, "r") as f:
return json.load(f)
def README(self):
"""
Returns the contents of the README.md file that describes the dataset
in detail and contains attribution information.
"""
path = find_dataset_path(self.name, data_home=self.data_home, fname="README.md")
with open(path, "r") as f:
return f.read()