"""This module contains functions to load datasets."""
from __future__ import annotations
from pathlib import Path
from typing import cast
import numpy as np
import pandas as pd
GITHUB_DATA_URL = "https://raw.githubusercontent.com/mmschlk/shapiq/main/data/"
# csv files are located next to this file in a folder called "data"
SHAPIQ_DATASETS_FOLDER = Path(__file__).parent / "data"
def _create_folder() -> None:
"""Create the datasets folder if it does not exist."""
Path(SHAPIQ_DATASETS_FOLDER).mkdir(parents=True, exist_ok=True)
def _try_load(csv_file_name: str) -> pd.DataFrame:
"""Try to load a dataset from the local folder.
If it does not exist, load it from GitHub and save it to the local folder.
Args:
csv_file_name: The name of the csv file to load.
Returns:
The dataset as a pandas DataFrame.
"""
_create_folder()
path = Path(SHAPIQ_DATASETS_FOLDER) / csv_file_name
try:
return pd.read_csv(path)
except FileNotFoundError:
data = pd.read_csv(GITHUB_DATA_URL + csv_file_name)
data.to_csv(path, index=False)
return data
[docs]
def load_california_housing(
*,
to_numpy: bool = False,
) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame] | tuple[np.ndarray, np.ndarray]:
"""Load the California housing dataset.
Args:
to_numpy: Return numpy objects instead of pandas. Default is ``False``.
Returns:
The California housing dataset as a pandas DataFrame.
Example:
>>> from shapiq.datasets import load_california_housing
>>> x_data, y_data = load_california_housing()
>>> print(x_data.shape, y_data.shape)
((20640, 8), (20640,))
"""
dataset = _try_load("california_housing.csv")
class_label = "MedHouseVal"
y_data = dataset[class_label]
x_data = dataset.drop(columns=[class_label])
if to_numpy:
return x_data.to_numpy(), y_data.to_numpy()
return x_data, y_data
[docs]
def load_bike_sharing(
*, to_numpy: bool = False
) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame] | tuple[np.ndarray, np.ndarray]:
"""Load the bike-sharing dataset from openml and preprocess it.
Note:
The function requires the `sklearn` package to be installed.
Args:
to_numpy: Return numpy objects instead of pandas. ``Default is False.``
Returns:
The bike-sharing dataset as a pandas DataFrame.
Example:
>>> from shapiq.datasets import load_bike_sharing
>>> x_data, y_data = load_bike_sharing()
>>> print(x_data.shape, y_data.shape)
((17379, 12), (17379,))
"""
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
dataset: pd.DataFrame = _try_load("bike.csv")
class_label = "count"
num_feature_names = [
"hour",
"temp",
"feel_temp",
"humidity",
"windspeed",
"year",
"month",
"holiday",
"weekday",
"workingday",
]
cat_feature_names = [
"season",
"weather",
]
dataset[num_feature_names] = dataset[num_feature_names].apply(pd.to_numeric)
num_pipeline = Pipeline([("scaler", RobustScaler())])
cat_pipeline = Pipeline(
[
("ordinal_encoder", OrdinalEncoder()),
],
)
column_transformer = ColumnTransformer(
[
("numerical", num_pipeline, num_feature_names),
("categorical", cat_pipeline, cat_feature_names),
],
remainder="passthrough",
)
col_names = num_feature_names + cat_feature_names
col_names += [feature for feature in dataset.columns if feature not in col_names]
transformed_data: np.ndarray = cast(
"np.ndarray", column_transformer.fit_transform(dataset)
) # Transformations will always return a dense array
dataset = pd.DataFrame(
transformed_data,
columns=np.asarray(col_names),
)
dataset = dataset.dropna()
y_data = dataset.pop(class_label)
x_data = dataset
if to_numpy:
return x_data.to_numpy(), y_data.to_numpy()
return x_data, y_data
[docs]
def load_adult_census(
*, to_numpy: bool = False
) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame] | tuple[np.ndarray, np.ndarray]:
"""Load the adult census dataset from the UCI Machine Learning Repository.
Original source: https://archive.ics.uci.edu/ml/datasets/adult
Note:
The function requires the `sklearn` package to be installed.
Args:
to_numpy: Return numpy objects instead of pandas. Default is ``False``.
Returns:
The adult census dataset as a pandas DataFrame.
Example:
>>> from shapiq.datasets import load_adult_census
>>> x_data, y_data = load_adult_census()
>>> print(x_data.shape, y_data.shape)
((45222, 14), (45222,))
"""
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
dataset = _try_load("adult_census.csv")
class_label = "class"
num_feature_names = ["age", "capital-gain", "capital-loss", "hours-per-week", "fnlwgt"]
cat_feature_names = [
"workclass",
"education",
"marital-status",
"occupation",
"relationship",
"race",
"sex",
"native-country",
"education-num",
]
dataset[num_feature_names] = dataset[num_feature_names].apply(pd.to_numeric)
num_pipeline = Pipeline(
[("imputer", SimpleImputer(strategy="median")), ("std_scaler", StandardScaler())],
)
cat_pipeline = Pipeline(
[
("ordinal_encoder", OrdinalEncoder()),
],
)
column_transformer = ColumnTransformer(
[
("numerical", num_pipeline, num_feature_names),
("categorical", cat_pipeline, cat_feature_names),
],
remainder="passthrough",
)
col_names = num_feature_names + cat_feature_names
col_names += [feature for feature in dataset.columns if feature not in col_names]
transformed_data = cast(
"np.ndarray", column_transformer.fit_transform(dataset)
) # Transformations will always return a dense array
dataset = pd.DataFrame(
transformed_data,
columns=np.asarray(col_names),
)
dataset = dataset.dropna()
y_data = dataset.pop(class_label)
x_data = dataset.astype(float)
# transform '>50K' to 1 and '<=50K' to 0
y_data = y_data.apply(lambda x: 1 if x == ">50K" else 0)
if to_numpy:
return x_data.to_numpy(), y_data.to_numpy()
return x_data, y_data