"""
Copyright (C) 2024 Instituto Andaluz Interuniversitario en Ciencia de Datos e Inteligencia Computacional (DaSCI).
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from enum import Enum, EnumMeta
[docs]
class PluggableDataset(EnumMeta):
def __contains__(self, item):
return item in list(self.members())
[docs]
class PluggableDatasetString(EnumMeta):
def __contains__(self, item):
return item in [k.value[0] for k in self.__members__.values()]
[docs]
class PluggableTorchtext(Enum, metaclass=PluggableDataset):
"""Class containing all the pluggable datasets to a Dataset without any preprocessing needed.
Any other dataset from the TorchText library will need further preprocessing.
Args:
-----
Enum (enum): torchtext class for each dataset than can be accepted on our platform.
"""
[docs]
def members():
from torchtext import datasets
yield datasets.AG_NEWS.__name__
yield datasets.AmazonReviewFull.__name__
yield datasets.AmazonReviewPolarity.__name__
yield datasets.DBpedia.__name__
yield datasets.YahooAnswers.__name__
yield datasets.YelpReviewFull.__name__
yield datasets.YelpReviewPolarity.__name__
[docs]
class PluggableTorchvision(Enum, metaclass=PluggableDataset):
"""Class containing all the pluggable datasets to a Dataset without any preprocessing needed.
Any other dataset from the Torchvision library will need further preprocessing.
Args:
-----
Enum (enum): torchvision class for each dataset than can be accepted on our platform.
"""
[docs]
def members():
from torchvision import datasets
yield datasets.WIDERFace.__name__
yield datasets.Food101.__name__
yield datasets.CelebA.__name__
yield datasets.CLEVRClassification.__name__
yield datasets.Country211.__name__
yield datasets.FGVCAircraft.__name__
yield datasets.GTSRB.__name__
yield datasets.Kitti.__name__
yield datasets.Flowers102.__name__
yield datasets.StanfordCars.__name__
yield datasets.LFWPeople.__name__
yield datasets.Caltech256.__name__
yield datasets.EuroSAT.__name__
yield datasets.CIFAR10.__name__
yield datasets.CIFAR100.__name__
yield datasets.MNIST.__name__
yield datasets.SUN397.__name__
yield datasets.SEMEION.__name__
yield datasets.Omniglot.__name__
yield datasets.KMNIST.__name__
yield datasets.FashionMNIST.__name__
yield datasets.OxfordIIITPet.__name__
yield datasets.STL10.__name__
yield datasets.PCAM.__name__
yield datasets.Caltech101.__name__
yield datasets.QMNIST.__name__
yield datasets.SVHN.__name__
yield datasets.DTD.__name__
yield datasets.USPS.__name__
yield datasets.RenderedSST2.__name__
yield datasets.INaturalist.__name__
yield datasets.EMNIST.__name__
[docs]
class PluggableHuggingFace(Enum, metaclass=PluggableDatasetString):
"""Class containing some datasets that can be loaded to FLEXible. Other datasets
can be plugged in, but it requires a special configuration, i.e., glue-cola. This
is more about the user using correctly the arguments on the load_dataset function
from huggingface datasets than a problem of our platform, so the user can easy-use
other datasets.
We show some example datasets that can be loaded using the function
FedDataDistribution.from_config_with_huggingface_dataset just giving a config
and the string associated to each dataset from the Enum defined.
We selected this dataset as we can automatice the process of loading this datasets,
but our framework support almost all the datasets, as they can be loaded as numpy
arrays. We only show supports to this datasets as we can load the dataset
as follows: dataset = load_dataset(name, split='train').
There are some datasets that need extra parameters like the version of the dataset,
or that don't have any split. This must be used by the user previously to load
the dataset into FLEXible, but it will be easy and fast, as the user just
need to select the X_train-y_train as np.arrays.
Args:
-----
Enum (enum): Tuple containing name, X_columns and y_columns to use in the
load_dataset function.
"""
IMDB_HF = ("imdb", "text", "label")
AG_NEWS_HF = ("ag_news", "text", "label")
TWEET_EVAL_EMOJI_HF = ("tweet_eval", "text", "label") # We support all subsets
ROTTEN_TOMATOES_HF = ("rotten_tomatoes", "text", "label")
GLUE_COLA_HF = ("glue", "sentence", "label")
FINANCIAL_PHRASEBANK_HF = (
"financial_phrasebank",
"sentence",
"label",
) # We support all subsets"
SQUAD_HF = ("squad", ["context", "question"], "answers")
APPREVIEWS_HF = ("app_reviews", "review", "star")
AMAZON_POLARITY_HF = ("amazon_polarity", ["title", "content"], "label")
# class PluggableDatasetsTensorFlowText(Enum, metaclass=PluggableDataset):
# """Class containing some datasets that can be loaded to FLEXible. Other datasets
# can be plugged in, but it requires a special configuration, i.e., glue-cola. This
# is more about the user using correctly the arguments on the load_dataset function
# from huggingface datasets than a problem of our platform, so the user can easy-use
# other datasets.
# Args:
# Enum (enum): Tuple containing name, X_columns and y_columns to use in the
# load_dataset function.
# """
# AG_NEWS_TF = ("ag_news_subset", ["title", "description"], ["label"])
# GLUE_TF = ("glue", ["sentence"], ["label"])
# ASSET_TF = ("asset", ["original"], ["simplifications"])
# SQUAD_TF = ("squad", ["title", "question", "context"], ["answers"]) # Wonk work
# COQA_TF = ("coqa", ["questions", "source", "story"], ["answers"]) # Won't work