Source code for flex.data.preprocessing_utils

"""
Copyright (C) 2024  Instituto Andaluz Interuniversitario en Ciencia de Datos e Inteligencia Computacional (DaSCI).

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published
    by the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
from copy import deepcopy

import numpy as np

from flex.data.dataset import Dataset
from flex.data.lazy_indexable import LazyIndexable


[docs] def normalize(node_dataset, *args, **kwargs): """Function that normalizes federated data. Args: ----- node_dataset (Dataset): node_dataset to normalize the data. Returns: -------- Dataset: Returns the node_dataset with the X_data property normalized. """ X_data = node_dataset.X_data.to_numpy() norms = np.linalg.norm(X_data, axis=0) norms = np.where(norms == 0, np.finfo(X_data.dtype).eps, norms) new_X_data = X_data / norms return Dataset.from_array(new_X_data, node_dataset.y_data.to_numpy())
[docs] def one_hot_encoding(node_dataset, *args, **kwargs): """Function that apply one hot encoding to the labels of a node_dataset. Args: ----- node_dataset (Dataset): node_dataset to which apply one hot encode to her labels. Raises: ------- ValueError: Raises value error if n_labels is not given in the kwargs argument. Returns: -------- Dataset: Returns the node_dataset with the y_data property updated. """ if "n_labels" not in kwargs: raise ValueError( "No number of labels given. The parameter n_labels must be given through kwargs." ) y_data = node_dataset.y_data.to_numpy() n_labels = int(kwargs["n_labels"]) one_hot_labels = np.zeros((y_data.size, n_labels)) one_hot_labels[np.arange(y_data.size), y_data] = 1 new__y_data = one_hot_labels return Dataset( X_data=deepcopy(node_dataset.X_data), y_data=LazyIndexable(new__y_data, len(new__y_data)), )