Source code for flex.data.preprocessing_utils

"""
Copyright (C) 2024  Instituto Andaluz Interuniversitario en Ciencia de Datos e Inteligencia Computacional (DaSCI).

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published
    by the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
from copy import deepcopy

import numpy as np

from flex.data.dataset import Dataset
from flex.data.lazy_indexable import LazyIndexable



[docs]
def normalize(node_dataset, *args, **kwargs):
    """Function that normalizes federated data.

    Args:
    -----
        node_dataset (Dataset): node_dataset  to normalize the data.

    Returns:
    --------
        Dataset: Returns the node_dataset with the X_data property normalized.
    """
    X_data = node_dataset.X_data.to_numpy()
    norms = np.linalg.norm(X_data, axis=0)
    norms = np.where(norms == 0, np.finfo(X_data.dtype).eps, norms)
    new_X_data = X_data / norms
    return Dataset.from_array(new_X_data, node_dataset.y_data.to_numpy())




[docs]
def one_hot_encoding(node_dataset, *args, **kwargs):
    """Function that apply one hot encoding to the labels of a node_dataset.

    Args:
    -----
        node_dataset (Dataset): node_dataset to which apply one hot encode to her labels.

    Raises:
    -------
        ValueError: Raises value error if n_labels is not given in the kwargs argument.

    Returns:
    --------
        Dataset: Returns the node_dataset with the y_data property updated.
    """
    if "n_labels" not in kwargs:
        raise ValueError(
            "No number of labels given. The parameter n_labels must be given through kwargs."
        )
    y_data = node_dataset.y_data.to_numpy()
    n_labels = int(kwargs["n_labels"])
    one_hot_labels = np.zeros((y_data.size, n_labels))
    one_hot_labels[np.arange(y_data.size), y_data] = 1
    new__y_data = one_hot_labels
    return Dataset(
        X_data=deepcopy(node_dataset.X_data),
        y_data=LazyIndexable(new__y_data, len(new__y_data)),
    )