Source code for ada.datasets.toys

import numpy as np
import scipy.stats as ss
import os
import logging

from sklearn.utils import check_random_state

import torch
from torch.utils.data import Dataset
import ada.utils.experimentation as xp
from ada.datasets.dataset_access import DatasetAccess


[docs]def shift_data(x_in, ti=None, ri=None, si=None):
    """
    This function applies scaling, translation and/or rotation to 2D data points, in that order only.

    Args
        x_in (np.ndarray): data, input feature array of shape (n, d)
        ti (float, optional): translation (scalar or vector of compatible dimension). Defaults to None.
        ri (float, optional): rotation angle in radians (scalar, for 2D points only). Defaults to None.
        si (float, optional): scaling factor (scalar). Defaults to None.

    Returns:
        np.ndarray: transformed feature array of shape (n, d), same as x_in.
    """
    x_out = x_in

    if si is not None and si > 0:
        s_mat = si * np.eye(x_in.shape[1])
        x_out = x_out @ s_mat

    if ti is not None:
        x_out = x_out + ti

    if ri is not None:
        if x_in.shape[1] != 2:
            raise ValueError("Rotation may be applied to 2D data only")
        rot_mat = np.array([[np.cos(ri), np.sin(ri)], [-np.sin(ri), np.cos(ri)]])
        x_out = x_out @ rot_mat

    return x_out


[docs]def gen_cluster_distributions(
    dim, n_clusters, radius, random_state=None, centers="normal"
):
    random_state = check_random_state(random_state)
    if isinstance(centers, list):
        centers = np.array(centers)
    if isinstance(centers, str):
        if centers == "normal":
            cluster_means = random_state.normal(size=(n_clusters, dim))
        elif centers == "fixed" and n_clusters < 3 and dim == 2:
            fixed_means = np.array([[-0.5, 0.0], [0.5, 0]])
            cluster_means = fixed_means[:n_clusters, :]
    elif isinstance(centers, np.ndarray):
        cluster_means = centers
        n_clusters, dim = cluster_means.shape
    else:
        cluster_means = random_state.uniform(size=(n_clusters, dim))
    # cluster_std = random_state.uniform(size=(n_clusters, dim)) * radius
    if isinstance(radius, (np.ndarray, list)):
        radius = np.array(radius)
        if radius.shape != (n_clusters, dim):
            logging.debug(radius.shape, centers.shape)
            n_radii, dim_radius = (
                radius.shape if radius.ndim == 2 else radius.shape[0],
                1,
            )
            if dim_radius != dim and radius.ndim > 1 and n_radii == n_clusters:
                cluster_var = np.repeat(radius[:, 0], dim).reshape((n_clusters, dim))
            elif dim_radius != dim and radius.ndim == 1 and n_radii == n_clusters:
                cluster_var = np.repeat(radius, dim).reshape((n_clusters, dim))
            elif dim_radius == dim and n_radii == 1:
                cluster_var = (
                    np.repeat(radius[:], n_clusters).reshape((dim, n_clusters)).T
                )
            else:
                cluster_var = np.repeat(radius[0], dim * n_clusters).reshape(
                    (dim, n_clusters)
                )
            logging.warning(
                f"Input radius {radius} shape doesn't match cluster centers shape. Attempts to adapt, will use {cluster_var} instead"
            )
        else:
            cluster_var = radius
    else:
        cluster_var = np.ones((n_clusters, dim)) * radius
    if n_clusters <= 1:
        cluster_dist = ss.multivariate_normal(
            mean=cluster_means.flatten(), cov=cluster_var.flatten()
        )
        return cluster_dist, cluster_means, cluster_var

    cluster_dists = np.array(
        list(
            map(
                lambda x: (ss.multivariate_normal, {"mean": x[0], "cov": x[1]}),
                zip(cluster_means, cluster_var),
            )
        )
    )
    return cluster_dists, cluster_means, cluster_var


[docs]class CausalClusterGenerator:
    """
    Generate blobs from a gaussian distribution following given causal parameters relating environment/domain, X and Y:
    - Y --> X: select class Y, then distribution X|Y
    """

    def __init__(
        self,
        dim=2,
        n_clusters=2,
        radius=0.05,
        proba_classes=0.5,
        centers="fixed",
        shape="blobs",
        data_seed=None,
    ):
        self._random_state = check_random_state(data_seed)
        self._n_clusters = n_clusters
        self._proba_classes = proba_classes
        self.shape = shape
        self._cluster_dists, self._means, self._stds = gen_cluster_distributions(
            dim=dim,
            n_clusters=n_clusters,
            radius=radius,
            centers=centers,
            random_state=self._random_state,
        )

[docs]    def generate_sample(
        self,
        nb_samples,
        shift_y=False,
        shift_x=False,
        shift_conditional_x=False,
        shift_conditional_y=False,
        y_cause_x=True,
        ye=0.5,
        te=0.3,
        se=None,
        re=None,
    ):
        """
        Generate a sample and apply a given shift:
        shift_x = change p(x), ie x_e = f(x, env)
        shift_y = change p(y), ie y_e = f(y, env)
        shift_conditional_x = change p(x|y), ie x_e = f(y, x, env)
        shift_conditional_y = change p(y|x), ie y_e = f(x, y, env)

        env_parameters control the change in the data:
        ye = proportion of class 0 labels
        te = translation value (uniform on all dimensions!)
        se = scaling factor
        re = rotation in radians
        """
        if shift_y and y_cause_x:
            logging.debug("E --> Z=Y")
            zy = ss.bernoulli(ye * self._proba_classes).rvs(
                size=nb_samples, random_state=self._random_state
            )
            zx = None
        elif (
            isinstance(self._proba_classes, (np.ndarray, list))
            or len(self._cluster_dists) > 2
        ):
            n_clusters, dim = self._means.shape
            if not isinstance(self._proba_classes, (np.ndarray, list)):
                n_samples = (np.ones(n_clusters, dtype=float) / n_clusters) * nb_samples
            else:
                probas = np.array(self._proba_classes)
                probas /= probas.sum()
                n_samples = probas * nb_samples
            n_samples = n_samples.astype(np.int)
            n_samples[-1] = nb_samples - np.sum(n_samples[:-1])
            zy = np.empty(nb_samples, dtype=np.int)
            zx = np.empty((nb_samples, dim), dtype=np.float)
            sid = 0
            for class_id, n_class_samples in enumerate(n_samples):
                pdist, law_args = self._cluster_dists[class_id]
                zy[sid : sid + n_class_samples] = np.ones(n_class_samples) * class_id
                zx[sid : sid + n_class_samples, :] = pdist.rvs(
                    size=n_class_samples, random_state=self._random_state, **law_args
                )
                sid += n_class_samples
        else:
            logging.debug("ZY = cte")
            zy = ss.bernoulli(self._proba_classes).rvs(
                size=nb_samples, random_state=self._random_state
            )
            zx = None

        logging.debug("ZY --> ZX(ZY)")
        if zx is None:
            zx = np.array(
                [
                    pdist.rvs(size=1, random_state=self._random_state, **law_args)
                    for pdist, law_args in self._cluster_dists[zy]
                ]
            ).astype(np.float32)

        if self.shape.lower() == "moons":
            r = 1 - zy * 2  # assumes 2 classes, maps 0 to 1 and 1 to -1
            indices = np.linspace(0, np.pi, nb_samples)
            self._random_state.shuffle(indices)
            zx[:, 0] = zx[:, 0] + r * np.cos(indices)
            zx[:, 1] = zx[:, 1] + r * np.sin(indices)

        if shift_x:
            logging.debug("E, ZX --> X = g_E(XZ)")
            x = shift_data(zx, ti=te, si=se, ri=re)
        else:
            logging.debug("X=ZX")
            x = zx

        if shift_conditional_x:
            logging.debug("ZY, ZX, E --> g_E(X, Y)")
            # x = f(y, env)
            if te is None:
                ti0 = ti1 = None
            elif isinstance(te, float):
                ti0, ti1 = te * 2, te / 2
            else:
                ti0, ti1 = te
            if se is None:
                si0 = si1 = se
            elif isinstance(se, float):
                si0, si1 = se * 2, se / 2
            else:
                si0, si1 = se
            if se is not None and (si0 < 0 or si1 < 0):
                raise ValueError("Scaling factor cannot be negative")
            if re is None:
                ri0 = ri1 = re
            elif isinstance(re, float):
                ri0, ri1 = re * 2, re / 2
            else:
                ri0, ri1 = re
            x[zy == 0, :] = shift_data(zx[zy == 0], ti=ti0, si=si0, ri=ri0)
            x[zy == 1, :] = shift_data(zx[zy == 1], ti=ti1, si=si1, ri=ri1)

        if y_cause_x:
            logging.debug("Y=ZY")
            y = zy
            return x, y

        fx = np.sum(x, axis=1)
        xm = self._means.sum(axis=1)
        if shift_conditional_y:
            logging.debug("X, E --> Y")
            # y = f(env, x)
            thresh = np.percentile(xm, q=ye * 100)
        else:
            # y = f(x) indep. env
            logging.debug("E --> X --> Y")
            thresh = np.median(xm)

        logging.debug("threshold:", thresh)
        y = (fx > thresh).astype(int)
        if shift_y:
            logging.debug("flip random labels")
            idx = np.random.choice(len(y), int(ye * len(y)), replace=False)
            y[idx] = 1

        return x, y

    @property
    def means(self):
        return self._means


[docs]def get_datashift_params(data_shift=None, ye=0.5, te=None, se=None, re=None):
    """
    This factory simplifies the parameter generation process for a number of
    use cases. The parameters generated can be used with CausalClusterGenerator.generate_sample
    """
    data_shift_types = dict(
        no_shift=dict(
            shift_y=False,
            shift_x=False,
            shift_conditional_x=False,
            shift_conditional_y=False,
            y_cause_x=True,
            ye=ye,
            te=te,
            se=se,
            re=re,
        ),
        covariate_shift_y=dict(
            y_cause_x=True, shift_y=False, shift_x=True, re=re, te=te, se=se
        ),
        cond_covariate_shift_y=dict(
            y_cause_x=True,
            shift_y=False,
            shift_conditional_x=True,
            shift_x=False,
            re=re,
            te=te,
            se=se,
        ),
        covariate_shift_x=dict(
            y_cause_x=True, shift_y=False, shift_x=True, re=re, te=te, se=se
        ),
        label_shift=dict(y_cause_x=True, shift_y=True, shift_x=False, ye=ye),
        label_and_covariate_shift=dict(
            y_cause_x=True, shift_y=True, shift_x=True, ye=ye, re=re, te=te, se=se
        ),
        label_and_cond_covariate_shift=dict(
            y_cause_x=True,
            shift_y=True,
            shift_conditional_x=True,
            ye=ye,
            re=re,
            te=te,
            se=se,
        ),
        covariate_and_cond_label_shift=dict(
            y_cause_x=False,
            shift_x=True,
            shift_conditional_y=True,
            ye=ye,
            re=re,
            te=te,
            se=se,
        ),
    )

    if data_shift is not None:
        return data_shift_types[data_shift]

    return list(data_shift_types.keys())


[docs]class CausalBlobs(torch.utils.data.Dataset):
    """
    `CausalGaussianBlobs Dataset.
    MNIST-like dataset that generates Blobs in a given environment setting
    - original cluster params set by `cluster_params` dictionary
    - environment and cluster generation params given by `transform` dictionary
    """

    raw_folder = "BlobsData"

    def __init__(
        self,
        data_path,  # for compatibility with other datasets API
        train=True,
        transform=None,
        download=True,
        cluster_params=None,
        n_samples=300,
    ):
        """Init Blobs dataset."""
        super(CausalBlobs, self).__init__()
        self.root = data_path
        self.transform = transform if transform is not None else {}
        self.train = train  # training set or test set
        self.n_samples = n_samples

        if cluster_params is None:
            self.cluster_params = dict(
                n_clusters=2, data_seed=0, radius=0.02, centers=None, proba_classes=0.5
            )
        else:
            self.cluster_params = cluster_params

        tmp_cluster_params = cluster_params.copy()
        if isinstance(cluster_params["centers"], np.ndarray):
            tmp_cluster_params["centers"] = tmp_cluster_params["centers"].tolist()
        cluster_hash = xp.param_to_hash(tmp_cluster_params)
        transform_hash = xp.param_to_hash(transform)
        self.data_dir = os.path.join(cluster_hash, transform_hash)
        root_dir = os.path.join(self.root, self.raw_folder)
        os.makedirs(root_dir, exist_ok=True)
        xp.record_hashes(
            os.path.join(root_dir, "parameters.json"),
            f"{cluster_hash}/{transform_hash}",
            {"cluster_params": tmp_cluster_params, "transform": transform},
        )

        self.training_file = "causal_blobs_train.pt"
        self.test_file = "causal_blobs_test.pt"
        self._cluster_gen = None

        if not self._check_exists() or download:
            self.create_on_disk()

        if not self._check_exists():
            raise RuntimeError("Dataset not found.")

        if self.train:
            self.data, self.targets = torch.load(
                os.path.join(
                    self.root, self.raw_folder, self.data_dir, self.training_file
                )
            )
        else:
            self.data, self.targets = torch.load(
                os.path.join(self.root, self.raw_folder, self.data_dir, self.test_file)
            )

    def __getitem__(self, index):
        """Get images and target for data loader.
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        data, target = self.data[index], self.targets[index]

        return data, target

    def __len__(self):
        """Return size of dataset."""
        return len(self.data)

    def _check_exists(self):
        return os.path.exists(
            os.path.join(self.root, self.raw_folder, self.data_dir, self.training_file)
        ) and os.path.exists(
            os.path.join(self.root, self.raw_folder, self.data_dir, self.test_file)
        )

[docs]    def create_on_disk(self):
        file_path = os.path.join(self.root, self.raw_folder, self.data_dir)
        # make data dirs
        os.makedirs(file_path, exist_ok=True)

        self._cluster_gen = CausalClusterGenerator(**self.cluster_params)

        X_tr, y_tr = self._cluster_gen.generate_sample(self.n_samples, **self.transform)

        Xtr = torch.from_numpy(X_tr).float()
        ytr = torch.from_numpy(y_tr).long()
        training_set = (Xtr, ytr)

        X_te, y_te = self._cluster_gen.generate_sample(self.n_samples, **self.transform)
        Xte = torch.from_numpy(X_te).float()
        yte = torch.from_numpy(y_te).long()
        test_set = (Xte, yte)

        with open(os.path.join(file_path, self.training_file), "wb") as f:
            torch.save(training_set, f)
        with open(os.path.join(file_path, self.test_file), "wb") as f:
            torch.save(test_set, f)

[docs]    def delete_from_disk(self):
        file_path = os.path.join(self.root, self.raw_folder, self.data_dir)
        os.remove(os.path.join(file_path, self.training_file))
        os.remove(os.path.join(file_path, self.test_file))


[docs]class CausalBlobsDataAccess(DatasetAccess):
    def __init__(self, data_path, transform, download, cluster_params, n_samples):
        super().__init__(n_classes=cluster_params.get("n_clusters", 2))
        self._data_path = data_path
        self._transform = transform
        self._download = download
        self._cluster_params = cluster_params
        self._n_samples = n_samples

[docs]    def get_train(self):
        return CausalBlobs(
            data_path=self._data_path,
            train=True,
            transform=self._transform,
            download=self._download,
            cluster_params=self._cluster_params,
            n_samples=self._n_samples,
        )

[docs]    def get_test(self):
        return CausalBlobs(
            data_path=self._data_path,
            train=False,
            transform=self._transform,
            download=self._download,
            cluster_params=self._cluster_params,
            n_samples=self._n_samples,
        )
Source code for ada.datasets.toys

Ada

Navigation

Related Topics