Skip to content

Synthetic Data with Privacy Levels

This section demonstrates how to use the Privacy Levels module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

"""
    Usage of the privacy input for ydata synthetic data generators
"""
import pandas as pd

from ydata.datascience.common import PrivacyLevel
from ydata.dataset import Dataset
from ydata.metadata import Metadata
from ydata.report import SyntheticDataProfile
from ydata.report.reports.report_type import ReportType
from ydata.synthesizers.regular.model import RegularSynthesizer

_PRIVACY_LEVELS = [PrivacyLevel.HIGH_FIDELITY,
                   PrivacyLevel.BALANCED_PRIVACY_FIDELITY,
                   PrivacyLevel.HIGH_PRIVACY]

if __name__ == "__main__":
    df = pd.read_csv("../data.csv")
    real_dataset = Dataset(df)
    metadata = Metadata(real_dataset)

    for privacy_level in _PRIVACY_LEVELS:
        synthesizer = RegularSynthesizer()
        synthesizer.fit(real_dataset, metadata=metadata, privacy_level=privacy_level)

        holdout_df = synthesizer._holdout._data.compute()
        synthetic_dataset = synthesizer.sample(n_samples=len(holdout_df))
        holdout_dataset = Dataset(holdout_df)
        train_dataset = Dataset(synthesizer._holdout._train_data.compute())

        sdf = SyntheticDataProfile(report_type=ReportType.TABULAR)
        data_types = {k: v.datatype for k, v in metadata.columns.items()}

        sdf.generate_report(real=holdout_dataset,
                            synth=synthetic_dataset,
                            target="diagnosis",
                            data_types=data_types,
                            training_data=train_dataset,
                            metadata=metadata,
                            pdf=False)

        summary_metrics = sdf.get_summary()
        print(f"\n{privacy_level.name}")
        print(f"Privacy: {summary_metrics['privacy']:.2f}")
        print(f"Fidelity: {summary_metrics['fidelity']:.2f}")
        print(f"Utility: {summary_metrics['utility']:.2f}\n")