Synthetic Data with Privacy Levels
This section demonstrates how to use the Privacy Levels
module in ydata-sdk
.
Don't forget to set up your license key
Example Code
"""
Usage of the privacy input for ydata synthetic data generators
"""
import pandas as pd
from ydata.datascience.common import PrivacyLevel
from ydata.dataset import Dataset
from ydata.metadata import Metadata
from ydata.report import SyntheticDataProfile
from ydata.report.reports.report_type import ReportType
from ydata.synthesizers.regular.model import RegularSynthesizer
_PRIVACY_LEVELS = [PrivacyLevel.HIGH_FIDELITY,
PrivacyLevel.BALANCED_PRIVACY_FIDELITY,
PrivacyLevel.HIGH_PRIVACY]
if __name__ == "__main__":
df = pd.read_csv("../data.csv")
real_dataset = Dataset(df)
metadata = Metadata(real_dataset)
for privacy_level in _PRIVACY_LEVELS:
synthesizer = RegularSynthesizer()
synthesizer.fit(real_dataset, metadata=metadata, privacy_level=privacy_level)
holdout_df = synthesizer._holdout._data.compute()
synthetic_dataset = synthesizer.sample(n_samples=len(holdout_df))
holdout_dataset = Dataset(holdout_df)
train_dataset = Dataset(synthesizer._holdout._train_data.compute())
sdf = SyntheticDataProfile(report_type=ReportType.TABULAR)
data_types = {k: v.datatype for k, v in metadata.columns.items()}
sdf.generate_report(real=holdout_dataset,
synth=synthetic_dataset,
target="diagnosis",
data_types=data_types,
training_data=train_dataset,
metadata=metadata,
pdf=False)
summary_metrics = sdf.get_summary()
print(f"\n{privacy_level.name}")
print(f"Privacy: {summary_metrics['privacy']:.2f}")
print(f"Fidelity: {summary_metrics['fidelity']:.2f}")
print(f"Utility: {summary_metrics['utility']:.2f}\n")