Skip to content

Tabular Synthetic Data with Conditions

This section demonstrates how to use the Tabular Synthetic Data Generator module with the Conditions advanced feature, in ydata-sdk.

Read more about Conditional Synthetic Data in this blog post.

Example Code

"""Example using YData's SDK regular & Conditions data synthesizer."""
import os
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.profiling import ProfileReport
from ydata.metadata import Metadata
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.report import SyntheticDataProfile
from ydata.utils.formats import read_json


def get_token(token_path: str):
    "Utility to load a token from .secrets"
    return read_json(token_path)


if __name__ == "__main__":

    # Set up my YData SDK License Key
    os.environ["YDATA_LICENSE_KEY"] = "YDATA_LICENSE_KEY"

    # defining the arguments and acces token to Google Cloud Storage
    token = get_token("gcs_credentials.json")

    # init the connector
    connector = GCSConnector("bucket_name", keyfile_dict=token)
    # Read the file from the GCS storage
    data = connector.read_file(
        "gs://path-to-file/data.csv", file_type=FileType.CSV
    )

    # Instantiate a synthesizer
    synth = RegularSynthesizer()

    # calculating the metadata
    metadata = Metadata(data)

    # Profile your data to understand what changes need to be made
    report = ProfileReport(data, title='My first Profile Report using YData')
    report.to_file('data_profiling.html') #This will save the report as a shareable HTML file

    # fit model to the provided data
    synth.fit(data, metadata, condition_on=["sex", "native-country", "age"])

    # Generate data samples by the end of the synth process
    synth_sample = synth.sample(
        n_samples=500,
        condition_on={
            "sex": {
                "categories": [{
                    "category": 'Female',
                    "percentage": 0.7
                }]
            },
            "native-country": {
                "categories": [{
                    "category": 'United-States',
                    "percentage": 0.6
                }, {
                    "category": 'Mexico',
                    "percentage": 0.4
                }]
            },
            "age": {
                "minimum": 55,
                "maximum": 60
            }
        }
    )

    # Write the sample to the same connector as the original data
    connector.write_file(
        data=synth_sample,
        path="gs://file-to-path/synth_sample.csv",
        file_type=FileType.CSV,
    )

    # Profile your synthetic data
    report_synth = ProfileReport(synth_sample, title='My first Profile Report for Synthetic Data using YData')
    report_synth.to_file('synthetic_profiling.html') #This will save the report as a shareable HTML file

    # Compare the synthetic data with the original data
    comparison_report = report.compare(report_synth)
    comparison_report.to_file("comparison_profiling_report.html") #This will save the report as a shareable HTML file

    # Generate the Synthetic Data Metrics Report
    # If your dataset has a TARGET variable, you should pass it as an argument
    metrics_report = SyntheticDataProfile(
        data,
        synth_sample,
        metadata=metadata,
        data_types=synth.data_types)

    metrics_report.generate_report(
        output_path="./synthetic_data_metrics_report.pdf", #This will save the report as a shareable PDF file
    )