Skip to content

Tabular Synthetic Data

This section demonstrates how to use the Tabular Synthetic Data Generator module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

"""Example using YData's regular data synthesizer."""
from pathlib import Path

from examples.local import setting_dask_env
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.metadata import Metadata
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.utils.formats import read_json

# If running locally this example please set your environment variables first: 'RUNNING_ENV'=='LOCAL'
setting_dask_env()


def get_token(token_name: str):
    "Utility to load a token from .secrets"
    # Use relative path from file to token to be able to run regardless of the cwd()
    token_path = (
        Path(__file__).absolute().parent.parent.parent.joinpath(
            ".secrets", token_name)
    )
    return read_json(token_path)


if __name__ == "__main__":

    # defining the arguments and acces token to Google Cloud Storage
    token = get_token("gcs_credentials.json")

    # init the connector
    connector = GCSConnector("bucketname", keyfile_dict=token)
    # Read the file from the GCS storage
    data = connector.read_file(
        "gs://path-to-file/data.csv", file_type=FileType.CSV
    )

    # Instantiate a synthesizer
    cardio_synth = RegularSynthesizer()

    # calculating the metadata
    metadata = Metadata(data)

    # fit model to the provided data
    cardio_synth.fit(data, metadata)

    # Generate data samples by the end of the synth process
    synth_sample = cardio_synth.sample(1000)

    # Write the sample to the same connector as the original data
    connector.write_file(
        data=synth_sample,
        path="gs://path-to-file/synth_sample.csv",
        file_type=FileType.CSV,
    )

    # Store the synthesizer
    cardio_synth.save("./test.pkl")

    # Load and Sample
    model = RegularSynthesizer.load("./test.pkl")
    res = model.sample(100)

    # Regular Synth with no segment strategy
    synth = RegularSynthesizer()

    synth = synth.fit(X=data, metadata=metadata, segment_by=False)