Skip to content

Tabular Synthetic Data

This section demonstrates how to use the Tabular Synthetic Data Generator module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

"""Example using YData's regular data synthesizer."""
from pathlib import Path

from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.metadata import Metadata
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.utils.formats import read_json


def get_token(token_name: str):
    "Utility to load a token from .secrets"
    # Use relative path from file to token to be able to run regardless of the cwd()
    token_path = (
        Path(__file__).absolute().parent.parent.parent.joinpath(
            ".secrets", token_name)
    )
    return read_json(token_path)


if __name__ == "__main__":

    # defining the arguments and acces token to Google Cloud Storage
    token = get_token("gcs_credentials.json")

    # init the connector
    connector = GCSConnector("bucketname", keyfile_dict=token)
    # Read the file from the GCS storage
    data = connector.read_file(
        "gs://path-to-file/data.csv", file_type=FileType.CSV
    )

    # Instantiate a synthesizer
    cardio_synth = RegularSynthesizer()

    # calculating the metadata
    metadata = Metadata(data)

    # fit model to the provided data
    cardio_synth.fit(data, metadata)

    # Generate data samples by the end of the synth process
    synth_sample = cardio_synth.sample(1000)

    # Write the sample to the same connector as the original data
    connector.write_file(
        data=synth_sample,
        path="gs://path-to-file/synth_sample.csv",
        file_type=FileType.CSV,
    )

    # Store the synthesizer
    cardio_synth.save("./test.pkl")

    # Load and Sample
    model = RegularSynthesizer.load("./test.pkl")
    res = model.sample(100)

    # Regular Synth with no segment strategy
    synth = RegularSynthesizer()

    synth = synth.fit(X=data, metadata=metadata, segment_by=False)