Tabular Synthetic Data
This section demonstrates how to use the Tabular Synthetic Data Generator
module in ydata-sdk
.
Don't forget to set up your license key
Example Code
"""Example using YData's regular data synthesizer."""
from pathlib import Path
from examples.local import setting_dask_env
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.metadata import Metadata
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.utils.formats import read_json
# If running locally this example please set your environment variables first: 'RUNNING_ENV'=='LOCAL'
setting_dask_env()
def get_token(token_name: str):
"Utility to load a token from .secrets"
# Use relative path from file to token to be able to run regardless of the cwd()
token_path = (
Path(__file__).absolute().parent.parent.parent.joinpath(
".secrets", token_name)
)
return read_json(token_path)
if __name__ == "__main__":
# defining the arguments and acces token to Google Cloud Storage
token = get_token("gcs_credentials.json")
# init the connector
connector = GCSConnector("bucketname", keyfile_dict=token)
# Read the file from the GCS storage
data = connector.read_file(
"gs://path-to-file/data.csv", file_type=FileType.CSV
)
# Instantiate a synthesizer
cardio_synth = RegularSynthesizer()
# calculating the metadata
metadata = Metadata(data)
# fit model to the provided data
cardio_synth.fit(data, metadata)
# Generate data samples by the end of the synth process
synth_sample = cardio_synth.sample(1000)
# Write the sample to the same connector as the original data
connector.write_file(
data=synth_sample,
path="gs://path-to-file/synth_sample.csv",
file_type=FileType.CSV,
)
# Store the synthesizer
cardio_synth.save("./test.pkl")
# Load and Sample
model = RegularSynthesizer.load("./test.pkl")
res = model.sample(100)
# Regular Synth with no segment strategy
synth = RegularSynthesizer()
synth = synth.fit(X=data, metadata=metadata, segment_by=False)