Tabular Synthetic Data
This section demonstrates how to use the Tabular Synthetic Data Generator
module in ydata-sdk
.
Don't forget to set up your license key
Example Code
"""Example using YData's regular data synthesizer."""
from pathlib import Path
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.metadata import Metadata
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.utils.formats import read_json
def get_token(token_name: str):
"Utility to load a token from .secrets"
# Use relative path from file to token to be able to run regardless of the cwd()
token_path = (
Path(__file__).absolute().parent.parent.parent.joinpath(
".secrets", token_name)
)
return read_json(token_path)
if __name__ == "__main__":
# defining the arguments and acces token to Google Cloud Storage
token = get_token("gcs_credentials.json")
# init the connector
connector = GCSConnector("bucketname", keyfile_dict=token)
# Read the file from the GCS storage
data = connector.read_file(
"gs://path-to-file/data.csv", file_type=FileType.CSV
)
# Instantiate a synthesizer
cardio_synth = RegularSynthesizer()
# calculating the metadata
metadata = Metadata(data)
# fit model to the provided data
cardio_synth.fit(data, metadata)
# Generate data samples by the end of the synth process
synth_sample = cardio_synth.sample(1000)
# Write the sample to the same connector as the original data
connector.write_file(
data=synth_sample,
path="gs://path-to-file/synth_sample.csv",
file_type=FileType.CSV,
)
# Store the synthesizer
cardio_synth.save("./test.pkl")
# Load and Sample
model = RegularSynthesizer.load("./test.pkl")
res = model.sample(100)
# Regular Synth with no segment strategy
synth = RegularSynthesizer()
synth = synth.fit(X=data, metadata=metadata, segment_by=False)