Tabular Synthetic Data with Conditions
This section demonstrates how to use the Tabular Synthetic Data Generator
module with the Conditions
advanced feature, in ydata-sdk
.
Read more about Conditional Synthetic Data in this blog post.
Example Code
"""Example using YData's SDK regular & Conditions data synthesizer."""
import os
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.profiling import ProfileReport
from ydata.metadata import Metadata
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.report import SyntheticDataProfile
from ydata.utils.formats import read_json
def get_token(token_path: str):
"Utility to load a token from .secrets"
return read_json(token_path)
if __name__ == "__main__":
# Set up my YData SDK License Key
os.environ["YDATA_LICENSE_KEY"] = "YDATA_LICENSE_KEY"
# defining the arguments and acces token to Google Cloud Storage
token = get_token("gcs_credentials.json")
# init the connector
connector = GCSConnector("bucket_name", keyfile_dict=token)
# Read the file from the GCS storage
data = connector.read_file(
"gs://path-to-file/data.csv", file_type=FileType.CSV
)
# Instantiate a synthesizer
synth = RegularSynthesizer()
# calculating the metadata
metadata = Metadata(data)
# Profile your data to understand what changes need to be made
report = ProfileReport(data, title='My first Profile Report using YData')
report.to_file('data_profiling.html') #This will save the report as a shareable HTML file
# fit model to the provided data
synth.fit(data, metadata, condition_on=["sex", "native-country", "age"])
# Generate data samples by the end of the synth process
synth_sample = synth.sample(
n_samples=500,
condition_on={
"sex": {
"categories": [{
"category": 'Female',
"percentage": 0.7
}]
},
"native-country": {
"categories": [{
"category": 'United-States',
"percentage": 0.6
}, {
"category": 'Mexico',
"percentage": 0.4
}]
},
"age": {
"minimum": 55,
"maximum": 60
}
}
)
# Write the sample to the same connector as the original data
connector.write_file(
data=synth_sample,
path="gs://file-to-path/synth_sample.csv",
file_type=FileType.CSV,
)
# Profile your synthetic data
report_synth = ProfileReport(synth_sample, title='My first Profile Report for Synthetic Data using YData')
report_synth.to_file('synthetic_profiling.html') #This will save the report as a shareable HTML file
# Compare the synthetic data with the original data
comparison_report = report.compare(report_synth)
comparison_report.to_file("comparison_profiling_report.html") #This will save the report as a shareable HTML file
# Generate the Synthetic Data Metrics Report
# If your dataset has a TARGET variable, you should pass it as an argument
metrics_report = SyntheticDataProfile(
data,
synth_sample,
metadata=metadata,
data_types=synth.data_types)
metrics_report.generate_report(
output_path="./synthetic_data_metrics_report.pdf", #This will save the report as a shareable PDF file
)