Example with profiling and synthetic data generation
This section demonstrates how to use the Profile Report and advanced features of Tabular Synthetic Data Generation
with ydata-sdk
.
Example Code
"""Example using YData's profiling report, tabular synthetic data generator and metrics report."""
import os
from ydata.connectors import LocalConnector
from ydata.connectors.filetype import FileType
from ydata.profiling import ProfileReport
from ydata.metadata import Metadata
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.report import SyntheticDataProfile
import pandas as pd
import numpy as np
# Definition for the fields to be anonymized before the synthesis
anonymizer_config = {
'fnlwgt': {'type': 'regex', 'regex': r'[0-9]{6}'},
}
# Definition for the business rules
def get_education_mapping(education: pd.Series, education_num: pd.Series) -> pd.Series:
"Maps the Education with the Education Level because it's a static relation."
code_mapping = {
'Preschool': 1, '1st-4th': 2, '5th-6th': 3, '7th-8th': 4,
'9th': 5, '10th': 6, '11th': 7, '12th': 8, 'HS-grad': 9,
'Some-college': 10, 'Assoc-voc': 11, 'Assoc-acdm': 12,
'Bachelors': 13, 'Masters': 14, 'Prof-school': 15, 'Doctorate': 16
}
# Create DataFrames from input series
df = pd.DataFrame({'Education': education, 'education.num': education_num})
# Map 'Education' column using the dictionary
# In this case if there are no matches we have decide to fill the records with "Unknown"
resulting_df = df['Education'].map(code_mapping).fillna('Unknown')
return resulting_df
if __name__ == "__main__":
# Set up my YData License Key
os.environ["YDATA_LICENSE_KEY"] = "YDATA_LICENSE_KEY"
# Example using the LocalConnector
# Check the list of Connectors to connect diretly to your data source
connector = LocalConnector()
# List all the directories/files
files_dirs = connector.list(path="./data")
# Read a file from a given path
# Dataset used: https://www.kaggle.com/datasets/uciml/adult-census-income
data = connector.read_file("./data/census.csv", file_type=FileType.CSV)
print(data.head())
# calculating the metadata
metadata = Metadata(data)
print(metadata)
for item, values in metadata.summary.items():
print('\n\033[4m'+item+'\033[0m')
print(values)
# Profile your data to understand what changes need to be made
report = ProfileReport(data, title='My first Profile Report using YData')
report.to_file('data_profiling.html') #This will save the report as a shareable HTML file
# Choose the privacy level from
# PRIVACY_LEVELS = ["HIGH_FIDELITY",
# "BALANCED_PRIVACY_FIDELITY",
# "HIGH_PRIVACY"]
privacy_level="HIGH_FIDELITY"
# Add your Calculated Features to the configuration
calculated_features = [
{
"calculated_features": "education_mapping",
"function": get_education_mapping,
"calculated_from": ["Education", "education.num"],
},
]
# Instantiate a synthesizer
synth = RegularSynthesizer()
# fit model to the provided data
synth.fit(data,
metadata=metadata,
anonymize=anonymizer_config,
calculated_features=calculated_features,
privacy_level=privacy_level)
# Generate data samples by the end of the synth process
synth_sample = synth.sample(n_samples=len(data))
synth_metadata = Metadata(synth_sample)
print(synth_metadata)
# Update Dataset var type and Metadata datatype because our Anonymized column became a String and it is supposed to be a numerical
synth_sample.update_types([{'column':"fnlwgt", 'vartype': "int"}])
synth_metadata = Metadata(synth_sample)
synth_metadata.update_datatypes({"fnlwgt": "numerical"})
print(synth_metadata)
# Profile your synthetic data
report_synth = ProfileReport(synth_sample, title='My first Profile Report for Synthetic Data using YData')
report_synth.to_file('synthetic_profiling.html') #This will save the report as a shareable HTML file
# Compare the synthetic data with the original data
comparison_report = report.compare(report_synth)
comparison_report.to_file("comparison_profiling_report.html") #This will save the report as a shareable HTML file
# Write parquet file to your destination
connector.write_file(synth_sample.to_pandas(), path="./data/destination/data.parquet", file_type=FileType.PARQUET)
# Generate the Synthetic Data Metrics Report
# If your dataset has a TARGET variable, you should pass it as an argument
metrics_report = SyntheticDataProfile(
data,
synth_sample,
metadata=metadata,
data_types=synth.data_types)
metrics_report.generate_report(
output_path="./synthetic_data_metrics_report.pdf", #This will save the report as a shareable PDF file
)