Skip to content

Tabular Data Report

This section demonstrates how to use the Tabular Data Profiling Report module in ydata-sdk. Generate the Profiling Report and the PDF report for Synthetic Data Quality.

Example Code

"""
    Example of synthetic data profile report for the Cardio Dataset
"""
from ydata.connectors import GCSConnector
from ydata.metadata.metadata import Metadata
from ydata.report import SyntheticDataProfile
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.utils.formats import read_json

def get_token(token_path: str):
    "Utility to load a token from .secrets"
    return read_json(token_path)


if __name__ == "__main__":

    # Load the token and read a file from GCS
    gcs_connector = GCSConnector(
        project_id="ydatasynthetic",
        keyfile_dict=get_token("gcs_credentials.json")
    )

    # Read sample of a .csv file
    data = gcs_connector.read_file(
        "gs://ydata_testdata/tabular/cardio/data.csv"
    )
    metadata = Metadata(data)

    # Train a Synthesizer and sample data
    cardio_synth = RegularSynthesizer()
    cardio_synth.fit(data, metadata=metadata)
    synth_sample = cardio_synth.sample(20_000)

    # get the metadata from the synth, or have an example of how to create a metadata object
    # Generate the Report

    # TODO target variable validation
    profile = SyntheticDataProfile(
        data,
        synth_sample,
        metadata=metadata,
        target="cardio",
        data_types=cardio_synth.data_types)

    profile.generate_report(
        output_path="./cardio_report_example.pdf",
    )