Skip to content

MultiTable Synthetic Data with Attribute Tables

This section demonstrates how to use the Attribute Tables in the MultiTalble Synthesis module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

from pathlib import Path

from ydata.connectors.storages.rdbms_connector import PostgreSQLConnector
from ydata.dataset.multidataset import MultiDataset
from ydata.metadata.multimetadata import MultiMetadata
from ydata.synthesizers.multitable.model import MultiTableSynthesizer
from ydata.utils.formats import read_json

def get_token(token_name: str):
    "Utility to load a token from .secrets"
    # Use relative path from file to token to be able to run regardless of the cwd()
    token_path = (
        Path(__file__)
        .absolute()
        .parent.parent.parent.joinpath(".secrets", token_name)
    )
    return read_json(token_path)


def check_composite_keys(sample_dfs):
    target = sample_dfs['appearances']
    syn = target['leagueID']
    reference = sample_dfs['games']
    mapping = reference.set_index("gameID")["leagueID"].to_dict()
    mapped = target['gameID'].map(mapping)
    comp = (syn == mapped)
    print(f"matching keys: {comp.sum()}, unmatching keys {(~comp).sum()}")


def main():
    connection_string = get_token("postgresql_credentials.json")
    connection_string["database"] = "database-name"
    connector = PostgreSQLConnector(conn_string=connection_string)

    dataset: MultiDataset = connector.read_database(lazy=True)
    metadata = MultiMetadata(dataset)

    composite_keys = {
        "table": "appearances",
        "columns": ['leagueID', 'gameID'],
        "parent_table": 'games',
        "parent_columns": ['leagueID', 'gameID'],
    }

    dataset.schema.add_composite_keys(**composite_keys)
    synthesizer = MultiTableSynthesizer()
    synthesizer.fit(
        X=dataset,
        metadata=metadata,
        attribute_tables="leagues"  # or ["leagues"]
    )

    sample = synthesizer.sample(n_samples=1)
    sample_dfs = {t: df.to_pandas() for t, df in sample.items()}

    assert len(dataset["leagues"].to_pandas()) == len(sample_dfs["leagues"])
    print("attribute table")
    print(sample_dfs["leagues"])

    check_composite_keys(sample_dfs)


if __name__ == '__main__':
    main()