MultiTable Synthetic Data with Attribute Tables
This section demonstrates how to use the Attribute Tables
in the MultiTalble Synthesis
module in ydata-sdk
.
Don't forget to set up your license key
Example Code
from pathlib import Path
from ydata.connectors.storages.rdbms_connector import PostgreSQLConnector
from ydata.dataset.multidataset import MultiDataset
from ydata.metadata.multimetadata import MultiMetadata
from ydata.synthesizers.multitable.model import MultiTableSynthesizer
from ydata.utils.formats import read_json
def get_token(token_name: str):
"Utility to load a token from .secrets"
# Use relative path from file to token to be able to run regardless of the cwd()
token_path = (
Path(__file__)
.absolute()
.parent.parent.parent.joinpath(".secrets", token_name)
)
return read_json(token_path)
def check_composite_keys(sample_dfs):
target = sample_dfs['appearances']
syn = target['leagueID']
reference = sample_dfs['games']
mapping = reference.set_index("gameID")["leagueID"].to_dict()
mapped = target['gameID'].map(mapping)
comp = (syn == mapped)
print(f"matching keys: {comp.sum()}, unmatching keys {(~comp).sum()}")
def main():
connection_string = get_token("postgresql_credentials.json")
connection_string["database"] = "database-name"
connector = PostgreSQLConnector(conn_string=connection_string)
dataset: MultiDataset = connector.read_database(lazy=True)
metadata = MultiMetadata(dataset)
composite_keys = {
"table": "appearances",
"columns": ['leagueID', 'gameID'],
"parent_table": 'games',
"parent_columns": ['leagueID', 'gameID'],
}
dataset.schema.add_composite_keys(**composite_keys)
synthesizer = MultiTableSynthesizer()
synthesizer.fit(
X=dataset,
metadata=metadata,
attribute_tables="leagues" # or ["leagues"]
)
sample = synthesizer.sample(n_samples=1)
sample_dfs = {t: df.to_pandas() for t, df in sample.items()}
assert len(dataset["leagues"].to_pandas()) == len(sample_dfs["leagues"])
print("attribute table")
print(sample_dfs["leagues"])
check_composite_keys(sample_dfs)
if __name__ == '__main__':
main()