Skip to content

Text to Dataset (LLM Synthesizer)

The LLM Synthesizer generates single tables or multi-table databases from natural language prompts—no existing dataset is required. Define your tables and columns in text; the model produces synthetic data that matches your schema. This is useful for demos, test fixtures, and rapid prototyping when you only have a description of the data you need.

  • Generate single-table or multi-table (relational) data from prompts
  • Define schemas with table and column prompts; optional primary and foreign keys
  • No source data required—purely prompt-driven generation
  • Configurable LLM model (e.g. OpenAI via LiteLLM)
  • Control relationship cardinality with foreign key prompts

LLM provider API key required

Set the API key for your chosen LLM provider (e.g. OPENAI_API_KEY for OpenAI).

Don't forget to set up your API key

import os

os.environ['OPENAI_API_KEY'] = '{add-your-key}'
os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

"""
Example for the LLM synthesizer: prompt-based synthetic data generation without an existing dataset.

Requires an API key for the LLM provider (e.g. OPENAI_API_KEY for OpenAI).

Use a small sample_size for a quick run and to limit API usage.
"""
import json
from pathlib import Path

from ydata.synthesizers import LLMSynthesizer

if __name__ == "__main__":
    """Generate a single table from a prompt-based schema (credit card transactions)."""
    tables = {
        "transactions": {
            "prompt": "Credit card transactions for a financial services dataset",
            "columns": {
                "transaction_id": {"prompt": "unique identifier for the transaction", "dtype": "string"},
                "card_id": {"prompt": "identifier of the credit card", "dtype": "string"},
                "date": {"prompt": "transaction date", "dtype": "date"},
                "merchant": {"prompt": "merchant or vendor name", "dtype": "string"},
                "amount": {"prompt": "transaction amount", "dtype": "float"},
                "currency": {"prompt": "currency code of the transaction", "dtype": "category", "values": ["USD", "EUR", "GBP"]},
                "category": {"prompt": "spending category", "dtype": "category", "values": ["retail", "travel", "dining", "utilities", "other"]},
            },
        }
    }

    # Or load the prompts from a JSON file
    with open("transactions_schema.json") as f:
        tables = json.load(f)

    synth = LLMSynthesizer(model="openai/gpt-5-nano")
    synth.fit(tables=tables)
    data = synth.sample(sample_size=6)
    print("Single-table output (Dataset):")
    print(data.head())


    """Generate multiple related tables with primary and foreign keys (5-table credit card domain)."""
    tables = {
        "customers": {
            "prompt": "Customers of a financial services provider",
            "columns": {
                "customer_id": {"prompt": "unique identifier for the customer", "dtype": "string"},
                "name": {"prompt": "full name of the customer", "dtype": "string"},
                "email": {"prompt": "customer email address", "dtype": "string"},
            },
            "primary_key": "customer_id",
        },
        "merchants": {
            "prompt": "Merchants or vendors where card transactions can occur",
            "columns": {
                "merchant_id": {"prompt": "unique identifier for the merchant", "dtype": "string"},
                "name": {"prompt": "business or merchant name", "dtype": "string"},
                "category": {"prompt": "merchant category", "dtype": "category", "values": ["retail", "travel", "dining", "utilities", "other"]},
            },
            "primary_key": "merchant_id",
        },
        "cards": {
            "prompt": "Credit cards issued by a financial services provider",
            "columns": {
                "card_id": {"prompt": "the unique id of the credit card", "dtype": "string"},
                "customer_id": {"prompt": "the customer who owns this card", "dtype": "string"},
                "holder_name": {"prompt": "first name and last name of the cardholder", "dtype": "string"},
                "card_type": {"prompt": "type of card (debit or credit)", "dtype": "category", "values": ["debit", "credit"]},
            },
            "primary_key": "card_id",
            "foreign_keys": [
                {"column": "customer_id", "referenced_table": "customers", "prompt": "each customer has between 1 and 3 cards"},
            ],
        },
        "transactions": {
            "prompt": "Credit card transactions",
            "columns": {
                "transaction_id": {"prompt": "the unique id of the transaction", "dtype": "string"},
                "card_id": {"prompt": "the card used for this transaction", "dtype": "string"},
                "merchant_id": {"prompt": "the merchant where the transaction occurred", "dtype": "string"},
                "date": {"prompt": "transaction date", "dtype": "date"},
                "amount": {"prompt": "transaction amount", "dtype": "float"},
                "category": {"prompt": "spending category", "dtype": "category", "values": ["retail", "travel", "dining", "utilities", "other"]},
            },
            "primary_key": "transaction_id",
            "foreign_keys": [
                {"column": "card_id", "referenced_table": "cards", "prompt": "each card has between 2 and 5 transactions"},
                {"column": "merchant_id", "referenced_table": "merchants", "prompt": "transactions reference existing merchants"},
            ],
        },
        "statements": {
            "prompt": "Monthly billing statements for each card",
            "columns": {
                "statement_id": {"prompt": "unique identifier for the statement", "dtype": "string"},
                "card_id": {"prompt": "the card this statement is for", "dtype": "string"},
                "period_start": {"prompt": "billing period start date", "dtype": "date"},
                "period_end": {"prompt": "billing period end date", "dtype": "date"},
                "amount_due": {"prompt": "total amount due for the period", "dtype": "float"},
            },
            "primary_key": "statement_id",
            "foreign_keys": [
                {"column": "card_id", "referenced_table": "cards", "prompt": "each card has one statement per billing period"},
            ],
        },
    }

    synth = LLMSynthesizer(model="openai/gpt-5-nano")
    synth.fit(tables=tables)
    data = synth.sample(sample_size=2)
    print("Multi-table output (MultiDataset):")
    for table_name in ["customers", "merchants", "cards", "transactions", "statements"]:
        print(f"{table_name.capitalize()}:")
        print(data[table_name].head())
        print()