Text to Dataset (LLM Synthesizer)
The LLM Synthesizer generates single tables or multi-table databases from natural language prompts—no existing dataset is required. Define your tables and columns in text; the model produces synthetic data that matches your schema. This is useful for demos, test fixtures, and rapid prototyping when you only have a description of the data you need.
- Generate single-table or multi-table (relational) data from prompts
- Define schemas with table and column prompts; optional primary and foreign keys
- No source data required—purely prompt-driven generation
- Configurable LLM model (e.g. OpenAI via LiteLLM)
- Control relationship cardinality with foreign key prompts
LLM provider API key required
Set the API key for your chosen LLM provider (e.g. OPENAI_API_KEY for OpenAI).
Don't forget to set up your API key
import os
os.environ['OPENAI_API_KEY'] = '{add-your-key}'
os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'
Example Code
"""
Example for the LLM synthesizer: prompt-based synthetic data generation without an existing dataset.
Requires an API key for the LLM provider (e.g. OPENAI_API_KEY for OpenAI).
Use a small sample_size for a quick run and to limit API usage.
"""
import json
from pathlib import Path
from ydata.synthesizers import LLMSynthesizer
if __name__ == "__main__":
"""Generate a single table from a prompt-based schema (credit card transactions)."""
tables = {
"transactions": {
"prompt": "Credit card transactions for a financial services dataset",
"columns": {
"transaction_id": {"prompt": "unique identifier for the transaction", "dtype": "string"},
"card_id": {"prompt": "identifier of the credit card", "dtype": "string"},
"date": {"prompt": "transaction date", "dtype": "date"},
"merchant": {"prompt": "merchant or vendor name", "dtype": "string"},
"amount": {"prompt": "transaction amount", "dtype": "float"},
"currency": {"prompt": "currency code of the transaction", "dtype": "category", "values": ["USD", "EUR", "GBP"]},
"category": {"prompt": "spending category", "dtype": "category", "values": ["retail", "travel", "dining", "utilities", "other"]},
},
}
}
# Or load the prompts from a JSON file
with open("transactions_schema.json") as f:
tables = json.load(f)
synth = LLMSynthesizer(model="openai/gpt-5-nano")
synth.fit(tables=tables)
data = synth.sample(sample_size=6)
print("Single-table output (Dataset):")
print(data.head())
"""Generate multiple related tables with primary and foreign keys (5-table credit card domain)."""
tables = {
"customers": {
"prompt": "Customers of a financial services provider",
"columns": {
"customer_id": {"prompt": "unique identifier for the customer", "dtype": "string"},
"name": {"prompt": "full name of the customer", "dtype": "string"},
"email": {"prompt": "customer email address", "dtype": "string"},
},
"primary_key": "customer_id",
},
"merchants": {
"prompt": "Merchants or vendors where card transactions can occur",
"columns": {
"merchant_id": {"prompt": "unique identifier for the merchant", "dtype": "string"},
"name": {"prompt": "business or merchant name", "dtype": "string"},
"category": {"prompt": "merchant category", "dtype": "category", "values": ["retail", "travel", "dining", "utilities", "other"]},
},
"primary_key": "merchant_id",
},
"cards": {
"prompt": "Credit cards issued by a financial services provider",
"columns": {
"card_id": {"prompt": "the unique id of the credit card", "dtype": "string"},
"customer_id": {"prompt": "the customer who owns this card", "dtype": "string"},
"holder_name": {"prompt": "first name and last name of the cardholder", "dtype": "string"},
"card_type": {"prompt": "type of card (debit or credit)", "dtype": "category", "values": ["debit", "credit"]},
},
"primary_key": "card_id",
"foreign_keys": [
{"column": "customer_id", "referenced_table": "customers", "prompt": "each customer has between 1 and 3 cards"},
],
},
"transactions": {
"prompt": "Credit card transactions",
"columns": {
"transaction_id": {"prompt": "the unique id of the transaction", "dtype": "string"},
"card_id": {"prompt": "the card used for this transaction", "dtype": "string"},
"merchant_id": {"prompt": "the merchant where the transaction occurred", "dtype": "string"},
"date": {"prompt": "transaction date", "dtype": "date"},
"amount": {"prompt": "transaction amount", "dtype": "float"},
"category": {"prompt": "spending category", "dtype": "category", "values": ["retail", "travel", "dining", "utilities", "other"]},
},
"primary_key": "transaction_id",
"foreign_keys": [
{"column": "card_id", "referenced_table": "cards", "prompt": "each card has between 2 and 5 transactions"},
{"column": "merchant_id", "referenced_table": "merchants", "prompt": "transactions reference existing merchants"},
],
},
"statements": {
"prompt": "Monthly billing statements for each card",
"columns": {
"statement_id": {"prompt": "unique identifier for the statement", "dtype": "string"},
"card_id": {"prompt": "the card this statement is for", "dtype": "string"},
"period_start": {"prompt": "billing period start date", "dtype": "date"},
"period_end": {"prompt": "billing period end date", "dtype": "date"},
"amount_due": {"prompt": "total amount due for the period", "dtype": "float"},
},
"primary_key": "statement_id",
"foreign_keys": [
{"column": "card_id", "referenced_table": "cards", "prompt": "each card has one statement per billing period"},
],
},
}
synth = LLMSynthesizer(model="openai/gpt-5-nano")
synth.fit(tables=tables)
data = synth.sample(sample_size=2)
print("Multi-table output (MultiDataset):")
for table_name in ["customers", "merchants", "cards", "transactions", "statements"]:
print(f"{table_name.capitalize()}:")
print(data[table_name].head())
print()
