Skip to content

Timeseries Synthetic Data

This section demonstrates how to use the Time-series Synthetic Data Generator module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

"""Example using YData's time-series synthesizer."""
from pathlib import Path

from examples.local import setting_dask_env
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.metadata import Metadata
from ydata.synthesizers.timeseries.model import TimeSeriesSynthesizer
from ydata.utils.formats import read_json

# In running locally please do not forget to set your environment variables. "RUNNING_ENV"=="LOCAL"
setting_dask_env()


def get_token(token_name: str):
    "Utility to load a token from .secrets"
    # Use relative path from file to token to be able to run regardless of the cwd()
    token_path = (
        Path(__file__)
        .absolute()
        .parent.parent.parent.parent.joinpath(".secrets", token_name)
    )
    return read_json(token_path)


if __name__ == "__main__":

    TRAIN = True
    SYNTHESIZE = True

    token = get_token("gcs_credentials.json")
    connector = GCSConnector("bucketname", keyfile_dict=token)
    original = connector.read_file(
        "gs://path-to-file/data.csv", file_type=FileType.CSV
    )

    original = original.select_columns(
        columns=[
            "step",
            "action",
            "amount",
            "nameOrig",
            "nameDest",
            "isFraud",
            "isFlaggedFraud",
            "isUnauthorizedOverdraft",
        ]
    )

    schema = {col: vartype.value for col, vartype in original.schema.items()}

    X = original
    dataset_attrs = {"sortbykey": "step",
                     "entities": ["nameOrig", "nameDest"]}

    metadata = Metadata()
    m = metadata(X, dataset_attrs=dataset_attrs)

    out_path = "./test_trained_model.pkl"

    if TRAIN is True:
        synth = TimeSeriesSynthesizer()
        synth.fit(X, metadata=metadata)
        synth.save(out_path)

    if SYNTHESIZE is True:
        synth = TimeSeriesSynthesizer.load(out_path)
        n_samples = 1000
        sample = synth.sample(n_samples=n_samples)
        print(sample)

        sample.to_csv(r"test_synth_samples.csv")