Timeseries Synthetic Data
This section demonstrates how to use the Time-series Synthetic Data Generator
module in ydata-sdk
.
Don't forget to set up your license key
Example Code
"""Example using YData's time-series synthesizer."""
from pathlib import Path
from examples.local import setting_dask_env
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.metadata import Metadata
from ydata.synthesizers.timeseries.model import TimeSeriesSynthesizer
from ydata.utils.formats import read_json
# In running locally please do not forget to set your environment variables. "RUNNING_ENV"=="LOCAL"
setting_dask_env()
def get_token(token_name: str):
"Utility to load a token from .secrets"
# Use relative path from file to token to be able to run regardless of the cwd()
token_path = (
Path(__file__)
.absolute()
.parent.parent.parent.parent.joinpath(".secrets", token_name)
)
return read_json(token_path)
if __name__ == "__main__":
TRAIN = True
SYNTHESIZE = True
token = get_token("gcs_credentials.json")
connector = GCSConnector("bucketname", keyfile_dict=token)
original = connector.read_file(
"gs://path-to-file/data.csv", file_type=FileType.CSV
)
original = original.select_columns(
columns=[
"step",
"action",
"amount",
"nameOrig",
"nameDest",
"isFraud",
"isFlaggedFraud",
"isUnauthorizedOverdraft",
]
)
schema = {col: vartype.value for col, vartype in original.schema.items()}
X = original
dataset_attrs = {"sortbykey": "step",
"entities": ["nameOrig", "nameDest"]}
metadata = Metadata()
m = metadata(X, dataset_attrs=dataset_attrs)
out_path = "./test_trained_model.pkl"
if TRAIN is True:
synth = TimeSeriesSynthesizer()
synth.fit(X, metadata=metadata)
synth.save(out_path)
if SYNTHESIZE is True:
synth = TimeSeriesSynthesizer.load(out_path)
n_samples = 1000
sample = synth.sample(n_samples=n_samples)
print(sample)
sample.to_csv(r"test_synth_samples.csv")