Clustering

This section demonstrates how to use the Clustering module in ydata-sdk.

Don't forget to set up your license key

    import os

    os.environ['YDATA_LICENSE_KEY'] = '{add-your-key}'

Example Code

import os

import pandas as pd

from ydata.dataset.filetype import FileType

from ydata.connectors import GCSConnector

from ydata.preprocessing.clustering import TimeSeriesClustering
from ydata.utils.formats import read_json

os.environ["DATASOURCE_API_URL"] = ""
os.environ["HOSTNAME"] = ""
os.environ["RUNNING_ENV"] = "LOCAL"
os.environ["DASK_GATEWAY_URL"] = "url"


if __name__ == "__main__":
    # Read the dataset from a remote storage
    token = read_json(
        "/path-to-file/gcs_credentials.json"
    )
    connector = GCSConnector("ydatasynthetic", keyfile_dict=token)

    data = connector.read_file(
        "gs://path-to-file/data.csv",
        file_type=FileType.CSV,
    )

    # Convert into pandas as currently clustering does not yet support YData Dataset object
    data = data.to_pandas()
    data["timestamp"] = pd.to_datetime(data["timestamp"])
    data = data.set_index("timestamp")

    # Init the TimeSeriesClustering
    clustering = TimeSeriesClustering(metric="dtw", n_clusters=3)
    clusters, _ = clustering.fit_transform(
        X=data[["device", "winddirection"]], segment=["device"]
    )

    # We can also only the fit method alone, in case we want to first save the model prior to getting the clusters
    clustering.fit(X=data[["device", "winddirection"]], segment=["device"])

    # Saving the clustering model
    clustering.save("tscluster.pkl")

    # Loading the saved model
    clustering.load("tscluster.pkl")