Clustering
This section demonstrates how to use the Clustering
module in ydata-sdk
.
Don't forget to set up your license key
Example Code
import os
import pandas as pd
from ydata.connectors import GCSConnector
from ydata.connectors.filetype import FileType
from ydata.preprocessing.clustering import TimeSeriesClustering
from ydata.utils.formats import read_json
os.environ["DATASOURCE_API_URL"] = ""
os.environ["HOSTNAME"] = ""
os.environ["RUNNING_ENV"] = "LOCAL"
os.environ["DASK_GATEWAY_URL"] = "url"
if __name__ == "__main__":
# Read the dataset from a remote storage
token = read_json(
"/path-to-file/gcs_credentials.json"
)
connector = GCSConnector("ydatasynthetic", keyfile_dict=token)
data = connector.read_file(
"gs://path-to-file/data.csv",
file_type=FileType.CSV,
)
# Convert into pandas as currently clustering does not yet support YData Dataset object
data = data.to_pandas()
data["timestamp"] = pd.to_datetime(data["timestamp"])
data = data.set_index("timestamp")
# Init the TimeSeriesClustering
clustering = TimeSeriesClustering(metric="dtw", n_clusters=3)
clusters, _ = clustering.fit_transform(
X=data[["device", "winddirection"]], segment=["device"]
)
# We can also only the fit method alone, in case we want to first save the model prior to getting the clusters
clustering.fit(X=data[["device", "winddirection"]], segment=["device"])
# Saving the clustering model
clustering.save("tscluster.pkl")
# Loading the saved model
clustering.load("tscluster.pkl")