Source code for conformalopt.data

import numpy as np
import pandas as pd
import time
import os


def construct_file_path(data_name):

    # Get the directory where this script is located
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))

    # Construct the absolute path to the file
    return os.path.join(BASE_DIR, "data", f"{data_name}.csv")



[docs]
def get_scores(data_name, scores=None):
    """
    Loads and processes scores for a specified dataset. The scores are calculated as
    |Y_t - hat Y_t| for various base forecasters hat Y_t. The datasets are all described
    in detail in the paper Online Conformal Prediction via
    Online Optimization. The datasets of the form name* should be input as f"{name}_{base_forecaster}_absolute-residual_scores"
    for base_forecaster as `ar`, `prophet`, `theta`, or `transformer`.


    Args:
        data_name (str): The name of the dataset. Supported options:

            - `elec`: Elec2 data with base forecaster being a one-day delayed moving average. \n
            - `daily-climate*`: Daily climate data.
            - `AMZN*`, `GOOGL*`, `MSFT*`: Stock data.
            - `synthetic_AR_2_1M`: 1_000_000 synthetic AR(2) data generated with [0.3, -0.3] AR parameters and standard normal noise.
            - `gaussian`: 10_000 i.i.d. Gaussian-distributed synthetic scores.
            - `ercot_preregistered`: ERCOT load and forecast data. This is the preregistered dataset used in the paper.

    Returns:
        np.ndarray: A processed score array.
    """
    if data_name == "elec":  # length 45264
        # Score is |Y_t - \hat Y_t| where \hat Y_t is a one-day delayed moving average
        data = pd.read_csv(construct_file_path("electricity-normalized.csv"))
        Y = data["nswdemand"].to_numpy()
        # Bug in PID paper code: actually predicting one-day delayed moving average now, as paper claims.
        Yhat = [np.mean(Y[i : i + 24]) for i in range(len(Y[48:]))]
        Y = Y[48:]
        scores = np.abs(Y - Yhat)
    elif data_name.startswith("synthetic_AR_2_1M"):

        np.random.seed(int(data_name.split("_")[-1]))
        # Parameters
        n = 1_000_000  # Length of the time series
        phi = [0.3, -0.3]  # AR(2) parameters
        sigma = 1.0  # Standard deviation of the noise

        # Generate white noise
        noise = np.random.normal(0, sigma, n)

        # Initialize the time series
        y = np.zeros(n)

        # Generate the AR(2) time series
        for t in range(2, n):
            y[t] = phi[0] * y[t - 1] + phi[1] * y[t - 2] + noise[t]

        scores = y
    elif data_name == "gaussian":
        scores = []
        for i in range(10_000):
            scores.append(np.random.normal(scale=i // 10))
        scores = np.array(scores)
    elif data_name == "ercot_preregistered":
        import gridstatusio as gs

        API_KEY = "1a692d6abfa547bfb58911dd29a3f088"
        START_TIME = "2024-12-18"
        END_TIME = "2025-01-04"

        # Collect scores via API.
        client = gs.GridStatusClient(API_KEY)
        df_data = client.get_dataset(dataset="ercot_load", start=START_TIME, end=END_TIME)
        time.sleep(1)  # To avoid API rate limit hit.
        df_forecasts = client.get_dataset(dataset="ercot_load_forecast", start=START_TIME, end=END_TIME)
        df_merged = pd.merge(df_data, df_forecasts, on="interval_start_utc", how="inner")
        scores = np.abs(df_merged["load"] - df_merged["load_forecast"]).values[200:]

    else:
        # Stocks (daily-climate, AMZN, GOOGL, MSFT) routed here.
        filename = construct_file_path(data_name)
        scores = np.loadtxt(filename)

    # Sometimes the first few scores in these datasets are nonsense.
    return scores[30:]