quant-trader-service/training/trader_training/labels.py

from __future__ import annotations

import logging
from typing import Any

import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import sliding_window_view

from trader_training.io_utils import (
    manifest,
    read_json,
    read_parquet,
    require_columns,
    run_root,
    sha256_json,
    to_utc_series,
    write_json,
    write_parquet,
    write_text,
)
from trader_training.schemas import LABEL_VERSION


DEFAULT_LABEL_CONFIG = {
    "direction": {"horizon_minutes": 45, "long_threshold_bps": 5.0, "short_threshold_bps": -5.0},
    "entry": {"max_hold_minutes": 45, "target_bps": 12.0, "stop_bps": 8.0, "min_expected_net_edge_bps": 3.0},
    "continue": {"horizon_minutes": 30, "min_expected_continue_edge_bps": 2.0},
    "exit": {"horizon_minutes": 30, "adverse_move_bps": 8.0, "stagnation_abs_return_bps": 2.0},
    "risk": {"horizon_minutes": 30, "market_drawdown_bps": 12.0, "vol_expansion_ratio": 1.6, "spike_bps": 20.0},
}


DEFAULT_COST_CONFIG = {
    "fee_bps": 4.0,
    "slippage_bps": 2.0,
    "funding_cost_bps": 0.5,
}

ENTRY_LABEL_METHOD = "MAX_FUTURE_EDGE_V1"


def _load_config(path, default):
    if path is None:
        return default
    value = read_json(path)
    merged = default.copy()
    for key, item in value.items():
        if isinstance(item, dict) and isinstance(merged.get(key), dict):
            merged[key] = {**merged[key], **item}
        else:
            merged[key] = item
    return merged


def _base_frames(args: Any) -> tuple[pd.DataFrame, pd.DataFrame]:
    root = run_root(args)
    feature_path = args.feature_path or root / "feature" / "feature_frame.parquet"
    replay_path = args.replay_path or root / "replay" / "replay_1m.parquet"
    features = read_parquet(feature_path)
    replay = read_parquet(replay_path)
    require_columns(features, ("sample_id", "symbol", "event_time", "open_time_ms", "split_id", "walk_forward_fold", "data_quality_flag"), "feature_frame")
    require_columns(replay, ("symbol", "event_time", "open_time_ms", "open", "high", "low", "close", "spread_bps"), "replay_1m")
    features = features.copy()
    replay = replay.copy()
    features["event_time"] = to_utc_series(features["event_time"])
    replay["event_time"] = to_utc_series(replay["event_time"])
    replay = replay.sort_values(["symbol", "event_time"]).reset_index(drop=True)
    return features, replay


PATH_STAT_COLUMNS = [
    "symbol",
    "open_time_ms",
    "side",
    "target_hit",
    "stop_hit",
    "timeout_hit",
    "ambiguous_hit",
    "time_to_target_ms",
    "time_to_stop_ms",
    "gross_edge_bps",
    "future_return_bps",
    "mfe_bps",
    "mae_bps",
    "future_spread_p80",
    "future_realized_vol_bps",
]


def _empty_path_stats_frame() -> pd.DataFrame:
    return pd.DataFrame(columns=PATH_STAT_COLUMNS)


def _first_hit_index(hit_window: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    hit_any = hit_window.any(axis=1)
    first_idx = np.argmax(hit_window, axis=1)
    first_idx = np.where(hit_any, first_idx, hit_window.shape[1] + 1)
    return hit_any, first_idx


def _path_stats_for_group(group: pd.DataFrame, side: str, horizon: int, target_bps: float, stop_bps: float) -> pd.DataFrame:
    if len(group) <= horizon:
        return _empty_path_stats_frame()

    grouped = group.sort_values("event_time").reset_index(drop=True)
    open_ms = grouped["open_time_ms"].astype("int64").to_numpy()
    close = grouped["close"].astype("float64").to_numpy()
    high = grouped["high"].astype("float64").to_numpy()
    low = grouped["low"].astype("float64").to_numpy()
    spread = grouped["spread_bps"].astype("float64").to_numpy()

    entry = close[:-horizon]
    exit_price = close[horizon:]
    current_open_ms = open_ms[:-horizon]

    bad_gap = (np.diff(open_ms) != 60_000).astype("int64")
    gap_cumsum = np.concatenate(([0], np.cumsum(bad_gap)))
    contiguous = (gap_cumsum[horizon:] - gap_cumsum[:-horizon]) == 0
    finite = np.isfinite(entry) & np.isfinite(exit_price)
    valid = contiguous & finite

    future_high = sliding_window_view(high[1:], horizon)
    future_low = sliding_window_view(low[1:], horizon)
    future_spread = sliding_window_view(spread[1:], horizon)

    with np.errstate(all="ignore"):
        high_max = np.nanmax(future_high, axis=1)
        low_min = np.nanmin(future_low, axis=1)
        spread_p80 = np.nanquantile(future_spread, 0.8, axis=1)

    if horizon > 1:
        log_close = np.log(np.clip(close, 1e-12, None))
        log_return = np.diff(log_close)
        future_log_return = sliding_window_view(log_return[1:], horizon - 1)
        with np.errstate(all="ignore"):
            realized_vol_bps = np.nanstd(future_log_return, axis=1, ddof=1) * 10000.0
    else:
        realized_vol_bps = np.full(len(entry), np.nan)

    if side == "LONG":
        target_price = entry * (1.0 + target_bps / 10000.0)
        stop_price = entry * (1.0 - stop_bps / 10000.0)
        target_window = future_high >= target_price[:, None]
        stop_window = future_low <= stop_price[:, None]
        future_return_bps = (exit_price / entry - 1.0) * 10000.0
        mfe_bps = (high_max / entry - 1.0) * 10000.0
        mae_bps = (entry / low_min - 1.0) * 10000.0
    else:
        target_price = entry * (1.0 - target_bps / 10000.0)
        stop_price = entry * (1.0 + stop_bps / 10000.0)
        target_window = future_low <= target_price[:, None]
        stop_window = future_high >= stop_price[:, None]
        future_return_bps = (entry / exit_price - 1.0) * 10000.0
        mfe_bps = (entry / low_min - 1.0) * 10000.0
        mae_bps = (high_max / entry - 1.0) * 10000.0

    target_any, first_target_idx = _first_hit_index(target_window)
    stop_any, first_stop_idx = _first_hit_index(stop_window)
    ambiguous_hit = target_any & stop_any & (first_target_idx == first_stop_idx)
    target_hit = target_any & (first_target_idx < first_stop_idx)
    stop_hit = stop_any & (first_stop_idx <= first_target_idx)
    timeout_hit = ~(target_hit | stop_hit)
    gross_edge_bps = np.where(target_hit, target_bps, np.where(stop_hit, -stop_bps, future_return_bps))

    out = pd.DataFrame(
        {
            "symbol": grouped["symbol"].iloc[0],
            "open_time_ms": current_open_ms,
            "side": side,
            "target_hit": target_hit.astype("int8"),
            "stop_hit": stop_hit.astype("int8"),
            "timeout_hit": timeout_hit.astype("int8"),
            "ambiguous_hit": ambiguous_hit.astype("int8"),
            "time_to_target_ms": np.where(target_hit, (first_target_idx + 1) * 60_000, -1).astype("int64"),
            "time_to_stop_ms": np.where(stop_hit, (first_stop_idx + 1) * 60_000, -1).astype("int64"),
            "gross_edge_bps": gross_edge_bps.astype("float64"),
            "future_return_bps": future_return_bps.astype("float64"),
            "mfe_bps": mfe_bps.astype("float64"),
            "mae_bps": mae_bps.astype("float64"),
            "future_spread_p80": spread_p80.astype("float64"),
            "future_realized_vol_bps": realized_vol_bps.astype("float64"),
        }
    )
    return out.loc[valid, PATH_STAT_COLUMNS].reset_index(drop=True)


def _build_path_stats(replay: pd.DataFrame, horizon: int, target_bps: float, stop_bps: float) -> pd.DataFrame:
    frames: list[pd.DataFrame] = []
    for symbol, group in replay.groupby("symbol", sort=False, observed=False):
        logging.info(
            "trader.training.path_stats_group_start symbol=%s horizonMinutes=%s rowCount=%s",
            symbol,
            horizon,
            len(group),
        )
        for side in ("LONG", "SHORT"):
            stats = _path_stats_for_group(group, side, horizon, target_bps, stop_bps)
            frames.append(stats)
            logging.info(
                "trader.training.path_stats_side_done symbol=%s side=%s horizonMinutes=%s rowCount=%s",
                symbol,
                side,
                horizon,
                len(stats),
            )
    out = pd.concat(frames, ignore_index=True) if frames else _empty_path_stats_frame()
    logging.info("trader.training.path_stats_built horizonMinutes=%s rowCount=%s", horizon, len(out))
    return out


def write_price_plan_context(args: Any) -> None:
    root = run_root(args)
    cost = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
    labels = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
    entry = labels["entry"]
    cost_bps = float(cost["fee_bps"]) + float(cost["slippage_bps"]) + float(cost["funding_cost_bps"])
    context = {
        "pricePlanId": args.price_plan_id,
        "pricePlanConfigHash": sha256_json({"entry": entry, "cost": cost, "entry_label_method": ENTRY_LABEL_METHOD}),
        "stopDistanceBps": float(entry["stop_bps"]),
        "targetDistanceBps": float(entry["target_bps"]),
        "maxHoldMinutes": int(entry["max_hold_minutes"]),
        "minExpectedNetEdgeBps": float(entry["min_expected_net_edge_bps"]),
        "costBps": cost_bps,
        "entryLabelMethod": ENTRY_LABEL_METHOD,
    }
    path = root / "label" / "price_plan_context.json"
    write_json(path, context)
    frame = pd.DataFrame([{
        "price_plan_id": context["pricePlanId"],
        "price_plan_hash": context["pricePlanConfigHash"],
        "target_bps": context["targetDistanceBps"],
        "stop_bps": context["stopDistanceBps"],
        "max_hold_minutes": context["maxHoldMinutes"],
        "min_expected_net_edge_bps": context["minExpectedNetEdgeBps"],
        "cost_bps": context["costBps"],
        "entry_label_method": context["entryLabelMethod"],
    }])
    write_parquet(root / "label" / "price_plan_context.parquet", frame)
    logging.info("trader.training.price_plan_written runId=%s path=%s", args.run_id, path)


def build_direction_labels(args: Any) -> None:
    root = run_root(args)
    config = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)["direction"]
    features, replay = _base_frames(args)
    horizon = int(config["horizon_minutes"])
    replay = replay[["symbol", "event_time", "open_time_ms", "close"]].copy()
    future = replay[["symbol", "open_time_ms", "close"]].copy()
    future["open_time_ms"] = future["open_time_ms"].astype("int64") - horizon * 60_000
    future = future.rename(columns={"close": "future_close"})
    merged = features.merge(replay[["symbol", "open_time_ms", "close"]], on=["symbol", "open_time_ms"], how="left")
    merged = merged.merge(future, on=["symbol", "open_time_ms"], how="left")
    merged["future_return_bps"] = (merged["future_close"] / merged["close"] - 1.0) * 10000.0
    merged["direction_label"] = np.select(
        [merged["future_return_bps"] >= float(config["long_threshold_bps"]), merged["future_return_bps"] <= float(config["short_threshold_bps"])],
        ["LONG", "SHORT"],
        default="NEUTRAL",
    )
    out = pd.DataFrame(
        {
            "sample_id": merged["sample_id"],
            "symbol": merged["symbol"],
            "event_time": merged["event_time"],
            "horizon_minutes": horizon,
            "future_return_bps": merged["future_return_bps"],
            "direction_label": merged["direction_label"],
            "long_target": merged["direction_label"].eq("LONG").astype("int8"),
            "short_target": merged["direction_label"].eq("SHORT").astype("int8"),
            "neutral_target": merged["direction_label"].eq("NEUTRAL").astype("int8"),
            "split_id": merged["split_id"],
            "walk_forward_fold": merged["walk_forward_fold"],
            "label_version": LABEL_VERSION,
        }
    ).dropna(subset=["future_return_bps"])
    path = root / "label" / "direction_labels.parquet"
    data_hash = write_parquet(path, out)
    _write_label_manifest(root / "label" / "direction_labels.manifest.json", path, out, data_hash)
    _write_distribution_report(root / "label" / "direction_label_report.md", out, "direction_label")
    logging.info("trader.training.direction_labels_written runId=%s rowCount=%s", args.run_id, len(out))


def build_entry_labels(args: Any) -> None:
    root = run_root(args)
    labels = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
    cost = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
    plan_path = args.price_plan_context_path or root / "label" / "price_plan_context.json"
    plan = read_json(plan_path)
    features, replay = _base_frames(args)
    entry_conf = labels["entry"]
    cost_bps = float(cost["fee_bps"]) + float(cost["slippage_bps"]) + float(cost["funding_cost_bps"])
    stats = _build_path_stats(
        replay,
        int(entry_conf["max_hold_minutes"]),
        float(entry_conf["target_bps"]),
        float(entry_conf["stop_bps"]),
    )
    feature_columns = [
        "sample_id",
        "symbol",
        "event_time",
        "open_time_ms",
        "split_id",
        "walk_forward_fold",
        "spread_bps",
        "spread_rank_24h_pct",
        "realized_vol_15m_bps",
    ]
    merged = features[feature_columns].merge(stats, on=["symbol", "open_time_ms"], how="inner")
    merged["max_achievable_gross_edge_bps"] = merged["mfe_bps"]
    merged["max_achievable_net_edge_bps"] = merged["max_achievable_gross_edge_bps"] - cost_bps
    merged["expected_net_edge_bps"] = merged["max_achievable_net_edge_bps"]
    merged["entry_target"] = (merged["max_achievable_net_edge_bps"] >= float(entry_conf["min_expected_net_edge_bps"])).astype("int8")
    merged["price_plan_id"] = plan["pricePlanId"]
    merged["price_plan_hash"] = plan["pricePlanConfigHash"]
    merged["cost_bps"] = cost_bps
    merged["label_method"] = ENTRY_LABEL_METHOD
    merged["label_version"] = LABEL_VERSION
    out = merged[
        [
            "sample_id",
            "symbol",
            "event_time",
            "side",
            "price_plan_id",
            "price_plan_hash",
            "target_hit",
            "stop_hit",
            "timeout_hit",
            "ambiguous_hit",
            "time_to_target_ms",
            "time_to_stop_ms",
            "gross_edge_bps",
            "future_return_bps",
            "mfe_bps",
            "mae_bps",
            "max_achievable_gross_edge_bps",
            "max_achievable_net_edge_bps",
            "cost_bps",
            "expected_net_edge_bps",
            "entry_target",
            "label_method",
            "split_id",
            "walk_forward_fold",
            "label_version",
        ]
    ].copy()
    path = root / "label" / "entry_labels.parquet"
    data_hash = write_parquet(path, out)
    _write_label_manifest(root / "label" / "entry_labels.manifest.json", path, out, data_hash)
    _write_distribution_report(root / "label" / "entry_label_report.md", out, "entry_target")
    logging.info("trader.training.entry_labels_written runId=%s rowCount=%s", args.run_id, len(out))


def build_position_state_samples(args: Any) -> None:
    root = run_root(args)
    entry_path = args.entry_label_path or root / "label" / "entry_labels.parquet"
    entry = read_parquet(entry_path)
    if entry.empty:
        raise ValueError("entry labels are required before building position samples")
    samples = entry[entry["entry_target"] == 1].copy()
    samples["position_age_minutes"] = 0
    samples["unrealized_pnl_bps"] = 0.0
    samples["mfe_bps"] = pd.to_numeric(samples["mfe_bps"], errors="coerce").fillna(0.0).clip(lower=0)
    samples["mae_bps"] = pd.to_numeric(samples["mae_bps"], errors="coerce").fillna(0.0).clip(lower=0)
    path = root / "label" / "position_state_samples.parquet"
    data_hash = write_parquet(path, samples)
    write_json(root / "label" / "position_state_samples.manifest.json", manifest(path, {"row_count": len(samples), "data_hash_sha256": data_hash}))
    logging.info("trader.training.position_samples_written runId=%s rowCount=%s", args.run_id, len(samples))


def build_continue_exit_risk_labels(args: Any) -> None:
    root = run_root(args)
    labels = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
    cost = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
    plan = read_json(args.price_plan_context_path or root / "label" / "price_plan_context.json")
    features, replay = _base_frames(args)
    cost_bps = float(cost["fee_bps"]) + float(cost["slippage_bps"]) + float(cost["funding_cost_bps"])
    horizon = int(labels["continue"]["horizon_minutes"])
    target_bps = float(plan["targetDistanceBps"])
    stop_bps = float(plan["stopDistanceBps"])
    stats = _build_path_stats(replay, horizon, target_bps, stop_bps)
    long_stats = stats[stats["side"] == "LONG"].drop(columns=["side"]).add_prefix("long_")
    short_stats = stats[stats["side"] == "SHORT"].drop(columns=["side"]).add_prefix("short_")
    long_stats = long_stats.rename(columns={"long_symbol": "symbol", "long_open_time_ms": "open_time_ms"})
    short_stats = short_stats.rename(columns={"short_symbol": "symbol", "short_open_time_ms": "open_time_ms"})
    feature_columns = [
        "sample_id",
        "symbol",
        "event_time",
        "open_time_ms",
        "split_id",
        "walk_forward_fold",
        "spread_bps",
        "spread_rank_24h_pct",
        "realized_vol_15m_bps",
    ]
    if "ret_15m_bps" in features.columns:
        feature_columns.append("ret_15m_bps")
    merged = features[feature_columns].merge(long_stats, on=["symbol", "open_time_ms"], how="inner")
    merged = merged.merge(short_stats, on=["symbol", "open_time_ms"], how="inner")
    min_continue = float(labels["continue"]["min_expected_continue_edge_bps"])
    adverse_threshold = float(labels["exit"]["adverse_move_bps"])
    current_vol = merged["realized_vol_15m_bps"].astype(float).fillna(0.0).clip(lower=1.0)

    long_edge = merged["long_future_return_bps"] - cost_bps
    short_edge = merged["short_future_return_bps"] - cost_bps
    path_risk = np.maximum(merged["long_mae_bps"], merged["short_mae_bps"])
    max_path_move = np.maximum.reduce([merged["long_mfe_bps"], merged["short_mfe_bps"], path_risk])
    if "ret_15m_bps" in merged.columns:
        reversal = (np.sign(merged["long_future_return_bps"]) != np.sign(merged["ret_15m_bps"])).astype("int8")
    else:
        reversal = pd.Series(0, index=merged.index, dtype="int8")
    future_vol = merged["long_future_realized_vol_bps"].fillna(0.0)
    volatility_expansion = future_vol >= current_vol * float(labels["risk"]["vol_expansion_ratio"])
    liquidity_deterioration = merged["spread_rank_24h_pct"].astype(float).fillna(0.0) >= 0.90

    rows_continue = pd.DataFrame(
        {
            "sample_id": merged["sample_id"],
            "symbol": merged["symbol"],
            "event_time": merged["event_time"],
            "long_continue_target": ((long_edge >= min_continue) & (merged["long_mae_bps"] < stop_bps)).astype("int8"),
            "short_continue_target": ((short_edge >= min_continue) & (merged["short_mae_bps"] < stop_bps)).astype("int8"),
            "long_expected_continue_edge_bps": long_edge,
            "short_expected_continue_edge_bps": short_edge,
            "split_id": merged["split_id"],
            "walk_forward_fold": merged["walk_forward_fold"],
            "label_version": LABEL_VERSION,
        }
    )
    rows_exit = pd.DataFrame(
        {
            "sample_id": merged["sample_id"],
            "symbol": merged["symbol"],
            "event_time": merged["event_time"],
            "long_exit_target": ((merged["long_stop_hit"] == 1) | (merged["long_mae_bps"] >= adverse_threshold)).astype("int8"),
            "short_exit_target": ((merged["short_stop_hit"] == 1) | (merged["short_mae_bps"] >= adverse_threshold)).astype("int8"),
            "long_adverse_move_bps": merged["long_mae_bps"],
            "short_adverse_move_bps": merged["short_mae_bps"],
            "adverse_move_prob_label": (path_risk >= adverse_threshold).astype("int8"),
            "reversal_prob_label": reversal,
            "stop_hit_prob_label": ((merged["long_stop_hit"] == 1) | (merged["short_stop_hit"] == 1)).astype("int8"),
            "stagnation_prob_label": (merged["long_future_return_bps"].abs() <= float(labels["exit"]["stagnation_abs_return_bps"])).astype("int8"),
            "split_id": merged["split_id"],
            "walk_forward_fold": merged["walk_forward_fold"],
            "label_version": LABEL_VERSION,
        }
    )
    rows_risk = pd.DataFrame(
        {
            "sample_id": merged["sample_id"],
            "symbol": merged["symbol"],
            "event_time": merged["event_time"],
            "market_risk_target": (path_risk >= float(labels["risk"]["market_drawdown_bps"])).astype("int8"),
            "market_path_risk_bps": path_risk,
            "long_position_path_risk_bps": merged["long_mae_bps"],
            "short_position_path_risk_bps": merged["short_mae_bps"],
            "long_position_risk_target": (merged["long_mae_bps"] >= stop_bps).astype("int8"),
            "short_position_risk_target": (merged["short_mae_bps"] >= stop_bps).astype("int8"),
            "market_drawdown_prob_label": (path_risk >= float(labels["risk"]["market_drawdown_bps"])).astype("int8"),
            "volatility_expansion_prob_label": volatility_expansion.astype("int8"),
            "spike_prob_label": (max_path_move >= float(labels["risk"]["spike_bps"])).astype("int8"),
            "liquidity_deterioration_prob_label": liquidity_deterioration.astype("int8"),
            "position_drawdown_prob_label": (path_risk >= stop_bps).astype("int8"),
            "split_id": merged["split_id"],
            "walk_forward_fold": merged["walk_forward_fold"],
            "label_version": LABEL_VERSION,
        }
    )
    outputs = [
        ("continue", pd.DataFrame(rows_continue), "long_continue_target"),
        ("exit", pd.DataFrame(rows_exit), "long_exit_target"),
        ("risk", pd.DataFrame(rows_risk), "market_risk_target"),
    ]
    report_parts = ["# Continue Exit Risk Label Report", ""]
    for name, frame, target in outputs:
        path = root / "label" / f"{name}_labels.parquet"
        data_hash = write_parquet(path, frame)
        _write_label_manifest(root / "label" / f"{name}_labels.manifest.json", path, frame, data_hash)
        report_parts.append(f"## {name}")
        report_parts.append("")
        report_parts.append(str(frame[target].value_counts(dropna=False).to_dict() if not frame.empty else {}))
        report_parts.append("")
        logging.info("trader.training.%s_labels_written runId=%s rowCount=%s", name, args.run_id, len(frame))
    write_text(root / "label" / "continue_exit_risk_label_report.md", "\n".join(report_parts) + "\n")


def _write_label_manifest(path, parquet_path, frame: pd.DataFrame, data_hash: str) -> None:
    write_json(path, manifest(parquet_path, {"row_count": len(frame), "label_version": LABEL_VERSION, "data_hash_sha256": data_hash}))


def _write_distribution_report(path, frame: pd.DataFrame, column: str) -> None:
    counts = frame[column].value_counts(dropna=False).to_dict() if not frame.empty else {}
    lines = ["# Label Report", "", f"- row_count: {len(frame)}", f"- target_column: {column}", f"- distribution: {counts}", ""]
    write_text(path, "\n".join(lines))