Improve Trader V4 training pipeline

Align entry labels with max future edge, tune direction labeling, and harden regression evaluation. Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
2026-06-27 19:57:29 +08:00
parent e58e4a5572
commit 9acb3460a1
27 changed files with 2059 additions and 341 deletions
@@ -1,12 +1,14 @@
 from __future__ import annotations

 import logging
+from pathlib import Path
 from typing import Any

 import numpy as np
 import pandas as pd

 from trader_training.io_utils import (
+    DEFAULT_RAW_ROOT,
    manifest,
    read_parquet,
    require_columns,
@@ -49,7 +51,7 @@ def _rolling_rank_last(values: pd.Series, window: int) -> pd.Series:
 def _complete_days(frame: pd.DataFrame) -> pd.DataFrame:
    frame = frame.copy()
    frame["event_date"] = frame["event_time"].dt.strftime("%Y-%m-%d")
-    counts = frame.groupby(["symbol", "event_date"])["event_time"].count()
+    counts = frame.groupby(["symbol", "event_date"], observed=False)["event_time"].count()
    complete = counts[counts == 1440].reset_index()[["symbol", "event_date"]]
    return frame.merge(complete, on=["symbol", "event_date"], how="inner").drop(columns=["event_date"])

@@ -90,9 +92,18 @@ def build_feature_frame(args: Any) -> None:
        before = len(replay)
        replay = _complete_days(replay)
        logging.info("trader.training.feature_complete_days rowBefore=%s rowAfter=%s", before, len(replay))
+    raw_root = Path(getattr(args, "raw_root", None) or DEFAULT_RAW_ROOT)
+    book_features = _load_book_minute_features(raw_root, replay[["symbol", "event_time", "open_time_ms"]])
+    replay = replay.merge(book_features, on=["symbol", "open_time_ms"], how="left")
+    logging.info(
+        "trader.training.book_features_merged rowCount=%s bookRows=%s bookAvailableRows=%s",
+        len(replay),
+        len(book_features),
+        int(replay["book_top_imbalance"].notna().sum()),
+    )

    frames: list[pd.DataFrame] = []
-    for symbol, group in replay.groupby("symbol", sort=False):
+    for symbol, group in replay.groupby("symbol", sort=False, observed=False):
        group = group.sort_values("event_time").reset_index(drop=True).copy()
        close = group["close"].astype(float)
        high = group["high"].astype(float)
@@ -140,6 +151,14 @@ def build_feature_frame(args: Any) -> None:
        group["oi_delta_15m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(15) - 1.0) * 10000.0
        group["oi_delta_60m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(60) - 1.0) * 10000.0
        group["mark_index_basis_bps"] = (group["mark_price"].astype(float) / group["index_price"].astype(float) - 1.0) * 10000.0
+        group["book_pressure_spread_ratio"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["spread_bps"].astype(float).abs().clip(lower=0.01))
+        group["book_pressure_taker_1m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_1m"].astype(float)
+        group["book_pressure_taker_5m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_5m"].astype(float)
+        group["book_l20_imbalance_taker_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["taker_imbalance_15m"].astype(float)
+        group["book_l20_imbalance_ret_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["ret_15m_bps"].astype(float)
+        group["book_pressure_vol_adjusted"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["realized_vol_15m_bps"].astype(float).clip(lower=1.0))
+        group["book_depth_pressure_gap"] = group["book_depth_imbalance_l5"].astype(float) - group["book_depth_imbalance_l20"].astype(float)
+        group["book_pressure_reversal_15m"] = -group["book_microprice_basis_bps"].astype(float) * group["ret_15m_bps"].astype(float)
        liq_buy = group["liquidation_buy_notional_1m"].astype(float)
        liq_sell = group["liquidation_sell_notional_1m"].astype(float)
        liq_total_15 = (liq_buy + liq_sell).rolling(15, min_periods=1).sum()
@@ -159,12 +178,13 @@ def build_feature_frame(args: Any) -> None:
    frame["split_id"] = assign_split(frame["event_time"], split_manifest_path)
    frame["walk_forward_fold"] = np.where(frame["split_id"].eq(FIT_SPLIT), "fold_01", "NO_FOLD")
    frame["feature_version"] = FEATURE_VERSION
-    hard_na = frame[FEATURE_ORDER].isna().any(axis=1)
+    numeric_features = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
+    hard_na = numeric_features.isna().any(axis=1)
    optional_missing = frame["liquidation_available"].fillna(0).eq(0)
    frame["data_quality_flag"] = np.where(hard_na, "WARMUP", np.where(optional_missing, "PARTIAL_OPTIONAL", "OK"))
    ordered = frame[META_COLUMNS + FEATURE_ORDER].copy()
    for feature in FEATURE_ORDER:
-        ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").astype("float32")
+        ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32")

    feature_dir = root / "feature"
    data_hash = write_parquet(feature_dir / "feature_frame.parquet", ordered)
@@ -202,7 +222,7 @@ def build_feature_frame(args: Any) -> None:

 def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, feature_order_hash: str) -> None:
    split_rows = []
-    for split_id, group in frame.groupby("split_id", sort=True):
+    for split_id, group in frame.groupby("split_id", sort=True, observed=False):
        split_rows.append(
            {
                "split_id": split_id,
@@ -246,6 +266,8 @@ def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, fe
        "",
        f"- replay_1m_required_columns: present",
        f"- liquidation_available_share: {float(frame['liquidation_available'].mean()):.6f}",
+        f"- book_available_share: {float(frame['book_top_imbalance'].notna().mean()):.6f}",
+        f"- feature_rows_with_book_missing: {int(frame['book_top_imbalance'].isna().sum())}",
        f"- feature_rows_with_optional_liquidation_missing: {int(frame['data_quality_flag'].eq('PARTIAL_OPTIONAL').sum())}",
        "",
        "## Leakage Check",
@@ -286,6 +308,105 @@ def feature_order_hash() -> str:
    return sha256_json(FEATURE_ORDER)


+def _load_book_minute_features(raw_root: Path, replay_keys: pd.DataFrame) -> pd.DataFrame:
+    if replay_keys.empty:
+        return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
+    keys = replay_keys.copy()
+    keys["event_time"] = to_utc_series(keys["event_time"])
+    keys["event_date"] = keys["event_time"].dt.strftime("%Y-%m-%d")
+    frames: list[pd.DataFrame] = []
+    for (symbol, event_date), _ in keys.groupby(["symbol", "event_date"], sort=True, observed=False):
+        path = raw_root / "table=book" / "exchange=BINANCE_FUTURES" / f"symbol={symbol}" / f"dt={event_date}" / "data.parquet"
+        if not path.is_file():
+            logging.warning("trader.training.book_partition_missing symbol=%s eventDate=%s path=%s", symbol, event_date, path)
+            continue
+        day_features = _read_book_day(path, symbol)
+        frames.append(day_features)
+        logging.info(
+            "trader.training.book_partition_loaded symbol=%s eventDate=%s minuteRows=%s path=%s",
+            symbol,
+            event_date,
+            len(day_features),
+            path,
+        )
+    if not frames:
+        return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
+    out = pd.concat(frames, ignore_index=True).drop_duplicates(["symbol", "open_time_ms"], keep="last")
+    wanted = keys[["symbol", "open_time_ms"]].drop_duplicates()
+    return wanted.merge(out, on=["symbol", "open_time_ms"], how="inner")
+
+
+def _read_book_day(path: Path, symbol: str) -> pd.DataFrame:
+    columns = ["origin_time"]
+    for side in ("bid", "ask"):
+        for level in range(20):
+            columns.extend([f"{side}_{level}_price", f"{side}_{level}_size"])
+    book = pd.read_parquet(path, columns=columns)
+    if book.empty:
+        return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
+    book = book.dropna(subset=["origin_time", "bid_0_price", "ask_0_price", "bid_0_size", "ask_0_size"]).copy()
+    book["origin_time"] = to_utc_series(book["origin_time"])
+    book["minute"] = book["origin_time"].dt.floor("min")
+    book = book.sort_values("origin_time").drop_duplicates("minute", keep="last").reset_index(drop=True)
+    if book.empty:
+        return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
+
+    bid0 = book["bid_0_price"].astype("float64").to_numpy()
+    ask0 = book["ask_0_price"].astype("float64").to_numpy()
+    bid0_size = book["bid_0_size"].astype("float64").to_numpy()
+    ask0_size = book["ask_0_size"].astype("float64").to_numpy()
+    mid = (bid0 + ask0) / 2.0
+    top_denominator = np.maximum(bid0_size + ask0_size, 1e-12)
+    microprice = (bid0 * ask0_size + ask0 * bid0_size) / top_denominator
+
+    bid_depth_l5 = _book_level_notional(book, "bid", 5)
+    ask_depth_l5 = _book_level_notional(book, "ask", 5)
+    bid_depth_l20 = _book_level_notional(book, "bid", 20)
+    ask_depth_l20 = _book_level_notional(book, "ask", 20)
+    total_l5 = bid_depth_l5 + ask_depth_l5
+    total_l20 = bid_depth_l20 + ask_depth_l20
+
+    minute_ms = (book["minute"].astype("int64") // 1_000_000).astype("int64")
+    return pd.DataFrame(
+        {
+            "symbol": symbol,
+            "open_time_ms": minute_ms,
+            "book_top_imbalance": (bid0_size - ask0_size) / top_denominator,
+            "book_microprice_basis_bps": (microprice / mid - 1.0) * 10000.0,
+            "book_bid_depth_l5_quote": bid_depth_l5,
+            "book_ask_depth_l5_quote": ask_depth_l5,
+            "book_depth_imbalance_l5": _depth_imbalance(bid_depth_l5, ask_depth_l5),
+            "book_depth_imbalance_l20": _depth_imbalance(bid_depth_l20, ask_depth_l20),
+            "book_depth_concentration_l5_l20": total_l5 / np.maximum(total_l20, 1e-12),
+        }
+    )
+
+
+def _book_level_notional(book: pd.DataFrame, side: str, level_count: int) -> np.ndarray:
+    total = np.zeros(len(book), dtype="float64")
+    for level in range(level_count):
+        price = pd.to_numeric(book[f"{side}_{level}_price"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
+        size = pd.to_numeric(book[f"{side}_{level}_size"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
+        total += price * size
+    return total
+
+
+def _depth_imbalance(bid_depth: np.ndarray, ask_depth: np.ndarray) -> np.ndarray:
+    return (bid_depth - ask_depth) / np.maximum(bid_depth + ask_depth, 1e-12)
+
+
+def _book_feature_columns() -> list[str]:
+    return [
+        "book_top_imbalance",
+        "book_microprice_basis_bps",
+        "book_bid_depth_l5_quote",
+        "book_ask_depth_l5_quote",
+        "book_depth_imbalance_l5",
+        "book_depth_imbalance_l20",
+        "book_depth_concentration_l5_l20",
+    ]
+
+
 def _high_correlation_rows(frame: pd.DataFrame) -> list[dict[str, object]]:
    sample = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").dropna()
    if len(sample) > 5000: