Improve Trader V4 training pipeline

Align entry labels with max future edge, tune direction labeling, and harden regression evaluation.

Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
This commit is contained in:
Codex
2026-06-27 19:57:29 +08:00
parent e58e4a5572
commit 9acb3460a1
27 changed files with 2059 additions and 341 deletions
+126 -5
View File
@@ -1,12 +1,14 @@
from __future__ import annotations
import logging
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
from trader_training.io_utils import (
DEFAULT_RAW_ROOT,
manifest,
read_parquet,
require_columns,
@@ -49,7 +51,7 @@ def _rolling_rank_last(values: pd.Series, window: int) -> pd.Series:
def _complete_days(frame: pd.DataFrame) -> pd.DataFrame:
frame = frame.copy()
frame["event_date"] = frame["event_time"].dt.strftime("%Y-%m-%d")
counts = frame.groupby(["symbol", "event_date"])["event_time"].count()
counts = frame.groupby(["symbol", "event_date"], observed=False)["event_time"].count()
complete = counts[counts == 1440].reset_index()[["symbol", "event_date"]]
return frame.merge(complete, on=["symbol", "event_date"], how="inner").drop(columns=["event_date"])
@@ -90,9 +92,18 @@ def build_feature_frame(args: Any) -> None:
before = len(replay)
replay = _complete_days(replay)
logging.info("trader.training.feature_complete_days rowBefore=%s rowAfter=%s", before, len(replay))
raw_root = Path(getattr(args, "raw_root", None) or DEFAULT_RAW_ROOT)
book_features = _load_book_minute_features(raw_root, replay[["symbol", "event_time", "open_time_ms"]])
replay = replay.merge(book_features, on=["symbol", "open_time_ms"], how="left")
logging.info(
"trader.training.book_features_merged rowCount=%s bookRows=%s bookAvailableRows=%s",
len(replay),
len(book_features),
int(replay["book_top_imbalance"].notna().sum()),
)
frames: list[pd.DataFrame] = []
for symbol, group in replay.groupby("symbol", sort=False):
for symbol, group in replay.groupby("symbol", sort=False, observed=False):
group = group.sort_values("event_time").reset_index(drop=True).copy()
close = group["close"].astype(float)
high = group["high"].astype(float)
@@ -140,6 +151,14 @@ def build_feature_frame(args: Any) -> None:
group["oi_delta_15m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(15) - 1.0) * 10000.0
group["oi_delta_60m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(60) - 1.0) * 10000.0
group["mark_index_basis_bps"] = (group["mark_price"].astype(float) / group["index_price"].astype(float) - 1.0) * 10000.0
group["book_pressure_spread_ratio"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["spread_bps"].astype(float).abs().clip(lower=0.01))
group["book_pressure_taker_1m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_1m"].astype(float)
group["book_pressure_taker_5m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_5m"].astype(float)
group["book_l20_imbalance_taker_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["taker_imbalance_15m"].astype(float)
group["book_l20_imbalance_ret_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["ret_15m_bps"].astype(float)
group["book_pressure_vol_adjusted"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["realized_vol_15m_bps"].astype(float).clip(lower=1.0))
group["book_depth_pressure_gap"] = group["book_depth_imbalance_l5"].astype(float) - group["book_depth_imbalance_l20"].astype(float)
group["book_pressure_reversal_15m"] = -group["book_microprice_basis_bps"].astype(float) * group["ret_15m_bps"].astype(float)
liq_buy = group["liquidation_buy_notional_1m"].astype(float)
liq_sell = group["liquidation_sell_notional_1m"].astype(float)
liq_total_15 = (liq_buy + liq_sell).rolling(15, min_periods=1).sum()
@@ -159,12 +178,13 @@ def build_feature_frame(args: Any) -> None:
frame["split_id"] = assign_split(frame["event_time"], split_manifest_path)
frame["walk_forward_fold"] = np.where(frame["split_id"].eq(FIT_SPLIT), "fold_01", "NO_FOLD")
frame["feature_version"] = FEATURE_VERSION
hard_na = frame[FEATURE_ORDER].isna().any(axis=1)
numeric_features = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
hard_na = numeric_features.isna().any(axis=1)
optional_missing = frame["liquidation_available"].fillna(0).eq(0)
frame["data_quality_flag"] = np.where(hard_na, "WARMUP", np.where(optional_missing, "PARTIAL_OPTIONAL", "OK"))
ordered = frame[META_COLUMNS + FEATURE_ORDER].copy()
for feature in FEATURE_ORDER:
ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").astype("float32")
ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32")
feature_dir = root / "feature"
data_hash = write_parquet(feature_dir / "feature_frame.parquet", ordered)
@@ -202,7 +222,7 @@ def build_feature_frame(args: Any) -> None:
def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, feature_order_hash: str) -> None:
split_rows = []
for split_id, group in frame.groupby("split_id", sort=True):
for split_id, group in frame.groupby("split_id", sort=True, observed=False):
split_rows.append(
{
"split_id": split_id,
@@ -246,6 +266,8 @@ def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, fe
"",
f"- replay_1m_required_columns: present",
f"- liquidation_available_share: {float(frame['liquidation_available'].mean()):.6f}",
f"- book_available_share: {float(frame['book_top_imbalance'].notna().mean()):.6f}",
f"- feature_rows_with_book_missing: {int(frame['book_top_imbalance'].isna().sum())}",
f"- feature_rows_with_optional_liquidation_missing: {int(frame['data_quality_flag'].eq('PARTIAL_OPTIONAL').sum())}",
"",
"## Leakage Check",
@@ -286,6 +308,105 @@ def feature_order_hash() -> str:
return sha256_json(FEATURE_ORDER)
def _load_book_minute_features(raw_root: Path, replay_keys: pd.DataFrame) -> pd.DataFrame:
if replay_keys.empty:
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
keys = replay_keys.copy()
keys["event_time"] = to_utc_series(keys["event_time"])
keys["event_date"] = keys["event_time"].dt.strftime("%Y-%m-%d")
frames: list[pd.DataFrame] = []
for (symbol, event_date), _ in keys.groupby(["symbol", "event_date"], sort=True, observed=False):
path = raw_root / "table=book" / "exchange=BINANCE_FUTURES" / f"symbol={symbol}" / f"dt={event_date}" / "data.parquet"
if not path.is_file():
logging.warning("trader.training.book_partition_missing symbol=%s eventDate=%s path=%s", symbol, event_date, path)
continue
day_features = _read_book_day(path, symbol)
frames.append(day_features)
logging.info(
"trader.training.book_partition_loaded symbol=%s eventDate=%s minuteRows=%s path=%s",
symbol,
event_date,
len(day_features),
path,
)
if not frames:
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
out = pd.concat(frames, ignore_index=True).drop_duplicates(["symbol", "open_time_ms"], keep="last")
wanted = keys[["symbol", "open_time_ms"]].drop_duplicates()
return wanted.merge(out, on=["symbol", "open_time_ms"], how="inner")
def _read_book_day(path: Path, symbol: str) -> pd.DataFrame:
columns = ["origin_time"]
for side in ("bid", "ask"):
for level in range(20):
columns.extend([f"{side}_{level}_price", f"{side}_{level}_size"])
book = pd.read_parquet(path, columns=columns)
if book.empty:
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
book = book.dropna(subset=["origin_time", "bid_0_price", "ask_0_price", "bid_0_size", "ask_0_size"]).copy()
book["origin_time"] = to_utc_series(book["origin_time"])
book["minute"] = book["origin_time"].dt.floor("min")
book = book.sort_values("origin_time").drop_duplicates("minute", keep="last").reset_index(drop=True)
if book.empty:
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
bid0 = book["bid_0_price"].astype("float64").to_numpy()
ask0 = book["ask_0_price"].astype("float64").to_numpy()
bid0_size = book["bid_0_size"].astype("float64").to_numpy()
ask0_size = book["ask_0_size"].astype("float64").to_numpy()
mid = (bid0 + ask0) / 2.0
top_denominator = np.maximum(bid0_size + ask0_size, 1e-12)
microprice = (bid0 * ask0_size + ask0 * bid0_size) / top_denominator
bid_depth_l5 = _book_level_notional(book, "bid", 5)
ask_depth_l5 = _book_level_notional(book, "ask", 5)
bid_depth_l20 = _book_level_notional(book, "bid", 20)
ask_depth_l20 = _book_level_notional(book, "ask", 20)
total_l5 = bid_depth_l5 + ask_depth_l5
total_l20 = bid_depth_l20 + ask_depth_l20
minute_ms = (book["minute"].astype("int64") // 1_000_000).astype("int64")
return pd.DataFrame(
{
"symbol": symbol,
"open_time_ms": minute_ms,
"book_top_imbalance": (bid0_size - ask0_size) / top_denominator,
"book_microprice_basis_bps": (microprice / mid - 1.0) * 10000.0,
"book_bid_depth_l5_quote": bid_depth_l5,
"book_ask_depth_l5_quote": ask_depth_l5,
"book_depth_imbalance_l5": _depth_imbalance(bid_depth_l5, ask_depth_l5),
"book_depth_imbalance_l20": _depth_imbalance(bid_depth_l20, ask_depth_l20),
"book_depth_concentration_l5_l20": total_l5 / np.maximum(total_l20, 1e-12),
}
)
def _book_level_notional(book: pd.DataFrame, side: str, level_count: int) -> np.ndarray:
total = np.zeros(len(book), dtype="float64")
for level in range(level_count):
price = pd.to_numeric(book[f"{side}_{level}_price"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
size = pd.to_numeric(book[f"{side}_{level}_size"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
total += price * size
return total
def _depth_imbalance(bid_depth: np.ndarray, ask_depth: np.ndarray) -> np.ndarray:
return (bid_depth - ask_depth) / np.maximum(bid_depth + ask_depth, 1e-12)
def _book_feature_columns() -> list[str]:
return [
"book_top_imbalance",
"book_microprice_basis_bps",
"book_bid_depth_l5_quote",
"book_ask_depth_l5_quote",
"book_depth_imbalance_l5",
"book_depth_imbalance_l20",
"book_depth_concentration_l5_l20",
]
def _high_correlation_rows(frame: pd.DataFrame) -> list[dict[str, object]]:
sample = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").dropna()
if len(sample) > 5000: