Improve Trader V4 training pipeline
Align entry labels with max future edge, tune direction labeling, and harden regression evaluation. Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
This commit is contained in:
@@ -1,12 +1,14 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from trader_training.io_utils import (
|
||||
DEFAULT_RAW_ROOT,
|
||||
manifest,
|
||||
read_parquet,
|
||||
require_columns,
|
||||
@@ -49,7 +51,7 @@ def _rolling_rank_last(values: pd.Series, window: int) -> pd.Series:
|
||||
def _complete_days(frame: pd.DataFrame) -> pd.DataFrame:
|
||||
frame = frame.copy()
|
||||
frame["event_date"] = frame["event_time"].dt.strftime("%Y-%m-%d")
|
||||
counts = frame.groupby(["symbol", "event_date"])["event_time"].count()
|
||||
counts = frame.groupby(["symbol", "event_date"], observed=False)["event_time"].count()
|
||||
complete = counts[counts == 1440].reset_index()[["symbol", "event_date"]]
|
||||
return frame.merge(complete, on=["symbol", "event_date"], how="inner").drop(columns=["event_date"])
|
||||
|
||||
@@ -90,9 +92,18 @@ def build_feature_frame(args: Any) -> None:
|
||||
before = len(replay)
|
||||
replay = _complete_days(replay)
|
||||
logging.info("trader.training.feature_complete_days rowBefore=%s rowAfter=%s", before, len(replay))
|
||||
raw_root = Path(getattr(args, "raw_root", None) or DEFAULT_RAW_ROOT)
|
||||
book_features = _load_book_minute_features(raw_root, replay[["symbol", "event_time", "open_time_ms"]])
|
||||
replay = replay.merge(book_features, on=["symbol", "open_time_ms"], how="left")
|
||||
logging.info(
|
||||
"trader.training.book_features_merged rowCount=%s bookRows=%s bookAvailableRows=%s",
|
||||
len(replay),
|
||||
len(book_features),
|
||||
int(replay["book_top_imbalance"].notna().sum()),
|
||||
)
|
||||
|
||||
frames: list[pd.DataFrame] = []
|
||||
for symbol, group in replay.groupby("symbol", sort=False):
|
||||
for symbol, group in replay.groupby("symbol", sort=False, observed=False):
|
||||
group = group.sort_values("event_time").reset_index(drop=True).copy()
|
||||
close = group["close"].astype(float)
|
||||
high = group["high"].astype(float)
|
||||
@@ -140,6 +151,14 @@ def build_feature_frame(args: Any) -> None:
|
||||
group["oi_delta_15m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(15) - 1.0) * 10000.0
|
||||
group["oi_delta_60m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(60) - 1.0) * 10000.0
|
||||
group["mark_index_basis_bps"] = (group["mark_price"].astype(float) / group["index_price"].astype(float) - 1.0) * 10000.0
|
||||
group["book_pressure_spread_ratio"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["spread_bps"].astype(float).abs().clip(lower=0.01))
|
||||
group["book_pressure_taker_1m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_1m"].astype(float)
|
||||
group["book_pressure_taker_5m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_5m"].astype(float)
|
||||
group["book_l20_imbalance_taker_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["taker_imbalance_15m"].astype(float)
|
||||
group["book_l20_imbalance_ret_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["ret_15m_bps"].astype(float)
|
||||
group["book_pressure_vol_adjusted"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["realized_vol_15m_bps"].astype(float).clip(lower=1.0))
|
||||
group["book_depth_pressure_gap"] = group["book_depth_imbalance_l5"].astype(float) - group["book_depth_imbalance_l20"].astype(float)
|
||||
group["book_pressure_reversal_15m"] = -group["book_microprice_basis_bps"].astype(float) * group["ret_15m_bps"].astype(float)
|
||||
liq_buy = group["liquidation_buy_notional_1m"].astype(float)
|
||||
liq_sell = group["liquidation_sell_notional_1m"].astype(float)
|
||||
liq_total_15 = (liq_buy + liq_sell).rolling(15, min_periods=1).sum()
|
||||
@@ -159,12 +178,13 @@ def build_feature_frame(args: Any) -> None:
|
||||
frame["split_id"] = assign_split(frame["event_time"], split_manifest_path)
|
||||
frame["walk_forward_fold"] = np.where(frame["split_id"].eq(FIT_SPLIT), "fold_01", "NO_FOLD")
|
||||
frame["feature_version"] = FEATURE_VERSION
|
||||
hard_na = frame[FEATURE_ORDER].isna().any(axis=1)
|
||||
numeric_features = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
|
||||
hard_na = numeric_features.isna().any(axis=1)
|
||||
optional_missing = frame["liquidation_available"].fillna(0).eq(0)
|
||||
frame["data_quality_flag"] = np.where(hard_na, "WARMUP", np.where(optional_missing, "PARTIAL_OPTIONAL", "OK"))
|
||||
ordered = frame[META_COLUMNS + FEATURE_ORDER].copy()
|
||||
for feature in FEATURE_ORDER:
|
||||
ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").astype("float32")
|
||||
ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32")
|
||||
|
||||
feature_dir = root / "feature"
|
||||
data_hash = write_parquet(feature_dir / "feature_frame.parquet", ordered)
|
||||
@@ -202,7 +222,7 @@ def build_feature_frame(args: Any) -> None:
|
||||
|
||||
def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, feature_order_hash: str) -> None:
|
||||
split_rows = []
|
||||
for split_id, group in frame.groupby("split_id", sort=True):
|
||||
for split_id, group in frame.groupby("split_id", sort=True, observed=False):
|
||||
split_rows.append(
|
||||
{
|
||||
"split_id": split_id,
|
||||
@@ -246,6 +266,8 @@ def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, fe
|
||||
"",
|
||||
f"- replay_1m_required_columns: present",
|
||||
f"- liquidation_available_share: {float(frame['liquidation_available'].mean()):.6f}",
|
||||
f"- book_available_share: {float(frame['book_top_imbalance'].notna().mean()):.6f}",
|
||||
f"- feature_rows_with_book_missing: {int(frame['book_top_imbalance'].isna().sum())}",
|
||||
f"- feature_rows_with_optional_liquidation_missing: {int(frame['data_quality_flag'].eq('PARTIAL_OPTIONAL').sum())}",
|
||||
"",
|
||||
"## Leakage Check",
|
||||
@@ -286,6 +308,105 @@ def feature_order_hash() -> str:
|
||||
return sha256_json(FEATURE_ORDER)
|
||||
|
||||
|
||||
def _load_book_minute_features(raw_root: Path, replay_keys: pd.DataFrame) -> pd.DataFrame:
|
||||
if replay_keys.empty:
|
||||
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
||||
keys = replay_keys.copy()
|
||||
keys["event_time"] = to_utc_series(keys["event_time"])
|
||||
keys["event_date"] = keys["event_time"].dt.strftime("%Y-%m-%d")
|
||||
frames: list[pd.DataFrame] = []
|
||||
for (symbol, event_date), _ in keys.groupby(["symbol", "event_date"], sort=True, observed=False):
|
||||
path = raw_root / "table=book" / "exchange=BINANCE_FUTURES" / f"symbol={symbol}" / f"dt={event_date}" / "data.parquet"
|
||||
if not path.is_file():
|
||||
logging.warning("trader.training.book_partition_missing symbol=%s eventDate=%s path=%s", symbol, event_date, path)
|
||||
continue
|
||||
day_features = _read_book_day(path, symbol)
|
||||
frames.append(day_features)
|
||||
logging.info(
|
||||
"trader.training.book_partition_loaded symbol=%s eventDate=%s minuteRows=%s path=%s",
|
||||
symbol,
|
||||
event_date,
|
||||
len(day_features),
|
||||
path,
|
||||
)
|
||||
if not frames:
|
||||
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
||||
out = pd.concat(frames, ignore_index=True).drop_duplicates(["symbol", "open_time_ms"], keep="last")
|
||||
wanted = keys[["symbol", "open_time_ms"]].drop_duplicates()
|
||||
return wanted.merge(out, on=["symbol", "open_time_ms"], how="inner")
|
||||
|
||||
|
||||
def _read_book_day(path: Path, symbol: str) -> pd.DataFrame:
|
||||
columns = ["origin_time"]
|
||||
for side in ("bid", "ask"):
|
||||
for level in range(20):
|
||||
columns.extend([f"{side}_{level}_price", f"{side}_{level}_size"])
|
||||
book = pd.read_parquet(path, columns=columns)
|
||||
if book.empty:
|
||||
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
||||
book = book.dropna(subset=["origin_time", "bid_0_price", "ask_0_price", "bid_0_size", "ask_0_size"]).copy()
|
||||
book["origin_time"] = to_utc_series(book["origin_time"])
|
||||
book["minute"] = book["origin_time"].dt.floor("min")
|
||||
book = book.sort_values("origin_time").drop_duplicates("minute", keep="last").reset_index(drop=True)
|
||||
if book.empty:
|
||||
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
||||
|
||||
bid0 = book["bid_0_price"].astype("float64").to_numpy()
|
||||
ask0 = book["ask_0_price"].astype("float64").to_numpy()
|
||||
bid0_size = book["bid_0_size"].astype("float64").to_numpy()
|
||||
ask0_size = book["ask_0_size"].astype("float64").to_numpy()
|
||||
mid = (bid0 + ask0) / 2.0
|
||||
top_denominator = np.maximum(bid0_size + ask0_size, 1e-12)
|
||||
microprice = (bid0 * ask0_size + ask0 * bid0_size) / top_denominator
|
||||
|
||||
bid_depth_l5 = _book_level_notional(book, "bid", 5)
|
||||
ask_depth_l5 = _book_level_notional(book, "ask", 5)
|
||||
bid_depth_l20 = _book_level_notional(book, "bid", 20)
|
||||
ask_depth_l20 = _book_level_notional(book, "ask", 20)
|
||||
total_l5 = bid_depth_l5 + ask_depth_l5
|
||||
total_l20 = bid_depth_l20 + ask_depth_l20
|
||||
|
||||
minute_ms = (book["minute"].astype("int64") // 1_000_000).astype("int64")
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"symbol": symbol,
|
||||
"open_time_ms": minute_ms,
|
||||
"book_top_imbalance": (bid0_size - ask0_size) / top_denominator,
|
||||
"book_microprice_basis_bps": (microprice / mid - 1.0) * 10000.0,
|
||||
"book_bid_depth_l5_quote": bid_depth_l5,
|
||||
"book_ask_depth_l5_quote": ask_depth_l5,
|
||||
"book_depth_imbalance_l5": _depth_imbalance(bid_depth_l5, ask_depth_l5),
|
||||
"book_depth_imbalance_l20": _depth_imbalance(bid_depth_l20, ask_depth_l20),
|
||||
"book_depth_concentration_l5_l20": total_l5 / np.maximum(total_l20, 1e-12),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _book_level_notional(book: pd.DataFrame, side: str, level_count: int) -> np.ndarray:
|
||||
total = np.zeros(len(book), dtype="float64")
|
||||
for level in range(level_count):
|
||||
price = pd.to_numeric(book[f"{side}_{level}_price"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
|
||||
size = pd.to_numeric(book[f"{side}_{level}_size"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
|
||||
total += price * size
|
||||
return total
|
||||
|
||||
|
||||
def _depth_imbalance(bid_depth: np.ndarray, ask_depth: np.ndarray) -> np.ndarray:
|
||||
return (bid_depth - ask_depth) / np.maximum(bid_depth + ask_depth, 1e-12)
|
||||
|
||||
|
||||
def _book_feature_columns() -> list[str]:
|
||||
return [
|
||||
"book_top_imbalance",
|
||||
"book_microprice_basis_bps",
|
||||
"book_bid_depth_l5_quote",
|
||||
"book_ask_depth_l5_quote",
|
||||
"book_depth_imbalance_l5",
|
||||
"book_depth_imbalance_l20",
|
||||
"book_depth_concentration_l5_l20",
|
||||
]
|
||||
|
||||
|
||||
def _high_correlation_rows(frame: pd.DataFrame) -> list[dict[str, object]]:
|
||||
sample = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").dropna()
|
||||
if len(sample) > 5000:
|
||||
|
||||
Reference in New Issue
Block a user