9acb3460a1
Align entry labels with max future edge, tune direction labeling, and harden regression evaluation. Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
464 lines
23 KiB
Python
464 lines
23 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from trader_training.io_utils import (
|
|
DEFAULT_RAW_ROOT,
|
|
manifest,
|
|
read_parquet,
|
|
require_columns,
|
|
run_root,
|
|
sha256_json,
|
|
to_utc_series,
|
|
write_json,
|
|
write_parquet,
|
|
write_text,
|
|
)
|
|
from trader_training.replay import assign_split
|
|
from trader_training.schemas import FEATURE_ORDER, FEATURE_VERSION, FEATURES, FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
|
|
|
|
|
META_COLUMNS = [
|
|
"sample_id",
|
|
"symbol",
|
|
"event_time",
|
|
"open_time_ms",
|
|
"split_id",
|
|
"walk_forward_fold",
|
|
"feature_version",
|
|
"data_quality_flag",
|
|
]
|
|
|
|
|
|
def _safe_divide(numerator: pd.Series, denominator: pd.Series, default: float = 0.0) -> pd.Series:
|
|
result = numerator / denominator.replace(0, np.nan)
|
|
return result.replace([np.inf, -np.inf], np.nan).fillna(default)
|
|
|
|
|
|
def _rolling_rank_last(values: pd.Series, window: int) -> pd.Series:
|
|
def calc(raw: np.ndarray) -> float:
|
|
last = raw[-1]
|
|
return float(np.sum(raw <= last) / len(raw))
|
|
|
|
return values.rolling(window, min_periods=window).apply(calc, raw=True)
|
|
|
|
|
|
def _complete_days(frame: pd.DataFrame) -> pd.DataFrame:
|
|
frame = frame.copy()
|
|
frame["event_date"] = frame["event_time"].dt.strftime("%Y-%m-%d")
|
|
counts = frame.groupby(["symbol", "event_date"], observed=False)["event_time"].count()
|
|
complete = counts[counts == 1440].reset_index()[["symbol", "event_date"]]
|
|
return frame.merge(complete, on=["symbol", "event_date"], how="inner").drop(columns=["event_date"])
|
|
|
|
|
|
def build_feature_frame(args: Any) -> None:
|
|
root = run_root(args)
|
|
replay_path = args.replay_path or root / "replay" / "replay_1m.parquet"
|
|
split_manifest_path = args.split_manifest_path or root / "split" / "split_manifest.json"
|
|
replay = read_parquet(replay_path)
|
|
required = [
|
|
"symbol",
|
|
"event_time",
|
|
"open_time_ms",
|
|
"open",
|
|
"high",
|
|
"low",
|
|
"close",
|
|
"volume",
|
|
"taker_buy_volume",
|
|
"taker_sell_volume",
|
|
"funding_bps",
|
|
"mark_price",
|
|
"index_price",
|
|
"next_funding_time",
|
|
"open_interest",
|
|
"spread_bps",
|
|
"level1_ofi_1m",
|
|
"liquidation_buy_notional_1m",
|
|
"liquidation_sell_notional_1m",
|
|
"liquidation_available",
|
|
]
|
|
require_columns(replay, required, "replay_1m")
|
|
replay = replay.copy()
|
|
replay["event_time"] = to_utc_series(replay["event_time"])
|
|
replay["next_funding_time"] = to_utc_series(replay["next_funding_time"])
|
|
replay = replay.sort_values(["symbol", "event_time"]).reset_index(drop=True)
|
|
if not args.allow_incomplete_days:
|
|
before = len(replay)
|
|
replay = _complete_days(replay)
|
|
logging.info("trader.training.feature_complete_days rowBefore=%s rowAfter=%s", before, len(replay))
|
|
raw_root = Path(getattr(args, "raw_root", None) or DEFAULT_RAW_ROOT)
|
|
book_features = _load_book_minute_features(raw_root, replay[["symbol", "event_time", "open_time_ms"]])
|
|
replay = replay.merge(book_features, on=["symbol", "open_time_ms"], how="left")
|
|
logging.info(
|
|
"trader.training.book_features_merged rowCount=%s bookRows=%s bookAvailableRows=%s",
|
|
len(replay),
|
|
len(book_features),
|
|
int(replay["book_top_imbalance"].notna().sum()),
|
|
)
|
|
|
|
frames: list[pd.DataFrame] = []
|
|
for symbol, group in replay.groupby("symbol", sort=False, observed=False):
|
|
group = group.sort_values("event_time").reset_index(drop=True).copy()
|
|
close = group["close"].astype(float)
|
|
high = group["high"].astype(float)
|
|
low = group["low"].astype(float)
|
|
volume = group["volume"].astype(float)
|
|
log_ret = np.log(close / close.shift(1))
|
|
group["ret_1m_bps"] = (close / close.shift(1) - 1.0) * 10000.0
|
|
group["ret_5m_bps"] = (close / close.shift(5) - 1.0) * 10000.0
|
|
group["ret_15m_bps"] = (close / close.shift(15) - 1.0) * 10000.0
|
|
group["ret_60m_bps"] = (close / close.shift(60) - 1.0) * 10000.0
|
|
group["ret_240m_bps"] = (close / close.shift(240) - 1.0) * 10000.0
|
|
group["realized_vol_15m_bps"] = log_ret.rolling(15, min_periods=15).std() * 10000.0
|
|
group["realized_vol_60m_bps"] = log_ret.rolling(60, min_periods=60).std() * 10000.0
|
|
group["vol_ratio_15m_60m"] = _safe_divide(group["realized_vol_15m_bps"], group["realized_vol_60m_bps"].clip(lower=1.0))
|
|
group["range_15m_bps"] = (high.rolling(15, min_periods=15).max() / low.rolling(15, min_periods=15).min() - 1.0) * 10000.0
|
|
group["range_60m_bps"] = (high.rolling(60, min_periods=60).max() / low.rolling(60, min_periods=60).min() - 1.0) * 10000.0
|
|
vol_mean = volume.rolling(60, min_periods=60).mean()
|
|
vol_std = volume.rolling(60, min_periods=60).std().replace(0, np.nan)
|
|
group["volume_zscore_60m"] = ((volume - vol_mean) / vol_std).fillna(0.0)
|
|
group["trend_consistency_15m"] = np.sign(group["ret_1m_bps"]).rolling(15, min_periods=15).mean()
|
|
high60 = high.rolling(60, min_periods=60).max()
|
|
low60 = low.rolling(60, min_periods=60).min()
|
|
group["channel_position_60m_pct"] = ((close - low60) / (high60 - low60).clip(lower=1e-12)).clip(0.0, 1.0)
|
|
prev_high60 = high.shift(1).rolling(60, min_periods=60).max()
|
|
prev_low60 = low.shift(1).rolling(60, min_periods=60).min()
|
|
group["upper_breakout_60m_bps"] = ((close / prev_high60 - 1.0).clip(lower=0.0)) * 10000.0
|
|
group["lower_breakout_60m_bps"] = ((prev_low60 / close - 1.0).clip(lower=0.0)) * 10000.0
|
|
recent_high15 = high.rolling(15, min_periods=15).max()
|
|
recent_low15 = low.rolling(15, min_periods=15).min()
|
|
broke_up = recent_high15 > prev_high60
|
|
broke_down = recent_low15 < prev_low60
|
|
group["upper_failed_break_reclaim_15m_bps"] = np.where(broke_up, ((prev_high60 - close).clip(lower=0.0) / close) * 10000.0, 0.0)
|
|
group["lower_failed_break_reclaim_15m_bps"] = np.where(broke_down, ((close - prev_low60).clip(lower=0.0) / close) * 10000.0, 0.0)
|
|
group["sweep_up_15m_bps"] = ((recent_high15 / close - 1.0).clip(lower=0.0)) * 10000.0
|
|
group["sweep_down_15m_bps"] = ((close / recent_low15 - 1.0).clip(lower=0.0)) * 10000.0
|
|
rank = _rolling_rank_last(group["range_15m_bps"], 240)
|
|
group["compression_score_4h_pct"] = 1.0 - rank
|
|
group["compression_release_15m_bps"] = (group["range_15m_bps"] - group["range_15m_bps"].rolling(240, min_periods=240).median()).clip(lower=0.0)
|
|
buy = group["taker_buy_volume"].astype(float)
|
|
sell = group["taker_sell_volume"].astype(float)
|
|
group["taker_imbalance_1m"] = _safe_divide(buy - sell, buy + sell)
|
|
group["taker_imbalance_5m"] = _safe_divide(buy.rolling(5, min_periods=5).sum() - sell.rolling(5, min_periods=5).sum(), (buy + sell).rolling(5, min_periods=5).sum())
|
|
group["taker_imbalance_15m"] = _safe_divide(buy.rolling(15, min_periods=15).sum() - sell.rolling(15, min_periods=15).sum(), (buy + sell).rolling(15, min_periods=15).sum())
|
|
group["spread_rank_24h_pct"] = _rolling_rank_last(group["spread_bps"].astype(float), 1440)
|
|
group["oi_delta_15m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(15) - 1.0) * 10000.0
|
|
group["oi_delta_60m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(60) - 1.0) * 10000.0
|
|
group["mark_index_basis_bps"] = (group["mark_price"].astype(float) / group["index_price"].astype(float) - 1.0) * 10000.0
|
|
group["book_pressure_spread_ratio"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["spread_bps"].astype(float).abs().clip(lower=0.01))
|
|
group["book_pressure_taker_1m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_1m"].astype(float)
|
|
group["book_pressure_taker_5m"] = group["book_microprice_basis_bps"].astype(float) * group["taker_imbalance_5m"].astype(float)
|
|
group["book_l20_imbalance_taker_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["taker_imbalance_15m"].astype(float)
|
|
group["book_l20_imbalance_ret_15m"] = group["book_depth_imbalance_l20"].astype(float) * group["ret_15m_bps"].astype(float)
|
|
group["book_pressure_vol_adjusted"] = _safe_divide(group["book_microprice_basis_bps"].astype(float), group["realized_vol_15m_bps"].astype(float).clip(lower=1.0))
|
|
group["book_depth_pressure_gap"] = group["book_depth_imbalance_l5"].astype(float) - group["book_depth_imbalance_l20"].astype(float)
|
|
group["book_pressure_reversal_15m"] = -group["book_microprice_basis_bps"].astype(float) * group["ret_15m_bps"].astype(float)
|
|
liq_buy = group["liquidation_buy_notional_1m"].astype(float)
|
|
liq_sell = group["liquidation_sell_notional_1m"].astype(float)
|
|
liq_total_15 = (liq_buy + liq_sell).rolling(15, min_periods=1).sum()
|
|
group["liquidation_imbalance_15m"] = _safe_divide(liq_buy.rolling(15, min_periods=1).sum() - liq_sell.rolling(15, min_periods=1).sum(), liq_total_15)
|
|
liq_mean = liq_total_15.rolling(1440, min_periods=60).mean()
|
|
liq_std = liq_total_15.rolling(1440, min_periods=60).std().replace(0, np.nan)
|
|
group["liquidation_notional_zscore_15m"] = ((liq_total_15 - liq_mean) / liq_std).fillna(0.0)
|
|
minute_of_day = group["event_time"].dt.hour * 60 + group["event_time"].dt.minute
|
|
group["minute_of_day_sin"] = np.sin(2 * np.pi * minute_of_day / 1440.0)
|
|
group["minute_of_day_cos"] = np.cos(2 * np.pi * minute_of_day / 1440.0)
|
|
group["minutes_to_next_funding"] = ((group["next_funding_time"] - group["event_time"]).dt.total_seconds() / 60.0).clip(0.0, 480.0)
|
|
group["symbol"] = symbol
|
|
frames.append(group)
|
|
|
|
frame = pd.concat(frames, ignore_index=True)
|
|
frame["sample_id"] = frame["symbol"].astype(str) + ":" + frame["open_time_ms"].astype(str)
|
|
frame["split_id"] = assign_split(frame["event_time"], split_manifest_path)
|
|
frame["walk_forward_fold"] = np.where(frame["split_id"].eq(FIT_SPLIT), "fold_01", "NO_FOLD")
|
|
frame["feature_version"] = FEATURE_VERSION
|
|
numeric_features = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan)
|
|
hard_na = numeric_features.isna().any(axis=1)
|
|
optional_missing = frame["liquidation_available"].fillna(0).eq(0)
|
|
frame["data_quality_flag"] = np.where(hard_na, "WARMUP", np.where(optional_missing, "PARTIAL_OPTIONAL", "OK"))
|
|
ordered = frame[META_COLUMNS + FEATURE_ORDER].copy()
|
|
for feature in FEATURE_ORDER:
|
|
ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32")
|
|
|
|
feature_dir = root / "feature"
|
|
data_hash = write_parquet(feature_dir / "feature_frame.parquet", ordered)
|
|
schema = [feature.as_json() for feature in FEATURES]
|
|
feature_order_hash = write_json(feature_dir / "feature_order.json", FEATURE_ORDER)
|
|
feature_schema_hash = write_json(feature_dir / "feature_schema.json", schema)
|
|
write_json(
|
|
feature_dir / "feature_frame.manifest.json",
|
|
manifest(
|
|
feature_dir / "feature_frame.parquet",
|
|
{
|
|
"row_count": len(ordered),
|
|
"ok_row_count": int(ordered["data_quality_flag"].eq("OK").sum()),
|
|
"partial_optional_row_count": int(ordered["data_quality_flag"].eq("PARTIAL_OPTIONAL").sum()),
|
|
"warmup_row_count": int(ordered["data_quality_flag"].eq("WARMUP").sum()),
|
|
"feature_count": len(FEATURE_ORDER),
|
|
"feature_version": FEATURE_VERSION,
|
|
"feature_order_hash": feature_order_hash,
|
|
"feature_schema_hash": feature_schema_hash,
|
|
"data_hash_sha256": data_hash,
|
|
},
|
|
),
|
|
)
|
|
write_feature_report(feature_dir / "feature_quality_report.md", ordered, feature_schema_hash, feature_order_hash)
|
|
logging.info(
|
|
"trader.training.feature_written runId=%s rowCount=%s splitCounts=%s eventFrom=%s eventTo=%s path=%s",
|
|
args.run_id,
|
|
len(ordered),
|
|
ordered["split_id"].value_counts().to_dict(),
|
|
ordered["event_time"].min(),
|
|
ordered["event_time"].max(),
|
|
feature_dir / "feature_frame.parquet",
|
|
)
|
|
|
|
|
|
def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, feature_order_hash: str) -> None:
|
|
split_rows = []
|
|
for split_id, group in frame.groupby("split_id", sort=True, observed=False):
|
|
split_rows.append(
|
|
{
|
|
"split_id": split_id,
|
|
"rows": len(group),
|
|
"start": str(group["event_time"].min()),
|
|
"end": str(group["event_time"].max()),
|
|
"ok": int(group["data_quality_flag"].eq("OK").sum()),
|
|
"partial_optional": int(group["data_quality_flag"].eq("PARTIAL_OPTIONAL").sum()),
|
|
"warmup": int(group["data_quality_flag"].eq("WARMUP").sum()),
|
|
}
|
|
)
|
|
finite_rows = []
|
|
for feature in FEATURE_ORDER:
|
|
series = pd.to_numeric(frame[feature], errors="coerce")
|
|
values = series.to_numpy(dtype=float)
|
|
finite_rows.append(
|
|
{
|
|
"feature": feature,
|
|
"nan_count": int(series.isna().sum()),
|
|
"inf_count": int(np.isinf(values).sum()),
|
|
"finite_count": int(np.isfinite(values).sum()),
|
|
}
|
|
)
|
|
correlation_rows = _high_correlation_rows(frame)
|
|
drift_rows = _drift_rows(frame)
|
|
lines = [
|
|
"# Trader Feature Quality Report",
|
|
"",
|
|
f"- row_count: {len(frame)}",
|
|
f"- OK: {int(frame['data_quality_flag'].eq('OK').sum())}",
|
|
f"- PARTIAL_OPTIONAL: {int(frame['data_quality_flag'].eq('PARTIAL_OPTIONAL').sum())}",
|
|
f"- WARMUP: {int(frame['data_quality_flag'].eq('WARMUP').sum())}",
|
|
f"- feature_schema_hash: {feature_schema_hash}",
|
|
f"- feature_order_hash: {feature_order_hash}",
|
|
"",
|
|
"## Split Coverage",
|
|
"",
|
|
_markdown_table(split_rows, ["split_id", "rows", "start", "end", "ok", "partial_optional", "warmup"]),
|
|
"",
|
|
"## Source Coverage",
|
|
"",
|
|
f"- replay_1m_required_columns: present",
|
|
f"- liquidation_available_share: {float(frame['liquidation_available'].mean()):.6f}",
|
|
f"- book_available_share: {float(frame['book_top_imbalance'].notna().mean()):.6f}",
|
|
f"- feature_rows_with_book_missing: {int(frame['book_top_imbalance'].isna().sum())}",
|
|
f"- feature_rows_with_optional_liquidation_missing: {int(frame['data_quality_flag'].eq('PARTIAL_OPTIONAL').sum())}",
|
|
"",
|
|
"## Leakage Check",
|
|
"",
|
|
"- 所有特征只使用当前分钟收盘后已经知道的数据,滚动窗口都只看 `<= t`。",
|
|
"- 未来价格、未来收益、目标标签不进入 `feature_frame.parquet`。",
|
|
"",
|
|
"## Extreme Value Check",
|
|
"",
|
|
_markdown_table(finite_rows, ["feature", "nan_count", "inf_count", "finite_count"]),
|
|
"",
|
|
"## High Correlation Check",
|
|
"",
|
|
_markdown_table(correlation_rows, ["feature_a", "feature_b", "corr_abs"]),
|
|
"",
|
|
"## Drift Check",
|
|
"",
|
|
_markdown_table(
|
|
drift_rows,
|
|
["feature", "train_p50", "tune_p50", "validation_p50", "p50_diff", "train_p99", "tune_p99", "validation_p99", "p99_diff"],
|
|
),
|
|
"",
|
|
"## Distribution",
|
|
"",
|
|
"| feature | null_count | min | p01 | p50 | p99 | max |",
|
|
"| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
|
|
]
|
|
for feature in FEATURE_ORDER:
|
|
series = pd.to_numeric(frame[feature], errors="coerce")
|
|
quantiles = series.quantile([0.01, 0.5, 0.99])
|
|
lines.append(
|
|
f"| {feature} | {int(series.isna().sum())} | {series.min():.6g} | {quantiles.loc[0.01]:.6g} | {quantiles.loc[0.5]:.6g} | {quantiles.loc[0.99]:.6g} | {series.max():.6g} |"
|
|
)
|
|
write_text(path, "\n".join(lines) + "\n")
|
|
|
|
|
|
def feature_order_hash() -> str:
|
|
return sha256_json(FEATURE_ORDER)
|
|
|
|
|
|
def _load_book_minute_features(raw_root: Path, replay_keys: pd.DataFrame) -> pd.DataFrame:
|
|
if replay_keys.empty:
|
|
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
|
keys = replay_keys.copy()
|
|
keys["event_time"] = to_utc_series(keys["event_time"])
|
|
keys["event_date"] = keys["event_time"].dt.strftime("%Y-%m-%d")
|
|
frames: list[pd.DataFrame] = []
|
|
for (symbol, event_date), _ in keys.groupby(["symbol", "event_date"], sort=True, observed=False):
|
|
path = raw_root / "table=book" / "exchange=BINANCE_FUTURES" / f"symbol={symbol}" / f"dt={event_date}" / "data.parquet"
|
|
if not path.is_file():
|
|
logging.warning("trader.training.book_partition_missing symbol=%s eventDate=%s path=%s", symbol, event_date, path)
|
|
continue
|
|
day_features = _read_book_day(path, symbol)
|
|
frames.append(day_features)
|
|
logging.info(
|
|
"trader.training.book_partition_loaded symbol=%s eventDate=%s minuteRows=%s path=%s",
|
|
symbol,
|
|
event_date,
|
|
len(day_features),
|
|
path,
|
|
)
|
|
if not frames:
|
|
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
|
out = pd.concat(frames, ignore_index=True).drop_duplicates(["symbol", "open_time_ms"], keep="last")
|
|
wanted = keys[["symbol", "open_time_ms"]].drop_duplicates()
|
|
return wanted.merge(out, on=["symbol", "open_time_ms"], how="inner")
|
|
|
|
|
|
def _read_book_day(path: Path, symbol: str) -> pd.DataFrame:
|
|
columns = ["origin_time"]
|
|
for side in ("bid", "ask"):
|
|
for level in range(20):
|
|
columns.extend([f"{side}_{level}_price", f"{side}_{level}_size"])
|
|
book = pd.read_parquet(path, columns=columns)
|
|
if book.empty:
|
|
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
|
book = book.dropna(subset=["origin_time", "bid_0_price", "ask_0_price", "bid_0_size", "ask_0_size"]).copy()
|
|
book["origin_time"] = to_utc_series(book["origin_time"])
|
|
book["minute"] = book["origin_time"].dt.floor("min")
|
|
book = book.sort_values("origin_time").drop_duplicates("minute", keep="last").reset_index(drop=True)
|
|
if book.empty:
|
|
return pd.DataFrame(columns=["symbol", "open_time_ms", *_book_feature_columns()])
|
|
|
|
bid0 = book["bid_0_price"].astype("float64").to_numpy()
|
|
ask0 = book["ask_0_price"].astype("float64").to_numpy()
|
|
bid0_size = book["bid_0_size"].astype("float64").to_numpy()
|
|
ask0_size = book["ask_0_size"].astype("float64").to_numpy()
|
|
mid = (bid0 + ask0) / 2.0
|
|
top_denominator = np.maximum(bid0_size + ask0_size, 1e-12)
|
|
microprice = (bid0 * ask0_size + ask0 * bid0_size) / top_denominator
|
|
|
|
bid_depth_l5 = _book_level_notional(book, "bid", 5)
|
|
ask_depth_l5 = _book_level_notional(book, "ask", 5)
|
|
bid_depth_l20 = _book_level_notional(book, "bid", 20)
|
|
ask_depth_l20 = _book_level_notional(book, "ask", 20)
|
|
total_l5 = bid_depth_l5 + ask_depth_l5
|
|
total_l20 = bid_depth_l20 + ask_depth_l20
|
|
|
|
minute_ms = (book["minute"].astype("int64") // 1_000_000).astype("int64")
|
|
return pd.DataFrame(
|
|
{
|
|
"symbol": symbol,
|
|
"open_time_ms": minute_ms,
|
|
"book_top_imbalance": (bid0_size - ask0_size) / top_denominator,
|
|
"book_microprice_basis_bps": (microprice / mid - 1.0) * 10000.0,
|
|
"book_bid_depth_l5_quote": bid_depth_l5,
|
|
"book_ask_depth_l5_quote": ask_depth_l5,
|
|
"book_depth_imbalance_l5": _depth_imbalance(bid_depth_l5, ask_depth_l5),
|
|
"book_depth_imbalance_l20": _depth_imbalance(bid_depth_l20, ask_depth_l20),
|
|
"book_depth_concentration_l5_l20": total_l5 / np.maximum(total_l20, 1e-12),
|
|
}
|
|
)
|
|
|
|
|
|
def _book_level_notional(book: pd.DataFrame, side: str, level_count: int) -> np.ndarray:
|
|
total = np.zeros(len(book), dtype="float64")
|
|
for level in range(level_count):
|
|
price = pd.to_numeric(book[f"{side}_{level}_price"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
|
|
size = pd.to_numeric(book[f"{side}_{level}_size"], errors="coerce").fillna(0.0).to_numpy(dtype="float64")
|
|
total += price * size
|
|
return total
|
|
|
|
|
|
def _depth_imbalance(bid_depth: np.ndarray, ask_depth: np.ndarray) -> np.ndarray:
|
|
return (bid_depth - ask_depth) / np.maximum(bid_depth + ask_depth, 1e-12)
|
|
|
|
|
|
def _book_feature_columns() -> list[str]:
|
|
return [
|
|
"book_top_imbalance",
|
|
"book_microprice_basis_bps",
|
|
"book_bid_depth_l5_quote",
|
|
"book_ask_depth_l5_quote",
|
|
"book_depth_imbalance_l5",
|
|
"book_depth_imbalance_l20",
|
|
"book_depth_concentration_l5_l20",
|
|
]
|
|
|
|
|
|
def _high_correlation_rows(frame: pd.DataFrame) -> list[dict[str, object]]:
|
|
sample = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").dropna()
|
|
if len(sample) > 5000:
|
|
sample = sample.sample(5000, random_state=7)
|
|
if sample.empty:
|
|
return [{"feature_a": "NONE", "feature_b": "NONE", "corr_abs": 0.0}]
|
|
corr = sample.corr().abs()
|
|
rows = []
|
|
for left_index, left in enumerate(FEATURE_ORDER):
|
|
for right in FEATURE_ORDER[left_index + 1 :]:
|
|
value = corr.loc[left, right]
|
|
if pd.notna(value) and value >= 0.95:
|
|
rows.append({"feature_a": left, "feature_b": right, "corr_abs": round(float(value), 6)})
|
|
return rows[:30] or [{"feature_a": "NONE", "feature_b": "NONE", "corr_abs": 0.0}]
|
|
|
|
|
|
def _drift_rows(frame: pd.DataFrame) -> list[dict[str, object]]:
|
|
train = frame[frame["split_id"].eq(FIT_SPLIT)]
|
|
validation = frame[frame["split_id"].eq(VALIDATION_LOCKED_SPLIT)]
|
|
tune = frame[frame["split_id"].eq(TUNE_SPLIT)]
|
|
rows = []
|
|
for feature in FEATURE_ORDER:
|
|
train_series = pd.to_numeric(train[feature], errors="coerce")
|
|
validation_series = pd.to_numeric(validation[feature], errors="coerce")
|
|
tune_series = pd.to_numeric(tune[feature], errors="coerce")
|
|
train_p50 = float(train_series.quantile(0.5)) if not train_series.empty else 0.0
|
|
tune_p50 = float(tune_series.quantile(0.5)) if not tune_series.empty else 0.0
|
|
validation_p50 = float(validation_series.quantile(0.5)) if not validation_series.empty else 0.0
|
|
train_p99 = float(train_series.quantile(0.99)) if not train_series.empty else 0.0
|
|
tune_p99 = float(tune_series.quantile(0.99)) if not tune_series.empty else 0.0
|
|
validation_p99 = float(validation_series.quantile(0.99)) if not validation_series.empty else 0.0
|
|
rows.append(
|
|
{
|
|
"feature": feature,
|
|
"train_p50": round(train_p50, 6),
|
|
"tune_p50": round(tune_p50, 6),
|
|
"validation_p50": round(validation_p50, 6),
|
|
"p50_diff": round(validation_p50 - train_p50, 6),
|
|
"train_p99": round(train_p99, 6),
|
|
"tune_p99": round(tune_p99, 6),
|
|
"validation_p99": round(validation_p99, 6),
|
|
"p99_diff": round(validation_p99 - train_p99, 6),
|
|
}
|
|
)
|
|
return rows
|
|
|
|
|
|
def _markdown_table(rows: list[dict[str, object]], columns: list[str]) -> str:
|
|
if not rows:
|
|
rows = [{column: "" for column in columns}]
|
|
lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join("---" for _ in columns) + " |"]
|
|
for row in rows:
|
|
lines.append("| " + " | ".join(str(row.get(column, "")) for column in columns) + " |")
|
|
return "\n".join(lines)
|