Files
quant-trader-service/training/trader_training/entry_feature_screen.py
T
Codex 9acb3460a1 Improve Trader V4 training pipeline
Align entry labels with max future edge, tune direction labeling, and harden regression evaluation.

Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
2026-06-27 19:57:29 +08:00

307 lines
13 KiB
Python

from __future__ import annotations
import logging
from typing import Any
import numpy as np
import pandas as pd
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
ALL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
def screen_entry_features(args: Any) -> None:
root = run_root(args)
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
required = {"split_id", *FEATURE_ORDER, "long_entry_target", "short_entry_target", "long_expected_net_edge_bps", "short_expected_net_edge_bps"}
missing = sorted(required.difference(dataset.columns))
if missing:
raise ValueError(f"entry feature screen missing required columns: {missing}")
min_bucket_rows = int(args.min_bucket_rows or 300)
rows: list[dict[str, Any]] = []
for side in ("LONG", "SHORT"):
target_col = "long_entry_target" if side == "LONG" else "short_entry_target"
edge_col = "long_expected_net_edge_bps" if side == "LONG" else "short_expected_net_edge_bps"
baselines = _split_baselines(dataset, target_col, edge_col)
for feature in FEATURE_ORDER:
rows.extend(_feature_rows(dataset, feature, side, target_col, edge_col, baselines))
bucket_metrics = pd.DataFrame(rows)
if bucket_metrics.empty:
raise ValueError("entry feature screen produced no bucket metrics")
candidates = _select_candidates(bucket_metrics, min_bucket_rows)
result = {
"run_id": args.run_id,
"dataset_path": str(root / "dataset" / "entry_train.parquet"),
"feature_count": len(FEATURE_ORDER),
"bucket_metric_count": int(len(bucket_metrics)),
"candidate_count": int(len(candidates)),
"min_bucket_rows": min_bucket_rows,
"selection_rule": "bucket boundaries are learned on fit_inner; candidate is picked by tune_inner and checked on validation_locked/latest_stress",
}
write_json(root / "diagnostics" / "entry_feature_screen_result.json", result)
write_text(root / "diagnostics" / "entry_feature_bucket_metrics.csv", bucket_metrics.to_csv(index=False))
write_text(root / "diagnostics" / "entry_feature_signal_candidates.csv", candidates.to_csv(index=False))
write_text(root / "diagnostics" / "entry_feature_screen_report.md", _markdown_report(result, candidates))
logging.info(
"trader.training.entry_feature_screened runId=%s featureCount=%s bucketMetricCount=%s candidateCount=%s reportPath=%s",
args.run_id,
len(FEATURE_ORDER),
len(bucket_metrics),
len(candidates),
root / "diagnostics" / "entry_feature_screen_report.md",
)
def _split_baselines(dataset: pd.DataFrame, target_col: str, edge_col: str) -> dict[str, dict[str, float]]:
baselines: dict[str, dict[str, float]] = {}
for split_id in ALL_SPLITS:
part = dataset[dataset["split_id"].eq(split_id)]
if part.empty:
continue
baselines[split_id] = {
"rows": float(len(part)),
"positive_rate": float(part[target_col].mean()),
"avg_edge_bps": float(part[edge_col].mean()),
}
return baselines
def _feature_rows(
dataset: pd.DataFrame,
feature: str,
side: str,
target_col: str,
edge_col: str,
baselines: dict[str, dict[str, float]],
) -> list[dict[str, Any]]:
train_values = pd.to_numeric(dataset.loc[dataset["split_id"].eq(FIT_SPLIT), feature], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
edges = _bucket_edges(train_values.to_numpy(dtype="float64"))
if len(edges) < 3:
logging.info("trader.training.entry_feature_screen_skipped feature=%s reason=not_enough_unique_values", feature)
return []
values = pd.to_numeric(dataset[feature], errors="coerce").replace([np.inf, -np.inf], np.nan)
bucket = pd.cut(values, bins=edges, include_lowest=True, labels=False, duplicates="drop")
working = dataset[["split_id", target_col, edge_col]].copy()
working["bucket_index"] = bucket.astype("float")
working = working.dropna(subset=["bucket_index"])
if working.empty:
return []
working["bucket_index"] = working["bucket_index"].astype(int)
rows: list[dict[str, Any]] = []
for (split_id, bucket_index), part in working.groupby(["split_id", "bucket_index"], sort=True, observed=False):
if split_id not in baselines:
continue
lower = float(edges[bucket_index])
upper = float(edges[bucket_index + 1])
baseline = baselines[split_id]
avg_edge = float(part[edge_col].mean())
positive_rate = float(part[target_col].mean())
rows.append(
{
"side": side,
"feature": feature,
"split_id": split_id,
"bucket_index": int(bucket_index),
"bucket_count": int(len(edges) - 1),
"bucket_lower": lower,
"bucket_upper": upper,
"row_count": int(len(part)),
"positive_rate": positive_rate,
"baseline_positive_rate": baseline["positive_rate"],
"positive_rate_lift": positive_rate - baseline["positive_rate"],
"avg_edge_bps": avg_edge,
"baseline_avg_edge_bps": baseline["avg_edge_bps"],
"avg_edge_lift_bps": avg_edge - baseline["avg_edge_bps"],
"median_edge_bps": float(part[edge_col].median()),
}
)
return rows
def _bucket_edges(values: np.ndarray) -> np.ndarray:
clean = values[np.isfinite(values)]
if clean.size < 1000:
return np.array([], dtype="float64")
quantiles = np.linspace(0.0, 1.0, 11)
edges = np.quantile(clean, quantiles)
edges = np.unique(edges)
if edges.size < 3:
return np.array([], dtype="float64")
edges[0] = -np.inf
edges[-1] = np.inf
return edges
def _select_candidates(bucket_metrics: pd.DataFrame, min_bucket_rows: int) -> pd.DataFrame:
tune = bucket_metrics[bucket_metrics["split_id"].eq(TUNE_SPLIT) & (bucket_metrics["row_count"] >= min_bucket_rows)].copy()
if tune.empty:
return pd.DataFrame()
tune = tune.sort_values(["side", "feature", "avg_edge_lift_bps", "positive_rate_lift"], ascending=[True, True, False, False])
picked = tune.groupby(["side", "feature"], as_index=False, observed=False).head(1)
candidates = picked[
[
"side",
"feature",
"bucket_index",
"bucket_count",
"bucket_lower",
"bucket_upper",
"row_count",
"positive_rate",
"positive_rate_lift",
"avg_edge_bps",
"avg_edge_lift_bps",
]
].rename(
columns={
"row_count": "tune_rows",
"positive_rate": "tune_positive_rate",
"positive_rate_lift": "tune_positive_rate_lift",
"avg_edge_bps": "tune_avg_edge_bps",
"avg_edge_lift_bps": "tune_avg_edge_lift_bps",
}
)
for split_id in (VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT):
split_rows = bucket_metrics[bucket_metrics["split_id"].eq(split_id)][
["side", "feature", "bucket_index", "row_count", "positive_rate", "positive_rate_lift", "avg_edge_bps", "avg_edge_lift_bps"]
].rename(
columns={
"row_count": f"{split_id}_rows",
"positive_rate": f"{split_id}_positive_rate",
"positive_rate_lift": f"{split_id}_positive_rate_lift",
"avg_edge_bps": f"{split_id}_avg_edge_bps",
"avg_edge_lift_bps": f"{split_id}_avg_edge_lift_bps",
}
)
candidates = candidates.merge(split_rows, on=["side", "feature", "bucket_index"], how="left")
for column in (
f"{VALIDATION_LOCKED_SPLIT}_rows",
f"{LATEST_STRESS_SPLIT}_rows",
f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps",
f"{LATEST_STRESS_SPLIT}_avg_edge_bps",
f"{VALIDATION_LOCKED_SPLIT}_avg_edge_lift_bps",
f"{LATEST_STRESS_SPLIT}_avg_edge_lift_bps",
):
if column not in candidates.columns:
candidates[column] = np.nan
candidates["stable_positive_edge"] = (
(candidates["tune_avg_edge_bps"] > 0.0)
& (candidates[f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps"] > 0.0)
& (candidates[f"{LATEST_STRESS_SPLIT}_avg_edge_bps"] > 0.0)
)
candidates["stable_lift"] = (
(candidates["tune_avg_edge_lift_bps"] > 0.0)
& (candidates[f"{VALIDATION_LOCKED_SPLIT}_avg_edge_lift_bps"] > 0.0)
& (candidates[f"{LATEST_STRESS_SPLIT}_avg_edge_lift_bps"] > 0.0)
)
candidates["min_eval_edge_bps"] = candidates[["tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps"]].min(axis=1)
candidates["mean_eval_edge_bps"] = candidates[["tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps"]].mean(axis=1)
candidates["min_eval_rows"] = candidates[["tune_rows", f"{VALIDATION_LOCKED_SPLIT}_rows", f"{LATEST_STRESS_SPLIT}_rows"]].min(axis=1)
candidates["screen_score"] = (
candidates["min_eval_edge_bps"].fillna(-999.0)
+ candidates["mean_eval_edge_bps"].fillna(-999.0) * 0.25
+ candidates["stable_lift"].astype(float) * 2.0
)
return candidates.sort_values("screen_score", ascending=False).reset_index(drop=True)
def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
lines = [
"# Entry 特征筛查报告",
"",
"## 结论怎么读",
"",
"这份报告只回答一个问题:历史数据里,单个特征的某些区间有没有稳定变好。",
"",
"- `tune_inner` 用来挑候选区间。",
"- `validation_locked` 和 `latest_stress` 用来检查这个区间是不是出了训练样本也还能站住。",
"- `stable_positive_edge=true` 代表这个区间在三个检查集里的平均净收益都大于 0。",
"- `stable_lift=true` 代表这个区间在三个检查集里都比对应大盘样本平均值更好。",
"",
"## 本次结果",
"",
f"- run_id: `{result['run_id']}`",
f"- 特征数: `{result['feature_count']}`",
f"- 分桶明细数: `{result['bucket_metric_count']}`",
f"- 候选数: `{result['candidate_count']}`",
f"- 最小分桶行数: `{result['min_bucket_rows']}`",
"",
]
if candidates.empty:
lines.extend(
[
"## 候选特征",
"",
"没有找到满足最小样本数的候选区间。下一步应先扩大数据或重新检查标签/价格计划,不建议直接继续调模型。",
"",
]
)
return "\n".join(lines)
stable = candidates[candidates["stable_positive_edge"] & candidates["stable_lift"]]
lines.extend(
[
"## 稳定候选",
"",
f"- 同时满足正收益和正提升的候选数: `{len(stable)}`",
"",
]
)
display_columns = [
"side",
"feature",
"bucket_index",
"bucket_lower",
"bucket_upper",
"tune_avg_edge_bps",
f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps",
f"{LATEST_STRESS_SPLIT}_avg_edge_bps",
"stable_positive_edge",
"stable_lift",
"screen_score",
]
lines.append(_markdown_table(candidates[display_columns].head(20)))
lines.extend(
[
"",
"## 文件",
"",
"- `diagnostics/entry_feature_bucket_metrics.csv`: 每个特征、每个桶、每个数据段的完整明细。",
"- `diagnostics/entry_feature_signal_candidates.csv`: 每个特征按调参集挑出的最好区间,以及封存验证/压力检查结果。",
"",
]
)
return "\n".join(lines)
def _markdown_table(frame: pd.DataFrame) -> str:
if frame.empty:
return "_无_"
columns = list(frame.columns)
lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join(["---"] * len(columns)) + " |"]
for _, row in frame.iterrows():
values = [_format_cell(row[column]) for column in columns]
lines.append("| " + " | ".join(values) + " |")
return "\n".join(lines)
def _format_cell(value: Any) -> str:
if pd.isna(value):
return ""
if isinstance(value, (float, np.floating)):
return f"{float(value):.6g}"
if isinstance(value, (bool, np.bool_)):
return "true" if bool(value) else "false"
return str(value)