Improve Trader entry quality training diagnostics

This commit is contained in:
Codex
2026-06-28 00:50:37 +08:00
parent 87849a66a7
commit 340d1dd91b
11 changed files with 1895 additions and 110 deletions
@@ -24,9 +24,11 @@ def screen_entry_features(args: Any) -> None:
min_bucket_rows = int(args.min_bucket_rows or 300)
rows: list[dict[str, Any]] = []
edge_source_by_side: dict[str, str] = {}
for side in ("LONG", "SHORT"):
target_col = "long_entry_target" if side == "LONG" else "short_entry_target"
edge_col = "long_expected_net_edge_bps" if side == "LONG" else "short_expected_net_edge_bps"
edge_col = _screen_edge_column(dataset, side)
edge_source_by_side[side] = edge_col
baselines = _split_baselines(dataset, target_col, edge_col)
for feature in FEATURE_ORDER:
rows.extend(_feature_rows(dataset, feature, side, target_col, edge_col, baselines))
@@ -43,6 +45,7 @@ def screen_entry_features(args: Any) -> None:
"bucket_metric_count": int(len(bucket_metrics)),
"candidate_count": int(len(candidates)),
"min_bucket_rows": min_bucket_rows,
"edge_source_by_side": edge_source_by_side,
"selection_rule": "bucket boundaries are learned on fit_inner; candidate is picked by tune_inner and checked on validation_locked/latest_stress",
}
write_json(root / "diagnostics" / "entry_feature_screen_result.json", result)
@@ -59,6 +62,14 @@ def screen_entry_features(args: Any) -> None:
)
def _screen_edge_column(dataset: pd.DataFrame, side: str) -> str:
prefix = "long" if side == "LONG" else "short"
actual_col = f"{prefix}_actual_plan_net_edge_bps"
if actual_col in dataset.columns:
return actual_col
return f"{prefix}_expected_net_edge_bps"
def _split_baselines(dataset: pd.DataFrame, target_col: str, edge_col: str) -> dict[str, dict[str, float]]:
baselines: dict[str, dict[str, float]] = {}
for split_id in ALL_SPLITS:
@@ -225,6 +236,7 @@ def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
"",
"这份报告只回答一个问题:历史数据里,单个特征的某些区间有没有稳定变好。",
"",
"- 如果数据里有真实出场净收益,本报告用真实出场净收益;没有时才退回训练收益标签。",
"- `tune_inner` 用来挑候选区间。",
"- `validation_locked` 和 `latest_stress` 用来检查这个区间是不是出了训练样本也还能站住。",
"- `stable_positive_edge=true` 代表这个区间在三个检查集里的平均净收益都大于 0。",
@@ -237,6 +249,8 @@ def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
f"- 分桶明细数: `{result['bucket_metric_count']}`",
f"- 候选数: `{result['candidate_count']}`",
f"- 最小分桶行数: `{result['min_bucket_rows']}`",
f"- 做多收益来源: `{result['edge_source_by_side'].get('LONG')}`",
f"- 做空收益来源: `{result['edge_source_by_side'].get('SHORT')}`",
"",
]
if candidates.empty: