9acb3460a1
Align entry labels with max future edge, tune direction labeling, and harden regression evaluation. Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
307 lines
13 KiB
Python
307 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import logging
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
|
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
|
|
|
|
|
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
|
ALL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
|
|
|
|
|
def screen_entry_features(args: Any) -> None:
|
|
root = run_root(args)
|
|
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
|
required = {"split_id", *FEATURE_ORDER, "long_entry_target", "short_entry_target", "long_expected_net_edge_bps", "short_expected_net_edge_bps"}
|
|
missing = sorted(required.difference(dataset.columns))
|
|
if missing:
|
|
raise ValueError(f"entry feature screen missing required columns: {missing}")
|
|
|
|
min_bucket_rows = int(args.min_bucket_rows or 300)
|
|
rows: list[dict[str, Any]] = []
|
|
for side in ("LONG", "SHORT"):
|
|
target_col = "long_entry_target" if side == "LONG" else "short_entry_target"
|
|
edge_col = "long_expected_net_edge_bps" if side == "LONG" else "short_expected_net_edge_bps"
|
|
baselines = _split_baselines(dataset, target_col, edge_col)
|
|
for feature in FEATURE_ORDER:
|
|
rows.extend(_feature_rows(dataset, feature, side, target_col, edge_col, baselines))
|
|
|
|
bucket_metrics = pd.DataFrame(rows)
|
|
if bucket_metrics.empty:
|
|
raise ValueError("entry feature screen produced no bucket metrics")
|
|
|
|
candidates = _select_candidates(bucket_metrics, min_bucket_rows)
|
|
result = {
|
|
"run_id": args.run_id,
|
|
"dataset_path": str(root / "dataset" / "entry_train.parquet"),
|
|
"feature_count": len(FEATURE_ORDER),
|
|
"bucket_metric_count": int(len(bucket_metrics)),
|
|
"candidate_count": int(len(candidates)),
|
|
"min_bucket_rows": min_bucket_rows,
|
|
"selection_rule": "bucket boundaries are learned on fit_inner; candidate is picked by tune_inner and checked on validation_locked/latest_stress",
|
|
}
|
|
write_json(root / "diagnostics" / "entry_feature_screen_result.json", result)
|
|
write_text(root / "diagnostics" / "entry_feature_bucket_metrics.csv", bucket_metrics.to_csv(index=False))
|
|
write_text(root / "diagnostics" / "entry_feature_signal_candidates.csv", candidates.to_csv(index=False))
|
|
write_text(root / "diagnostics" / "entry_feature_screen_report.md", _markdown_report(result, candidates))
|
|
logging.info(
|
|
"trader.training.entry_feature_screened runId=%s featureCount=%s bucketMetricCount=%s candidateCount=%s reportPath=%s",
|
|
args.run_id,
|
|
len(FEATURE_ORDER),
|
|
len(bucket_metrics),
|
|
len(candidates),
|
|
root / "diagnostics" / "entry_feature_screen_report.md",
|
|
)
|
|
|
|
|
|
def _split_baselines(dataset: pd.DataFrame, target_col: str, edge_col: str) -> dict[str, dict[str, float]]:
|
|
baselines: dict[str, dict[str, float]] = {}
|
|
for split_id in ALL_SPLITS:
|
|
part = dataset[dataset["split_id"].eq(split_id)]
|
|
if part.empty:
|
|
continue
|
|
baselines[split_id] = {
|
|
"rows": float(len(part)),
|
|
"positive_rate": float(part[target_col].mean()),
|
|
"avg_edge_bps": float(part[edge_col].mean()),
|
|
}
|
|
return baselines
|
|
|
|
|
|
def _feature_rows(
|
|
dataset: pd.DataFrame,
|
|
feature: str,
|
|
side: str,
|
|
target_col: str,
|
|
edge_col: str,
|
|
baselines: dict[str, dict[str, float]],
|
|
) -> list[dict[str, Any]]:
|
|
train_values = pd.to_numeric(dataset.loc[dataset["split_id"].eq(FIT_SPLIT), feature], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
|
|
edges = _bucket_edges(train_values.to_numpy(dtype="float64"))
|
|
if len(edges) < 3:
|
|
logging.info("trader.training.entry_feature_screen_skipped feature=%s reason=not_enough_unique_values", feature)
|
|
return []
|
|
|
|
values = pd.to_numeric(dataset[feature], errors="coerce").replace([np.inf, -np.inf], np.nan)
|
|
bucket = pd.cut(values, bins=edges, include_lowest=True, labels=False, duplicates="drop")
|
|
working = dataset[["split_id", target_col, edge_col]].copy()
|
|
working["bucket_index"] = bucket.astype("float")
|
|
working = working.dropna(subset=["bucket_index"])
|
|
if working.empty:
|
|
return []
|
|
working["bucket_index"] = working["bucket_index"].astype(int)
|
|
|
|
rows: list[dict[str, Any]] = []
|
|
for (split_id, bucket_index), part in working.groupby(["split_id", "bucket_index"], sort=True, observed=False):
|
|
if split_id not in baselines:
|
|
continue
|
|
lower = float(edges[bucket_index])
|
|
upper = float(edges[bucket_index + 1])
|
|
baseline = baselines[split_id]
|
|
avg_edge = float(part[edge_col].mean())
|
|
positive_rate = float(part[target_col].mean())
|
|
rows.append(
|
|
{
|
|
"side": side,
|
|
"feature": feature,
|
|
"split_id": split_id,
|
|
"bucket_index": int(bucket_index),
|
|
"bucket_count": int(len(edges) - 1),
|
|
"bucket_lower": lower,
|
|
"bucket_upper": upper,
|
|
"row_count": int(len(part)),
|
|
"positive_rate": positive_rate,
|
|
"baseline_positive_rate": baseline["positive_rate"],
|
|
"positive_rate_lift": positive_rate - baseline["positive_rate"],
|
|
"avg_edge_bps": avg_edge,
|
|
"baseline_avg_edge_bps": baseline["avg_edge_bps"],
|
|
"avg_edge_lift_bps": avg_edge - baseline["avg_edge_bps"],
|
|
"median_edge_bps": float(part[edge_col].median()),
|
|
}
|
|
)
|
|
return rows
|
|
|
|
|
|
def _bucket_edges(values: np.ndarray) -> np.ndarray:
|
|
clean = values[np.isfinite(values)]
|
|
if clean.size < 1000:
|
|
return np.array([], dtype="float64")
|
|
quantiles = np.linspace(0.0, 1.0, 11)
|
|
edges = np.quantile(clean, quantiles)
|
|
edges = np.unique(edges)
|
|
if edges.size < 3:
|
|
return np.array([], dtype="float64")
|
|
edges[0] = -np.inf
|
|
edges[-1] = np.inf
|
|
return edges
|
|
|
|
|
|
def _select_candidates(bucket_metrics: pd.DataFrame, min_bucket_rows: int) -> pd.DataFrame:
|
|
tune = bucket_metrics[bucket_metrics["split_id"].eq(TUNE_SPLIT) & (bucket_metrics["row_count"] >= min_bucket_rows)].copy()
|
|
if tune.empty:
|
|
return pd.DataFrame()
|
|
tune = tune.sort_values(["side", "feature", "avg_edge_lift_bps", "positive_rate_lift"], ascending=[True, True, False, False])
|
|
picked = tune.groupby(["side", "feature"], as_index=False, observed=False).head(1)
|
|
|
|
candidates = picked[
|
|
[
|
|
"side",
|
|
"feature",
|
|
"bucket_index",
|
|
"bucket_count",
|
|
"bucket_lower",
|
|
"bucket_upper",
|
|
"row_count",
|
|
"positive_rate",
|
|
"positive_rate_lift",
|
|
"avg_edge_bps",
|
|
"avg_edge_lift_bps",
|
|
]
|
|
].rename(
|
|
columns={
|
|
"row_count": "tune_rows",
|
|
"positive_rate": "tune_positive_rate",
|
|
"positive_rate_lift": "tune_positive_rate_lift",
|
|
"avg_edge_bps": "tune_avg_edge_bps",
|
|
"avg_edge_lift_bps": "tune_avg_edge_lift_bps",
|
|
}
|
|
)
|
|
for split_id in (VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT):
|
|
split_rows = bucket_metrics[bucket_metrics["split_id"].eq(split_id)][
|
|
["side", "feature", "bucket_index", "row_count", "positive_rate", "positive_rate_lift", "avg_edge_bps", "avg_edge_lift_bps"]
|
|
].rename(
|
|
columns={
|
|
"row_count": f"{split_id}_rows",
|
|
"positive_rate": f"{split_id}_positive_rate",
|
|
"positive_rate_lift": f"{split_id}_positive_rate_lift",
|
|
"avg_edge_bps": f"{split_id}_avg_edge_bps",
|
|
"avg_edge_lift_bps": f"{split_id}_avg_edge_lift_bps",
|
|
}
|
|
)
|
|
candidates = candidates.merge(split_rows, on=["side", "feature", "bucket_index"], how="left")
|
|
|
|
for column in (
|
|
f"{VALIDATION_LOCKED_SPLIT}_rows",
|
|
f"{LATEST_STRESS_SPLIT}_rows",
|
|
f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps",
|
|
f"{LATEST_STRESS_SPLIT}_avg_edge_bps",
|
|
f"{VALIDATION_LOCKED_SPLIT}_avg_edge_lift_bps",
|
|
f"{LATEST_STRESS_SPLIT}_avg_edge_lift_bps",
|
|
):
|
|
if column not in candidates.columns:
|
|
candidates[column] = np.nan
|
|
|
|
candidates["stable_positive_edge"] = (
|
|
(candidates["tune_avg_edge_bps"] > 0.0)
|
|
& (candidates[f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps"] > 0.0)
|
|
& (candidates[f"{LATEST_STRESS_SPLIT}_avg_edge_bps"] > 0.0)
|
|
)
|
|
candidates["stable_lift"] = (
|
|
(candidates["tune_avg_edge_lift_bps"] > 0.0)
|
|
& (candidates[f"{VALIDATION_LOCKED_SPLIT}_avg_edge_lift_bps"] > 0.0)
|
|
& (candidates[f"{LATEST_STRESS_SPLIT}_avg_edge_lift_bps"] > 0.0)
|
|
)
|
|
candidates["min_eval_edge_bps"] = candidates[["tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps"]].min(axis=1)
|
|
candidates["mean_eval_edge_bps"] = candidates[["tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps"]].mean(axis=1)
|
|
candidates["min_eval_rows"] = candidates[["tune_rows", f"{VALIDATION_LOCKED_SPLIT}_rows", f"{LATEST_STRESS_SPLIT}_rows"]].min(axis=1)
|
|
candidates["screen_score"] = (
|
|
candidates["min_eval_edge_bps"].fillna(-999.0)
|
|
+ candidates["mean_eval_edge_bps"].fillna(-999.0) * 0.25
|
|
+ candidates["stable_lift"].astype(float) * 2.0
|
|
)
|
|
return candidates.sort_values("screen_score", ascending=False).reset_index(drop=True)
|
|
|
|
|
|
def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
|
|
lines = [
|
|
"# Entry 特征筛查报告",
|
|
"",
|
|
"## 结论怎么读",
|
|
"",
|
|
"这份报告只回答一个问题:历史数据里,单个特征的某些区间有没有稳定变好。",
|
|
"",
|
|
"- `tune_inner` 用来挑候选区间。",
|
|
"- `validation_locked` 和 `latest_stress` 用来检查这个区间是不是出了训练样本也还能站住。",
|
|
"- `stable_positive_edge=true` 代表这个区间在三个检查集里的平均净收益都大于 0。",
|
|
"- `stable_lift=true` 代表这个区间在三个检查集里都比对应大盘样本平均值更好。",
|
|
"",
|
|
"## 本次结果",
|
|
"",
|
|
f"- run_id: `{result['run_id']}`",
|
|
f"- 特征数: `{result['feature_count']}`",
|
|
f"- 分桶明细数: `{result['bucket_metric_count']}`",
|
|
f"- 候选数: `{result['candidate_count']}`",
|
|
f"- 最小分桶行数: `{result['min_bucket_rows']}`",
|
|
"",
|
|
]
|
|
if candidates.empty:
|
|
lines.extend(
|
|
[
|
|
"## 候选特征",
|
|
"",
|
|
"没有找到满足最小样本数的候选区间。下一步应先扩大数据或重新检查标签/价格计划,不建议直接继续调模型。",
|
|
"",
|
|
]
|
|
)
|
|
return "\n".join(lines)
|
|
stable = candidates[candidates["stable_positive_edge"] & candidates["stable_lift"]]
|
|
lines.extend(
|
|
[
|
|
"## 稳定候选",
|
|
"",
|
|
f"- 同时满足正收益和正提升的候选数: `{len(stable)}`",
|
|
"",
|
|
]
|
|
)
|
|
display_columns = [
|
|
"side",
|
|
"feature",
|
|
"bucket_index",
|
|
"bucket_lower",
|
|
"bucket_upper",
|
|
"tune_avg_edge_bps",
|
|
f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps",
|
|
f"{LATEST_STRESS_SPLIT}_avg_edge_bps",
|
|
"stable_positive_edge",
|
|
"stable_lift",
|
|
"screen_score",
|
|
]
|
|
lines.append(_markdown_table(candidates[display_columns].head(20)))
|
|
lines.extend(
|
|
[
|
|
"",
|
|
"## 文件",
|
|
"",
|
|
"- `diagnostics/entry_feature_bucket_metrics.csv`: 每个特征、每个桶、每个数据段的完整明细。",
|
|
"- `diagnostics/entry_feature_signal_candidates.csv`: 每个特征按调参集挑出的最好区间,以及封存验证/压力检查结果。",
|
|
"",
|
|
]
|
|
)
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _markdown_table(frame: pd.DataFrame) -> str:
|
|
if frame.empty:
|
|
return "_无_"
|
|
columns = list(frame.columns)
|
|
lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join(["---"] * len(columns)) + " |"]
|
|
for _, row in frame.iterrows():
|
|
values = [_format_cell(row[column]) for column in columns]
|
|
lines.append("| " + " | ".join(values) + " |")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _format_cell(value: Any) -> str:
|
|
if pd.isna(value):
|
|
return ""
|
|
if isinstance(value, (float, np.floating)):
|
|
return f"{float(value):.6g}"
|
|
if isinstance(value, (bool, np.bool_)):
|
|
return "true" if bool(value) else "false"
|
|
return str(value)
|