from __future__ import annotations import logging from typing import Any import numpy as np import pandas as pd from trader_training.io_utils import read_parquet, run_root, write_json, write_text from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT) ALL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT) def screen_entry_features(args: Any) -> None: root = run_root(args) dataset = read_parquet(root / "dataset" / "entry_train.parquet") required = {"split_id", *FEATURE_ORDER, "long_entry_target", "short_entry_target", "long_expected_net_edge_bps", "short_expected_net_edge_bps"} missing = sorted(required.difference(dataset.columns)) if missing: raise ValueError(f"entry feature screen missing required columns: {missing}") min_bucket_rows = int(args.min_bucket_rows or 300) rows: list[dict[str, Any]] = [] for side in ("LONG", "SHORT"): target_col = "long_entry_target" if side == "LONG" else "short_entry_target" edge_col = "long_expected_net_edge_bps" if side == "LONG" else "short_expected_net_edge_bps" baselines = _split_baselines(dataset, target_col, edge_col) for feature in FEATURE_ORDER: rows.extend(_feature_rows(dataset, feature, side, target_col, edge_col, baselines)) bucket_metrics = pd.DataFrame(rows) if bucket_metrics.empty: raise ValueError("entry feature screen produced no bucket metrics") candidates = _select_candidates(bucket_metrics, min_bucket_rows) result = { "run_id": args.run_id, "dataset_path": str(root / "dataset" / "entry_train.parquet"), "feature_count": len(FEATURE_ORDER), "bucket_metric_count": int(len(bucket_metrics)), "candidate_count": int(len(candidates)), "min_bucket_rows": min_bucket_rows, "selection_rule": "bucket boundaries are learned on fit_inner; candidate is picked by tune_inner and checked on validation_locked/latest_stress", } write_json(root / "diagnostics" / "entry_feature_screen_result.json", result) write_text(root / "diagnostics" / "entry_feature_bucket_metrics.csv", bucket_metrics.to_csv(index=False)) write_text(root / "diagnostics" / "entry_feature_signal_candidates.csv", candidates.to_csv(index=False)) write_text(root / "diagnostics" / "entry_feature_screen_report.md", _markdown_report(result, candidates)) logging.info( "trader.training.entry_feature_screened runId=%s featureCount=%s bucketMetricCount=%s candidateCount=%s reportPath=%s", args.run_id, len(FEATURE_ORDER), len(bucket_metrics), len(candidates), root / "diagnostics" / "entry_feature_screen_report.md", ) def _split_baselines(dataset: pd.DataFrame, target_col: str, edge_col: str) -> dict[str, dict[str, float]]: baselines: dict[str, dict[str, float]] = {} for split_id in ALL_SPLITS: part = dataset[dataset["split_id"].eq(split_id)] if part.empty: continue baselines[split_id] = { "rows": float(len(part)), "positive_rate": float(part[target_col].mean()), "avg_edge_bps": float(part[edge_col].mean()), } return baselines def _feature_rows( dataset: pd.DataFrame, feature: str, side: str, target_col: str, edge_col: str, baselines: dict[str, dict[str, float]], ) -> list[dict[str, Any]]: train_values = pd.to_numeric(dataset.loc[dataset["split_id"].eq(FIT_SPLIT), feature], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna() edges = _bucket_edges(train_values.to_numpy(dtype="float64")) if len(edges) < 3: logging.info("trader.training.entry_feature_screen_skipped feature=%s reason=not_enough_unique_values", feature) return [] values = pd.to_numeric(dataset[feature], errors="coerce").replace([np.inf, -np.inf], np.nan) bucket = pd.cut(values, bins=edges, include_lowest=True, labels=False, duplicates="drop") working = dataset[["split_id", target_col, edge_col]].copy() working["bucket_index"] = bucket.astype("float") working = working.dropna(subset=["bucket_index"]) if working.empty: return [] working["bucket_index"] = working["bucket_index"].astype(int) rows: list[dict[str, Any]] = [] for (split_id, bucket_index), part in working.groupby(["split_id", "bucket_index"], sort=True, observed=False): if split_id not in baselines: continue lower = float(edges[bucket_index]) upper = float(edges[bucket_index + 1]) baseline = baselines[split_id] avg_edge = float(part[edge_col].mean()) positive_rate = float(part[target_col].mean()) rows.append( { "side": side, "feature": feature, "split_id": split_id, "bucket_index": int(bucket_index), "bucket_count": int(len(edges) - 1), "bucket_lower": lower, "bucket_upper": upper, "row_count": int(len(part)), "positive_rate": positive_rate, "baseline_positive_rate": baseline["positive_rate"], "positive_rate_lift": positive_rate - baseline["positive_rate"], "avg_edge_bps": avg_edge, "baseline_avg_edge_bps": baseline["avg_edge_bps"], "avg_edge_lift_bps": avg_edge - baseline["avg_edge_bps"], "median_edge_bps": float(part[edge_col].median()), } ) return rows def _bucket_edges(values: np.ndarray) -> np.ndarray: clean = values[np.isfinite(values)] if clean.size < 1000: return np.array([], dtype="float64") quantiles = np.linspace(0.0, 1.0, 11) edges = np.quantile(clean, quantiles) edges = np.unique(edges) if edges.size < 3: return np.array([], dtype="float64") edges[0] = -np.inf edges[-1] = np.inf return edges def _select_candidates(bucket_metrics: pd.DataFrame, min_bucket_rows: int) -> pd.DataFrame: tune = bucket_metrics[bucket_metrics["split_id"].eq(TUNE_SPLIT) & (bucket_metrics["row_count"] >= min_bucket_rows)].copy() if tune.empty: return pd.DataFrame() tune = tune.sort_values(["side", "feature", "avg_edge_lift_bps", "positive_rate_lift"], ascending=[True, True, False, False]) picked = tune.groupby(["side", "feature"], as_index=False, observed=False).head(1) candidates = picked[ [ "side", "feature", "bucket_index", "bucket_count", "bucket_lower", "bucket_upper", "row_count", "positive_rate", "positive_rate_lift", "avg_edge_bps", "avg_edge_lift_bps", ] ].rename( columns={ "row_count": "tune_rows", "positive_rate": "tune_positive_rate", "positive_rate_lift": "tune_positive_rate_lift", "avg_edge_bps": "tune_avg_edge_bps", "avg_edge_lift_bps": "tune_avg_edge_lift_bps", } ) for split_id in (VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT): split_rows = bucket_metrics[bucket_metrics["split_id"].eq(split_id)][ ["side", "feature", "bucket_index", "row_count", "positive_rate", "positive_rate_lift", "avg_edge_bps", "avg_edge_lift_bps"] ].rename( columns={ "row_count": f"{split_id}_rows", "positive_rate": f"{split_id}_positive_rate", "positive_rate_lift": f"{split_id}_positive_rate_lift", "avg_edge_bps": f"{split_id}_avg_edge_bps", "avg_edge_lift_bps": f"{split_id}_avg_edge_lift_bps", } ) candidates = candidates.merge(split_rows, on=["side", "feature", "bucket_index"], how="left") for column in ( f"{VALIDATION_LOCKED_SPLIT}_rows", f"{LATEST_STRESS_SPLIT}_rows", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_lift_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_lift_bps", ): if column not in candidates.columns: candidates[column] = np.nan candidates["stable_positive_edge"] = ( (candidates["tune_avg_edge_bps"] > 0.0) & (candidates[f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps"] > 0.0) & (candidates[f"{LATEST_STRESS_SPLIT}_avg_edge_bps"] > 0.0) ) candidates["stable_lift"] = ( (candidates["tune_avg_edge_lift_bps"] > 0.0) & (candidates[f"{VALIDATION_LOCKED_SPLIT}_avg_edge_lift_bps"] > 0.0) & (candidates[f"{LATEST_STRESS_SPLIT}_avg_edge_lift_bps"] > 0.0) ) candidates["min_eval_edge_bps"] = candidates[["tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps"]].min(axis=1) candidates["mean_eval_edge_bps"] = candidates[["tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps"]].mean(axis=1) candidates["min_eval_rows"] = candidates[["tune_rows", f"{VALIDATION_LOCKED_SPLIT}_rows", f"{LATEST_STRESS_SPLIT}_rows"]].min(axis=1) candidates["screen_score"] = ( candidates["min_eval_edge_bps"].fillna(-999.0) + candidates["mean_eval_edge_bps"].fillna(-999.0) * 0.25 + candidates["stable_lift"].astype(float) * 2.0 ) return candidates.sort_values("screen_score", ascending=False).reset_index(drop=True) def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str: lines = [ "# Entry 特征筛查报告", "", "## 结论怎么读", "", "这份报告只回答一个问题:历史数据里,单个特征的某些区间有没有稳定变好。", "", "- `tune_inner` 用来挑候选区间。", "- `validation_locked` 和 `latest_stress` 用来检查这个区间是不是出了训练样本也还能站住。", "- `stable_positive_edge=true` 代表这个区间在三个检查集里的平均净收益都大于 0。", "- `stable_lift=true` 代表这个区间在三个检查集里都比对应大盘样本平均值更好。", "", "## 本次结果", "", f"- run_id: `{result['run_id']}`", f"- 特征数: `{result['feature_count']}`", f"- 分桶明细数: `{result['bucket_metric_count']}`", f"- 候选数: `{result['candidate_count']}`", f"- 最小分桶行数: `{result['min_bucket_rows']}`", "", ] if candidates.empty: lines.extend( [ "## 候选特征", "", "没有找到满足最小样本数的候选区间。下一步应先扩大数据或重新检查标签/价格计划,不建议直接继续调模型。", "", ] ) return "\n".join(lines) stable = candidates[candidates["stable_positive_edge"] & candidates["stable_lift"]] lines.extend( [ "## 稳定候选", "", f"- 同时满足正收益和正提升的候选数: `{len(stable)}`", "", ] ) display_columns = [ "side", "feature", "bucket_index", "bucket_lower", "bucket_upper", "tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps", "stable_positive_edge", "stable_lift", "screen_score", ] lines.append(_markdown_table(candidates[display_columns].head(20))) lines.extend( [ "", "## 文件", "", "- `diagnostics/entry_feature_bucket_metrics.csv`: 每个特征、每个桶、每个数据段的完整明细。", "- `diagnostics/entry_feature_signal_candidates.csv`: 每个特征按调参集挑出的最好区间,以及封存验证/压力检查结果。", "", ] ) return "\n".join(lines) def _markdown_table(frame: pd.DataFrame) -> str: if frame.empty: return "_无_" columns = list(frame.columns) lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join(["---"] * len(columns)) + " |"] for _, row in frame.iterrows(): values = [_format_cell(row[column]) for column in columns] lines.append("| " + " | ".join(values) + " |") return "\n".join(lines) def _format_cell(value: Any) -> str: if pd.isna(value): return "" if isinstance(value, (float, np.floating)): return f"{float(value):.6g}" if isinstance(value, (bool, np.bool_)): return "true" if bool(value) else "false" return str(value)