from __future__ import annotations import json import logging from typing import Any import numpy as np import pandas as pd from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.metrics import accuracy_score, brier_score_loss, log_loss, roc_auc_score from trader_training.io_utils import read_parquet, run_root, write_json, write_text from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT) def benchmark_nonlinear_models(args: Any) -> None: root = run_root(args) result = { "run_id": args.run_id, "purpose": "diagnostic_only_not_exported", "model_family": "sklearn_hist_gradient_boosting", "feature_count": len(FEATURE_ORDER), "direction": _benchmark_direction(root), "entry": _benchmark_entry(root), } out_dir = root / "diagnostics" write_json(out_dir / "nonlinear_benchmark_result.json", result) _write_report(out_dir / "nonlinear_benchmark_report.md", result) logging.info( "trader.training.nonlinear_benchmark_written runId=%s path=%s", args.run_id, out_dir / "nonlinear_benchmark_report.md", ) def _benchmark_direction(root) -> dict[str, Any]: dataset = read_parquet(root / "dataset" / "direction_train.parquet") train = dataset[dataset["split_id"] == FIT_SPLIT].copy() x_train = _x(train) y_train = train[["long_target", "short_target", "neutral_target"]].to_numpy().argmax(axis=1) model = HistGradientBoostingClassifier( max_iter=120, learning_rate=0.05, max_leaf_nodes=31, l2_regularization=0.01, early_stopping=True, random_state=7, ) model.fit(x_train, y_train) train_prior = np.bincount(y_train, minlength=3).astype(float) train_prior = train_prior / train_prior.sum() metrics = {} for split in EVAL_SPLITS: frame = dataset[dataset["split_id"] == split].copy() if frame.empty: continue y_true = frame[["long_target", "short_target", "neutral_target"]].to_numpy().argmax(axis=1) proba = model.predict_proba(_x(frame)) metrics[split] = _multiclass_metrics(y_true, proba, train_prior) return {"metrics": metrics} def _benchmark_entry(root) -> dict[str, Any]: dataset = read_parquet(root / "dataset" / "entry_train.parquet") train = dataset[dataset["split_id"] == FIT_SPLIT].copy() result: dict[str, Any] = {} for target in ("long_entry_target", "short_entry_target"): y_train = train[target].astype(int).to_numpy() if len(np.unique(y_train)) < 2: result[target] = {"status": "SKIPPED_ONE_CLASS_TRAIN"} continue model = HistGradientBoostingClassifier( max_iter=160, learning_rate=0.04, max_leaf_nodes=31, l2_regularization=0.02, early_stopping=True, random_state=11, ) model.fit(_x(train), y_train) train_prior = float(y_train.mean()) split_metrics = {} for split in EVAL_SPLITS: frame = dataset[dataset["split_id"] == split].copy() if frame.empty: continue y_true = frame[target].astype(int).to_numpy() proba = model.predict_proba(_x(frame))[:, 1] split_metrics[split] = _binary_metrics(y_true, proba, train_prior) result[target] = {"metrics": split_metrics} return result def _x(frame: pd.DataFrame) -> np.ndarray: return frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32").to_numpy() def _binary_metrics(y_true: np.ndarray, proba: np.ndarray, train_prior: float) -> dict[str, Any]: prior = float(np.clip(train_prior, 1e-6, 1 - 1e-6)) constant = np.full(len(y_true), prior) order = np.argsort(-proba) top_n = max(1, int(len(y_true) * 0.10)) metrics = { "row_count": int(len(y_true)), "positive_rate": float(y_true.mean()) if len(y_true) else 0.0, "brier": float(brier_score_loss(y_true, proba)) if len(y_true) else 0.0, "constant_brier": float(brier_score_loss(y_true, constant)) if len(y_true) else 0.0, "top10_hit_rate": float(y_true[order[:top_n]].mean()) if len(y_true) else 0.0, "all_hit_rate": float(y_true.mean()) if len(y_true) else 0.0, } if len(np.unique(y_true)) == 2: metrics["auc"] = float(roc_auc_score(y_true, proba)) metrics["tradable_signal"] = bool( metrics.get("auc", 0.0) >= 0.56 and metrics["brier"] < metrics["constant_brier"] and metrics["top10_hit_rate"] > metrics["all_hit_rate"] ) return metrics def _multiclass_metrics(y_true: np.ndarray, proba: np.ndarray, train_prior: np.ndarray) -> dict[str, Any]: constant = np.tile(train_prior.reshape(1, -1), (len(y_true), 1)) pred = proba.argmax(axis=1) metrics = { "row_count": int(len(y_true)), "accuracy": float(accuracy_score(y_true, pred)), "logloss": float(log_loss(y_true, proba, labels=[0, 1, 2])), "constant_logloss": float(log_loss(y_true, constant, labels=[0, 1, 2])), } for class_id, name in enumerate(("long_auc", "short_auc", "neutral_auc")): binary = (y_true == class_id).astype(int) if len(np.unique(binary)) == 2: metrics[name] = float(roc_auc_score(binary, proba[:, class_id])) metrics["tradable_signal"] = bool( metrics.get("long_auc", 0.0) >= 0.56 and metrics.get("short_auc", 0.0) >= 0.56 and metrics["logloss"] < metrics["constant_logloss"] ) return metrics def _write_report(path, result: dict[str, Any]) -> None: lines = [ "# Nonlinear Benchmark Report", "", "这份报告只做诊断,不导出上线模型。它回答:同样的特征给更强一点的树模型,能不能找到稳定信号。", "", f"- run_id: `{result['run_id']}`", f"- feature_count: `{result['feature_count']}`", "", "## Direction", "", _json_block(result["direction"]["metrics"]), "", "## Entry", "", _json_block(result["entry"]), "", ] write_text(path, "\n".join(lines)) def _json_block(value: Any) -> str: return "```json\n" + json.dumps(value, ensure_ascii=False, indent=2) + "\n```"