9acb3460a1
Align entry labels with max future edge, tune direction labeling, and harden regression evaluation. Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
168 lines
6.3 KiB
Python
168 lines
6.3 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.ensemble import HistGradientBoostingClassifier
|
|
from sklearn.metrics import accuracy_score, brier_score_loss, log_loss, roc_auc_score
|
|
|
|
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
|
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
|
|
|
|
|
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
|
|
|
|
|
def benchmark_nonlinear_models(args: Any) -> None:
|
|
root = run_root(args)
|
|
result = {
|
|
"run_id": args.run_id,
|
|
"purpose": "diagnostic_only_not_exported",
|
|
"model_family": "sklearn_hist_gradient_boosting",
|
|
"feature_count": len(FEATURE_ORDER),
|
|
"direction": _benchmark_direction(root),
|
|
"entry": _benchmark_entry(root),
|
|
}
|
|
out_dir = root / "diagnostics"
|
|
write_json(out_dir / "nonlinear_benchmark_result.json", result)
|
|
_write_report(out_dir / "nonlinear_benchmark_report.md", result)
|
|
logging.info(
|
|
"trader.training.nonlinear_benchmark_written runId=%s path=%s",
|
|
args.run_id,
|
|
out_dir / "nonlinear_benchmark_report.md",
|
|
)
|
|
|
|
|
|
def _benchmark_direction(root) -> dict[str, Any]:
|
|
dataset = read_parquet(root / "dataset" / "direction_train.parquet")
|
|
train = dataset[dataset["split_id"] == FIT_SPLIT].copy()
|
|
x_train = _x(train)
|
|
y_train = train[["long_target", "short_target", "neutral_target"]].to_numpy().argmax(axis=1)
|
|
model = HistGradientBoostingClassifier(
|
|
max_iter=120,
|
|
learning_rate=0.05,
|
|
max_leaf_nodes=31,
|
|
l2_regularization=0.01,
|
|
early_stopping=True,
|
|
random_state=7,
|
|
)
|
|
model.fit(x_train, y_train)
|
|
train_prior = np.bincount(y_train, minlength=3).astype(float)
|
|
train_prior = train_prior / train_prior.sum()
|
|
metrics = {}
|
|
for split in EVAL_SPLITS:
|
|
frame = dataset[dataset["split_id"] == split].copy()
|
|
if frame.empty:
|
|
continue
|
|
y_true = frame[["long_target", "short_target", "neutral_target"]].to_numpy().argmax(axis=1)
|
|
proba = model.predict_proba(_x(frame))
|
|
metrics[split] = _multiclass_metrics(y_true, proba, train_prior)
|
|
return {"metrics": metrics}
|
|
|
|
|
|
def _benchmark_entry(root) -> dict[str, Any]:
|
|
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
|
train = dataset[dataset["split_id"] == FIT_SPLIT].copy()
|
|
result: dict[str, Any] = {}
|
|
for target in ("long_entry_target", "short_entry_target"):
|
|
y_train = train[target].astype(int).to_numpy()
|
|
if len(np.unique(y_train)) < 2:
|
|
result[target] = {"status": "SKIPPED_ONE_CLASS_TRAIN"}
|
|
continue
|
|
model = HistGradientBoostingClassifier(
|
|
max_iter=160,
|
|
learning_rate=0.04,
|
|
max_leaf_nodes=31,
|
|
l2_regularization=0.02,
|
|
early_stopping=True,
|
|
random_state=11,
|
|
)
|
|
model.fit(_x(train), y_train)
|
|
train_prior = float(y_train.mean())
|
|
split_metrics = {}
|
|
for split in EVAL_SPLITS:
|
|
frame = dataset[dataset["split_id"] == split].copy()
|
|
if frame.empty:
|
|
continue
|
|
y_true = frame[target].astype(int).to_numpy()
|
|
proba = model.predict_proba(_x(frame))[:, 1]
|
|
split_metrics[split] = _binary_metrics(y_true, proba, train_prior)
|
|
result[target] = {"metrics": split_metrics}
|
|
return result
|
|
|
|
|
|
def _x(frame: pd.DataFrame) -> np.ndarray:
|
|
return frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32").to_numpy()
|
|
|
|
|
|
def _binary_metrics(y_true: np.ndarray, proba: np.ndarray, train_prior: float) -> dict[str, Any]:
|
|
prior = float(np.clip(train_prior, 1e-6, 1 - 1e-6))
|
|
constant = np.full(len(y_true), prior)
|
|
order = np.argsort(-proba)
|
|
top_n = max(1, int(len(y_true) * 0.10))
|
|
metrics = {
|
|
"row_count": int(len(y_true)),
|
|
"positive_rate": float(y_true.mean()) if len(y_true) else 0.0,
|
|
"brier": float(brier_score_loss(y_true, proba)) if len(y_true) else 0.0,
|
|
"constant_brier": float(brier_score_loss(y_true, constant)) if len(y_true) else 0.0,
|
|
"top10_hit_rate": float(y_true[order[:top_n]].mean()) if len(y_true) else 0.0,
|
|
"all_hit_rate": float(y_true.mean()) if len(y_true) else 0.0,
|
|
}
|
|
if len(np.unique(y_true)) == 2:
|
|
metrics["auc"] = float(roc_auc_score(y_true, proba))
|
|
metrics["tradable_signal"] = bool(
|
|
metrics.get("auc", 0.0) >= 0.56
|
|
and metrics["brier"] < metrics["constant_brier"]
|
|
and metrics["top10_hit_rate"] > metrics["all_hit_rate"]
|
|
)
|
|
return metrics
|
|
|
|
|
|
def _multiclass_metrics(y_true: np.ndarray, proba: np.ndarray, train_prior: np.ndarray) -> dict[str, Any]:
|
|
constant = np.tile(train_prior.reshape(1, -1), (len(y_true), 1))
|
|
pred = proba.argmax(axis=1)
|
|
metrics = {
|
|
"row_count": int(len(y_true)),
|
|
"accuracy": float(accuracy_score(y_true, pred)),
|
|
"logloss": float(log_loss(y_true, proba, labels=[0, 1, 2])),
|
|
"constant_logloss": float(log_loss(y_true, constant, labels=[0, 1, 2])),
|
|
}
|
|
for class_id, name in enumerate(("long_auc", "short_auc", "neutral_auc")):
|
|
binary = (y_true == class_id).astype(int)
|
|
if len(np.unique(binary)) == 2:
|
|
metrics[name] = float(roc_auc_score(binary, proba[:, class_id]))
|
|
metrics["tradable_signal"] = bool(
|
|
metrics.get("long_auc", 0.0) >= 0.56
|
|
and metrics.get("short_auc", 0.0) >= 0.56
|
|
and metrics["logloss"] < metrics["constant_logloss"]
|
|
)
|
|
return metrics
|
|
|
|
|
|
def _write_report(path, result: dict[str, Any]) -> None:
|
|
lines = [
|
|
"# Nonlinear Benchmark Report",
|
|
"",
|
|
"这份报告只做诊断,不导出上线模型。它回答:同样的特征给更强一点的树模型,能不能找到稳定信号。",
|
|
"",
|
|
f"- run_id: `{result['run_id']}`",
|
|
f"- feature_count: `{result['feature_count']}`",
|
|
"",
|
|
"## Direction",
|
|
"",
|
|
_json_block(result["direction"]["metrics"]),
|
|
"",
|
|
"## Entry",
|
|
"",
|
|
_json_block(result["entry"]),
|
|
"",
|
|
]
|
|
write_text(path, "\n".join(lines))
|
|
|
|
|
|
def _json_block(value: Any) -> str:
|
|
return "```json\n" + json.dumps(value, ensure_ascii=False, indent=2) + "\n```"
|