9acb3460a1
Align entry labels with max future edge, tune direction labeling, and harden regression evaluation. Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
369 lines
16 KiB
Python
369 lines
16 KiB
Python
from __future__ import annotations
|
|
|
|
import itertools
|
|
import logging
|
|
from typing import Any
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
|
from trader_training.labels import DEFAULT_COST_CONFIG, DEFAULT_LABEL_CONFIG, ENTRY_LABEL_METHOD, _load_config
|
|
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
|
|
|
|
|
EVAL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
|
DEFAULT_HORIZONS = (30, 45, 60, 90, 120)
|
|
DEFAULT_TARGETS = (12.0, 16.0, 20.0, 24.0, 32.0, 40.0)
|
|
DEFAULT_STOPS = (6.0, 8.0, 10.0, 12.0, 16.0)
|
|
|
|
|
|
def search_price_plans(args: Any) -> None:
|
|
root = run_root(args)
|
|
replay = read_parquet(args.replay_path or root / "replay" / "replay_1m.parquet")
|
|
features = read_parquet(args.feature_path or root / "feature" / "feature_frame.parquet")
|
|
label_config = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
|
|
cost_config = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
|
|
cost_bps = float(cost_config["fee_bps"]) + float(cost_config["slippage_bps"]) + float(cost_config["funding_cost_bps"])
|
|
min_expected_edge_bps = float(label_config["entry"]["min_expected_net_edge_bps"])
|
|
|
|
trainable = features[
|
|
features["data_quality_flag"].isin(["OK", "PARTIAL_OPTIONAL"])
|
|
& features["split_id"].isin(EVAL_SPLITS)
|
|
][["symbol", "open_time_ms", "split_id"]].copy()
|
|
if trainable.empty:
|
|
raise ValueError("price plan search needs trainable feature rows")
|
|
|
|
rows: list[dict[str, Any]] = []
|
|
for symbol, group in replay.groupby("symbol", sort=False, observed=False):
|
|
feature_split_by_ms = (
|
|
trainable[trainable["symbol"].eq(symbol)]
|
|
.drop_duplicates("open_time_ms")
|
|
.set_index("open_time_ms")["split_id"]
|
|
.to_dict()
|
|
)
|
|
if not feature_split_by_ms:
|
|
continue
|
|
symbol_rows = _symbol_plan_rows(
|
|
symbol,
|
|
group.sort_values("event_time").reset_index(drop=True),
|
|
feature_split_by_ms,
|
|
cost_bps,
|
|
min_expected_edge_bps,
|
|
args.horizons or DEFAULT_HORIZONS,
|
|
args.targets or DEFAULT_TARGETS,
|
|
args.stops or DEFAULT_STOPS,
|
|
)
|
|
rows.extend(symbol_rows)
|
|
|
|
result = pd.DataFrame(rows)
|
|
if result.empty:
|
|
raise ValueError("price plan search produced no candidate rows")
|
|
summary = _plan_summary(result)
|
|
best = _select_best_plan(summary)
|
|
payload = {
|
|
"run_id": args.run_id,
|
|
"cost_bps": cost_bps,
|
|
"min_expected_net_edge_bps": min_expected_edge_bps,
|
|
"entry_label_method": ENTRY_LABEL_METHOD,
|
|
"candidate_count": int(summary["plan_id"].nunique()),
|
|
"best_plan": best,
|
|
}
|
|
write_json(root / "price-plan-search" / "price_plan_search_result.json", _jsonable(payload))
|
|
write_text(root / "price-plan-search" / "price_plan_search_rows.csv", result.to_csv(index=False))
|
|
write_text(root / "price-plan-search" / "price_plan_search_summary.csv", summary.to_csv(index=False))
|
|
write_text(root / "price-plan-search" / "price_plan_search_report.md", _markdown_report(payload, summary))
|
|
logging.info(
|
|
"trader.training.price_plan_searched runId=%s candidateCount=%s bestPlan=%s bestScore=%.6f",
|
|
args.run_id,
|
|
payload["candidate_count"],
|
|
best["plan_id"],
|
|
best["score"],
|
|
)
|
|
|
|
|
|
def _symbol_plan_rows(
|
|
symbol: str,
|
|
replay: pd.DataFrame,
|
|
feature_split_by_ms: dict[int, str],
|
|
cost_bps: float,
|
|
min_expected_edge_bps: float,
|
|
horizons: tuple[int, ...],
|
|
targets: tuple[float, ...],
|
|
stops: tuple[float, ...],
|
|
) -> list[dict[str, Any]]:
|
|
close = replay["close"].astype("float64").to_numpy()
|
|
high = replay["high"].astype("float64").to_numpy()
|
|
low = replay["low"].astype("float64").to_numpy()
|
|
open_time_ms = replay["open_time_ms"].astype("int64").to_numpy()
|
|
rows: list[dict[str, Any]] = []
|
|
for horizon in horizons:
|
|
if len(replay) <= horizon:
|
|
continue
|
|
high_window = np.lib.stride_tricks.sliding_window_view(high, horizon + 1)[:, 1:]
|
|
low_window = np.lib.stride_tricks.sliding_window_view(low, horizon + 1)[:, 1:]
|
|
time_window = np.lib.stride_tricks.sliding_window_view(open_time_ms, horizon + 1)[:, 1:]
|
|
entry_price = close[: len(high_window)]
|
|
exit_price = close[horizon:]
|
|
current_ms = open_time_ms[: len(high_window)]
|
|
expected_times = current_ms.reshape(-1, 1) + np.arange(1, horizon + 1, dtype=np.int64).reshape(1, -1) * 60_000
|
|
contiguous = np.all(time_window == expected_times, axis=1)
|
|
split_values = pd.Series(current_ms).map(feature_split_by_ms).to_numpy()
|
|
feature_mask = pd.notna(split_values)
|
|
usable = contiguous & feature_mask
|
|
if not usable.any():
|
|
continue
|
|
for target_bps, stop_bps in itertools.product(targets, stops):
|
|
if target_bps - cost_bps < min_expected_edge_bps:
|
|
continue
|
|
rows.extend(
|
|
_plan_side_rows(
|
|
symbol,
|
|
horizon,
|
|
target_bps,
|
|
stop_bps,
|
|
"LONG",
|
|
entry_price,
|
|
exit_price,
|
|
high_window,
|
|
low_window,
|
|
split_values,
|
|
usable,
|
|
cost_bps,
|
|
min_expected_edge_bps,
|
|
)
|
|
)
|
|
rows.extend(
|
|
_plan_side_rows(
|
|
symbol,
|
|
horizon,
|
|
target_bps,
|
|
stop_bps,
|
|
"SHORT",
|
|
entry_price,
|
|
exit_price,
|
|
high_window,
|
|
low_window,
|
|
split_values,
|
|
usable,
|
|
cost_bps,
|
|
min_expected_edge_bps,
|
|
)
|
|
)
|
|
return rows
|
|
|
|
|
|
def _plan_side_rows(
|
|
symbol: str,
|
|
horizon: int,
|
|
target_bps: float,
|
|
stop_bps: float,
|
|
side: str,
|
|
entry_price: np.ndarray,
|
|
exit_price: np.ndarray,
|
|
high_window: np.ndarray,
|
|
low_window: np.ndarray,
|
|
split_values: np.ndarray,
|
|
usable: np.ndarray,
|
|
cost_bps: float,
|
|
min_expected_edge_bps: float,
|
|
) -> list[dict[str, Any]]:
|
|
if side == "LONG":
|
|
target_price = entry_price.reshape(-1, 1) * (1.0 + target_bps / 10000.0)
|
|
stop_price = entry_price.reshape(-1, 1) * (1.0 - stop_bps / 10000.0)
|
|
target_matrix = high_window >= target_price
|
|
stop_matrix = low_window <= stop_price
|
|
timeout_return = (exit_price / entry_price - 1.0) * 10000.0
|
|
max_achievable_gross = (np.nanmax(high_window, axis=1) / entry_price - 1.0) * 10000.0
|
|
else:
|
|
target_price = entry_price.reshape(-1, 1) * (1.0 - target_bps / 10000.0)
|
|
stop_price = entry_price.reshape(-1, 1) * (1.0 + stop_bps / 10000.0)
|
|
target_matrix = low_window <= target_price
|
|
stop_matrix = high_window >= stop_price
|
|
timeout_return = (entry_price / exit_price - 1.0) * 10000.0
|
|
max_achievable_gross = (entry_price / np.nanmin(low_window, axis=1) - 1.0) * 10000.0
|
|
|
|
large = target_matrix.shape[1] + 1
|
|
target_any = target_matrix.any(axis=1)
|
|
stop_any = stop_matrix.any(axis=1)
|
|
target_index = np.where(target_any, target_matrix.argmax(axis=1), large)
|
|
stop_index = np.where(stop_any, stop_matrix.argmax(axis=1), large)
|
|
target_first = target_any & (~stop_any | (target_index < stop_index))
|
|
stop_first = stop_any & (~target_any | (stop_index <= target_index))
|
|
timeout = ~(target_first | stop_first)
|
|
gross = np.where(target_first, target_bps, np.where(stop_first, -stop_bps, timeout_return))
|
|
price_plan_net = gross - cost_bps
|
|
expected_net = max_achievable_gross - cost_bps
|
|
positive = expected_net >= min_expected_edge_bps
|
|
ambiguous = target_any & stop_any & (target_index == stop_index)
|
|
|
|
rows: list[dict[str, Any]] = []
|
|
for split_id in EVAL_SPLITS:
|
|
mask = usable & (split_values == split_id)
|
|
if not mask.any():
|
|
continue
|
|
plan_id = f"h{horizon}_t{target_bps:g}_s{stop_bps:g}"
|
|
values = expected_net[mask]
|
|
plan_values = price_plan_net[mask]
|
|
target_rate = float(target_first[mask].mean())
|
|
stop_rate = float(stop_first[mask].mean())
|
|
timeout_rate = float(timeout[mask].mean())
|
|
rows.append(
|
|
{
|
|
"plan_id": plan_id,
|
|
"symbol": symbol,
|
|
"split_id": split_id,
|
|
"side": side,
|
|
"horizon_minutes": horizon,
|
|
"target_bps": target_bps,
|
|
"stop_bps": stop_bps,
|
|
"cost_bps": cost_bps,
|
|
"positive_net_bps": target_bps - cost_bps,
|
|
"stop_net_bps": -stop_bps - cost_bps,
|
|
"rows": int(mask.sum()),
|
|
"target_hit_rate": target_rate,
|
|
"stop_hit_rate": stop_rate,
|
|
"timeout_rate": timeout_rate,
|
|
"ambiguous_rate": float(ambiguous[mask].mean()),
|
|
"positive_label_rate": float(positive[mask].mean()),
|
|
"avg_expected_net_edge_bps": float(values.mean()),
|
|
"median_expected_net_edge_bps": float(np.median(values)),
|
|
"p95_expected_net_edge_bps": float(np.quantile(values, 0.95)),
|
|
"avg_price_plan_net_edge_bps": float(plan_values.mean()),
|
|
"required_target_hit_rate": float((stop_bps + cost_bps) / (target_bps + stop_bps)),
|
|
"target_rate_margin": float(target_rate - ((stop_bps + cost_bps) / (target_bps + stop_bps))),
|
|
}
|
|
)
|
|
return rows
|
|
|
|
|
|
def _plan_summary(rows: pd.DataFrame) -> pd.DataFrame:
|
|
group_cols = ["plan_id", "horizon_minutes", "target_bps", "stop_bps", "side"]
|
|
split_rows = rows.pivot_table(
|
|
index=group_cols,
|
|
columns="split_id",
|
|
values=["positive_label_rate", "avg_expected_net_edge_bps", "avg_price_plan_net_edge_bps", "target_rate_margin", "target_hit_rate", "stop_hit_rate"],
|
|
aggfunc="mean",
|
|
)
|
|
split_rows.columns = [f"{metric}_{split}" for metric, split in split_rows.columns]
|
|
split_rows = split_rows.reset_index()
|
|
for split_id in EVAL_SPLITS:
|
|
for metric in ("positive_label_rate", "avg_expected_net_edge_bps", "avg_price_plan_net_edge_bps", "target_rate_margin", "target_hit_rate", "stop_hit_rate"):
|
|
column = f"{metric}_{split_id}"
|
|
if column not in split_rows.columns:
|
|
split_rows[column] = np.nan
|
|
split_rows["min_positive_label_rate_eval"] = split_rows[
|
|
[f"positive_label_rate_{split}" for split in (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)]
|
|
].min(axis=1)
|
|
split_rows["max_positive_label_rate_eval"] = split_rows[
|
|
[f"positive_label_rate_{split}" for split in (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)]
|
|
].max(axis=1)
|
|
split_rows["avg_edge_eval"] = split_rows[
|
|
[f"avg_expected_net_edge_bps_{split}" for split in (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)]
|
|
].mean(axis=1)
|
|
split_rows["avg_price_plan_edge_eval"] = split_rows[
|
|
[f"avg_price_plan_net_edge_bps_{split}" for split in (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)]
|
|
].mean(axis=1)
|
|
split_rows["min_margin_eval"] = split_rows[
|
|
[f"target_rate_margin_{split}" for split in (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)]
|
|
].min(axis=1)
|
|
# The search score is not an上线门槛. It only chooses the next experiment:
|
|
# enough positive samples, less negative average edge, and stable behavior
|
|
# across tune/validation/stress.
|
|
positive_rate_penalty = (
|
|
(0.08 - split_rows["min_positive_label_rate_eval"]).clip(lower=0.0) * 80.0
|
|
+ (split_rows["max_positive_label_rate_eval"] - 0.45).clip(lower=0.0) * 30.0
|
|
)
|
|
spread_bonus = np.log1p((split_rows["target_bps"] - split_rows["stop_bps"]).clip(lower=0.0))
|
|
split_rows["score"] = (
|
|
split_rows["avg_edge_eval"]
|
|
+ split_rows["avg_price_plan_edge_eval"] * 0.5
|
|
+ split_rows["min_margin_eval"] * 20.0
|
|
- positive_rate_penalty
|
|
+ spread_bonus
|
|
)
|
|
return split_rows.sort_values("score", ascending=False).reset_index(drop=True)
|
|
|
|
|
|
def _select_best_plan(summary: pd.DataFrame) -> dict[str, Any]:
|
|
candidates = summary[
|
|
(summary["min_positive_label_rate_eval"] >= 0.08)
|
|
& (summary["max_positive_label_rate_eval"] <= 0.45)
|
|
& (summary["target_bps"] > summary["stop_bps"])
|
|
]
|
|
if candidates.empty:
|
|
candidates = summary[summary["target_bps"] > summary["stop_bps"]]
|
|
if candidates.empty:
|
|
candidates = summary
|
|
row = candidates.sort_values("score", ascending=False).iloc[0]
|
|
return {
|
|
"plan_id": str(row["plan_id"]),
|
|
"horizon_minutes": int(row["horizon_minutes"]),
|
|
"target_bps": float(row["target_bps"]),
|
|
"stop_bps": float(row["stop_bps"]),
|
|
"side": str(row["side"]),
|
|
"score": float(row["score"]),
|
|
"avg_edge_eval": float(row["avg_edge_eval"]),
|
|
"avg_price_plan_edge_eval": float(row["avg_price_plan_edge_eval"]),
|
|
"min_margin_eval": float(row["min_margin_eval"]),
|
|
"min_positive_label_rate_eval": float(row["min_positive_label_rate_eval"]),
|
|
"max_positive_label_rate_eval": float(row["max_positive_label_rate_eval"]),
|
|
}
|
|
|
|
|
|
def _markdown_report(payload: dict[str, Any], summary: pd.DataFrame) -> str:
|
|
top = summary.head(20)
|
|
lines = [
|
|
"# Price Plan Search Report",
|
|
"",
|
|
f"- run_id: `{payload['run_id']}`",
|
|
f"- cost_bps: {payload['cost_bps']}",
|
|
f"- min_expected_net_edge_bps: {payload['min_expected_net_edge_bps']}",
|
|
f"- entry_label_method: `{payload['entry_label_method']}`",
|
|
f"- candidate_count: {payload['candidate_count']}",
|
|
"",
|
|
"## Best Plan For Next Experiment",
|
|
"",
|
|
"```json",
|
|
str(payload["best_plan"]).replace("'", '"'),
|
|
"```",
|
|
"",
|
|
"## Top Plans",
|
|
"",
|
|
_markdown_table(top),
|
|
"",
|
|
"说明:positive_label_rate 和 avg_expected_net_edge_bps 按“未来窗口最大可拿净收益”统计;target_hit_rate、stop_hit_rate、avg_price_plan_net_edge_bps 只用来检查固定止盈止损计划是否顺手。这里选的是下一轮实验用的价格计划,不是上线结论。真正能不能上线仍然看模型训练、PM 搜索、validation_locked 和 latest_stress 回测。",
|
|
"",
|
|
]
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _markdown_table(frame: pd.DataFrame) -> str:
|
|
if frame.empty:
|
|
return "无数据。"
|
|
columns = list(frame.columns)
|
|
lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join("---" for _ in columns) + " |"]
|
|
for row in frame.to_dict("records"):
|
|
values = []
|
|
for column in columns:
|
|
value = row.get(column, "")
|
|
if isinstance(value, float):
|
|
value = round(value, 6)
|
|
values.append(str(value))
|
|
lines.append("| " + " | ".join(values) + " |")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def _jsonable(value: Any) -> Any:
|
|
if isinstance(value, dict):
|
|
return {str(key): _jsonable(item) for key, item in value.items()}
|
|
if isinstance(value, list):
|
|
return [_jsonable(item) for item in value]
|
|
if isinstance(value, tuple):
|
|
return [_jsonable(item) for item in value]
|
|
if isinstance(value, (np.integer,)):
|
|
return int(value)
|
|
if isinstance(value, (np.floating,)):
|
|
return float(value)
|
|
return value
|