2026-06-27 16:15:23 +08:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
|
import pandas as pd
|
2026-06-27 19:57:29 +08:00
|
|
|
from numpy.lib.stride_tricks import sliding_window_view
|
2026-06-27 16:15:23 +08:00
|
|
|
|
|
|
|
|
from trader_training.io_utils import (
|
|
|
|
|
manifest,
|
|
|
|
|
read_json,
|
|
|
|
|
read_parquet,
|
|
|
|
|
require_columns,
|
|
|
|
|
run_root,
|
|
|
|
|
sha256_json,
|
|
|
|
|
to_utc_series,
|
|
|
|
|
write_json,
|
|
|
|
|
write_parquet,
|
|
|
|
|
write_text,
|
|
|
|
|
)
|
|
|
|
|
from trader_training.schemas import LABEL_VERSION
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_LABEL_CONFIG = {
|
|
|
|
|
"direction": {"horizon_minutes": 45, "long_threshold_bps": 5.0, "short_threshold_bps": -5.0},
|
2026-06-28 00:50:37 +08:00
|
|
|
"entry": {
|
|
|
|
|
"max_hold_minutes": 45,
|
|
|
|
|
"target_bps": 12.0,
|
|
|
|
|
"stop_bps": 8.0,
|
|
|
|
|
"min_expected_net_edge_bps": 3.0,
|
|
|
|
|
"plan_method": "FIXED_TARGET_STOP_V1",
|
|
|
|
|
"target_method": "PRICE_PLAN_OUTCOME_V1",
|
|
|
|
|
"min_plan_net_edge_bps": 0.0,
|
|
|
|
|
"max_entry_mae_bps": 12.0,
|
|
|
|
|
"partial_take_1_ratio": 0.50,
|
|
|
|
|
"partial_take_2_ratio": 0.25,
|
|
|
|
|
"second_target_bps": 24.0,
|
|
|
|
|
"trailing_stop_bps": 10.0,
|
|
|
|
|
"breakeven_after_first_target": True,
|
|
|
|
|
},
|
2026-06-27 23:53:58 +08:00
|
|
|
"continue": {"horizon_minutes": 45, "min_expected_continue_edge_bps": 5.0},
|
|
|
|
|
"exit": {"horizon_minutes": 45, "adverse_move_bps": 20.0, "stagnation_abs_return_bps": 5.0},
|
|
|
|
|
"risk": {
|
|
|
|
|
"horizon_minutes": 45,
|
|
|
|
|
"market_drawdown_bps": 60.0,
|
|
|
|
|
"position_path_risk_bps": 35.0,
|
|
|
|
|
"vol_expansion_ratio": 1.8,
|
|
|
|
|
"spike_bps": 80.0,
|
|
|
|
|
},
|
2026-06-27 16:15:23 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DEFAULT_COST_CONFIG = {
|
|
|
|
|
"fee_bps": 4.0,
|
|
|
|
|
"slippage_bps": 2.0,
|
|
|
|
|
"funding_cost_bps": 0.5,
|
|
|
|
|
}
|
|
|
|
|
|
2026-06-27 23:53:58 +08:00
|
|
|
ENTRY_LABEL_METHOD = "PRICE_PLAN_OUTCOME_V1"
|
2026-06-27 19:57:29 +08:00
|
|
|
|
2026-06-27 16:15:23 +08:00
|
|
|
|
|
|
|
|
def _load_config(path, default):
|
|
|
|
|
if path is None:
|
|
|
|
|
return default
|
|
|
|
|
value = read_json(path)
|
|
|
|
|
merged = default.copy()
|
|
|
|
|
for key, item in value.items():
|
|
|
|
|
if isinstance(item, dict) and isinstance(merged.get(key), dict):
|
|
|
|
|
merged[key] = {**merged[key], **item}
|
|
|
|
|
else:
|
|
|
|
|
merged[key] = item
|
|
|
|
|
return merged
|
|
|
|
|
|
|
|
|
|
|
2026-06-27 23:53:58 +08:00
|
|
|
def _config_number(config: dict[str, Any], keys: tuple[str, ...], default: float) -> float:
|
|
|
|
|
for key in keys:
|
|
|
|
|
if key in config:
|
|
|
|
|
return float(config[key])
|
|
|
|
|
return default
|
|
|
|
|
|
|
|
|
|
|
2026-06-27 16:15:23 +08:00
|
|
|
def _base_frames(args: Any) -> tuple[pd.DataFrame, pd.DataFrame]:
|
|
|
|
|
root = run_root(args)
|
|
|
|
|
feature_path = args.feature_path or root / "feature" / "feature_frame.parquet"
|
|
|
|
|
replay_path = args.replay_path or root / "replay" / "replay_1m.parquet"
|
|
|
|
|
features = read_parquet(feature_path)
|
|
|
|
|
replay = read_parquet(replay_path)
|
|
|
|
|
require_columns(features, ("sample_id", "symbol", "event_time", "open_time_ms", "split_id", "walk_forward_fold", "data_quality_flag"), "feature_frame")
|
|
|
|
|
require_columns(replay, ("symbol", "event_time", "open_time_ms", "open", "high", "low", "close", "spread_bps"), "replay_1m")
|
|
|
|
|
features = features.copy()
|
|
|
|
|
replay = replay.copy()
|
|
|
|
|
features["event_time"] = to_utc_series(features["event_time"])
|
|
|
|
|
replay["event_time"] = to_utc_series(replay["event_time"])
|
|
|
|
|
replay = replay.sort_values(["symbol", "event_time"]).reset_index(drop=True)
|
|
|
|
|
return features, replay
|
|
|
|
|
|
|
|
|
|
|
2026-06-27 19:57:29 +08:00
|
|
|
PATH_STAT_COLUMNS = [
|
|
|
|
|
"symbol",
|
|
|
|
|
"open_time_ms",
|
|
|
|
|
"side",
|
|
|
|
|
"target_hit",
|
|
|
|
|
"stop_hit",
|
|
|
|
|
"timeout_hit",
|
|
|
|
|
"ambiguous_hit",
|
|
|
|
|
"time_to_target_ms",
|
|
|
|
|
"time_to_stop_ms",
|
2026-06-28 00:50:37 +08:00
|
|
|
"time_to_exit_ms",
|
2026-06-27 19:57:29 +08:00
|
|
|
"gross_edge_bps",
|
|
|
|
|
"future_return_bps",
|
|
|
|
|
"mfe_bps",
|
|
|
|
|
"mae_bps",
|
|
|
|
|
"future_spread_p80",
|
|
|
|
|
"future_realized_vol_bps",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _empty_path_stats_frame() -> pd.DataFrame:
|
|
|
|
|
return pd.DataFrame(columns=PATH_STAT_COLUMNS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _first_hit_index(hit_window: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
|
|
|
|
|
hit_any = hit_window.any(axis=1)
|
|
|
|
|
first_idx = np.argmax(hit_window, axis=1)
|
|
|
|
|
first_idx = np.where(hit_any, first_idx, hit_window.shape[1] + 1)
|
|
|
|
|
return hit_any, first_idx
|
|
|
|
|
|
|
|
|
|
|
2026-06-28 00:50:37 +08:00
|
|
|
def _path_stats_for_group(
|
|
|
|
|
group: pd.DataFrame,
|
|
|
|
|
side: str,
|
|
|
|
|
horizon: int,
|
|
|
|
|
target_bps: float,
|
|
|
|
|
stop_bps: float,
|
|
|
|
|
plan_config: dict[str, Any] | None = None,
|
|
|
|
|
) -> pd.DataFrame:
|
2026-06-27 19:57:29 +08:00
|
|
|
if len(group) <= horizon:
|
|
|
|
|
return _empty_path_stats_frame()
|
|
|
|
|
|
|
|
|
|
grouped = group.sort_values("event_time").reset_index(drop=True)
|
|
|
|
|
open_ms = grouped["open_time_ms"].astype("int64").to_numpy()
|
|
|
|
|
close = grouped["close"].astype("float64").to_numpy()
|
|
|
|
|
high = grouped["high"].astype("float64").to_numpy()
|
|
|
|
|
low = grouped["low"].astype("float64").to_numpy()
|
|
|
|
|
spread = grouped["spread_bps"].astype("float64").to_numpy()
|
|
|
|
|
|
|
|
|
|
entry = close[:-horizon]
|
|
|
|
|
exit_price = close[horizon:]
|
|
|
|
|
current_open_ms = open_ms[:-horizon]
|
|
|
|
|
|
|
|
|
|
bad_gap = (np.diff(open_ms) != 60_000).astype("int64")
|
|
|
|
|
gap_cumsum = np.concatenate(([0], np.cumsum(bad_gap)))
|
|
|
|
|
contiguous = (gap_cumsum[horizon:] - gap_cumsum[:-horizon]) == 0
|
|
|
|
|
finite = np.isfinite(entry) & np.isfinite(exit_price)
|
|
|
|
|
valid = contiguous & finite
|
|
|
|
|
|
|
|
|
|
future_high = sliding_window_view(high[1:], horizon)
|
|
|
|
|
future_low = sliding_window_view(low[1:], horizon)
|
|
|
|
|
future_spread = sliding_window_view(spread[1:], horizon)
|
|
|
|
|
|
|
|
|
|
with np.errstate(all="ignore"):
|
|
|
|
|
high_max = np.nanmax(future_high, axis=1)
|
|
|
|
|
low_min = np.nanmin(future_low, axis=1)
|
|
|
|
|
spread_p80 = np.nanquantile(future_spread, 0.8, axis=1)
|
|
|
|
|
|
|
|
|
|
if horizon > 1:
|
|
|
|
|
log_close = np.log(np.clip(close, 1e-12, None))
|
|
|
|
|
log_return = np.diff(log_close)
|
|
|
|
|
future_log_return = sliding_window_view(log_return[1:], horizon - 1)
|
|
|
|
|
with np.errstate(all="ignore"):
|
|
|
|
|
realized_vol_bps = np.nanstd(future_log_return, axis=1, ddof=1) * 10000.0
|
|
|
|
|
else:
|
|
|
|
|
realized_vol_bps = np.full(len(entry), np.nan)
|
2026-06-27 16:15:23 +08:00
|
|
|
|
2026-06-28 00:50:37 +08:00
|
|
|
method = str((plan_config or {}).get("plan_method", "FIXED_TARGET_STOP_V1"))
|
|
|
|
|
if method == "DYNAMIC_TRAILING_V1":
|
|
|
|
|
return _dynamic_path_stats_for_group(
|
|
|
|
|
grouped,
|
|
|
|
|
side,
|
|
|
|
|
horizon,
|
|
|
|
|
target_bps,
|
|
|
|
|
stop_bps,
|
|
|
|
|
close,
|
|
|
|
|
high,
|
|
|
|
|
low,
|
|
|
|
|
spread,
|
|
|
|
|
open_ms,
|
|
|
|
|
valid,
|
|
|
|
|
future_high,
|
|
|
|
|
future_low,
|
|
|
|
|
future_spread,
|
|
|
|
|
future_realized_vol_bps=realized_vol_bps,
|
|
|
|
|
plan_config=plan_config or {},
|
|
|
|
|
)
|
|
|
|
|
if method != "FIXED_TARGET_STOP_V1":
|
|
|
|
|
raise ValueError(f"unsupported entry plan_method: {method}")
|
|
|
|
|
|
2026-06-27 19:57:29 +08:00
|
|
|
if side == "LONG":
|
|
|
|
|
target_price = entry * (1.0 + target_bps / 10000.0)
|
|
|
|
|
stop_price = entry * (1.0 - stop_bps / 10000.0)
|
|
|
|
|
target_window = future_high >= target_price[:, None]
|
|
|
|
|
stop_window = future_low <= stop_price[:, None]
|
|
|
|
|
future_return_bps = (exit_price / entry - 1.0) * 10000.0
|
2026-06-27 23:53:58 +08:00
|
|
|
mfe_bps = np.maximum((high_max / entry - 1.0) * 10000.0, 0.0)
|
|
|
|
|
mae_bps = np.maximum((entry / low_min - 1.0) * 10000.0, 0.0)
|
2026-06-27 19:57:29 +08:00
|
|
|
else:
|
|
|
|
|
target_price = entry * (1.0 - target_bps / 10000.0)
|
|
|
|
|
stop_price = entry * (1.0 + stop_bps / 10000.0)
|
|
|
|
|
target_window = future_low <= target_price[:, None]
|
|
|
|
|
stop_window = future_high >= stop_price[:, None]
|
|
|
|
|
future_return_bps = (entry / exit_price - 1.0) * 10000.0
|
2026-06-27 23:53:58 +08:00
|
|
|
mfe_bps = np.maximum((entry / low_min - 1.0) * 10000.0, 0.0)
|
|
|
|
|
mae_bps = np.maximum((high_max / entry - 1.0) * 10000.0, 0.0)
|
2026-06-27 19:57:29 +08:00
|
|
|
|
|
|
|
|
target_any, first_target_idx = _first_hit_index(target_window)
|
|
|
|
|
stop_any, first_stop_idx = _first_hit_index(stop_window)
|
|
|
|
|
ambiguous_hit = target_any & stop_any & (first_target_idx == first_stop_idx)
|
|
|
|
|
target_hit = target_any & (first_target_idx < first_stop_idx)
|
|
|
|
|
stop_hit = stop_any & (first_stop_idx <= first_target_idx)
|
|
|
|
|
timeout_hit = ~(target_hit | stop_hit)
|
|
|
|
|
gross_edge_bps = np.where(target_hit, target_bps, np.where(stop_hit, -stop_bps, future_return_bps))
|
2026-06-28 00:50:37 +08:00
|
|
|
time_to_exit_ms = np.where(
|
|
|
|
|
target_hit,
|
|
|
|
|
(first_target_idx + 1) * 60_000,
|
|
|
|
|
np.where(stop_hit, (first_stop_idx + 1) * 60_000, horizon * 60_000),
|
|
|
|
|
)
|
2026-06-27 16:15:23 +08:00
|
|
|
|
2026-06-27 19:57:29 +08:00
|
|
|
out = pd.DataFrame(
|
|
|
|
|
{
|
|
|
|
|
"symbol": grouped["symbol"].iloc[0],
|
|
|
|
|
"open_time_ms": current_open_ms,
|
|
|
|
|
"side": side,
|
|
|
|
|
"target_hit": target_hit.astype("int8"),
|
|
|
|
|
"stop_hit": stop_hit.astype("int8"),
|
|
|
|
|
"timeout_hit": timeout_hit.astype("int8"),
|
|
|
|
|
"ambiguous_hit": ambiguous_hit.astype("int8"),
|
|
|
|
|
"time_to_target_ms": np.where(target_hit, (first_target_idx + 1) * 60_000, -1).astype("int64"),
|
|
|
|
|
"time_to_stop_ms": np.where(stop_hit, (first_stop_idx + 1) * 60_000, -1).astype("int64"),
|
2026-06-28 00:50:37 +08:00
|
|
|
"time_to_exit_ms": time_to_exit_ms.astype("int64"),
|
2026-06-27 19:57:29 +08:00
|
|
|
"gross_edge_bps": gross_edge_bps.astype("float64"),
|
|
|
|
|
"future_return_bps": future_return_bps.astype("float64"),
|
|
|
|
|
"mfe_bps": mfe_bps.astype("float64"),
|
|
|
|
|
"mae_bps": mae_bps.astype("float64"),
|
|
|
|
|
"future_spread_p80": spread_p80.astype("float64"),
|
|
|
|
|
"future_realized_vol_bps": realized_vol_bps.astype("float64"),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
return out.loc[valid, PATH_STAT_COLUMNS].reset_index(drop=True)
|
2026-06-27 16:15:23 +08:00
|
|
|
|
|
|
|
|
|
2026-06-28 00:50:37 +08:00
|
|
|
def _dynamic_path_stats_for_group(
|
|
|
|
|
grouped: pd.DataFrame,
|
|
|
|
|
side: str,
|
|
|
|
|
horizon: int,
|
|
|
|
|
target_bps: float,
|
|
|
|
|
stop_bps: float,
|
|
|
|
|
close: np.ndarray,
|
|
|
|
|
high: np.ndarray,
|
|
|
|
|
low: np.ndarray,
|
|
|
|
|
spread: np.ndarray,
|
|
|
|
|
open_ms: np.ndarray,
|
|
|
|
|
valid: np.ndarray,
|
|
|
|
|
future_high: np.ndarray,
|
|
|
|
|
future_low: np.ndarray,
|
|
|
|
|
future_spread: np.ndarray,
|
|
|
|
|
future_realized_vol_bps: np.ndarray,
|
|
|
|
|
plan_config: dict[str, Any],
|
|
|
|
|
) -> pd.DataFrame:
|
|
|
|
|
entry = close[:-horizon]
|
|
|
|
|
exit_price = close[horizon:]
|
|
|
|
|
current_open_ms = open_ms[:-horizon]
|
|
|
|
|
future_close = sliding_window_view(close[1:], horizon)
|
|
|
|
|
with np.errstate(all="ignore"):
|
|
|
|
|
high_max = np.nanmax(future_high, axis=1)
|
|
|
|
|
low_min = np.nanmin(future_low, axis=1)
|
|
|
|
|
spread_p80 = np.nanquantile(future_spread, 0.8, axis=1)
|
|
|
|
|
|
|
|
|
|
take1_ratio = float(plan_config.get("partial_take_1_ratio", 0.50))
|
|
|
|
|
take2_ratio = float(plan_config.get("partial_take_2_ratio", 0.25))
|
|
|
|
|
take1_ratio = float(np.clip(take1_ratio, 0.0, 1.0))
|
|
|
|
|
take2_ratio = float(np.clip(take2_ratio, 0.0, max(0.0, 1.0 - take1_ratio)))
|
|
|
|
|
target2_bps = float(plan_config.get("second_target_bps", target_bps * 2.0))
|
|
|
|
|
trailing_stop_bps = float(plan_config.get("trailing_stop_bps", stop_bps))
|
|
|
|
|
breakeven_after_first = bool(plan_config.get("breakeven_after_first_target", True))
|
|
|
|
|
|
|
|
|
|
n = len(entry)
|
|
|
|
|
active = np.ones(n, dtype=bool)
|
|
|
|
|
first_target_done = np.zeros(n, dtype=bool)
|
|
|
|
|
second_target_done = np.zeros(n, dtype=bool)
|
|
|
|
|
bad_stop_done = np.zeros(n, dtype=bool)
|
|
|
|
|
trailing_exit_done = np.zeros(n, dtype=bool)
|
|
|
|
|
remaining = np.ones(n, dtype="float64")
|
|
|
|
|
gross = np.zeros(n, dtype="float64")
|
|
|
|
|
first_target_idx = np.full(n, horizon + 1, dtype="int64")
|
|
|
|
|
stop_idx = np.full(n, horizon + 1, dtype="int64")
|
|
|
|
|
exit_idx = np.full(n, horizon, dtype="int64")
|
|
|
|
|
|
|
|
|
|
if side == "LONG":
|
|
|
|
|
high_water = entry.copy()
|
|
|
|
|
for step in range(horizon):
|
|
|
|
|
h = future_high[:, step]
|
|
|
|
|
l = future_low[:, step]
|
|
|
|
|
prior_high_water = high_water.copy()
|
|
|
|
|
trailing_stop_price = prior_high_water * (1.0 - trailing_stop_bps / 10000.0)
|
|
|
|
|
if breakeven_after_first:
|
|
|
|
|
trailing_stop_price = np.maximum(trailing_stop_price, entry)
|
|
|
|
|
stop_price = np.where(first_target_done, trailing_stop_price, entry * (1.0 - stop_bps / 10000.0))
|
|
|
|
|
stop_now = active & (l <= stop_price)
|
|
|
|
|
stop_gross = (stop_price / entry - 1.0) * 10000.0
|
|
|
|
|
gross = np.where(stop_now, gross + remaining * stop_gross, gross)
|
|
|
|
|
trailing_exit_done |= stop_now & first_target_done
|
|
|
|
|
bad_stop_done |= stop_now & (~first_target_done)
|
|
|
|
|
stop_idx = np.where(stop_now, step, stop_idx)
|
|
|
|
|
exit_idx = np.where(stop_now, step, exit_idx)
|
|
|
|
|
remaining = np.where(stop_now, 0.0, remaining)
|
|
|
|
|
active &= ~stop_now
|
|
|
|
|
|
|
|
|
|
first_now = active & (~first_target_done) & (h >= entry * (1.0 + target_bps / 10000.0))
|
|
|
|
|
gross = np.where(first_now, gross + take1_ratio * target_bps, gross)
|
|
|
|
|
remaining = np.where(first_now, remaining - take1_ratio, remaining)
|
|
|
|
|
first_target_done |= first_now
|
|
|
|
|
first_target_idx = np.where(first_now, step, first_target_idx)
|
|
|
|
|
|
|
|
|
|
second_now = active & first_target_done & (~second_target_done) & (h >= entry * (1.0 + target2_bps / 10000.0))
|
|
|
|
|
gross = np.where(second_now, gross + take2_ratio * target2_bps, gross)
|
|
|
|
|
remaining = np.where(second_now, remaining - take2_ratio, remaining)
|
|
|
|
|
second_target_done |= second_now
|
|
|
|
|
|
|
|
|
|
high_water = np.maximum(high_water, h)
|
|
|
|
|
timeout_return = (exit_price / entry - 1.0) * 10000.0
|
|
|
|
|
future_return_bps = timeout_return
|
|
|
|
|
mfe_bps = np.maximum((high_max / entry - 1.0) * 10000.0, 0.0)
|
|
|
|
|
mae_bps = np.maximum((entry / low_min - 1.0) * 10000.0, 0.0)
|
|
|
|
|
else:
|
|
|
|
|
low_water = entry.copy()
|
|
|
|
|
for step in range(horizon):
|
|
|
|
|
h = future_high[:, step]
|
|
|
|
|
l = future_low[:, step]
|
|
|
|
|
prior_low_water = low_water.copy()
|
|
|
|
|
trailing_stop_price = prior_low_water * (1.0 + trailing_stop_bps / 10000.0)
|
|
|
|
|
if breakeven_after_first:
|
|
|
|
|
trailing_stop_price = np.minimum(trailing_stop_price, entry)
|
|
|
|
|
stop_price = np.where(first_target_done, trailing_stop_price, entry * (1.0 + stop_bps / 10000.0))
|
|
|
|
|
stop_now = active & (h >= stop_price)
|
|
|
|
|
stop_gross = (entry / stop_price - 1.0) * 10000.0
|
|
|
|
|
gross = np.where(stop_now, gross + remaining * stop_gross, gross)
|
|
|
|
|
trailing_exit_done |= stop_now & first_target_done
|
|
|
|
|
bad_stop_done |= stop_now & (~first_target_done)
|
|
|
|
|
stop_idx = np.where(stop_now, step, stop_idx)
|
|
|
|
|
exit_idx = np.where(stop_now, step, exit_idx)
|
|
|
|
|
remaining = np.where(stop_now, 0.0, remaining)
|
|
|
|
|
active &= ~stop_now
|
|
|
|
|
|
|
|
|
|
first_now = active & (~first_target_done) & (l <= entry * (1.0 - target_bps / 10000.0))
|
|
|
|
|
gross = np.where(first_now, gross + take1_ratio * target_bps, gross)
|
|
|
|
|
remaining = np.where(first_now, remaining - take1_ratio, remaining)
|
|
|
|
|
first_target_done |= first_now
|
|
|
|
|
first_target_idx = np.where(first_now, step, first_target_idx)
|
|
|
|
|
|
|
|
|
|
second_now = active & first_target_done & (~second_target_done) & (l <= entry * (1.0 - target2_bps / 10000.0))
|
|
|
|
|
gross = np.where(second_now, gross + take2_ratio * target2_bps, gross)
|
|
|
|
|
remaining = np.where(second_now, remaining - take2_ratio, remaining)
|
|
|
|
|
second_target_done |= second_now
|
|
|
|
|
|
|
|
|
|
low_water = np.minimum(low_water, l)
|
|
|
|
|
timeout_return = (entry / exit_price - 1.0) * 10000.0
|
|
|
|
|
future_return_bps = timeout_return
|
|
|
|
|
mfe_bps = np.maximum((entry / low_min - 1.0) * 10000.0, 0.0)
|
|
|
|
|
mae_bps = np.maximum((high_max / entry - 1.0) * 10000.0, 0.0)
|
|
|
|
|
|
|
|
|
|
timeout_now = active
|
|
|
|
|
gross = np.where(timeout_now, gross + remaining * timeout_return, gross)
|
|
|
|
|
exit_idx = np.where(timeout_now, horizon - 1, exit_idx)
|
|
|
|
|
target_hit = first_target_done
|
|
|
|
|
stop_hit = bad_stop_done
|
|
|
|
|
timeout_hit = timeout_now
|
|
|
|
|
ambiguous_hit = np.zeros(n, dtype=bool)
|
|
|
|
|
out = pd.DataFrame(
|
|
|
|
|
{
|
|
|
|
|
"symbol": grouped["symbol"].iloc[0],
|
|
|
|
|
"open_time_ms": current_open_ms,
|
|
|
|
|
"side": side,
|
|
|
|
|
"target_hit": target_hit.astype("int8"),
|
|
|
|
|
"stop_hit": stop_hit.astype("int8"),
|
|
|
|
|
"timeout_hit": timeout_hit.astype("int8"),
|
|
|
|
|
"ambiguous_hit": ambiguous_hit.astype("int8"),
|
|
|
|
|
"time_to_target_ms": np.where(target_hit, (first_target_idx + 1) * 60_000, -1).astype("int64"),
|
|
|
|
|
"time_to_stop_ms": np.where(stop_hit | trailing_exit_done, (stop_idx + 1) * 60_000, -1).astype("int64"),
|
|
|
|
|
"time_to_exit_ms": ((exit_idx + 1) * 60_000).astype("int64"),
|
|
|
|
|
"gross_edge_bps": gross.astype("float64"),
|
|
|
|
|
"future_return_bps": future_return_bps.astype("float64"),
|
|
|
|
|
"mfe_bps": mfe_bps.astype("float64"),
|
|
|
|
|
"mae_bps": mae_bps.astype("float64"),
|
|
|
|
|
"future_spread_p80": spread_p80.astype("float64"),
|
|
|
|
|
"future_realized_vol_bps": future_realized_vol_bps.astype("float64"),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
return out.loc[valid, PATH_STAT_COLUMNS].reset_index(drop=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _build_path_stats(replay: pd.DataFrame, horizon: int, target_bps: float, stop_bps: float, plan_config: dict[str, Any] | None = None) -> pd.DataFrame:
|
2026-06-27 19:57:29 +08:00
|
|
|
frames: list[pd.DataFrame] = []
|
|
|
|
|
for symbol, group in replay.groupby("symbol", sort=False, observed=False):
|
|
|
|
|
logging.info(
|
2026-06-28 00:50:37 +08:00
|
|
|
"trader.training.path_stats_group_start symbol=%s horizonMinutes=%s planMethod=%s rowCount=%s",
|
2026-06-27 19:57:29 +08:00
|
|
|
symbol,
|
|
|
|
|
horizon,
|
2026-06-28 00:50:37 +08:00
|
|
|
(plan_config or {}).get("plan_method", "FIXED_TARGET_STOP_V1"),
|
2026-06-27 19:57:29 +08:00
|
|
|
len(group),
|
|
|
|
|
)
|
|
|
|
|
for side in ("LONG", "SHORT"):
|
2026-06-28 00:50:37 +08:00
|
|
|
stats = _path_stats_for_group(group, side, horizon, target_bps, stop_bps, plan_config=plan_config)
|
2026-06-27 19:57:29 +08:00
|
|
|
frames.append(stats)
|
|
|
|
|
logging.info(
|
2026-06-28 00:50:37 +08:00
|
|
|
"trader.training.path_stats_side_done symbol=%s side=%s horizonMinutes=%s planMethod=%s rowCount=%s",
|
2026-06-27 19:57:29 +08:00
|
|
|
symbol,
|
|
|
|
|
side,
|
|
|
|
|
horizon,
|
2026-06-28 00:50:37 +08:00
|
|
|
(plan_config or {}).get("plan_method", "FIXED_TARGET_STOP_V1"),
|
2026-06-27 19:57:29 +08:00
|
|
|
len(stats),
|
|
|
|
|
)
|
|
|
|
|
out = pd.concat(frames, ignore_index=True) if frames else _empty_path_stats_frame()
|
2026-06-28 00:50:37 +08:00
|
|
|
logging.info(
|
|
|
|
|
"trader.training.path_stats_built horizonMinutes=%s planMethod=%s rowCount=%s",
|
|
|
|
|
horizon,
|
|
|
|
|
(plan_config or {}).get("plan_method", "FIXED_TARGET_STOP_V1"),
|
|
|
|
|
len(out),
|
|
|
|
|
)
|
2026-06-27 19:57:29 +08:00
|
|
|
return out
|
2026-06-27 16:15:23 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_price_plan_context(args: Any) -> None:
|
|
|
|
|
root = run_root(args)
|
|
|
|
|
cost = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
|
|
|
|
|
labels = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
|
|
|
|
|
entry = labels["entry"]
|
|
|
|
|
cost_bps = float(cost["fee_bps"]) + float(cost["slippage_bps"]) + float(cost["funding_cost_bps"])
|
|
|
|
|
context = {
|
|
|
|
|
"pricePlanId": args.price_plan_id,
|
2026-06-27 19:57:29 +08:00
|
|
|
"pricePlanConfigHash": sha256_json({"entry": entry, "cost": cost, "entry_label_method": ENTRY_LABEL_METHOD}),
|
2026-06-27 16:15:23 +08:00
|
|
|
"stopDistanceBps": float(entry["stop_bps"]),
|
|
|
|
|
"targetDistanceBps": float(entry["target_bps"]),
|
|
|
|
|
"maxHoldMinutes": int(entry["max_hold_minutes"]),
|
2026-06-27 19:57:29 +08:00
|
|
|
"minExpectedNetEdgeBps": float(entry["min_expected_net_edge_bps"]),
|
2026-06-28 00:50:37 +08:00
|
|
|
"minPlanNetEdgeBps": float(entry.get("min_plan_net_edge_bps", 0.0)),
|
|
|
|
|
"maxEntryMaeBps": float(entry.get("max_entry_mae_bps", entry["stop_bps"])),
|
2026-06-27 16:15:23 +08:00
|
|
|
"costBps": cost_bps,
|
2026-06-27 19:57:29 +08:00
|
|
|
"entryLabelMethod": ENTRY_LABEL_METHOD,
|
2026-06-28 00:50:37 +08:00
|
|
|
"entryTargetMethod": str(entry.get("target_method", ENTRY_LABEL_METHOD)),
|
|
|
|
|
"entryPlanMethod": str(entry.get("plan_method", "FIXED_TARGET_STOP_V1")),
|
|
|
|
|
"partialTake1Ratio": float(entry.get("partial_take_1_ratio", 0.50)),
|
|
|
|
|
"partialTake2Ratio": float(entry.get("partial_take_2_ratio", 0.25)),
|
|
|
|
|
"secondTargetBps": float(entry.get("second_target_bps", float(entry["target_bps"]) * 2.0)),
|
|
|
|
|
"trailingStopBps": float(entry.get("trailing_stop_bps", float(entry["stop_bps"]))),
|
|
|
|
|
"breakevenAfterFirstTarget": bool(entry.get("breakeven_after_first_target", True)),
|
2026-06-27 16:15:23 +08:00
|
|
|
}
|
|
|
|
|
path = root / "label" / "price_plan_context.json"
|
|
|
|
|
write_json(path, context)
|
|
|
|
|
frame = pd.DataFrame([{
|
|
|
|
|
"price_plan_id": context["pricePlanId"],
|
|
|
|
|
"price_plan_hash": context["pricePlanConfigHash"],
|
|
|
|
|
"target_bps": context["targetDistanceBps"],
|
|
|
|
|
"stop_bps": context["stopDistanceBps"],
|
|
|
|
|
"max_hold_minutes": context["maxHoldMinutes"],
|
2026-06-27 19:57:29 +08:00
|
|
|
"min_expected_net_edge_bps": context["minExpectedNetEdgeBps"],
|
2026-06-28 00:50:37 +08:00
|
|
|
"min_plan_net_edge_bps": context["minPlanNetEdgeBps"],
|
|
|
|
|
"max_entry_mae_bps": context["maxEntryMaeBps"],
|
2026-06-27 16:15:23 +08:00
|
|
|
"cost_bps": context["costBps"],
|
2026-06-27 19:57:29 +08:00
|
|
|
"entry_label_method": context["entryLabelMethod"],
|
2026-06-28 00:50:37 +08:00
|
|
|
"entry_target_method": context["entryTargetMethod"],
|
|
|
|
|
"entry_plan_method": context["entryPlanMethod"],
|
|
|
|
|
"partial_take_1_ratio": context["partialTake1Ratio"],
|
|
|
|
|
"partial_take_2_ratio": context["partialTake2Ratio"],
|
|
|
|
|
"second_target_bps": context["secondTargetBps"],
|
|
|
|
|
"trailing_stop_bps": context["trailingStopBps"],
|
|
|
|
|
"breakeven_after_first_target": context["breakevenAfterFirstTarget"],
|
2026-06-27 16:15:23 +08:00
|
|
|
}])
|
|
|
|
|
write_parquet(root / "label" / "price_plan_context.parquet", frame)
|
|
|
|
|
logging.info("trader.training.price_plan_written runId=%s path=%s", args.run_id, path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_direction_labels(args: Any) -> None:
|
|
|
|
|
root = run_root(args)
|
|
|
|
|
config = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)["direction"]
|
|
|
|
|
features, replay = _base_frames(args)
|
|
|
|
|
horizon = int(config["horizon_minutes"])
|
|
|
|
|
replay = replay[["symbol", "event_time", "open_time_ms", "close"]].copy()
|
|
|
|
|
future = replay[["symbol", "open_time_ms", "close"]].copy()
|
|
|
|
|
future["open_time_ms"] = future["open_time_ms"].astype("int64") - horizon * 60_000
|
|
|
|
|
future = future.rename(columns={"close": "future_close"})
|
|
|
|
|
merged = features.merge(replay[["symbol", "open_time_ms", "close"]], on=["symbol", "open_time_ms"], how="left")
|
|
|
|
|
merged = merged.merge(future, on=["symbol", "open_time_ms"], how="left")
|
|
|
|
|
merged["future_return_bps"] = (merged["future_close"] / merged["close"] - 1.0) * 10000.0
|
|
|
|
|
merged["direction_label"] = np.select(
|
|
|
|
|
[merged["future_return_bps"] >= float(config["long_threshold_bps"]), merged["future_return_bps"] <= float(config["short_threshold_bps"])],
|
|
|
|
|
["LONG", "SHORT"],
|
|
|
|
|
default="NEUTRAL",
|
|
|
|
|
)
|
|
|
|
|
out = pd.DataFrame(
|
|
|
|
|
{
|
|
|
|
|
"sample_id": merged["sample_id"],
|
|
|
|
|
"symbol": merged["symbol"],
|
|
|
|
|
"event_time": merged["event_time"],
|
|
|
|
|
"horizon_minutes": horizon,
|
|
|
|
|
"future_return_bps": merged["future_return_bps"],
|
|
|
|
|
"direction_label": merged["direction_label"],
|
|
|
|
|
"long_target": merged["direction_label"].eq("LONG").astype("int8"),
|
|
|
|
|
"short_target": merged["direction_label"].eq("SHORT").astype("int8"),
|
|
|
|
|
"neutral_target": merged["direction_label"].eq("NEUTRAL").astype("int8"),
|
|
|
|
|
"split_id": merged["split_id"],
|
|
|
|
|
"walk_forward_fold": merged["walk_forward_fold"],
|
|
|
|
|
"label_version": LABEL_VERSION,
|
|
|
|
|
}
|
|
|
|
|
).dropna(subset=["future_return_bps"])
|
|
|
|
|
path = root / "label" / "direction_labels.parquet"
|
|
|
|
|
data_hash = write_parquet(path, out)
|
|
|
|
|
_write_label_manifest(root / "label" / "direction_labels.manifest.json", path, out, data_hash)
|
|
|
|
|
_write_distribution_report(root / "label" / "direction_label_report.md", out, "direction_label")
|
|
|
|
|
logging.info("trader.training.direction_labels_written runId=%s rowCount=%s", args.run_id, len(out))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_entry_labels(args: Any) -> None:
|
|
|
|
|
root = run_root(args)
|
|
|
|
|
labels = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
|
|
|
|
|
cost = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
|
|
|
|
|
plan_path = args.price_plan_context_path or root / "label" / "price_plan_context.json"
|
|
|
|
|
plan = read_json(plan_path)
|
|
|
|
|
features, replay = _base_frames(args)
|
|
|
|
|
entry_conf = labels["entry"]
|
|
|
|
|
cost_bps = float(cost["fee_bps"]) + float(cost["slippage_bps"]) + float(cost["funding_cost_bps"])
|
2026-06-27 19:57:29 +08:00
|
|
|
stats = _build_path_stats(
|
|
|
|
|
replay,
|
|
|
|
|
int(entry_conf["max_hold_minutes"]),
|
|
|
|
|
float(entry_conf["target_bps"]),
|
|
|
|
|
float(entry_conf["stop_bps"]),
|
2026-06-28 00:50:37 +08:00
|
|
|
plan_config=entry_conf,
|
2026-06-27 19:57:29 +08:00
|
|
|
)
|
|
|
|
|
feature_columns = [
|
|
|
|
|
"sample_id",
|
|
|
|
|
"symbol",
|
|
|
|
|
"event_time",
|
|
|
|
|
"open_time_ms",
|
|
|
|
|
"split_id",
|
|
|
|
|
"walk_forward_fold",
|
|
|
|
|
"spread_bps",
|
|
|
|
|
"spread_rank_24h_pct",
|
|
|
|
|
"realized_vol_15m_bps",
|
|
|
|
|
]
|
|
|
|
|
merged = features[feature_columns].merge(stats, on=["symbol", "open_time_ms"], how="inner")
|
2026-06-28 00:50:37 +08:00
|
|
|
merged["actual_plan_net_edge_bps"] = merged["gross_edge_bps"] - cost_bps
|
2026-06-27 19:57:29 +08:00
|
|
|
merged["max_achievable_gross_edge_bps"] = merged["mfe_bps"]
|
|
|
|
|
merged["max_achievable_net_edge_bps"] = merged["max_achievable_gross_edge_bps"] - cost_bps
|
2026-06-28 00:50:37 +08:00
|
|
|
target_method = str(entry_conf.get("target_method", ENTRY_LABEL_METHOD))
|
|
|
|
|
if target_method == "PRICE_PLAN_OUTCOME_V1":
|
|
|
|
|
merged["expected_net_edge_bps"] = merged["actual_plan_net_edge_bps"]
|
|
|
|
|
elif target_method in {"OPPORTUNITY_MFE_V1", "OPPORTUNITY_QUALITY_V1"}:
|
|
|
|
|
merged["expected_net_edge_bps"] = merged["max_achievable_net_edge_bps"]
|
|
|
|
|
else:
|
|
|
|
|
raise ValueError(f"unsupported entry target_method: {target_method}")
|
|
|
|
|
opportunity = merged["expected_net_edge_bps"] >= float(entry_conf["min_expected_net_edge_bps"])
|
|
|
|
|
if target_method == "OPPORTUNITY_QUALITY_V1":
|
|
|
|
|
# MFE 只说明价格曾经给过机会;真实开仓还要确认这笔机会按计划能拿走,
|
|
|
|
|
# 并且过程中没有先承受过大的反向波动。
|
|
|
|
|
min_plan_net_edge_bps = float(entry_conf.get("min_plan_net_edge_bps", 0.0))
|
|
|
|
|
max_entry_mae_bps = float(entry_conf.get("max_entry_mae_bps", entry_conf["stop_bps"]))
|
|
|
|
|
opportunity = opportunity & (merged["actual_plan_net_edge_bps"] >= min_plan_net_edge_bps) & (merged["mae_bps"] <= max_entry_mae_bps)
|
|
|
|
|
merged["entry_target"] = opportunity.astype("int8")
|
2026-06-27 19:57:29 +08:00
|
|
|
merged["price_plan_id"] = plan["pricePlanId"]
|
|
|
|
|
merged["price_plan_hash"] = plan["pricePlanConfigHash"]
|
|
|
|
|
merged["cost_bps"] = cost_bps
|
2026-06-28 00:50:37 +08:00
|
|
|
merged["label_method"] = target_method
|
2026-06-27 19:57:29 +08:00
|
|
|
merged["label_version"] = LABEL_VERSION
|
|
|
|
|
out = merged[
|
|
|
|
|
[
|
|
|
|
|
"sample_id",
|
|
|
|
|
"symbol",
|
|
|
|
|
"event_time",
|
|
|
|
|
"side",
|
|
|
|
|
"price_plan_id",
|
|
|
|
|
"price_plan_hash",
|
|
|
|
|
"target_hit",
|
|
|
|
|
"stop_hit",
|
|
|
|
|
"timeout_hit",
|
|
|
|
|
"ambiguous_hit",
|
|
|
|
|
"time_to_target_ms",
|
|
|
|
|
"time_to_stop_ms",
|
2026-06-28 00:50:37 +08:00
|
|
|
"time_to_exit_ms",
|
2026-06-27 19:57:29 +08:00
|
|
|
"gross_edge_bps",
|
|
|
|
|
"future_return_bps",
|
|
|
|
|
"mfe_bps",
|
|
|
|
|
"mae_bps",
|
2026-06-28 00:50:37 +08:00
|
|
|
"actual_plan_net_edge_bps",
|
2026-06-27 19:57:29 +08:00
|
|
|
"max_achievable_gross_edge_bps",
|
|
|
|
|
"max_achievable_net_edge_bps",
|
|
|
|
|
"cost_bps",
|
|
|
|
|
"expected_net_edge_bps",
|
|
|
|
|
"entry_target",
|
|
|
|
|
"label_method",
|
|
|
|
|
"split_id",
|
|
|
|
|
"walk_forward_fold",
|
|
|
|
|
"label_version",
|
|
|
|
|
]
|
|
|
|
|
].copy()
|
2026-06-27 16:15:23 +08:00
|
|
|
path = root / "label" / "entry_labels.parquet"
|
|
|
|
|
data_hash = write_parquet(path, out)
|
|
|
|
|
_write_label_manifest(root / "label" / "entry_labels.manifest.json", path, out, data_hash)
|
|
|
|
|
_write_distribution_report(root / "label" / "entry_label_report.md", out, "entry_target")
|
|
|
|
|
logging.info("trader.training.entry_labels_written runId=%s rowCount=%s", args.run_id, len(out))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_position_state_samples(args: Any) -> None:
|
|
|
|
|
root = run_root(args)
|
|
|
|
|
entry_path = args.entry_label_path or root / "label" / "entry_labels.parquet"
|
|
|
|
|
entry = read_parquet(entry_path)
|
|
|
|
|
if entry.empty:
|
|
|
|
|
raise ValueError("entry labels are required before building position samples")
|
|
|
|
|
samples = entry[entry["entry_target"] == 1].copy()
|
|
|
|
|
samples["position_age_minutes"] = 0
|
|
|
|
|
samples["unrealized_pnl_bps"] = 0.0
|
2026-06-27 19:57:29 +08:00
|
|
|
samples["mfe_bps"] = pd.to_numeric(samples["mfe_bps"], errors="coerce").fillna(0.0).clip(lower=0)
|
|
|
|
|
samples["mae_bps"] = pd.to_numeric(samples["mae_bps"], errors="coerce").fillna(0.0).clip(lower=0)
|
2026-06-27 16:15:23 +08:00
|
|
|
path = root / "label" / "position_state_samples.parquet"
|
|
|
|
|
data_hash = write_parquet(path, samples)
|
|
|
|
|
write_json(root / "label" / "position_state_samples.manifest.json", manifest(path, {"row_count": len(samples), "data_hash_sha256": data_hash}))
|
|
|
|
|
logging.info("trader.training.position_samples_written runId=%s rowCount=%s", args.run_id, len(samples))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_continue_exit_risk_labels(args: Any) -> None:
|
|
|
|
|
root = run_root(args)
|
|
|
|
|
labels = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
|
|
|
|
|
cost = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
|
|
|
|
|
plan = read_json(args.price_plan_context_path or root / "label" / "price_plan_context.json")
|
|
|
|
|
features, replay = _base_frames(args)
|
|
|
|
|
cost_bps = float(cost["fee_bps"]) + float(cost["slippage_bps"]) + float(cost["funding_cost_bps"])
|
|
|
|
|
horizon = int(labels["continue"]["horizon_minutes"])
|
|
|
|
|
target_bps = float(plan["targetDistanceBps"])
|
|
|
|
|
stop_bps = float(plan["stopDistanceBps"])
|
2026-06-28 00:50:37 +08:00
|
|
|
plan_config = {
|
|
|
|
|
"plan_method": plan.get("entryPlanMethod", labels["entry"].get("plan_method", "FIXED_TARGET_STOP_V1")),
|
|
|
|
|
"partial_take_1_ratio": plan.get("partialTake1Ratio", labels["entry"].get("partial_take_1_ratio", 0.50)),
|
|
|
|
|
"partial_take_2_ratio": plan.get("partialTake2Ratio", labels["entry"].get("partial_take_2_ratio", 0.25)),
|
|
|
|
|
"second_target_bps": plan.get("secondTargetBps", labels["entry"].get("second_target_bps", target_bps * 2.0)),
|
|
|
|
|
"trailing_stop_bps": plan.get("trailingStopBps", labels["entry"].get("trailing_stop_bps", stop_bps)),
|
|
|
|
|
"breakeven_after_first_target": plan.get(
|
|
|
|
|
"breakevenAfterFirstTarget",
|
|
|
|
|
labels["entry"].get("breakeven_after_first_target", True),
|
|
|
|
|
),
|
|
|
|
|
}
|
|
|
|
|
stats = _build_path_stats(replay, horizon, target_bps, stop_bps, plan_config=plan_config)
|
2026-06-27 19:57:29 +08:00
|
|
|
long_stats = stats[stats["side"] == "LONG"].drop(columns=["side"]).add_prefix("long_")
|
|
|
|
|
short_stats = stats[stats["side"] == "SHORT"].drop(columns=["side"]).add_prefix("short_")
|
|
|
|
|
long_stats = long_stats.rename(columns={"long_symbol": "symbol", "long_open_time_ms": "open_time_ms"})
|
|
|
|
|
short_stats = short_stats.rename(columns={"short_symbol": "symbol", "short_open_time_ms": "open_time_ms"})
|
|
|
|
|
feature_columns = [
|
|
|
|
|
"sample_id",
|
|
|
|
|
"symbol",
|
|
|
|
|
"event_time",
|
|
|
|
|
"open_time_ms",
|
|
|
|
|
"split_id",
|
|
|
|
|
"walk_forward_fold",
|
|
|
|
|
"spread_bps",
|
|
|
|
|
"spread_rank_24h_pct",
|
|
|
|
|
"realized_vol_15m_bps",
|
|
|
|
|
]
|
|
|
|
|
if "ret_15m_bps" in features.columns:
|
|
|
|
|
feature_columns.append("ret_15m_bps")
|
|
|
|
|
merged = features[feature_columns].merge(long_stats, on=["symbol", "open_time_ms"], how="inner")
|
|
|
|
|
merged = merged.merge(short_stats, on=["symbol", "open_time_ms"], how="inner")
|
|
|
|
|
min_continue = float(labels["continue"]["min_expected_continue_edge_bps"])
|
|
|
|
|
adverse_threshold = float(labels["exit"]["adverse_move_bps"])
|
|
|
|
|
current_vol = merged["realized_vol_15m_bps"].astype(float).fillna(0.0).clip(lower=1.0)
|
2026-06-27 23:53:58 +08:00
|
|
|
risk_config = labels["risk"]
|
|
|
|
|
market_risk_threshold = _config_number(
|
|
|
|
|
risk_config,
|
|
|
|
|
("market_path_risk_threshold_bps", "market_drawdown_bps"),
|
|
|
|
|
60.0,
|
|
|
|
|
)
|
|
|
|
|
position_risk_threshold = _config_number(
|
|
|
|
|
risk_config,
|
|
|
|
|
("position_path_risk_threshold_bps", "position_path_risk_bps"),
|
|
|
|
|
35.0,
|
|
|
|
|
)
|
|
|
|
|
spike_threshold = _config_number(risk_config, ("spike_1m_threshold_bps", "spike_bps"), 80.0)
|
|
|
|
|
vol_expansion_ratio = _config_number(risk_config, ("vol_expansion_ratio",), 1.8)
|
2026-06-27 19:57:29 +08:00
|
|
|
|
2026-06-27 23:53:58 +08:00
|
|
|
long_edge = merged["long_gross_edge_bps"] - cost_bps
|
|
|
|
|
short_edge = merged["short_gross_edge_bps"] - cost_bps
|
2026-06-28 00:50:37 +08:00
|
|
|
dynamic_plan = str(plan_config.get("plan_method")) == "DYNAMIC_TRAILING_V1"
|
2026-06-27 19:57:29 +08:00
|
|
|
path_risk = np.maximum(merged["long_mae_bps"], merged["short_mae_bps"])
|
|
|
|
|
max_path_move = np.maximum.reduce([merged["long_mfe_bps"], merged["short_mfe_bps"], path_risk])
|
|
|
|
|
if "ret_15m_bps" in merged.columns:
|
|
|
|
|
reversal = (np.sign(merged["long_future_return_bps"]) != np.sign(merged["ret_15m_bps"])).astype("int8")
|
|
|
|
|
else:
|
|
|
|
|
reversal = pd.Series(0, index=merged.index, dtype="int8")
|
|
|
|
|
future_vol = merged["long_future_realized_vol_bps"].fillna(0.0)
|
2026-06-27 23:53:58 +08:00
|
|
|
volatility_expansion = future_vol >= current_vol * vol_expansion_ratio
|
|
|
|
|
spike = max_path_move >= spike_threshold
|
|
|
|
|
market_risk = (path_risk >= market_risk_threshold) | spike | volatility_expansion
|
2026-06-27 19:57:29 +08:00
|
|
|
liquidity_deterioration = merged["spread_rank_24h_pct"].astype(float).fillna(0.0) >= 0.90
|
|
|
|
|
|
|
|
|
|
rows_continue = pd.DataFrame(
|
|
|
|
|
{
|
|
|
|
|
"sample_id": merged["sample_id"],
|
|
|
|
|
"symbol": merged["symbol"],
|
|
|
|
|
"event_time": merged["event_time"],
|
2026-06-28 00:50:37 +08:00
|
|
|
"long_continue_target": ((long_edge >= min_continue) & ((merged["long_stop_hit"] == 0) | dynamic_plan)).astype("int8"),
|
|
|
|
|
"short_continue_target": ((short_edge >= min_continue) & ((merged["short_stop_hit"] == 0) | dynamic_plan)).astype("int8"),
|
2026-06-27 19:57:29 +08:00
|
|
|
"long_expected_continue_edge_bps": long_edge,
|
|
|
|
|
"short_expected_continue_edge_bps": short_edge,
|
|
|
|
|
"split_id": merged["split_id"],
|
|
|
|
|
"walk_forward_fold": merged["walk_forward_fold"],
|
|
|
|
|
"label_version": LABEL_VERSION,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
rows_exit = pd.DataFrame(
|
|
|
|
|
{
|
|
|
|
|
"sample_id": merged["sample_id"],
|
|
|
|
|
"symbol": merged["symbol"],
|
|
|
|
|
"event_time": merged["event_time"],
|
|
|
|
|
"long_exit_target": ((merged["long_stop_hit"] == 1) | (merged["long_mae_bps"] >= adverse_threshold)).astype("int8"),
|
|
|
|
|
"short_exit_target": ((merged["short_stop_hit"] == 1) | (merged["short_mae_bps"] >= adverse_threshold)).astype("int8"),
|
|
|
|
|
"long_adverse_move_bps": merged["long_mae_bps"],
|
|
|
|
|
"short_adverse_move_bps": merged["short_mae_bps"],
|
|
|
|
|
"adverse_move_prob_label": (path_risk >= adverse_threshold).astype("int8"),
|
|
|
|
|
"reversal_prob_label": reversal,
|
|
|
|
|
"stop_hit_prob_label": ((merged["long_stop_hit"] == 1) | (merged["short_stop_hit"] == 1)).astype("int8"),
|
|
|
|
|
"stagnation_prob_label": (merged["long_future_return_bps"].abs() <= float(labels["exit"]["stagnation_abs_return_bps"])).astype("int8"),
|
|
|
|
|
"split_id": merged["split_id"],
|
|
|
|
|
"walk_forward_fold": merged["walk_forward_fold"],
|
|
|
|
|
"label_version": LABEL_VERSION,
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
rows_risk = pd.DataFrame(
|
|
|
|
|
{
|
|
|
|
|
"sample_id": merged["sample_id"],
|
|
|
|
|
"symbol": merged["symbol"],
|
|
|
|
|
"event_time": merged["event_time"],
|
2026-06-27 23:53:58 +08:00
|
|
|
"market_risk_target": market_risk.astype("int8"),
|
2026-06-27 19:57:29 +08:00
|
|
|
"market_path_risk_bps": path_risk,
|
|
|
|
|
"long_position_path_risk_bps": merged["long_mae_bps"],
|
|
|
|
|
"short_position_path_risk_bps": merged["short_mae_bps"],
|
2026-06-27 23:53:58 +08:00
|
|
|
"long_position_risk_target": ((merged["long_mae_bps"] >= position_risk_threshold) | (merged["long_stop_hit"] == 1)).astype("int8"),
|
|
|
|
|
"short_position_risk_target": ((merged["short_mae_bps"] >= position_risk_threshold) | (merged["short_stop_hit"] == 1)).astype("int8"),
|
|
|
|
|
"market_drawdown_prob_label": (path_risk >= market_risk_threshold).astype("int8"),
|
2026-06-27 19:57:29 +08:00
|
|
|
"volatility_expansion_prob_label": volatility_expansion.astype("int8"),
|
2026-06-27 23:53:58 +08:00
|
|
|
"spike_prob_label": spike.astype("int8"),
|
2026-06-27 19:57:29 +08:00
|
|
|
"liquidity_deterioration_prob_label": liquidity_deterioration.astype("int8"),
|
2026-06-27 23:53:58 +08:00
|
|
|
"position_drawdown_prob_label": (path_risk >= position_risk_threshold).astype("int8"),
|
2026-06-27 19:57:29 +08:00
|
|
|
"split_id": merged["split_id"],
|
|
|
|
|
"walk_forward_fold": merged["walk_forward_fold"],
|
|
|
|
|
"label_version": LABEL_VERSION,
|
|
|
|
|
}
|
|
|
|
|
)
|
2026-06-27 16:15:23 +08:00
|
|
|
outputs = [
|
|
|
|
|
("continue", pd.DataFrame(rows_continue), "long_continue_target"),
|
|
|
|
|
("exit", pd.DataFrame(rows_exit), "long_exit_target"),
|
|
|
|
|
("risk", pd.DataFrame(rows_risk), "market_risk_target"),
|
|
|
|
|
]
|
|
|
|
|
report_parts = ["# Continue Exit Risk Label Report", ""]
|
2026-06-27 23:53:58 +08:00
|
|
|
report_parts.extend(
|
|
|
|
|
[
|
|
|
|
|
"## Risk Thresholds",
|
|
|
|
|
"",
|
|
|
|
|
str(
|
|
|
|
|
{
|
|
|
|
|
"market_risk_threshold_bps": market_risk_threshold,
|
|
|
|
|
"position_risk_threshold_bps": position_risk_threshold,
|
|
|
|
|
"spike_threshold_bps": spike_threshold,
|
|
|
|
|
"vol_expansion_ratio": vol_expansion_ratio,
|
|
|
|
|
}
|
|
|
|
|
),
|
|
|
|
|
"",
|
|
|
|
|
]
|
|
|
|
|
)
|
2026-06-27 16:15:23 +08:00
|
|
|
for name, frame, target in outputs:
|
|
|
|
|
path = root / "label" / f"{name}_labels.parquet"
|
|
|
|
|
data_hash = write_parquet(path, frame)
|
|
|
|
|
_write_label_manifest(root / "label" / f"{name}_labels.manifest.json", path, frame, data_hash)
|
|
|
|
|
report_parts.append(f"## {name}")
|
|
|
|
|
report_parts.append("")
|
|
|
|
|
report_parts.append(str(frame[target].value_counts(dropna=False).to_dict() if not frame.empty else {}))
|
|
|
|
|
report_parts.append("")
|
|
|
|
|
logging.info("trader.training.%s_labels_written runId=%s rowCount=%s", name, args.run_id, len(frame))
|
|
|
|
|
write_text(root / "label" / "continue_exit_risk_label_report.md", "\n".join(report_parts) + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _write_label_manifest(path, parquet_path, frame: pd.DataFrame, data_hash: str) -> None:
|
|
|
|
|
write_json(path, manifest(parquet_path, {"row_count": len(frame), "label_version": LABEL_VERSION, "data_hash_sha256": data_hash}))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _write_distribution_report(path, frame: pd.DataFrame, column: str) -> None:
|
|
|
|
|
counts = frame[column].value_counts(dropna=False).to_dict() if not frame.empty else {}
|
|
|
|
|
lines = ["# Label Report", "", f"- row_count: {len(frame)}", f"- target_column: {column}", f"- distribution: {counts}", ""]
|
|
|
|
|
write_text(path, "\n".join(lines))
|