Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0fe3bd864e | |||
| 6be4bb976a | |||
| e8420f76fe | |||
| 5ad77ffe90 | |||
| 0323fb3caf | |||
| 7268f640a6 | |||
| 3f49af5ba6 | |||
| dc4d00a373 | |||
| 2a86a6e2fa | |||
| 3c0f2d0d91 | |||
| 5a9786d861 | |||
| 340e220b28 | |||
| 1fd46ff3c9 |
@@ -11,6 +11,15 @@ def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_args(parser)
|
||||
parser.add_argument("--max-rows", type=int, default=0)
|
||||
parser.add_argument(
|
||||
"--conditional-entry-source",
|
||||
choices=("none", "direction_label", "side_opportunity"),
|
||||
default="none",
|
||||
help="Entry 训练样本人群来源:不筛选、按 Direction 标签筛选、或按本方向未来机会阈值筛选。",
|
||||
)
|
||||
parser.add_argument("--conditional-entry-opportunity-bps", type=float, default=40.0)
|
||||
parser.add_argument("--conditional-entry-direction-labels", action="store_true")
|
||||
parser.add_argument("--conditional-entry-min-fit-rows", type=int, default=1000)
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
train_small_models(args)
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import _bootstrap # noqa: F401
|
||||
from trader_training.dynamic_exit_search import search_dynamic_exit_plans
|
||||
from trader_training.io_utils import add_common_args, setup_logging
|
||||
|
||||
|
||||
def _float_tuple(value: str) -> tuple[float, ...]:
|
||||
return tuple(float(item.strip()) for item in value.split(",") if item.strip())
|
||||
|
||||
|
||||
def _int_tuple(value: str) -> tuple[int, ...]:
|
||||
return tuple(int(item.strip()) for item in value.split(",") if item.strip())
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_args(parser)
|
||||
parser.add_argument("--feature-path", type=Path)
|
||||
parser.add_argument("--replay-path", type=Path)
|
||||
parser.add_argument("--label-config-path", type=Path)
|
||||
parser.add_argument("--cost-config-path", type=Path)
|
||||
parser.add_argument("--horizons", type=_int_tuple)
|
||||
parser.add_argument("--targets", type=_float_tuple)
|
||||
parser.add_argument("--stops", type=_float_tuple)
|
||||
parser.add_argument("--trailing-stops", type=_float_tuple)
|
||||
parser.add_argument("--second-target-multipliers", type=_float_tuple)
|
||||
parser.add_argument("--take1-ratios", type=_float_tuple)
|
||||
parser.add_argument("--take2-ratios", type=_float_tuple)
|
||||
parser.add_argument("--output-dir-name", default="dynamic-exit-search")
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
search_dynamic_exit_plans(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,23 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
import _bootstrap # noqa: F401
|
||||
from trader_training.entry_condition_pair_screen import screen_entry_condition_pairs
|
||||
from trader_training.io_utils import add_common_args, setup_logging
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_args(parser)
|
||||
parser.add_argument("--min-seed-rows", type=int, default=300)
|
||||
parser.add_argument("--min-pair-rows", type=int, default=150)
|
||||
parser.add_argument("--max-seed-conditions-per-side", type=int, default=32)
|
||||
parser.add_argument("--max-buckets-per-feature", type=int, default=2)
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
screen_entry_condition_pairs(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,33 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
import _bootstrap # noqa: F401
|
||||
from trader_training.entry_mae_label_diagnostic import diagnose_entry_mae_labels
|
||||
from trader_training.io_utils import add_common_args, setup_logging
|
||||
|
||||
|
||||
def _float_tuple(value: str) -> tuple[float, ...]:
|
||||
return tuple(float(item.strip()) for item in value.split(",") if item.strip())
|
||||
|
||||
|
||||
def _str_tuple(value: str) -> tuple[str, ...]:
|
||||
return tuple(item.strip() for item in value.split(",") if item.strip())
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_args(parser)
|
||||
parser.add_argument("--max-mae-bps", type=_float_tuple, default=(4.0, 6.0, 8.0, 12.0))
|
||||
parser.add_argument("--min-opportunity-bps", type=_float_tuple, default=(6.0, 12.0, 20.0))
|
||||
parser.add_argument("--model-families", type=_str_tuple, default=("linear",))
|
||||
parser.add_argument("--top-fraction", type=float, default=0.10)
|
||||
parser.add_argument("--top-fractions", type=_float_tuple)
|
||||
parser.add_argument("--max-train-rows", type=int, default=0)
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
diagnose_entry_mae_labels(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,34 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
import _bootstrap # noqa: F401
|
||||
from trader_training.conditional_entry_probe import probe_conditional_entry_training
|
||||
from trader_training.io_utils import add_common_args, setup_logging
|
||||
|
||||
|
||||
def _float_tuple(value: str) -> tuple[float, ...]:
|
||||
return tuple(float(item.strip()) for item in value.split(",") if item.strip())
|
||||
|
||||
|
||||
def _str_tuple(value: str) -> tuple[str, ...]:
|
||||
return tuple(item.strip() for item in value.split(",") if item.strip())
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_args(parser)
|
||||
parser.add_argument("--condition-opportunity-bps", type=_float_tuple, default=(6.0, 12.0, 20.0, 40.0, 60.0))
|
||||
parser.add_argument("--target-edge-bps", type=_float_tuple, default=(0.0, 3.0))
|
||||
parser.add_argument("--model-families", type=_str_tuple, default=("linear", "tree"))
|
||||
parser.add_argument("--top-fractions", type=_float_tuple, default=(0.01, 0.02, 0.05, 0.10))
|
||||
parser.add_argument("--max-train-rows", type=int, default=0)
|
||||
parser.add_argument("--min-train-rows", type=int, default=1000)
|
||||
parser.add_argument("--min-eval-rows", type=int, default=500)
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
probe_conditional_entry_training(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,28 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import _bootstrap # noqa: F401
|
||||
from trader_training.direction_opportunity_dataset import build_direction_opportunity_dataset
|
||||
from trader_training.io_utils import add_common_args, setup_logging
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
add_common_args(parser)
|
||||
parser.add_argument("--direction-dataset-path", type=Path)
|
||||
parser.add_argument("--entry-dataset-path", type=Path)
|
||||
parser.add_argument("--output-path", type=Path)
|
||||
parser.add_argument("--opportunity-bps", type=float, required=True)
|
||||
parser.add_argument("--min-advantage-bps", type=float, default=5.0)
|
||||
parser.add_argument("--long-edge-column", default="long_max_achievable_net_edge_bps")
|
||||
parser.add_argument("--short-edge-column", default="short_max_achievable_net_edge_bps")
|
||||
parser.add_argument("--label-method", default="DIRECTION_OPPORTUNITY_FROM_ENTRY_MFE_V1")
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
build_direction_opportunity_dataset(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,32 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
import _bootstrap # noqa: F401
|
||||
from trader_training.io_utils import add_common_args, setup_logging
|
||||
from trader_training.nonlinear_pm_probe import probe_nonlinear_pm
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Run diagnostic nonlinear model PM probe.")
|
||||
add_common_args(parser)
|
||||
parser.add_argument(
|
||||
"--probe-mode",
|
||||
choices=("direction_entry_tree", "entry_tree_only"),
|
||||
default="direction_entry_tree",
|
||||
help="诊断模式:同时替换 Direction+Entry,或只替换 Entry、保留当前 Direction 输出。",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--entry-train-filter",
|
||||
choices=("direction_label", "side_opportunity"),
|
||||
default="direction_label",
|
||||
help="树模型 Entry 的训练人群来源。",
|
||||
)
|
||||
parser.add_argument("--entry-opportunity-bps", type=float, default=40.0)
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
probe_nonlinear_pm(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,26 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
|
||||
import _bootstrap # noqa: F401
|
||||
from trader_training.good_trade_structure import diagnose_good_trade_structure
|
||||
from trader_training.io_utils import add_common_args, setup_logging
|
||||
|
||||
|
||||
def _float_tuple(value: str) -> tuple[float, ...]:
|
||||
return tuple(float(item.strip()) for item in value.split(",") if item.strip())
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Diagnose whether existing features separate good and bad Entry trades.")
|
||||
add_common_args(parser)
|
||||
parser.add_argument("--min-good-edge-bps", type=float, default=3.0)
|
||||
parser.add_argument("--bad-edge-bps", type=float, default=-3.0)
|
||||
parser.add_argument("--top-fractions", type=_float_tuple, default=(0.01, 0.05, 0.10))
|
||||
args = parser.parse_args()
|
||||
setup_logging()
|
||||
diagnose_good_trade_structure(args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,6 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
@@ -12,7 +13,7 @@ if str(TRAINING_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(TRAINING_ROOT))
|
||||
|
||||
from trader_training.labels import DEFAULT_LABEL_CONFIG, _path_stats_for_group
|
||||
from trader_training.pm import _probability_implied_edge, _simulate_open_trades, _threshold_candidates, default_pm_config
|
||||
from trader_training.pm import _pm_frame, _probability_implied_edge, _simulate_open_trades, _threshold_candidates, default_pm_config
|
||||
|
||||
|
||||
class RiskPmFixTest(unittest.TestCase):
|
||||
@@ -44,13 +45,13 @@ class RiskPmFixTest(unittest.TestCase):
|
||||
self.assertEqual(80.0, DEFAULT_LABEL_CONFIG["risk"]["spike_bps"])
|
||||
self.assertEqual(1.8, DEFAULT_LABEL_CONFIG["risk"]["vol_expansion_ratio"])
|
||||
|
||||
def test_pm_search_covers_low_entry_probability_without_allowing_negative_edge(self) -> None:
|
||||
def test_pm_search_uses_strict_entry_probability_and_positive_edge(self) -> None:
|
||||
candidates = _threshold_candidates()
|
||||
|
||||
self.assertTrue(candidates)
|
||||
self.assertLessEqual(max(item["max_market_risk_prob"] for item in candidates), 0.98)
|
||||
self.assertLessEqual(min(item["min_entry_prob"] for item in candidates), 0.03)
|
||||
self.assertGreaterEqual(min(item["min_expected_edge_bps"] for item in candidates), 0.0)
|
||||
self.assertLessEqual(max(item["max_market_risk_prob"] for item in candidates), 0.65)
|
||||
self.assertGreaterEqual(min(item["min_entry_prob"] for item in candidates), 0.30)
|
||||
self.assertGreaterEqual(min(item["min_expected_edge_bps"] for item in candidates), 3.0)
|
||||
|
||||
def test_probability_implied_edge_uses_price_plan_payoff(self) -> None:
|
||||
edge = _probability_implied_edge(
|
||||
@@ -61,6 +62,71 @@ class RiskPmFixTest(unittest.TestCase):
|
||||
self.assertAlmostEqual(3.7, float(edge.iloc[0]), places=6)
|
||||
self.assertAlmostEqual(52.5, float(edge.iloc[1]), places=6)
|
||||
|
||||
def test_pm_frame_reads_actual_plan_edge_not_old_opportunity_edge(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
(root / "model" / "direction").mkdir(parents=True)
|
||||
(root / "model" / "entry").mkdir(parents=True)
|
||||
(root / "model" / "risk").mkdir(parents=True)
|
||||
(root / "dataset").mkdir(parents=True)
|
||||
(root / "label").mkdir(parents=True)
|
||||
common = {
|
||||
"sample_id": ["s0"],
|
||||
"symbol": ["BTC-USDT-PERP"],
|
||||
"event_time": pd.to_datetime(["2026-01-01T00:00:00Z"]),
|
||||
"split_id": ["tune_inner"],
|
||||
}
|
||||
pd.DataFrame({**common, "long_prob": [0.70], "short_prob": [0.10], "neutral_prob": [0.20]}).to_parquet(
|
||||
root / "model" / "direction" / "tune_predictions.parquet",
|
||||
index=False,
|
||||
)
|
||||
pd.DataFrame(
|
||||
{
|
||||
**common,
|
||||
"long_entry_prob": [0.80],
|
||||
"short_entry_prob": [0.20],
|
||||
"long_expected_net_edge_bps": [12.0],
|
||||
"short_expected_net_edge_bps": [1.0],
|
||||
}
|
||||
).to_parquet(root / "model" / "entry" / "tune_predictions.parquet", index=False)
|
||||
pd.DataFrame(
|
||||
{
|
||||
**common,
|
||||
"market_risk_prob": [0.20],
|
||||
"long_position_risk_prob": [0.10],
|
||||
"short_position_risk_prob": [0.10],
|
||||
}
|
||||
).to_parquet(root / "model" / "risk" / "tune_predictions.parquet", index=False)
|
||||
pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s0"],
|
||||
"long_entry_target": [0],
|
||||
"short_entry_target": [0],
|
||||
"long_expected_net_edge_bps": [99.0],
|
||||
"short_expected_net_edge_bps": [88.0],
|
||||
"long_actual_plan_net_edge_bps": [-6.5],
|
||||
"short_actual_plan_net_edge_bps": [-6.5],
|
||||
}
|
||||
).to_parquet(root / "dataset" / "entry_train.parquet", index=False)
|
||||
pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s0", "s0"],
|
||||
"side": ["LONG", "SHORT"],
|
||||
"gross_edge_bps": [0.0, 0.0],
|
||||
"cost_bps": [6.5, 6.5],
|
||||
"target_hit": [0, 0],
|
||||
"stop_hit": [0, 0],
|
||||
"time_to_target_ms": [-1, -1],
|
||||
"time_to_stop_ms": [-1, -1],
|
||||
"time_to_exit_ms": [2_700_000, 2_700_000],
|
||||
}
|
||||
).to_parquet(root / "label" / "entry_labels.parquet", index=False)
|
||||
|
||||
frame = _pm_frame(root, "tune_inner")
|
||||
|
||||
self.assertAlmostEqual(-6.5, float(frame.loc[0, "actual_long_plan_edge_bps"]))
|
||||
self.assertAlmostEqual(-6.5, float(frame.loc[0, "actual_short_plan_edge_bps"]))
|
||||
|
||||
def test_pm_backtest_sizing_uses_position_manager_formula_not_fixed_floor(self) -> None:
|
||||
frame = pd.DataFrame(
|
||||
{
|
||||
@@ -78,8 +144,8 @@ class RiskPmFixTest(unittest.TestCase):
|
||||
"short_position_risk_prob": [0.10],
|
||||
"pred_long_expected_net_edge_bps": [40.0],
|
||||
"pred_short_expected_net_edge_bps": [1.0],
|
||||
"actual_long_expected_net_edge_bps": [30.0],
|
||||
"actual_short_expected_net_edge_bps": [-10.0],
|
||||
"actual_long_plan_edge_bps": [30.0],
|
||||
"actual_short_plan_edge_bps": [-10.0],
|
||||
"long_trade_net_edge_bps": [11.0],
|
||||
"short_trade_net_edge_bps": [-14.5],
|
||||
"long_target_hit": [1],
|
||||
@@ -112,7 +178,7 @@ class RiskPmFixTest(unittest.TestCase):
|
||||
|
||||
self.assertEqual(1, len(trades))
|
||||
self.assertAlmostEqual(11.0, float(trades.iloc[0]["actual_edge_bps"]))
|
||||
self.assertAlmostEqual(30.0, float(trades.iloc[0]["label_max_edge_bps"]))
|
||||
self.assertAlmostEqual(30.0, float(trades.iloc[0]["label_actual_plan_edge_bps"]))
|
||||
self.assertGreater(float(trades.iloc[0]["planned_ratio"]), 0.05)
|
||||
self.assertLessEqual(float(trades.iloc[0]["planned_ratio"]), 0.20)
|
||||
|
||||
@@ -133,8 +199,8 @@ class RiskPmFixTest(unittest.TestCase):
|
||||
"short_position_risk_prob": [0.10, 0.10],
|
||||
"pred_long_expected_net_edge_bps": [40.0, 42.0],
|
||||
"pred_short_expected_net_edge_bps": [1.0, 1.0],
|
||||
"actual_long_expected_net_edge_bps": [30.0, 31.0],
|
||||
"actual_short_expected_net_edge_bps": [-10.0, -10.0],
|
||||
"actual_long_plan_edge_bps": [30.0, 31.0],
|
||||
"actual_short_plan_edge_bps": [-10.0, -10.0],
|
||||
"long_trade_net_edge_bps": [11.0, 12.0],
|
||||
"short_trade_net_edge_bps": [-14.5, -14.5],
|
||||
"long_target_hit": [1, 1],
|
||||
|
||||
@@ -14,13 +14,22 @@ if str(TRAINING_ROOT) not in sys.path:
|
||||
sys.path.insert(0, str(TRAINING_ROOT))
|
||||
|
||||
from trader_training.onnx_export import LinearHead, export_heads
|
||||
from trader_training.entry_feature_screen import _screen_edge_column
|
||||
from trader_training.conditional_entry_probe import probe_conditional_entry_training
|
||||
from trader_training.direction_opportunity_dataset import _opportunity_labels
|
||||
from trader_training.dynamic_exit_search import search_dynamic_exit_plans
|
||||
from trader_training.entry_condition_pair_screen import screen_entry_condition_pairs
|
||||
from trader_training.entry_feature_screen import _bucket_edges, _screen_edge_column
|
||||
from trader_training.entry_mae_label_diagnostic import diagnose_entry_mae_labels
|
||||
from trader_training.good_trade_structure import _side_frame, _top_fraction_metrics
|
||||
from trader_training.io_utils import read_json, write_json
|
||||
from trader_training.labels import ENTRY_LABEL_METHOD, _path_stats_for_group, build_entry_labels
|
||||
from trader_training.ofi_feature_experiment import l1_snapshot_diff_ofi_quote
|
||||
from trader_training.nonlinear_pm_probe import _entry_side_fit_frame, _exit_metrics, _expanded_threshold_candidates
|
||||
from trader_training.ofi_feature_experiment import _load_entry_dataset, l1_snapshot_diff_ofi_quote
|
||||
from trader_training.promote import promote_artifact_bundle
|
||||
from trader_training.replay import build_splits
|
||||
from trader_training.schemas import FEATURE_ORDER, LATEST_STRESS_SPLIT, MODEL_OUTPUTS, OUTPUT_MAPPING, TRAINING_SPLITS, VALIDATION_LOCKED_SPLIT
|
||||
from trader_training.training import TARGETS, _head_train_mask
|
||||
from trader_training.diagnostics import _label_summary
|
||||
|
||||
|
||||
class TrainingContractTest(unittest.TestCase):
|
||||
@@ -35,6 +44,72 @@ class TrainingContractTest(unittest.TestCase):
|
||||
self.assertEqual(set(fields), set(OUTPUT_MAPPING[model_name]))
|
||||
self.assertEqual([f"prediction[{idx}]" for idx in range(len(fields))], [OUTPUT_MAPPING[model_name][field] for field in fields])
|
||||
|
||||
def test_nonlinear_pm_probe_expands_low_probability_thresholds(self) -> None:
|
||||
candidates = _expanded_threshold_candidates()
|
||||
|
||||
self.assertIn(
|
||||
{
|
||||
"long_open_prob": 0.2,
|
||||
"short_open_prob": 0.2,
|
||||
"min_entry_prob": 0.05,
|
||||
"max_market_risk_prob": 0.45,
|
||||
"min_expected_edge_bps": -5.0,
|
||||
"min_direction_margin": 0.0,
|
||||
},
|
||||
candidates,
|
||||
)
|
||||
self.assertIn(
|
||||
{
|
||||
"long_open_prob": 1.01,
|
||||
"short_open_prob": 0.2,
|
||||
"min_entry_prob": 0.05,
|
||||
"max_market_risk_prob": 0.45,
|
||||
"min_expected_edge_bps": -5.0,
|
||||
"min_direction_margin": 0.0,
|
||||
},
|
||||
candidates,
|
||||
)
|
||||
|
||||
def test_nonlinear_entry_tree_probe_can_use_side_opportunity_rows(self) -> None:
|
||||
direction = pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s1", "s2", "s3", "s4"],
|
||||
"long_target": [1, 0, 0, 0],
|
||||
"short_target": [0, 1, 0, 0],
|
||||
}
|
||||
)
|
||||
entry = pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s1", "s2", "s3", "s4"],
|
||||
"split_id": ["fit_inner", "fit_inner", "fit_inner", "fit_inner"],
|
||||
"long_max_achievable_net_edge_bps": [45.0, 10.0, 65.0, 39.0],
|
||||
"short_max_achievable_net_edge_bps": [8.0, 41.0, 15.0, 70.0],
|
||||
}
|
||||
)
|
||||
|
||||
long_frame = _entry_side_fit_frame(direction, entry, "LONG", "side_opportunity", 40.0)
|
||||
short_frame = _entry_side_fit_frame(direction, entry, "SHORT", "side_opportunity", 40.0)
|
||||
|
||||
self.assertEqual(["s1", "s3"], long_frame["sample_id"].tolist())
|
||||
self.assertEqual(["s2", "s4"], short_frame["sample_id"].tolist())
|
||||
|
||||
def test_nonlinear_pm_probe_exit_metrics_describe_trade_outcomes(self) -> None:
|
||||
trades = pd.DataFrame(
|
||||
{
|
||||
"target_hit": [1, 0, 0],
|
||||
"stop_hit": [0, 1, 0],
|
||||
"time_to_exit_ms": [300_000, 600_000, 2_700_000],
|
||||
}
|
||||
)
|
||||
|
||||
metrics = _exit_metrics(trades)
|
||||
|
||||
self.assertAlmostEqual(1 / 3, metrics["target_hit_rate"])
|
||||
self.assertAlmostEqual(1 / 3, metrics["stop_hit_rate"])
|
||||
self.assertAlmostEqual(1 / 3, metrics["timeout_exit_rate"])
|
||||
self.assertAlmostEqual(20.0, metrics["avg_time_to_exit_min"])
|
||||
self.assertAlmostEqual(10.0, metrics["p50_time_to_exit_min"])
|
||||
|
||||
def test_entry_feature_screen_prefers_actual_plan_edge(self) -> None:
|
||||
dataset = pd.DataFrame(
|
||||
{
|
||||
@@ -48,6 +123,353 @@ class TrainingContractTest(unittest.TestCase):
|
||||
self.assertEqual("long_actual_plan_net_edge_bps", _screen_edge_column(dataset, "LONG"))
|
||||
self.assertEqual("short_actual_plan_net_edge_bps", _screen_edge_column(dataset, "SHORT"))
|
||||
|
||||
def test_entry_feature_screen_requires_actual_plan_edge(self) -> None:
|
||||
dataset = pd.DataFrame({"long_expected_net_edge_bps": [20.0]})
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
_screen_edge_column(dataset, "LONG")
|
||||
|
||||
def test_entry_regression_heads_train_on_actual_plan_edge(self) -> None:
|
||||
heads = {head[0]: head[2] for head in TARGETS["ENTRY"]["heads"]}
|
||||
|
||||
self.assertEqual("long_actual_plan_net_edge_bps", heads["long_expected_net_edge_bps"])
|
||||
self.assertEqual("short_actual_plan_net_edge_bps", heads["short_expected_net_edge_bps"])
|
||||
|
||||
def test_conditional_entry_training_uses_direction_label_rows(self) -> None:
|
||||
train = pd.DataFrame({"long_target": [1, 0, 1, 0], "short_target": [0, 1, 0, 1]})
|
||||
|
||||
long_mask, long_filter = _head_train_mask("ENTRY", "long_entry_prob", train, Namespace(conditional_entry_direction_labels=True))
|
||||
short_mask, short_filter = _head_train_mask("ENTRY", "short_expected_net_edge_bps", train, Namespace(conditional_entry_direction_labels=True))
|
||||
default_mask, default_filter = _head_train_mask("ENTRY", "long_entry_prob", train, Namespace(conditional_entry_direction_labels=False))
|
||||
|
||||
self.assertEqual("DIRECTION_LABEL_LONG_FIT_ROWS", long_filter)
|
||||
self.assertEqual([True, False, True, False], long_mask.tolist())
|
||||
self.assertEqual("DIRECTION_LABEL_SHORT_FIT_ROWS", short_filter)
|
||||
self.assertEqual([False, True, False, True], short_mask.tolist())
|
||||
self.assertEqual("ALL_FIT_ROWS", default_filter)
|
||||
self.assertEqual([True, True, True, True], default_mask.tolist())
|
||||
|
||||
def test_conditional_entry_training_can_use_side_opportunity_rows(self) -> None:
|
||||
train = pd.DataFrame(
|
||||
{
|
||||
"long_max_achievable_net_edge_bps": [45.0, 10.0, 60.0, 39.0],
|
||||
"short_max_achievable_net_edge_bps": [8.0, 42.0, 15.0, 80.0],
|
||||
}
|
||||
)
|
||||
args = Namespace(
|
||||
conditional_entry_source="side_opportunity",
|
||||
conditional_entry_direction_labels=False,
|
||||
conditional_entry_opportunity_bps=40.0,
|
||||
)
|
||||
|
||||
long_mask, long_filter = _head_train_mask("ENTRY", "long_entry_prob", train, args)
|
||||
short_mask, short_filter = _head_train_mask("ENTRY", "short_expected_net_edge_bps", train, args)
|
||||
|
||||
self.assertEqual("SIDE_OPPORTUNITY_LONG_GE_40_BPS_FIT_ROWS", long_filter)
|
||||
self.assertEqual([True, False, True, False], long_mask.tolist())
|
||||
self.assertEqual("SIDE_OPPORTUNITY_SHORT_GE_40_BPS_FIT_ROWS", short_filter)
|
||||
self.assertEqual([False, True, False, True], short_mask.tolist())
|
||||
|
||||
def test_direction_opportunity_labels_choose_clear_path_opportunity(self) -> None:
|
||||
labels = _opportunity_labels(
|
||||
np.array([45.0, 10.0, 45.0, 42.0, np.nan]),
|
||||
np.array([20.0, 50.0, 43.0, 48.0, 50.0]),
|
||||
opportunity_bps=40.0,
|
||||
min_advantage_bps=5.0,
|
||||
)
|
||||
|
||||
self.assertEqual([1, 0, 0, 0, 0], labels["long_target"].tolist())
|
||||
self.assertEqual([0, 1, 0, 1, 1], labels["short_target"].tolist())
|
||||
self.assertEqual([0, 0, 1, 0, 0], labels["neutral_target"].tolist())
|
||||
|
||||
def test_diagnostics_reads_actual_training_dataset_labels(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
root = Path(tmp)
|
||||
dataset_dir = root / "dataset"
|
||||
dataset_dir.mkdir(parents=True)
|
||||
pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s1", "s2"],
|
||||
"split_id": ["fit_inner", "fit_inner"],
|
||||
"long_target": [1, 0],
|
||||
"short_target": [0, 0],
|
||||
"neutral_target": [0, 1],
|
||||
"future_return_bps": [5.0, -1.0],
|
||||
}
|
||||
).to_parquet(dataset_dir / "direction_train.parquet", index=False)
|
||||
pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s1", "s2"],
|
||||
"split_id": ["fit_inner", "fit_inner"],
|
||||
"long_entry_target": [1, 0],
|
||||
"short_entry_target": [0, 1],
|
||||
"long_actual_plan_net_edge_bps": [8.0, -6.0],
|
||||
"short_actual_plan_net_edge_bps": [-5.0, 7.0],
|
||||
}
|
||||
).to_parquet(dataset_dir / "entry_train.parquet", index=False)
|
||||
|
||||
summary = _label_summary(root)
|
||||
|
||||
self.assertEqual("dataset/direction_train.parquet", summary["fit_inner"]["direction"]["source"])
|
||||
self.assertEqual({"LONG": 0.5, "SHORT": 0.0, "NEUTRAL": 0.5}, summary["fit_inner"]["direction"]["label_ratio"])
|
||||
self.assertEqual("dataset/entry_train.parquet", summary["fit_inner"]["entry"]["source"])
|
||||
self.assertEqual(0.5, summary["fit_inner"]["entry"]["target_rate_by_side"]["LONG"])
|
||||
|
||||
def test_good_trade_structure_builds_side_frame_and_top_metrics(self) -> None:
|
||||
dataset = pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s1", "s2", "s3"],
|
||||
"split_id": ["fit_inner", "fit_inner", "fit_inner"],
|
||||
"long_actual_plan_net_edge_bps": [4.0, -5.0, 1.0],
|
||||
"short_actual_plan_net_edge_bps": [-5.0, 6.0, -1.0],
|
||||
**{feature: [0.1, 0.2, 0.3] for feature in FEATURE_ORDER},
|
||||
}
|
||||
)
|
||||
|
||||
frame = _side_frame(dataset, "LONG", min_good_edge_bps=3.0, bad_edge_bps=-3.0)
|
||||
metrics = _top_fraction_metrics(frame, np.array([0.9, 0.1, 0.2]), 1 / 3)
|
||||
|
||||
self.assertEqual([1, 0, 0], frame["good_trade"].tolist())
|
||||
self.assertEqual([0, 1, 0], frame["bad_trade"].tolist())
|
||||
self.assertEqual(1, metrics["rows"])
|
||||
self.assertEqual(1.0, metrics["good_rate"])
|
||||
self.assertEqual(4.0, metrics["avg_edge_bps"])
|
||||
|
||||
def test_entry_feature_screen_keeps_zero_inflated_event_features(self) -> None:
|
||||
values = np.concatenate((np.zeros(5000), np.linspace(1.0, 100.0, 500)))
|
||||
edges = _bucket_edges(values)
|
||||
|
||||
self.assertGreaterEqual(len(edges), 3)
|
||||
self.assertEqual(-np.inf, edges[0])
|
||||
self.assertEqual(np.inf, edges[-1])
|
||||
|
||||
def test_entry_condition_pair_screen_finds_stable_two_feature_filter(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
data_root = Path(tmp)
|
||||
run_root = data_root / "trader-v4" / "runs" / "unit-condition-pair"
|
||||
dataset_path = run_root / "dataset" / "entry_train.parquet"
|
||||
dataset_path.parent.mkdir(parents=True)
|
||||
|
||||
frames = []
|
||||
row_count = 1200
|
||||
base_feature_values = np.linspace(0.0, 0.999, row_count)
|
||||
for split_id in TRAINING_SPLITS:
|
||||
frame = pd.DataFrame({feature: 0.0 for feature in FEATURE_ORDER}, index=np.arange(row_count))
|
||||
frame["split_id"] = split_id
|
||||
frame["ret_1m_bps"] = base_feature_values
|
||||
frame["ret_5m_bps"] = base_feature_values
|
||||
good_mask = (frame["ret_1m_bps"] > 0.9) & (frame["ret_5m_bps"] > 0.9)
|
||||
frame["long_entry_target"] = good_mask.astype(int)
|
||||
frame["short_entry_target"] = 0
|
||||
frame["long_actual_plan_net_edge_bps"] = np.where(good_mask, 8.0, -6.0)
|
||||
frame["short_actual_plan_net_edge_bps"] = -6.0
|
||||
frame["long_mae_bps"] = np.where(good_mask, 2.0, 15.0)
|
||||
frame["short_mae_bps"] = 15.0
|
||||
frames.append(frame)
|
||||
pd.concat(frames, ignore_index=True).to_parquet(dataset_path, index=False)
|
||||
|
||||
screen_entry_condition_pairs(
|
||||
Namespace(
|
||||
data_root=data_root,
|
||||
run_id="unit-condition-pair",
|
||||
min_seed_rows=50,
|
||||
min_pair_rows=50,
|
||||
max_seed_conditions_per_side=8,
|
||||
max_buckets_per_feature=2,
|
||||
)
|
||||
)
|
||||
|
||||
result = read_json(run_root / "diagnostics" / "entry_condition_pair_screen_result.json")
|
||||
candidates = pd.read_csv(run_root / "diagnostics" / "entry_condition_pair_candidates.csv")
|
||||
|
||||
self.assertGreater(result["stable_candidate_count"], 0)
|
||||
self.assertTrue(candidates["usable_candidate"].any())
|
||||
best = candidates.iloc[0]
|
||||
self.assertEqual("LONG", best["side"])
|
||||
self.assertGreater(float(best["min_eval_edge_bps"]), 0.0)
|
||||
|
||||
def test_entry_mae_label_diagnostic_finds_low_drawdown_target(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
data_root = Path(tmp)
|
||||
run_root = data_root / "trader-v4" / "runs" / "unit-mae-diagnostic"
|
||||
dataset_path = run_root / "dataset" / "entry_train.parquet"
|
||||
dataset_path.parent.mkdir(parents=True)
|
||||
|
||||
frames = []
|
||||
row_count = 800
|
||||
base_feature_values = np.linspace(0.0, 0.999, row_count)
|
||||
for split_id in TRAINING_SPLITS:
|
||||
frame = pd.DataFrame({feature: 0.0 for feature in FEATURE_ORDER}, index=np.arange(row_count))
|
||||
frame["split_id"] = split_id
|
||||
frame["ret_1m_bps"] = base_feature_values
|
||||
good_mask = frame["ret_1m_bps"] > 0.85
|
||||
frame["long_entry_target"] = good_mask.astype(int)
|
||||
frame["short_entry_target"] = 0
|
||||
frame["long_actual_plan_net_edge_bps"] = np.where(good_mask, 9.0, -6.0)
|
||||
frame["short_actual_plan_net_edge_bps"] = -6.0
|
||||
frame["long_max_achievable_net_edge_bps"] = np.where(good_mask, 18.0, 2.0)
|
||||
frame["short_max_achievable_net_edge_bps"] = 2.0
|
||||
frame["long_mae_bps"] = np.where(good_mask, 2.0, 15.0)
|
||||
frame["short_mae_bps"] = 15.0
|
||||
frames.append(frame)
|
||||
pd.concat(frames, ignore_index=True).to_parquet(dataset_path, index=False)
|
||||
|
||||
diagnose_entry_mae_labels(
|
||||
Namespace(
|
||||
data_root=data_root,
|
||||
run_id="unit-mae-diagnostic",
|
||||
max_mae_bps=(4.0,),
|
||||
min_opportunity_bps=(12.0,),
|
||||
model_families=("linear",),
|
||||
top_fraction=0.10,
|
||||
max_train_rows=0,
|
||||
)
|
||||
)
|
||||
|
||||
result = read_json(run_root / "diagnostics" / "entry_mae_label_diagnostic_result.json")
|
||||
candidates = pd.read_csv(run_root / "diagnostics" / "entry_mae_label_diagnostic_candidates.csv")
|
||||
|
||||
self.assertGreater(result["positive_top_edge_candidate_count"], 0)
|
||||
best = candidates.iloc[0]
|
||||
self.assertEqual("LONG", best["side"])
|
||||
self.assertTrue(bool(best["stable_top_edge_positive"]))
|
||||
|
||||
def test_conditional_entry_probe_finds_positive_oracle_direction_subset(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
data_root = Path(tmp)
|
||||
run_root = data_root / "trader-v4" / "runs" / "unit-conditional-entry"
|
||||
dataset_path = run_root / "dataset" / "entry_train.parquet"
|
||||
dataset_path.parent.mkdir(parents=True)
|
||||
|
||||
frames = []
|
||||
row_count = 900
|
||||
base_feature_values = np.linspace(0.0, 0.999, row_count)
|
||||
for split_id in TRAINING_SPLITS:
|
||||
frame = pd.DataFrame({feature: 0.0 for feature in FEATURE_ORDER}, index=np.arange(row_count))
|
||||
frame["split_id"] = split_id
|
||||
frame["ret_1m_bps"] = base_feature_values
|
||||
good_mask = frame["ret_1m_bps"] > 0.85
|
||||
opportunity_mask = frame["ret_1m_bps"] > 0.50
|
||||
frame["long_actual_plan_net_edge_bps"] = np.where(good_mask, 10.0, -6.0)
|
||||
frame["short_actual_plan_net_edge_bps"] = -6.0
|
||||
frame["long_max_achievable_net_edge_bps"] = np.where(opportunity_mask, 40.0, 2.0)
|
||||
frame["short_max_achievable_net_edge_bps"] = 2.0
|
||||
frames.append(frame)
|
||||
pd.concat(frames, ignore_index=True).to_parquet(dataset_path, index=False)
|
||||
|
||||
probe_conditional_entry_training(
|
||||
Namespace(
|
||||
data_root=data_root,
|
||||
run_id="unit-conditional-entry",
|
||||
condition_opportunity_bps=(20.0,),
|
||||
target_edge_bps=(0.0,),
|
||||
model_families=("linear",),
|
||||
top_fractions=(0.10,),
|
||||
max_train_rows=0,
|
||||
min_train_rows=50,
|
||||
min_eval_rows=50,
|
||||
)
|
||||
)
|
||||
|
||||
result = read_json(run_root / "diagnostics" / "conditional_entry_probe_result.json")
|
||||
candidates = pd.read_csv(run_root / "diagnostics" / "conditional_entry_probe_candidates.csv")
|
||||
|
||||
self.assertGreater(result["stable_positive_count"], 0)
|
||||
self.assertTrue(candidates.iloc[0]["stable_positive"])
|
||||
self.assertGreater(float(candidates.iloc[0]["min_top_edge_bps"]), 0.0)
|
||||
|
||||
def test_dynamic_exit_search_writes_plan_diagnostics(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
data_root = Path(tmp)
|
||||
run_root = data_root / "trader-v4" / "runs" / "unit-dynamic-exit"
|
||||
feature_path = run_root / "feature" / "feature_frame.parquet"
|
||||
replay_path = run_root / "replay" / "replay_1m.parquet"
|
||||
config_path = data_root / "label_config.json"
|
||||
feature_path.parent.mkdir(parents=True)
|
||||
replay_path.parent.mkdir(parents=True)
|
||||
|
||||
times = pd.date_range("2026-01-01", periods=7, freq="min", tz="UTC")
|
||||
pd.DataFrame(
|
||||
{
|
||||
"sample_id": [f"s{i}" for i in range(4)],
|
||||
"symbol": ["BTC-USDT-PERP"] * 4,
|
||||
"event_time": times[:4],
|
||||
"open_time_ms": [0, 60_000, 120_000, 180_000],
|
||||
"split_id": ["tune_inner", "validation_locked", "latest_stress", "fit_inner"],
|
||||
"walk_forward_fold": [0, 0, 0, 0],
|
||||
"data_quality_flag": ["OK", "OK", "OK", "OK"],
|
||||
}
|
||||
).to_parquet(feature_path, index=False)
|
||||
pd.DataFrame(
|
||||
{
|
||||
"event_time": times,
|
||||
"open_time_ms": [0, 60_000, 120_000, 180_000, 240_000, 300_000, 360_000],
|
||||
"symbol": ["BTC-USDT-PERP"] * 7,
|
||||
"open": [100.0] * 7,
|
||||
"high": [100.0, 100.12, 100.22, 100.24, 100.24, 100.24, 100.24],
|
||||
"low": [100.0, 100.00, 100.00, 100.00, 100.00, 100.00, 100.00],
|
||||
"close": [100.0, 100.10, 100.18, 100.20, 100.20, 100.20, 100.20],
|
||||
"spread_bps": [1.0] * 7,
|
||||
}
|
||||
).to_parquet(replay_path, index=False)
|
||||
write_json(config_path, {"entry": {"min_expected_net_edge_bps": 3.0}})
|
||||
|
||||
search_dynamic_exit_plans(
|
||||
Namespace(
|
||||
data_root=data_root,
|
||||
run_id="unit-dynamic-exit",
|
||||
feature_path=feature_path,
|
||||
replay_path=replay_path,
|
||||
label_config_path=config_path,
|
||||
cost_config_path=None,
|
||||
horizons=(3,),
|
||||
targets=(10.0,),
|
||||
stops=(5.0,),
|
||||
trailing_stops=(4.0,),
|
||||
second_target_multipliers=(2.0,),
|
||||
take1_ratios=(0.5,),
|
||||
take2_ratios=(0.25,),
|
||||
output_dir_name="dynamic-exit-search",
|
||||
)
|
||||
)
|
||||
|
||||
result = read_json(run_root / "dynamic-exit-search" / "dynamic_exit_search_result.json")
|
||||
self.assertEqual("DYNAMIC_TRAILING_V1", result["best_plan"]["plan_method"])
|
||||
self.assertEqual(1, result["candidate_count"])
|
||||
self.assertTrue((run_root / "dynamic-exit-search" / "dynamic_exit_search_report.md").is_file())
|
||||
|
||||
def test_ofi_entry_dataset_uses_actual_plan_edge(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
baseline_root = Path(tmp)
|
||||
dataset_path = baseline_root / "dataset" / "entry_train.parquet"
|
||||
dataset_path.parent.mkdir(parents=True)
|
||||
pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s1"],
|
||||
"long_entry_target": [1],
|
||||
"short_entry_target": [0],
|
||||
"long_actual_plan_net_edge_bps": [4.0],
|
||||
"short_actual_plan_net_edge_bps": [-7.0],
|
||||
}
|
||||
).to_parquet(dataset_path, index=False)
|
||||
feature = pd.DataFrame(
|
||||
{
|
||||
"sample_id": ["s1"],
|
||||
"symbol": ["BTC-USDT-PERP"],
|
||||
"event_time": pd.to_datetime(["2026-01-01T00:00:00Z"]),
|
||||
"open_time_ms": [0],
|
||||
"split_id": ["fit_inner"],
|
||||
"walk_forward_fold": [0],
|
||||
"data_quality_flag": ["OK"],
|
||||
}
|
||||
)
|
||||
|
||||
dataset = _load_entry_dataset(baseline_root, feature)
|
||||
|
||||
self.assertIn("long_actual_plan_net_edge_bps", dataset.columns)
|
||||
self.assertNotIn("long_expected_net_edge_bps", dataset.columns)
|
||||
self.assertEqual(4.0, float(dataset.loc[0, "long_actual_plan_net_edge_bps"]))
|
||||
|
||||
def test_split_builder_uses_locked_validation_contract(self) -> None:
|
||||
with tempfile.TemporaryDirectory() as tmp:
|
||||
data_root = Path(tmp)
|
||||
|
||||
@@ -0,0 +1,337 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import brier_score_loss, roc_auc_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from trader_training.entry_feature_screen import _markdown_table
|
||||
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
||||
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
||||
|
||||
|
||||
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
|
||||
|
||||
def probe_conditional_entry_training(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
||||
_require_columns(dataset)
|
||||
|
||||
condition_opportunities = tuple(float(item) for item in (args.condition_opportunity_bps or (6.0, 12.0, 20.0, 40.0, 60.0)))
|
||||
target_edges = tuple(float(item) for item in (args.target_edge_bps or (0.0, 3.0)))
|
||||
model_families = tuple(str(item).strip().lower() for item in (args.model_families or ("linear", "tree")) if str(item).strip())
|
||||
top_fractions = tuple(float(item) for item in (args.top_fractions or (0.01, 0.02, 0.05, 0.10)))
|
||||
max_train_rows = int(args.max_train_rows or 0)
|
||||
|
||||
rows: list[dict[str, Any]] = []
|
||||
skipped: list[dict[str, Any]] = []
|
||||
for side in ("LONG", "SHORT"):
|
||||
prefix = side.lower()
|
||||
actual_edge_col = f"{prefix}_actual_plan_net_edge_bps"
|
||||
opportunity_col = f"{prefix}_max_achievable_net_edge_bps"
|
||||
for condition_opportunity_bps in condition_opportunities:
|
||||
fit_condition = dataset["split_id"].eq(FIT_SPLIT) & (pd.to_numeric(dataset[opportunity_col], errors="coerce") >= condition_opportunity_bps)
|
||||
fit_frame = dataset.loc[fit_condition].copy()
|
||||
if max_train_rows > 0 and len(fit_frame) > max_train_rows:
|
||||
fit_frame = fit_frame.sort_values("event_time").tail(max_train_rows).copy() if "event_time" in fit_frame.columns else fit_frame.tail(max_train_rows).copy()
|
||||
if len(fit_frame) < int(args.min_train_rows or 1000):
|
||||
skipped.append(
|
||||
{
|
||||
"side": side,
|
||||
"condition_opportunity_bps": condition_opportunity_bps,
|
||||
"reason": "NOT_ENOUGH_TRAIN_ROWS",
|
||||
"train_rows": int(len(fit_frame)),
|
||||
}
|
||||
)
|
||||
continue
|
||||
x_train = _x(fit_frame)
|
||||
for target_edge_bps in target_edges:
|
||||
y_train = (pd.to_numeric(fit_frame[actual_edge_col], errors="coerce") >= target_edge_bps).astype(int).to_numpy()
|
||||
if len(np.unique(y_train)) < 2:
|
||||
skipped.append(
|
||||
{
|
||||
"side": side,
|
||||
"condition_opportunity_bps": condition_opportunity_bps,
|
||||
"target_edge_bps": target_edge_bps,
|
||||
"reason": "ONE_CLASS_TRAIN",
|
||||
"train_rows": int(len(fit_frame)),
|
||||
"train_positive_rate": float(y_train.mean()) if len(y_train) else 0.0,
|
||||
}
|
||||
)
|
||||
continue
|
||||
for model_family in model_families:
|
||||
model, scaler = _fit_model(model_family, x_train, y_train)
|
||||
for split_id in EVAL_SPLITS:
|
||||
eval_condition = dataset["split_id"].eq(split_id) & (pd.to_numeric(dataset[opportunity_col], errors="coerce") >= condition_opportunity_bps)
|
||||
eval_frame = dataset.loc[eval_condition].copy()
|
||||
if len(eval_frame) < int(args.min_eval_rows or 500):
|
||||
continue
|
||||
y_true = (pd.to_numeric(eval_frame[actual_edge_col], errors="coerce") >= target_edge_bps).astype(int).to_numpy()
|
||||
proba = _predict(model_family, model, scaler, _x(eval_frame))
|
||||
for top_fraction in top_fractions:
|
||||
rows.append(
|
||||
_metric_row(
|
||||
eval_frame,
|
||||
y_true,
|
||||
proba,
|
||||
side,
|
||||
model_family,
|
||||
split_id,
|
||||
condition_opportunity_bps,
|
||||
target_edge_bps,
|
||||
top_fraction,
|
||||
actual_edge_col,
|
||||
float(y_train.mean()),
|
||||
len(fit_frame),
|
||||
)
|
||||
)
|
||||
logging.info(
|
||||
"trader.training.conditional_entry_probe_fitted side=%s conditionOpportunityBps=%s targetEdgeBps=%s modelFamily=%s trainRows=%s trainPositiveRate=%.6f",
|
||||
side,
|
||||
condition_opportunity_bps,
|
||||
target_edge_bps,
|
||||
model_family,
|
||||
len(fit_frame),
|
||||
float(y_train.mean()),
|
||||
)
|
||||
|
||||
metrics = pd.DataFrame(rows)
|
||||
candidates = _select_candidates(metrics)
|
||||
result = {
|
||||
"run_id": args.run_id,
|
||||
"purpose": "diagnostic_only_not_exported",
|
||||
"warning": "condition_opportunity_bps is an oracle future filter; use this only to decide whether conditional Entry training is worth implementing",
|
||||
"feature_count": len(FEATURE_ORDER),
|
||||
"condition_opportunity_bps": list(condition_opportunities),
|
||||
"target_edge_bps": list(target_edges),
|
||||
"model_families": list(model_families),
|
||||
"top_fractions": list(top_fractions),
|
||||
"max_train_rows": max_train_rows,
|
||||
"metric_count": int(len(metrics)),
|
||||
"candidate_count": int(len(candidates)),
|
||||
"stable_positive_count": int(candidates["stable_positive"].sum()) if not candidates.empty else 0,
|
||||
"skipped": skipped,
|
||||
}
|
||||
out_dir = root / "diagnostics"
|
||||
write_json(out_dir / "conditional_entry_probe_result.json", result)
|
||||
write_text(out_dir / "conditional_entry_probe_metrics.csv", metrics.to_csv(index=False))
|
||||
write_text(out_dir / "conditional_entry_probe_candidates.csv", candidates.to_csv(index=False))
|
||||
write_text(out_dir / "conditional_entry_probe_report.md", _markdown_report(result, candidates))
|
||||
logging.info(
|
||||
"trader.training.conditional_entry_probe_written runId=%s metricCount=%s candidateCount=%s stablePositiveCount=%s reportPath=%s",
|
||||
args.run_id,
|
||||
len(metrics),
|
||||
len(candidates),
|
||||
result["stable_positive_count"],
|
||||
out_dir / "conditional_entry_probe_report.md",
|
||||
)
|
||||
|
||||
|
||||
def _require_columns(dataset: pd.DataFrame) -> None:
|
||||
required = {"split_id", *FEATURE_ORDER}
|
||||
for side in ("long", "short"):
|
||||
required.update({f"{side}_actual_plan_net_edge_bps", f"{side}_max_achievable_net_edge_bps"})
|
||||
missing = sorted(required.difference(dataset.columns))
|
||||
if missing:
|
||||
raise ValueError(f"conditional entry probe missing required columns: {missing}")
|
||||
|
||||
|
||||
def _x(frame: pd.DataFrame) -> np.ndarray:
|
||||
values = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32")
|
||||
if values.isna().any().any():
|
||||
missing = values.columns[values.isna().any()].tolist()
|
||||
raise ValueError(f"conditional entry probe found non-finite feature values: {missing}")
|
||||
return values.to_numpy(dtype="float32")
|
||||
|
||||
|
||||
def _fit_model(model_family: str, x_train: np.ndarray, y_train: np.ndarray) -> tuple[Any, StandardScaler | None]:
|
||||
if model_family == "linear":
|
||||
scaler = StandardScaler()
|
||||
x_scaled = scaler.fit_transform(x_train)
|
||||
model = LogisticRegression(max_iter=500, class_weight="balanced")
|
||||
model.fit(x_scaled, y_train)
|
||||
return model, scaler
|
||||
if model_family == "tree":
|
||||
model = HistGradientBoostingClassifier(
|
||||
max_iter=120,
|
||||
learning_rate=0.04,
|
||||
max_leaf_nodes=31,
|
||||
l2_regularization=0.02,
|
||||
early_stopping=True,
|
||||
random_state=31,
|
||||
)
|
||||
model.fit(x_train, y_train)
|
||||
return model, None
|
||||
raise ValueError(f"unsupported model family: {model_family}")
|
||||
|
||||
|
||||
def _predict(model_family: str, model: Any, scaler: StandardScaler | None, x: np.ndarray) -> np.ndarray:
|
||||
if model_family == "linear":
|
||||
if scaler is None:
|
||||
raise ValueError("linear model missing scaler")
|
||||
return model.predict_proba(scaler.transform(x))[:, 1]
|
||||
return model.predict_proba(x)[:, 1]
|
||||
|
||||
|
||||
def _metric_row(
|
||||
frame: pd.DataFrame,
|
||||
y_true: np.ndarray,
|
||||
proba: np.ndarray,
|
||||
side: str,
|
||||
model_family: str,
|
||||
split_id: str,
|
||||
condition_opportunity_bps: float,
|
||||
target_edge_bps: float,
|
||||
top_fraction: float,
|
||||
actual_edge_col: str,
|
||||
train_positive_rate: float,
|
||||
train_rows: int,
|
||||
) -> dict[str, Any]:
|
||||
order = np.argsort(-proba)
|
||||
top_n = max(1, int(len(frame) * top_fraction))
|
||||
top = frame.iloc[order[:top_n]]
|
||||
constant = np.full(len(y_true), np.clip(train_positive_rate, 1e-6, 1 - 1e-6))
|
||||
row: dict[str, Any] = {
|
||||
"side": side,
|
||||
"model_family": model_family,
|
||||
"split_id": split_id,
|
||||
"condition_opportunity_bps": condition_opportunity_bps,
|
||||
"target_edge_bps": target_edge_bps,
|
||||
"top_fraction": top_fraction,
|
||||
"train_rows": int(train_rows),
|
||||
"train_positive_rate": train_positive_rate,
|
||||
"row_count": int(len(frame)),
|
||||
"positive_rate": float(y_true.mean()) if len(y_true) else 0.0,
|
||||
"brier": float(brier_score_loss(y_true, proba)) if len(y_true) else 0.0,
|
||||
"constant_brier": float(brier_score_loss(y_true, constant)) if len(y_true) else 0.0,
|
||||
"top_rows": int(len(top)),
|
||||
"top_positive_rate": float((top[actual_edge_col] >= target_edge_bps).mean()),
|
||||
"all_actual_edge_bps": float(frame[actual_edge_col].mean()),
|
||||
"top_actual_edge_bps": float(top[actual_edge_col].mean()),
|
||||
"top_probability_min": float(proba[order[:top_n]].min()) if len(proba) else 0.0,
|
||||
"top_probability_max": float(proba[order[:top_n]].max()) if len(proba) else 0.0,
|
||||
}
|
||||
row["auc"] = float(roc_auc_score(y_true, proba)) if len(np.unique(y_true)) == 2 else np.nan
|
||||
row["top_edge_lift_bps"] = row["top_actual_edge_bps"] - row["all_actual_edge_bps"]
|
||||
row["brier_beats_constant"] = bool(row["brier"] < row["constant_brier"])
|
||||
return row
|
||||
|
||||
|
||||
def _select_candidates(metrics: pd.DataFrame) -> pd.DataFrame:
|
||||
if metrics.empty:
|
||||
return pd.DataFrame()
|
||||
key_columns = ["side", "model_family", "condition_opportunity_bps", "target_edge_bps", "top_fraction"]
|
||||
tune = metrics[metrics["split_id"].eq(TUNE_SPLIT)].copy()
|
||||
candidates = tune[
|
||||
key_columns
|
||||
+ [
|
||||
"train_rows",
|
||||
"train_positive_rate",
|
||||
"row_count",
|
||||
"positive_rate",
|
||||
"auc",
|
||||
"brier_beats_constant",
|
||||
"top_rows",
|
||||
"top_positive_rate",
|
||||
"all_actual_edge_bps",
|
||||
"top_actual_edge_bps",
|
||||
"top_edge_lift_bps",
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
"row_count": "tune_rows",
|
||||
"positive_rate": "tune_positive_rate",
|
||||
"auc": "tune_auc",
|
||||
"brier_beats_constant": "tune_brier_beats_constant",
|
||||
"top_rows": "tune_top_rows",
|
||||
"top_positive_rate": "tune_top_positive_rate",
|
||||
"all_actual_edge_bps": "tune_all_actual_edge_bps",
|
||||
"top_actual_edge_bps": "tune_top_actual_edge_bps",
|
||||
"top_edge_lift_bps": "tune_top_edge_lift_bps",
|
||||
}
|
||||
)
|
||||
for split_id in (VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT):
|
||||
split_rows = metrics[metrics["split_id"].eq(split_id)][
|
||||
key_columns + ["row_count", "positive_rate", "auc", "brier_beats_constant", "top_rows", "top_positive_rate", "all_actual_edge_bps", "top_actual_edge_bps", "top_edge_lift_bps"]
|
||||
].rename(
|
||||
columns={
|
||||
"row_count": f"{split_id}_rows",
|
||||
"positive_rate": f"{split_id}_positive_rate",
|
||||
"auc": f"{split_id}_auc",
|
||||
"brier_beats_constant": f"{split_id}_brier_beats_constant",
|
||||
"top_rows": f"{split_id}_top_rows",
|
||||
"top_positive_rate": f"{split_id}_top_positive_rate",
|
||||
"all_actual_edge_bps": f"{split_id}_all_actual_edge_bps",
|
||||
"top_actual_edge_bps": f"{split_id}_top_actual_edge_bps",
|
||||
"top_edge_lift_bps": f"{split_id}_top_edge_lift_bps",
|
||||
}
|
||||
)
|
||||
candidates = candidates.merge(split_rows, on=key_columns, how="left")
|
||||
top_edge_columns = ["tune_top_actual_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_top_actual_edge_bps", f"{LATEST_STRESS_SPLIT}_top_actual_edge_bps"]
|
||||
auc_columns = ["tune_auc", f"{VALIDATION_LOCKED_SPLIT}_auc", f"{LATEST_STRESS_SPLIT}_auc"]
|
||||
lift_columns = ["tune_top_edge_lift_bps", f"{VALIDATION_LOCKED_SPLIT}_top_edge_lift_bps", f"{LATEST_STRESS_SPLIT}_top_edge_lift_bps"]
|
||||
candidates["min_top_edge_bps"] = candidates[top_edge_columns].min(axis=1)
|
||||
candidates["mean_top_edge_bps"] = candidates[top_edge_columns].mean(axis=1)
|
||||
candidates["min_auc"] = candidates[auc_columns].min(axis=1)
|
||||
candidates["stable_positive"] = candidates[top_edge_columns].gt(0.0).all(axis=1)
|
||||
candidates["stable_lift"] = candidates[lift_columns].gt(0.0).all(axis=1)
|
||||
candidates["score"] = candidates["min_top_edge_bps"].fillna(-999.0) + candidates["mean_top_edge_bps"].fillna(-999.0) * 0.25 + candidates["stable_positive"].astype(float) * 2.0
|
||||
return candidates.sort_values("score", ascending=False).reset_index(drop=True)
|
||||
|
||||
|
||||
def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
|
||||
lines = [
|
||||
"# 条件化 Entry 训练诊断报告",
|
||||
"",
|
||||
"这份报告只做诊断,不导出上线模型。它先用未来机会做过滤,模拟“Direction 已经筛过一层”的训练人群。",
|
||||
"",
|
||||
"**注意:这里的过滤条件用了未来机会,不能直接上线,只能判断条件化 Entry 训练是否值得做。**",
|
||||
"",
|
||||
f"- run_id: `{result['run_id']}`",
|
||||
f"- 特征数: `{result['feature_count']}`",
|
||||
f"- 条件机会阈值: `{','.join(str(item) for item in result['condition_opportunity_bps'])}`",
|
||||
f"- 目标真实收益阈值: `{','.join(str(item) for item in result['target_edge_bps'])}`",
|
||||
f"- 模型类型: `{','.join(result['model_families'])}`",
|
||||
f"- top 档位: `{','.join(str(item) for item in result['top_fractions'])}`",
|
||||
f"- 候选数: `{result['candidate_count']}`",
|
||||
f"- 三段 top 真实收益都转正: `{result['stable_positive_count']}`",
|
||||
"",
|
||||
]
|
||||
if candidates.empty:
|
||||
lines.extend(["## 候选", "", "没有候选。", ""])
|
||||
return "\n".join(lines)
|
||||
display_columns = [
|
||||
"side",
|
||||
"model_family",
|
||||
"condition_opportunity_bps",
|
||||
"target_edge_bps",
|
||||
"top_fraction",
|
||||
"tune_top_actual_edge_bps",
|
||||
f"{VALIDATION_LOCKED_SPLIT}_top_actual_edge_bps",
|
||||
f"{LATEST_STRESS_SPLIT}_top_actual_edge_bps",
|
||||
"min_top_edge_bps",
|
||||
"stable_positive",
|
||||
"stable_lift",
|
||||
"score",
|
||||
]
|
||||
lines.extend(
|
||||
[
|
||||
"## 候选",
|
||||
"",
|
||||
_markdown_table(candidates[display_columns].head(30)),
|
||||
"",
|
||||
"## 文件",
|
||||
"",
|
||||
"- `diagnostics/conditional_entry_probe_metrics.csv`: 每个组合、每个数据段的完整指标。",
|
||||
"- `diagnostics/conditional_entry_probe_candidates.csv`: 汇总后的候选排序。",
|
||||
"",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
@@ -36,8 +36,8 @@ def diagnose_training_run(args: Any) -> None:
|
||||
|
||||
|
||||
def _label_summary(root) -> dict[str, Any]:
|
||||
direction = read_parquet(root / "label" / "direction_labels.parquet")
|
||||
entry = read_parquet(root / "label" / "entry_labels.parquet")
|
||||
direction = read_parquet(root / "dataset" / "direction_train.parquet")
|
||||
entry = read_parquet(root / "dataset" / "entry_train.parquet")
|
||||
summary: dict[str, Any] = {}
|
||||
for split_id in DIAGNOSTIC_SPLITS:
|
||||
direction_split = direction[direction["split_id"].eq(split_id)].copy()
|
||||
@@ -45,25 +45,57 @@ def _label_summary(root) -> dict[str, Any]:
|
||||
item: dict[str, Any] = {"direction": {}, "entry": {}}
|
||||
if not direction_split.empty:
|
||||
item["direction"] = {
|
||||
"source": "dataset/direction_train.parquet",
|
||||
"rows": len(direction_split),
|
||||
"label_ratio": direction_split["direction_label"].value_counts(normalize=True).round(6).to_dict(),
|
||||
"label_ratio": _direction_target_ratio(direction_split),
|
||||
"future_return_bps_quantile": _quantiles(direction_split["future_return_bps"], (0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99)),
|
||||
}
|
||||
if not entry_split.empty:
|
||||
grouped = entry_split.groupby("side", observed=False)
|
||||
required = {
|
||||
"long_entry_target",
|
||||
"short_entry_target",
|
||||
"long_actual_plan_net_edge_bps",
|
||||
"short_actual_plan_net_edge_bps",
|
||||
}
|
||||
missing = sorted(required - set(entry_split.columns))
|
||||
if missing:
|
||||
raise ValueError(f"entry_train is missing columns required by diagnostics: {missing}")
|
||||
item["entry"] = {
|
||||
"source": "dataset/entry_train.parquet",
|
||||
"rows": len(entry_split),
|
||||
"target_rate_by_side": grouped["entry_target"].mean().round(6).to_dict(),
|
||||
"edge_mean_by_side": grouped["expected_net_edge_bps"].mean().round(6).to_dict(),
|
||||
"target_rate_by_side": {
|
||||
"LONG": float(entry_split["long_entry_target"].astype(float).mean()),
|
||||
"SHORT": float(entry_split["short_entry_target"].astype(float).mean()),
|
||||
},
|
||||
"edge_column": "actual_plan_net_edge_bps",
|
||||
"edge_mean_by_side": {
|
||||
"LONG": float(entry_split["long_actual_plan_net_edge_bps"].astype(float).mean()),
|
||||
"SHORT": float(entry_split["short_actual_plan_net_edge_bps"].astype(float).mean()),
|
||||
},
|
||||
"edge_quantile_by_side": {
|
||||
str(side): _quantiles(group["expected_net_edge_bps"], (0.05, 0.5, 0.95))
|
||||
for side, group in grouped
|
||||
"LONG": _quantiles(entry_split["long_actual_plan_net_edge_bps"], (0.05, 0.5, 0.95)),
|
||||
"SHORT": _quantiles(entry_split["short_actual_plan_net_edge_bps"], (0.05, 0.5, 0.95)),
|
||||
},
|
||||
}
|
||||
summary[split_id] = item
|
||||
return summary
|
||||
|
||||
|
||||
def _direction_target_ratio(frame: pd.DataFrame) -> dict[str, float]:
|
||||
required = {"long_target", "short_target", "neutral_target"}
|
||||
missing = sorted(required - set(frame.columns))
|
||||
if missing:
|
||||
raise ValueError(f"direction_train is missing target columns required by diagnostics: {missing}")
|
||||
rows = len(frame)
|
||||
if rows == 0:
|
||||
return {"LONG": 0.0, "SHORT": 0.0, "NEUTRAL": 0.0}
|
||||
return {
|
||||
"LONG": float(frame["long_target"].astype(float).mean()),
|
||||
"SHORT": float(frame["short_target"].astype(float).mean()),
|
||||
"NEUTRAL": float(frame["neutral_target"].astype(float).mean()),
|
||||
}
|
||||
|
||||
|
||||
def _pm_summary(root) -> dict[str, Any]:
|
||||
summary: dict[str, Any] = {}
|
||||
config_path = root / "pm-search" / "position_manager_config.json"
|
||||
@@ -98,8 +130,8 @@ def _score_distribution(frame: pd.DataFrame) -> dict[str, dict[str, float]]:
|
||||
"pred_short_expected_net_edge_bps",
|
||||
"model_pred_long_expected_net_edge_bps",
|
||||
"model_pred_short_expected_net_edge_bps",
|
||||
"actual_long_expected_net_edge_bps",
|
||||
"actual_short_expected_net_edge_bps",
|
||||
"actual_long_plan_edge_bps",
|
||||
"actual_short_plan_edge_bps",
|
||||
]
|
||||
return {column: _quantiles(frame[column], (0.0, 0.05, 0.5, 0.95, 1.0)) for column in columns if column in frame.columns}
|
||||
|
||||
@@ -141,11 +173,10 @@ def _cumulative_gate_counts(steps: dict[str, pd.Series], total_rows: int) -> dic
|
||||
|
||||
def _relaxed_variants(frame: pd.DataFrame) -> dict[str, Any]:
|
||||
variants = {
|
||||
"no_risk_no_edge": {"prob": 0.54, "entry": 0.50, "margin": 0.02, "risk": 1.0, "edge": -99.0},
|
||||
"rare_entry_low_prob": {"prob": 0.50, "entry": 0.03, "margin": 0.02, "risk": 0.98, "edge": 0.0},
|
||||
"entry_only_55": {"prob": 0.0, "entry": 0.55, "margin": -99.0, "risk": 1.0, "edge": -99.0},
|
||||
"direction_only_54": {"prob": 0.54, "entry": 0.0, "margin": 0.02, "risk": 1.0, "edge": -99.0},
|
||||
"very_loose": {"prob": 0.50, "entry": 0.45, "margin": 0.0, "risk": 1.0, "edge": -99.0},
|
||||
"entry_30_positive_edge": {"prob": 0.50, "entry": 0.30, "margin": 0.02, "risk": 0.65, "edge": 3.0},
|
||||
"entry_50_positive_edge": {"prob": 0.50, "entry": 0.50, "margin": 0.02, "risk": 0.65, "edge": 3.0},
|
||||
"entry_70_positive_edge": {"prob": 0.50, "entry": 0.70, "margin": 0.02, "risk": 0.65, "edge": 3.0},
|
||||
"direction_only_control": {"prob": 0.54, "entry": 0.0, "margin": 0.02, "risk": 1.0, "edge": -99.0},
|
||||
}
|
||||
result: dict[str, Any] = {}
|
||||
for name, thresholds in variants.items():
|
||||
@@ -200,8 +231,8 @@ def _top_bucket_edge(frame: pd.DataFrame) -> dict[str, Any]:
|
||||
direction_top[str(fraction)] = _plain_trade_metrics(top.rename(columns={"actual_edge_bps": "actual_edge_bps"}))
|
||||
return {
|
||||
"direction_top_score": direction_top,
|
||||
"long_entry_prob_deciles": _decile_edge(frame, "long_entry_prob", "actual_long_expected_net_edge_bps", "long_entry_target"),
|
||||
"short_entry_prob_deciles": _decile_edge(frame, "short_entry_prob", "actual_short_expected_net_edge_bps", "short_entry_target"),
|
||||
"long_entry_prob_deciles": _decile_edge(frame, "long_entry_prob", "actual_long_plan_edge_bps", "long_entry_target"),
|
||||
"short_entry_prob_deciles": _decile_edge(frame, "short_entry_prob", "actual_short_plan_edge_bps", "short_entry_target"),
|
||||
}
|
||||
|
||||
|
||||
@@ -257,7 +288,7 @@ def _diagnostic_conclusion(pm_summary: dict[str, Any]) -> dict[str, Any]:
|
||||
if validation.get("avg_weighted_edge_bps", 0.0) <= 0 and stress.get("avg_weighted_edge_bps", 0.0) <= 0:
|
||||
return {
|
||||
"status": "PRICE_PLAN_OR_ENTRY_NOT_TRADABLE",
|
||||
"plain_reason": "按固定止盈止损真实收益算,验证集和压力集选出来的交易平均都不赚钱。",
|
||||
"plain_reason": "按当前价格计划真实收益算,验证集和压力集选出来的交易平均都不赚钱。",
|
||||
"next_action": "优先重新搜索价格计划,再重建 Entry 标签和模型;不要只放松 PM 阈值。",
|
||||
}
|
||||
return {
|
||||
@@ -294,10 +325,12 @@ def _markdown_report(payload: dict[str, Any]) -> str:
|
||||
lines.append("")
|
||||
if direction:
|
||||
lines.append(f"- Direction 行数: {direction['rows']}")
|
||||
lines.append(f"- Direction 来源: `{direction['source']}`")
|
||||
lines.append(f"- Direction 标签比例: `{direction['label_ratio']}`")
|
||||
lines.append(f"- 45 分钟未来收益分位: `{direction['future_return_bps_quantile']}`")
|
||||
if entry:
|
||||
lines.append(f"- Entry 行数: {entry['rows']}")
|
||||
lines.append(f"- Entry 来源: `{entry['source']}`")
|
||||
lines.append(f"- Entry 命中率: `{entry['target_rate_by_side']}`")
|
||||
lines.append(f"- Entry 平均净收益: `{entry['edge_mean_by_side']}`")
|
||||
lines.append("")
|
||||
@@ -310,6 +343,7 @@ def _markdown_report(payload: dict[str, Any]) -> str:
|
||||
lines.append(f"- 当前阈值: `{item['active_thresholds']}`")
|
||||
lines.append(f"- 当前阈值选中交易: `{item['selected_trade_metrics']}`")
|
||||
lines.append(f"- 网格里有交易的候选数: {item['grid_search_any_trade']['candidates_with_trade']} / {item['grid_search_any_trade']['candidate_count']}")
|
||||
lines.extend(_score_distribution_markdown(item["score_distribution"]))
|
||||
lines.append("")
|
||||
for side in ("long", "short"):
|
||||
lines.append(f"#### {side.upper()}")
|
||||
@@ -332,6 +366,33 @@ def _markdown_report(payload: dict[str, Any]) -> str:
|
||||
return "\n".join(lines) + "\n"
|
||||
|
||||
|
||||
def _score_distribution_markdown(distribution: dict[str, dict[str, float]]) -> list[str]:
|
||||
watched_columns = [
|
||||
"long_prob",
|
||||
"short_prob",
|
||||
"long_entry_prob",
|
||||
"short_entry_prob",
|
||||
"market_risk_prob",
|
||||
"pred_long_expected_net_edge_bps",
|
||||
"pred_short_expected_net_edge_bps",
|
||||
]
|
||||
lines = ["", "#### 分数分布", "", "| 字段 | 最小 | 5% | 中位数 | 95% | 最大 |", "| --- | ---: | ---: | ---: | ---: | ---: |"]
|
||||
for column in watched_columns:
|
||||
quantiles = distribution.get(column)
|
||||
if not quantiles:
|
||||
continue
|
||||
lines.append(
|
||||
"| "
|
||||
+ column
|
||||
+ f" | {quantiles.get('0.0', 0.0):.4f}"
|
||||
+ f" | {quantiles.get('0.05', 0.0):.4f}"
|
||||
+ f" | {quantiles.get('0.5', 0.0):.4f}"
|
||||
+ f" | {quantiles.get('0.95', 0.0):.4f}"
|
||||
+ f" | {quantiles.get('1.0', 0.0):.4f} |"
|
||||
)
|
||||
return lines
|
||||
|
||||
|
||||
def _jsonable(value: Any) -> Any:
|
||||
if isinstance(value, dict):
|
||||
return {str(key): _jsonable(item) for key, item in value.items()}
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from trader_training.io_utils import manifest, read_parquet, require_columns, run_root, write_json, write_parquet, write_text
|
||||
from trader_training.schemas import FEATURE_ORDER
|
||||
|
||||
|
||||
def build_direction_opportunity_dataset(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
direction_path = args.direction_dataset_path or root / "dataset" / "direction_train.parquet"
|
||||
entry_path = args.entry_dataset_path or root / "dataset" / "entry_train.parquet"
|
||||
output_path = args.output_path or root / "dataset" / "direction_train.parquet"
|
||||
opportunity_bps = float(args.opportunity_bps)
|
||||
min_advantage_bps = float(args.min_advantage_bps)
|
||||
long_edge_column = str(args.long_edge_column)
|
||||
short_edge_column = str(args.short_edge_column)
|
||||
label_method = str(args.label_method)
|
||||
|
||||
direction = read_parquet(direction_path)
|
||||
entry = read_parquet(entry_path)
|
||||
require_columns(direction, ("sample_id", "split_id", *FEATURE_ORDER), "direction_train")
|
||||
require_columns(entry, ("sample_id", long_edge_column, short_edge_column), "entry_train")
|
||||
|
||||
opportunity = entry[["sample_id", long_edge_column, short_edge_column]].copy()
|
||||
merged = direction.drop(columns=["long_target", "short_target", "neutral_target"], errors="ignore").merge(opportunity, on="sample_id", how="inner", validate="one_to_one")
|
||||
if len(merged) != len(direction):
|
||||
raise ValueError(f"direction opportunity dataset lost rows: before={len(direction)} after={len(merged)}")
|
||||
|
||||
labels = _opportunity_labels(
|
||||
pd.to_numeric(merged[long_edge_column], errors="coerce").to_numpy(dtype="float64"),
|
||||
pd.to_numeric(merged[short_edge_column], errors="coerce").to_numpy(dtype="float64"),
|
||||
opportunity_bps,
|
||||
min_advantage_bps,
|
||||
)
|
||||
merged["long_target"] = labels["long_target"]
|
||||
merged["short_target"] = labels["short_target"]
|
||||
merged["neutral_target"] = labels["neutral_target"]
|
||||
# 保留 future_return_bps 作为排查字段;训练目标以三列 target 为准。
|
||||
ordered = [column for column in direction.columns if column in merged.columns and column not in {"long_target", "short_target", "neutral_target"}]
|
||||
ordered.extend(["long_target", "short_target", "neutral_target"])
|
||||
for column in (long_edge_column, short_edge_column):
|
||||
if column not in ordered:
|
||||
ordered.append(column)
|
||||
out = merged[ordered].copy()
|
||||
data_hash = write_parquet(output_path, out)
|
||||
result = {
|
||||
"dataset": manifest(
|
||||
output_path,
|
||||
{
|
||||
"row_count": len(out),
|
||||
"feature_count": len(FEATURE_ORDER),
|
||||
"data_hash_sha256": data_hash,
|
||||
"split_counts": out["split_id"].value_counts().to_dict(),
|
||||
},
|
||||
),
|
||||
"label_method": label_method,
|
||||
"long_edge_column": long_edge_column,
|
||||
"short_edge_column": short_edge_column,
|
||||
"opportunity_bps": opportunity_bps,
|
||||
"min_advantage_bps": min_advantage_bps,
|
||||
"target_counts": {
|
||||
"long": int(out["long_target"].sum()),
|
||||
"short": int(out["short_target"].sum()),
|
||||
"neutral": int(out["neutral_target"].sum()),
|
||||
},
|
||||
"target_rates_by_split": _target_rates_by_split(out),
|
||||
}
|
||||
write_json(root / "dataset" / "direction_opportunity_dataset_result.json", result)
|
||||
write_text(root / "dataset" / "direction_opportunity_dataset_report.md", _markdown_report(result))
|
||||
logging.info(
|
||||
"trader.training.direction_opportunity_dataset_written runId=%s opportunityBps=%.6f minAdvantageBps=%.6f rowCount=%s outputPath=%s",
|
||||
args.run_id,
|
||||
opportunity_bps,
|
||||
min_advantage_bps,
|
||||
len(out),
|
||||
output_path,
|
||||
)
|
||||
|
||||
|
||||
def _opportunity_labels(long_edge: np.ndarray, short_edge: np.ndarray, opportunity_bps: float, min_advantage_bps: float) -> dict[str, np.ndarray]:
|
||||
long_clean = np.nan_to_num(long_edge, nan=-np.inf)
|
||||
short_clean = np.nan_to_num(short_edge, nan=-np.inf)
|
||||
long_ok = long_clean >= opportunity_bps
|
||||
short_ok = short_clean >= opportunity_bps
|
||||
long_wins = long_ok & ((long_clean - short_clean) >= min_advantage_bps)
|
||||
short_wins = short_ok & ((short_clean - long_clean) >= min_advantage_bps)
|
||||
neutral = ~(long_wins | short_wins)
|
||||
return {
|
||||
"long_target": long_wins.astype("int8"),
|
||||
"short_target": short_wins.astype("int8"),
|
||||
"neutral_target": neutral.astype("int8"),
|
||||
}
|
||||
|
||||
|
||||
def _target_rates_by_split(frame: pd.DataFrame) -> dict[str, dict[str, float]]:
|
||||
result: dict[str, dict[str, float]] = {}
|
||||
for split_id, part in frame.groupby("split_id", observed=False):
|
||||
rows = len(part)
|
||||
result[str(split_id)] = {
|
||||
"rows": float(rows),
|
||||
"long_rate": float(part["long_target"].mean()) if rows else 0.0,
|
||||
"short_rate": float(part["short_target"].mean()) if rows else 0.0,
|
||||
"neutral_rate": float(part["neutral_target"].mean()) if rows else 0.0,
|
||||
}
|
||||
return result
|
||||
|
||||
|
||||
def _markdown_report(result: dict[str, Any]) -> str:
|
||||
lines = [
|
||||
"# Direction 机会标签数据集报告",
|
||||
"",
|
||||
"这份数据集把 Direction 目标从“未来收盘收益方向”改为“未来路径里哪边有可交易空间”。",
|
||||
"",
|
||||
f"- label_method: `{result['label_method']}`",
|
||||
f"- long_edge_column: `{result['long_edge_column']}`",
|
||||
f"- short_edge_column: `{result['short_edge_column']}`",
|
||||
f"- opportunity_bps: `{result['opportunity_bps']}`",
|
||||
f"- min_advantage_bps: `{result['min_advantage_bps']}`",
|
||||
f"- row_count: `{result['dataset']['row_count']}`",
|
||||
"",
|
||||
"## 标签数量",
|
||||
"",
|
||||
f"- long: `{result['target_counts']['long']}`",
|
||||
f"- short: `{result['target_counts']['short']}`",
|
||||
f"- neutral: `{result['target_counts']['neutral']}`",
|
||||
"",
|
||||
"## 分段比例",
|
||||
"",
|
||||
"| split | rows | long | short | neutral |",
|
||||
"| --- | ---: | ---: | ---: | ---: |",
|
||||
]
|
||||
for split_id, item in result["target_rates_by_split"].items():
|
||||
lines.append(f"| {split_id} | {int(item['rows'])} | {item['long_rate']:.4f} | {item['short_rate']:.4f} | {item['neutral_rate']:.4f} |")
|
||||
lines.append("")
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,359 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
||||
from trader_training.labels import DEFAULT_COST_CONFIG, DEFAULT_LABEL_CONFIG, ENTRY_LABEL_METHOD, _build_path_stats, _load_config
|
||||
from trader_training.schemas import FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
||||
|
||||
|
||||
EVAL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
GATE_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
DEFAULT_HORIZONS = (30, 45, 60)
|
||||
DEFAULT_TARGETS = (8.0, 12.0, 16.0)
|
||||
DEFAULT_STOPS = (4.0, 6.0, 8.0)
|
||||
DEFAULT_TRAILING_STOPS = (4.0, 8.0, 12.0)
|
||||
DEFAULT_SECOND_TARGET_MULTIPLIERS = (2.0,)
|
||||
DEFAULT_TAKE1_RATIOS = (0.50,)
|
||||
DEFAULT_TAKE2_RATIOS = (0.25,)
|
||||
|
||||
|
||||
def search_dynamic_exit_plans(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
replay = read_parquet(args.replay_path or root / "replay" / "replay_1m.parquet")
|
||||
features = read_parquet(args.feature_path or root / "feature" / "feature_frame.parquet")
|
||||
label_config = _load_config(args.label_config_path, DEFAULT_LABEL_CONFIG)
|
||||
cost_config = _load_config(args.cost_config_path, DEFAULT_COST_CONFIG)
|
||||
entry_config = label_config["entry"]
|
||||
cost_bps = float(cost_config["fee_bps"]) + float(cost_config["slippage_bps"]) + float(cost_config["funding_cost_bps"])
|
||||
min_expected_edge_bps = float(entry_config["min_expected_net_edge_bps"])
|
||||
|
||||
trainable = features[
|
||||
features["data_quality_flag"].isin(["OK", "PARTIAL_OPTIONAL"])
|
||||
& features["split_id"].isin(EVAL_SPLITS)
|
||||
][["symbol", "open_time_ms", "split_id"]].copy()
|
||||
if trainable.empty:
|
||||
raise ValueError("dynamic exit search needs trainable feature rows")
|
||||
|
||||
grid = list(
|
||||
itertools.product(
|
||||
args.horizons or DEFAULT_HORIZONS,
|
||||
args.targets or DEFAULT_TARGETS,
|
||||
args.stops or DEFAULT_STOPS,
|
||||
args.trailing_stops or DEFAULT_TRAILING_STOPS,
|
||||
args.second_target_multipliers or DEFAULT_SECOND_TARGET_MULTIPLIERS,
|
||||
args.take1_ratios or DEFAULT_TAKE1_RATIOS,
|
||||
args.take2_ratios or DEFAULT_TAKE2_RATIOS,
|
||||
)
|
||||
)
|
||||
if not grid:
|
||||
raise ValueError("dynamic exit search grid is empty")
|
||||
|
||||
logging.info(
|
||||
"trader.training.dynamic_exit_search_started runId=%s candidateCount=%s",
|
||||
args.run_id,
|
||||
len(grid),
|
||||
)
|
||||
rows: list[dict[str, Any]] = []
|
||||
for index, (horizon, target_bps, stop_bps, trailing_stop_bps, second_multiplier, take1_ratio, take2_ratio) in enumerate(grid, start=1):
|
||||
second_target_bps = float(target_bps) * float(second_multiplier)
|
||||
plan_id = _plan_id(horizon, target_bps, stop_bps, trailing_stop_bps, second_multiplier, take1_ratio, take2_ratio)
|
||||
plan_config = {
|
||||
"plan_method": "DYNAMIC_TRAILING_V1",
|
||||
"partial_take_1_ratio": float(take1_ratio),
|
||||
"partial_take_2_ratio": float(take2_ratio),
|
||||
"second_target_bps": second_target_bps,
|
||||
"trailing_stop_bps": float(trailing_stop_bps),
|
||||
"breakeven_after_first_target": True,
|
||||
}
|
||||
logging.info(
|
||||
"trader.training.dynamic_exit_candidate_start runId=%s candidateIndex=%s candidateCount=%s planId=%s",
|
||||
args.run_id,
|
||||
index,
|
||||
len(grid),
|
||||
plan_id,
|
||||
)
|
||||
stats = _build_path_stats(replay, int(horizon), float(target_bps), float(stop_bps), plan_config=plan_config)
|
||||
merged = stats.merge(trainable, on=["symbol", "open_time_ms"], how="inner")
|
||||
if merged.empty:
|
||||
logging.info("trader.training.dynamic_exit_candidate_skipped runId=%s planId=%s reason=no_trainable_rows", args.run_id, plan_id)
|
||||
continue
|
||||
merged["actual_net_edge_bps"] = merged["gross_edge_bps"].astype("float64") - cost_bps
|
||||
rows.extend(
|
||||
_candidate_rows(
|
||||
merged,
|
||||
plan_id,
|
||||
int(horizon),
|
||||
float(target_bps),
|
||||
float(stop_bps),
|
||||
float(trailing_stop_bps),
|
||||
second_target_bps,
|
||||
float(second_multiplier),
|
||||
float(take1_ratio),
|
||||
float(take2_ratio),
|
||||
cost_bps,
|
||||
min_expected_edge_bps,
|
||||
)
|
||||
)
|
||||
logging.info(
|
||||
"trader.training.dynamic_exit_candidate_done runId=%s planId=%s mergedRows=%s",
|
||||
args.run_id,
|
||||
plan_id,
|
||||
len(merged),
|
||||
)
|
||||
|
||||
result = pd.DataFrame(rows)
|
||||
if result.empty:
|
||||
raise ValueError("dynamic exit search produced no candidate rows")
|
||||
|
||||
summary = _plan_summary(result)
|
||||
best = _select_best_plan(summary)
|
||||
payload = {
|
||||
"run_id": args.run_id,
|
||||
"cost_bps": cost_bps,
|
||||
"min_expected_net_edge_bps": min_expected_edge_bps,
|
||||
"entry_label_method": ENTRY_LABEL_METHOD,
|
||||
"candidate_count": int(summary["plan_id"].nunique()),
|
||||
"robust_candidate_found": bool(best["robust_candidate_found"]),
|
||||
"best_plan": best,
|
||||
}
|
||||
output_dir_name = str(getattr(args, "output_dir_name", None) or "dynamic-exit-search")
|
||||
if output_dir_name in {"", ".", ".."} or "/" in output_dir_name or "\\" in output_dir_name:
|
||||
raise ValueError(f"output_dir_name must be a run-local directory name: {output_dir_name}")
|
||||
out_dir = root / output_dir_name
|
||||
write_json(out_dir / "dynamic_exit_search_result.json", _jsonable(payload))
|
||||
write_text(out_dir / "dynamic_exit_search_rows.csv", result.to_csv(index=False))
|
||||
write_text(out_dir / "dynamic_exit_search_summary.csv", summary.to_csv(index=False))
|
||||
write_text(out_dir / "dynamic_exit_search_report.md", _markdown_report(payload, summary))
|
||||
logging.info(
|
||||
"trader.training.dynamic_exit_search_finished runId=%s candidateCount=%s bestPlan=%s robust=%s bestScore=%.6f",
|
||||
args.run_id,
|
||||
payload["candidate_count"],
|
||||
best["plan_id"],
|
||||
best["robust_candidate_found"],
|
||||
best["score"],
|
||||
)
|
||||
|
||||
|
||||
def _candidate_rows(
|
||||
frame: pd.DataFrame,
|
||||
plan_id: str,
|
||||
horizon: int,
|
||||
target_bps: float,
|
||||
stop_bps: float,
|
||||
trailing_stop_bps: float,
|
||||
second_target_bps: float,
|
||||
second_target_multiplier: float,
|
||||
take1_ratio: float,
|
||||
take2_ratio: float,
|
||||
cost_bps: float,
|
||||
min_expected_edge_bps: float,
|
||||
) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
for split_id, side in itertools.product(EVAL_SPLITS, ("LONG", "SHORT")):
|
||||
mask = frame["split_id"].eq(split_id) & frame["side"].eq(side)
|
||||
if not mask.any():
|
||||
continue
|
||||
part = frame.loc[mask]
|
||||
actual = part["actual_net_edge_bps"].astype("float64")
|
||||
rows.append(
|
||||
{
|
||||
"plan_id": plan_id,
|
||||
"split_id": split_id,
|
||||
"side": side,
|
||||
"horizon_minutes": horizon,
|
||||
"target_bps": target_bps,
|
||||
"stop_bps": stop_bps,
|
||||
"trailing_stop_bps": trailing_stop_bps,
|
||||
"second_target_bps": second_target_bps,
|
||||
"second_target_multiplier": second_target_multiplier,
|
||||
"partial_take_1_ratio": take1_ratio,
|
||||
"partial_take_2_ratio": take2_ratio,
|
||||
"cost_bps": cost_bps,
|
||||
"rows": int(len(part)),
|
||||
"avg_actual_net_edge_bps": float(actual.mean()),
|
||||
"median_actual_net_edge_bps": float(actual.median()),
|
||||
"p10_actual_net_edge_bps": float(actual.quantile(0.10)),
|
||||
"p90_actual_net_edge_bps": float(actual.quantile(0.90)),
|
||||
"positive_label_rate": float((actual >= min_expected_edge_bps).mean()),
|
||||
"breakeven_rate": float((actual >= 0.0).mean()),
|
||||
"target_hit_rate": float(part["target_hit"].mean()),
|
||||
"stop_hit_rate": float(part["stop_hit"].mean()),
|
||||
"timeout_rate": float(part["timeout_hit"].mean()),
|
||||
"avg_time_to_exit_min": float(part["time_to_exit_ms"].mean() / 60_000.0),
|
||||
"avg_mfe_bps": float(part["mfe_bps"].mean()),
|
||||
"avg_mae_bps": float(part["mae_bps"].mean()),
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _plan_summary(rows: pd.DataFrame) -> pd.DataFrame:
|
||||
group_cols = [
|
||||
"plan_id",
|
||||
"horizon_minutes",
|
||||
"target_bps",
|
||||
"stop_bps",
|
||||
"trailing_stop_bps",
|
||||
"second_target_bps",
|
||||
"second_target_multiplier",
|
||||
"partial_take_1_ratio",
|
||||
"partial_take_2_ratio",
|
||||
"side",
|
||||
]
|
||||
metrics = [
|
||||
"avg_actual_net_edge_bps",
|
||||
"median_actual_net_edge_bps",
|
||||
"positive_label_rate",
|
||||
"breakeven_rate",
|
||||
"target_hit_rate",
|
||||
"stop_hit_rate",
|
||||
"timeout_rate",
|
||||
"avg_time_to_exit_min",
|
||||
"avg_mfe_bps",
|
||||
"avg_mae_bps",
|
||||
]
|
||||
split_rows = rows.pivot_table(index=group_cols, columns="split_id", values=metrics, aggfunc="mean")
|
||||
split_rows.columns = [f"{metric}_{split}" for metric, split in split_rows.columns]
|
||||
split_rows = split_rows.reset_index()
|
||||
for split_id in EVAL_SPLITS:
|
||||
for metric in metrics:
|
||||
column = f"{metric}_{split_id}"
|
||||
if column not in split_rows.columns:
|
||||
split_rows[column] = np.nan
|
||||
|
||||
edge_cols = [f"avg_actual_net_edge_bps_{split}" for split in GATE_SPLITS]
|
||||
breakeven_cols = [f"breakeven_rate_{split}" for split in GATE_SPLITS]
|
||||
positive_cols = [f"positive_label_rate_{split}" for split in GATE_SPLITS]
|
||||
stop_cols = [f"stop_hit_rate_{split}" for split in GATE_SPLITS]
|
||||
split_rows["avg_actual_edge_eval"] = split_rows[edge_cols].mean(axis=1)
|
||||
split_rows["min_actual_edge_eval"] = split_rows[edge_cols].min(axis=1)
|
||||
split_rows["min_breakeven_rate_eval"] = split_rows[breakeven_cols].min(axis=1)
|
||||
split_rows["min_positive_label_rate_eval"] = split_rows[positive_cols].min(axis=1)
|
||||
split_rows["max_positive_label_rate_eval"] = split_rows[positive_cols].max(axis=1)
|
||||
split_rows["max_stop_hit_rate_eval"] = split_rows[stop_cols].max(axis=1)
|
||||
split_rows["score"] = (
|
||||
split_rows["avg_actual_edge_eval"].fillna(-999.0) * 8.0
|
||||
+ split_rows["min_actual_edge_eval"].fillna(-999.0) * 4.0
|
||||
+ split_rows["min_breakeven_rate_eval"].fillna(0.0) * 20.0
|
||||
+ split_rows["min_positive_label_rate_eval"].fillna(0.0) * 20.0
|
||||
- split_rows["max_stop_hit_rate_eval"].fillna(1.0) * 8.0
|
||||
)
|
||||
return split_rows.sort_values("score", ascending=False).reset_index(drop=True)
|
||||
|
||||
|
||||
def _select_best_plan(summary: pd.DataFrame) -> dict[str, Any]:
|
||||
robust = summary[
|
||||
(summary["avg_actual_edge_eval"] > 0.0)
|
||||
& (summary["min_actual_edge_eval"] > -1.0)
|
||||
& (summary["min_breakeven_rate_eval"] >= 0.45)
|
||||
& (summary["min_positive_label_rate_eval"] >= 0.03)
|
||||
& (summary["max_positive_label_rate_eval"] <= 0.55)
|
||||
].copy()
|
||||
robust_found = not robust.empty
|
||||
candidates = robust if robust_found else summary
|
||||
row = candidates.sort_values("score", ascending=False, na_position="last").iloc[0]
|
||||
return {
|
||||
"plan_id": str(row["plan_id"]),
|
||||
"plan_method": "DYNAMIC_TRAILING_V1",
|
||||
"side": str(row["side"]),
|
||||
"horizon_minutes": int(row["horizon_minutes"]),
|
||||
"target_bps": float(row["target_bps"]),
|
||||
"stop_bps": float(row["stop_bps"]),
|
||||
"trailing_stop_bps": float(row["trailing_stop_bps"]),
|
||||
"second_target_bps": float(row["second_target_bps"]),
|
||||
"second_target_multiplier": float(row["second_target_multiplier"]),
|
||||
"partial_take_1_ratio": float(row["partial_take_1_ratio"]),
|
||||
"partial_take_2_ratio": float(row["partial_take_2_ratio"]),
|
||||
"breakeven_after_first_target": True,
|
||||
"score": float(row["score"]),
|
||||
"avg_actual_edge_eval": float(row["avg_actual_edge_eval"]),
|
||||
"min_actual_edge_eval": float(row["min_actual_edge_eval"]),
|
||||
"min_breakeven_rate_eval": float(row["min_breakeven_rate_eval"]),
|
||||
"min_positive_label_rate_eval": float(row["min_positive_label_rate_eval"]),
|
||||
"max_positive_label_rate_eval": float(row["max_positive_label_rate_eval"]),
|
||||
"max_stop_hit_rate_eval": float(row["max_stop_hit_rate_eval"]),
|
||||
"robust_candidate_found": bool(robust_found),
|
||||
}
|
||||
|
||||
|
||||
def _plan_id(
|
||||
horizon: int,
|
||||
target_bps: float,
|
||||
stop_bps: float,
|
||||
trailing_stop_bps: float,
|
||||
second_target_multiplier: float,
|
||||
take1_ratio: float,
|
||||
take2_ratio: float,
|
||||
) -> str:
|
||||
return (
|
||||
f"dyn_h{int(horizon)}_t{target_bps:g}_s{stop_bps:g}"
|
||||
f"_trail{trailing_stop_bps:g}_t2x{second_target_multiplier:g}"
|
||||
f"_p{int(round(take1_ratio * 100))}_{int(round(take2_ratio * 100))}"
|
||||
)
|
||||
|
||||
|
||||
def _markdown_report(payload: dict[str, Any], summary: pd.DataFrame) -> str:
|
||||
top = summary.head(20)
|
||||
best = payload["best_plan"]
|
||||
verdict = "找到可继续训练的稳定出场候选。" if payload["robust_candidate_found"] else "没有找到稳定为正的出场候选;只能把最高分组合当成下一轮排查对象。"
|
||||
lines = [
|
||||
"# Dynamic Exit Search Report",
|
||||
"",
|
||||
f"- run_id: `{payload['run_id']}`",
|
||||
f"- cost_bps: {payload['cost_bps']}",
|
||||
f"- min_expected_net_edge_bps: {payload['min_expected_net_edge_bps']}",
|
||||
f"- entry_label_method: `{payload['entry_label_method']}`",
|
||||
f"- candidate_count: {payload['candidate_count']}",
|
||||
f"- verdict: {verdict}",
|
||||
"",
|
||||
"## Best Plan For Next Experiment",
|
||||
"",
|
||||
"```json",
|
||||
json.dumps(best, ensure_ascii=False, sort_keys=False),
|
||||
"```",
|
||||
"",
|
||||
"## Top Plans",
|
||||
"",
|
||||
_markdown_table(top),
|
||||
"",
|
||||
"说明:这里统计的是动态出场后的真实计划收益,已经扣掉手续费、滑点、资金费。它不是上线结论,只用来决定下一轮训练是否值得换出场参数。",
|
||||
"",
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _markdown_table(frame: pd.DataFrame) -> str:
|
||||
if frame.empty:
|
||||
return "无数据。"
|
||||
columns = list(frame.columns)
|
||||
lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join("---" for _ in columns) + " |"]
|
||||
for row in frame.to_dict("records"):
|
||||
values = []
|
||||
for column in columns:
|
||||
value = row.get(column, "")
|
||||
if isinstance(value, float):
|
||||
value = round(value, 6)
|
||||
values.append(str(value))
|
||||
lines.append("| " + " | ".join(values) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _jsonable(value: Any) -> Any:
|
||||
if isinstance(value, dict):
|
||||
return {str(key): _jsonable(item) for key, item in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [_jsonable(item) for item in value]
|
||||
if isinstance(value, tuple):
|
||||
return [_jsonable(item) for item in value]
|
||||
if isinstance(value, (np.integer,)):
|
||||
return int(value)
|
||||
if isinstance(value, (np.floating,)):
|
||||
return float(value)
|
||||
return value
|
||||
@@ -0,0 +1,377 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from itertools import combinations
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from trader_training.entry_feature_screen import _bucket_edges, _markdown_table
|
||||
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
||||
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
||||
|
||||
|
||||
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
ALL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
|
||||
|
||||
def screen_entry_condition_pairs(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
||||
_require_columns(dataset)
|
||||
|
||||
min_seed_rows = int(args.min_seed_rows or 300)
|
||||
min_pair_rows = int(args.min_pair_rows or 150)
|
||||
max_seed_conditions_per_side = int(args.max_seed_conditions_per_side or 32)
|
||||
max_buckets_per_feature = int(args.max_buckets_per_feature or 2)
|
||||
|
||||
rows: list[dict[str, Any]] = []
|
||||
seed_frames: list[pd.DataFrame] = []
|
||||
bucketed_features = _bucketed_features(dataset)
|
||||
for side in ("LONG", "SHORT"):
|
||||
target_col = "long_entry_target" if side == "LONG" else "short_entry_target"
|
||||
edge_col = _actual_edge_column(side)
|
||||
mae_col = "long_mae_bps" if side == "LONG" else "short_mae_bps"
|
||||
baselines = _split_baselines(dataset, target_col, edge_col, mae_col)
|
||||
seeds = _seed_conditions(
|
||||
dataset,
|
||||
bucketed_features,
|
||||
side,
|
||||
target_col,
|
||||
edge_col,
|
||||
mae_col,
|
||||
baselines,
|
||||
min_seed_rows,
|
||||
max_buckets_per_feature,
|
||||
max_seed_conditions_per_side,
|
||||
)
|
||||
seed_frames.append(seeds)
|
||||
side_rows = _condition_pair_rows(
|
||||
dataset,
|
||||
bucketed_features,
|
||||
seeds,
|
||||
side,
|
||||
target_col,
|
||||
edge_col,
|
||||
mae_col,
|
||||
baselines,
|
||||
min_pair_rows,
|
||||
)
|
||||
rows.extend(side_rows)
|
||||
logging.info(
|
||||
"trader.training.entry_condition_pair_side_screened side=%s seedCount=%s pairMetricRows=%s",
|
||||
side,
|
||||
len(seeds),
|
||||
len(side_rows),
|
||||
)
|
||||
|
||||
pair_metrics = pd.DataFrame(rows)
|
||||
candidates = _select_candidates(pair_metrics, min_pair_rows) if not pair_metrics.empty else pd.DataFrame()
|
||||
seeds_all = pd.concat(seed_frames, ignore_index=True) if seed_frames else pd.DataFrame()
|
||||
result = {
|
||||
"run_id": args.run_id,
|
||||
"dataset_path": str(root / "dataset" / "entry_train.parquet"),
|
||||
"feature_count": len(FEATURE_ORDER),
|
||||
"seed_count": int(len(seeds_all)),
|
||||
"pair_metric_count": int(len(pair_metrics)),
|
||||
"candidate_count": int(len(candidates)),
|
||||
"stable_candidate_count": int((candidates.get("stable_positive_edge", pd.Series(dtype=bool)) & candidates.get("stable_lift", pd.Series(dtype=bool))).sum()) if not candidates.empty else 0,
|
||||
"min_seed_rows": min_seed_rows,
|
||||
"min_pair_rows": min_pair_rows,
|
||||
"max_seed_conditions_per_side": max_seed_conditions_per_side,
|
||||
"max_buckets_per_feature": max_buckets_per_feature,
|
||||
"selection_rule": "single buckets are chosen on tune_inner, then feature-pair intersections are checked on tune_inner/validation_locked/latest_stress",
|
||||
}
|
||||
write_json(root / "diagnostics" / "entry_condition_pair_screen_result.json", result)
|
||||
write_text(root / "diagnostics" / "entry_condition_pair_seeds.csv", seeds_all.to_csv(index=False))
|
||||
write_text(root / "diagnostics" / "entry_condition_pair_metrics.csv", pair_metrics.to_csv(index=False))
|
||||
write_text(root / "diagnostics" / "entry_condition_pair_candidates.csv", candidates.to_csv(index=False))
|
||||
write_text(root / "diagnostics" / "entry_condition_pair_screen_report.md", _markdown_report(result, candidates))
|
||||
logging.info(
|
||||
"trader.training.entry_condition_pair_screened runId=%s seedCount=%s pairMetricCount=%s candidateCount=%s reportPath=%s",
|
||||
args.run_id,
|
||||
len(seeds_all),
|
||||
len(pair_metrics),
|
||||
len(candidates),
|
||||
root / "diagnostics" / "entry_condition_pair_screen_report.md",
|
||||
)
|
||||
|
||||
|
||||
def _require_columns(dataset: pd.DataFrame) -> None:
|
||||
required = {
|
||||
"split_id",
|
||||
*FEATURE_ORDER,
|
||||
"long_entry_target",
|
||||
"short_entry_target",
|
||||
"long_actual_plan_net_edge_bps",
|
||||
"short_actual_plan_net_edge_bps",
|
||||
"long_mae_bps",
|
||||
"short_mae_bps",
|
||||
}
|
||||
missing = sorted(required.difference(dataset.columns))
|
||||
if missing:
|
||||
raise ValueError(f"entry condition pair screen missing required columns: {missing}")
|
||||
|
||||
|
||||
def _actual_edge_column(side: str) -> str:
|
||||
if side == "LONG":
|
||||
return "long_actual_plan_net_edge_bps"
|
||||
if side == "SHORT":
|
||||
return "short_actual_plan_net_edge_bps"
|
||||
raise ValueError(f"unsupported side: {side}")
|
||||
|
||||
|
||||
def _bucketed_features(dataset: pd.DataFrame) -> dict[str, pd.Series]:
|
||||
bucketed: dict[str, pd.Series] = {}
|
||||
fit_mask = dataset["split_id"].eq(FIT_SPLIT)
|
||||
for feature in FEATURE_ORDER:
|
||||
train_values = pd.to_numeric(dataset.loc[fit_mask, feature], errors="coerce").replace([np.inf, -np.inf], np.nan).dropna()
|
||||
edges = _bucket_edges(train_values.to_numpy(dtype="float64"))
|
||||
if len(edges) < 3:
|
||||
continue
|
||||
values = pd.to_numeric(dataset[feature], errors="coerce").replace([np.inf, -np.inf], np.nan)
|
||||
bucket = pd.cut(values, bins=edges, include_lowest=True, labels=False, duplicates="drop")
|
||||
bucketed[feature] = bucket.astype("float")
|
||||
logging.info("trader.training.entry_condition_pair_bucketed featureCount=%s", len(bucketed))
|
||||
return bucketed
|
||||
|
||||
|
||||
def _split_baselines(dataset: pd.DataFrame, target_col: str, edge_col: str, mae_col: str) -> dict[str, dict[str, float]]:
|
||||
baselines: dict[str, dict[str, float]] = {}
|
||||
for split_id in ALL_SPLITS:
|
||||
part = dataset[dataset["split_id"].eq(split_id)]
|
||||
if part.empty:
|
||||
continue
|
||||
baselines[split_id] = {
|
||||
"rows": float(len(part)),
|
||||
"positive_rate": float(part[target_col].mean()),
|
||||
"avg_edge_bps": float(part[edge_col].mean()),
|
||||
"avg_mae_bps": float(part[mae_col].mean()),
|
||||
}
|
||||
return baselines
|
||||
|
||||
|
||||
def _seed_conditions(
|
||||
dataset: pd.DataFrame,
|
||||
bucketed_features: dict[str, pd.Series],
|
||||
side: str,
|
||||
target_col: str,
|
||||
edge_col: str,
|
||||
mae_col: str,
|
||||
baselines: dict[str, dict[str, float]],
|
||||
min_seed_rows: int,
|
||||
max_buckets_per_feature: int,
|
||||
max_seed_conditions_per_side: int,
|
||||
) -> pd.DataFrame:
|
||||
tune_mask = dataset["split_id"].eq(TUNE_SPLIT)
|
||||
baseline = baselines[TUNE_SPLIT]
|
||||
rows: list[dict[str, Any]] = []
|
||||
for feature, bucket in bucketed_features.items():
|
||||
working = dataset.loc[tune_mask, [target_col, edge_col, mae_col]].copy()
|
||||
working["bucket_index"] = bucket.loc[tune_mask].to_numpy()
|
||||
working = working.dropna(subset=["bucket_index"])
|
||||
if working.empty:
|
||||
continue
|
||||
working["bucket_index"] = working["bucket_index"].astype(int)
|
||||
for bucket_index, part in working.groupby("bucket_index", sort=True, observed=False):
|
||||
if len(part) < min_seed_rows:
|
||||
continue
|
||||
avg_edge = float(part[edge_col].mean())
|
||||
positive_rate = float(part[target_col].mean())
|
||||
avg_mae = float(part[mae_col].mean())
|
||||
rows.append(
|
||||
{
|
||||
"side": side,
|
||||
"feature": feature,
|
||||
"bucket_index": int(bucket_index),
|
||||
"tune_rows": int(len(part)),
|
||||
"tune_positive_rate": positive_rate,
|
||||
"tune_positive_rate_lift": positive_rate - baseline["positive_rate"],
|
||||
"tune_avg_edge_bps": avg_edge,
|
||||
"tune_avg_edge_lift_bps": avg_edge - baseline["avg_edge_bps"],
|
||||
"tune_avg_mae_bps": avg_mae,
|
||||
"tune_avg_mae_lift_bps": avg_mae - baseline["avg_mae_bps"],
|
||||
}
|
||||
)
|
||||
if not rows:
|
||||
return pd.DataFrame()
|
||||
seeds = pd.DataFrame(rows).sort_values(["feature", "tune_avg_edge_lift_bps", "tune_avg_edge_bps"], ascending=[True, False, False])
|
||||
seeds = seeds.groupby("feature", as_index=False, observed=False).head(max_buckets_per_feature)
|
||||
seeds = seeds.sort_values(["tune_avg_edge_lift_bps", "tune_avg_edge_bps", "tune_rows"], ascending=[False, False, False])
|
||||
return seeds.head(max_seed_conditions_per_side).reset_index(drop=True)
|
||||
|
||||
|
||||
def _condition_pair_rows(
|
||||
dataset: pd.DataFrame,
|
||||
bucketed_features: dict[str, pd.Series],
|
||||
seeds: pd.DataFrame,
|
||||
side: str,
|
||||
target_col: str,
|
||||
edge_col: str,
|
||||
mae_col: str,
|
||||
baselines: dict[str, dict[str, float]],
|
||||
min_pair_rows: int,
|
||||
) -> list[dict[str, Any]]:
|
||||
if seeds.empty:
|
||||
return []
|
||||
rows: list[dict[str, Any]] = []
|
||||
seed_records = seeds.to_dict("records")
|
||||
for left, right in combinations(seed_records, 2):
|
||||
left_feature = str(left["feature"])
|
||||
right_feature = str(right["feature"])
|
||||
if left_feature == right_feature:
|
||||
continue
|
||||
left_bucket = int(left["bucket_index"])
|
||||
right_bucket = int(right["bucket_index"])
|
||||
left_mask = bucketed_features[left_feature].eq(left_bucket)
|
||||
right_mask = bucketed_features[right_feature].eq(right_bucket)
|
||||
pair_mask = left_mask & right_mask
|
||||
tune_rows = int((pair_mask & dataset["split_id"].eq(TUNE_SPLIT)).sum())
|
||||
if tune_rows < min_pair_rows:
|
||||
continue
|
||||
for split_id in ALL_SPLITS:
|
||||
split_mask = pair_mask & dataset["split_id"].eq(split_id)
|
||||
part = dataset.loc[split_mask, [target_col, edge_col, mae_col]]
|
||||
if part.empty or split_id not in baselines:
|
||||
continue
|
||||
baseline = baselines[split_id]
|
||||
avg_edge = float(part[edge_col].mean())
|
||||
positive_rate = float(part[target_col].mean())
|
||||
avg_mae = float(part[mae_col].mean())
|
||||
rows.append(
|
||||
{
|
||||
"side": side,
|
||||
"left_feature": left_feature,
|
||||
"left_bucket_index": left_bucket,
|
||||
"right_feature": right_feature,
|
||||
"right_bucket_index": right_bucket,
|
||||
"split_id": split_id,
|
||||
"row_count": int(len(part)),
|
||||
"positive_rate": positive_rate,
|
||||
"baseline_positive_rate": baseline["positive_rate"],
|
||||
"positive_rate_lift": positive_rate - baseline["positive_rate"],
|
||||
"avg_edge_bps": avg_edge,
|
||||
"baseline_avg_edge_bps": baseline["avg_edge_bps"],
|
||||
"avg_edge_lift_bps": avg_edge - baseline["avg_edge_bps"],
|
||||
"avg_mae_bps": avg_mae,
|
||||
"baseline_avg_mae_bps": baseline["avg_mae_bps"],
|
||||
"avg_mae_lift_bps": avg_mae - baseline["avg_mae_bps"],
|
||||
"median_edge_bps": float(part[edge_col].median()),
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _select_candidates(pair_metrics: pd.DataFrame, min_pair_rows: int) -> pd.DataFrame:
|
||||
tune = pair_metrics[pair_metrics["split_id"].eq(TUNE_SPLIT) & (pair_metrics["row_count"] >= min_pair_rows)].copy()
|
||||
if tune.empty:
|
||||
return pd.DataFrame()
|
||||
key_columns = ["side", "left_feature", "left_bucket_index", "right_feature", "right_bucket_index"]
|
||||
candidates = tune[key_columns + ["row_count", "positive_rate", "positive_rate_lift", "avg_edge_bps", "avg_edge_lift_bps", "avg_mae_bps", "avg_mae_lift_bps"]].rename(
|
||||
columns={
|
||||
"row_count": "tune_rows",
|
||||
"positive_rate": "tune_positive_rate",
|
||||
"positive_rate_lift": "tune_positive_rate_lift",
|
||||
"avg_edge_bps": "tune_avg_edge_bps",
|
||||
"avg_edge_lift_bps": "tune_avg_edge_lift_bps",
|
||||
"avg_mae_bps": "tune_avg_mae_bps",
|
||||
"avg_mae_lift_bps": "tune_avg_mae_lift_bps",
|
||||
}
|
||||
)
|
||||
for split_id in (VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT):
|
||||
split_rows = pair_metrics[pair_metrics["split_id"].eq(split_id)][
|
||||
key_columns + ["row_count", "positive_rate", "positive_rate_lift", "avg_edge_bps", "avg_edge_lift_bps", "avg_mae_bps", "avg_mae_lift_bps"]
|
||||
].rename(
|
||||
columns={
|
||||
"row_count": f"{split_id}_rows",
|
||||
"positive_rate": f"{split_id}_positive_rate",
|
||||
"positive_rate_lift": f"{split_id}_positive_rate_lift",
|
||||
"avg_edge_bps": f"{split_id}_avg_edge_bps",
|
||||
"avg_edge_lift_bps": f"{split_id}_avg_edge_lift_bps",
|
||||
"avg_mae_bps": f"{split_id}_avg_mae_bps",
|
||||
"avg_mae_lift_bps": f"{split_id}_avg_mae_lift_bps",
|
||||
}
|
||||
)
|
||||
candidates = candidates.merge(split_rows, on=key_columns, how="left")
|
||||
|
||||
edge_columns = ["tune_avg_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_bps"]
|
||||
lift_columns = ["tune_avg_edge_lift_bps", f"{VALIDATION_LOCKED_SPLIT}_avg_edge_lift_bps", f"{LATEST_STRESS_SPLIT}_avg_edge_lift_bps"]
|
||||
row_columns = ["tune_rows", f"{VALIDATION_LOCKED_SPLIT}_rows", f"{LATEST_STRESS_SPLIT}_rows"]
|
||||
positive_columns = ["tune_positive_rate", f"{VALIDATION_LOCKED_SPLIT}_positive_rate", f"{LATEST_STRESS_SPLIT}_positive_rate"]
|
||||
candidates["stable_positive_edge"] = candidates[edge_columns].gt(0.0).all(axis=1)
|
||||
candidates["stable_lift"] = candidates[lift_columns].gt(0.0).all(axis=1)
|
||||
candidates["min_eval_edge_bps"] = candidates[edge_columns].min(axis=1)
|
||||
candidates["mean_eval_edge_bps"] = candidates[edge_columns].mean(axis=1)
|
||||
candidates["min_eval_rows"] = candidates[row_columns].min(axis=1)
|
||||
candidates["min_eval_positive_rate"] = candidates[positive_columns].min(axis=1)
|
||||
candidates["stable_enough_rows"] = candidates["min_eval_rows"].ge(min_pair_rows)
|
||||
candidates["usable_candidate"] = candidates["stable_positive_edge"] & candidates["stable_lift"] & candidates["stable_enough_rows"]
|
||||
candidates["screen_score"] = (
|
||||
candidates["min_eval_edge_bps"].fillna(-999.0)
|
||||
+ candidates["mean_eval_edge_bps"].fillna(-999.0) * 0.25
|
||||
+ candidates["stable_lift"].astype(float) * 2.0
|
||||
+ candidates["stable_enough_rows"].astype(float)
|
||||
)
|
||||
return candidates.sort_values("screen_score", ascending=False).reset_index(drop=True)
|
||||
|
||||
|
||||
def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
|
||||
lines = [
|
||||
"# Entry 组合条件筛查报告",
|
||||
"",
|
||||
"## 结论怎么读",
|
||||
"",
|
||||
"这份报告只回答一个问题:两个特征条件同时出现时,能不能稳定筛掉坏开仓点。",
|
||||
"",
|
||||
"- 只使用真实计划收益,不使用旧的最大可拿收益。",
|
||||
"- `tune_inner` 用来挑条件组合。",
|
||||
"- `validation_locked` 和 `latest_stress` 用来检查组合是否还能站住。",
|
||||
"- `usable_candidate=true` 才表示这个组合既三段正收益、三段比大盘好、三段样本数也够。",
|
||||
"",
|
||||
"## 本次结果",
|
||||
"",
|
||||
f"- run_id: `{result['run_id']}`",
|
||||
f"- 特征数: `{result['feature_count']}`",
|
||||
f"- 种子条件数: `{result['seed_count']}`",
|
||||
f"- 组合明细数: `{result['pair_metric_count']}`",
|
||||
f"- 候选组合数: `{result['candidate_count']}`",
|
||||
f"- 稳定候选数: `{result['stable_candidate_count']}`",
|
||||
f"- 单条件最小行数: `{result['min_seed_rows']}`",
|
||||
f"- 组合最小行数: `{result['min_pair_rows']}`",
|
||||
"",
|
||||
]
|
||||
if candidates.empty:
|
||||
lines.extend(["## 候选组合", "", "没有找到满足最小样本数的组合条件。", ""])
|
||||
return "\n".join(lines)
|
||||
display_columns = [
|
||||
"side",
|
||||
"left_feature",
|
||||
"left_bucket_index",
|
||||
"right_feature",
|
||||
"right_bucket_index",
|
||||
"tune_avg_edge_bps",
|
||||
f"{VALIDATION_LOCKED_SPLIT}_avg_edge_bps",
|
||||
f"{LATEST_STRESS_SPLIT}_avg_edge_bps",
|
||||
"min_eval_edge_bps",
|
||||
"min_eval_rows",
|
||||
"stable_positive_edge",
|
||||
"stable_lift",
|
||||
"usable_candidate",
|
||||
"screen_score",
|
||||
]
|
||||
lines.extend(
|
||||
[
|
||||
"## 候选组合",
|
||||
"",
|
||||
_markdown_table(candidates[display_columns].head(25)),
|
||||
"",
|
||||
"## 文件",
|
||||
"",
|
||||
"- `diagnostics/entry_condition_pair_seeds.csv`: 进入组合筛查的单条件。",
|
||||
"- `diagnostics/entry_condition_pair_metrics.csv`: 每个组合在每个数据段的完整明细。",
|
||||
"- `diagnostics/entry_condition_pair_candidates.csv`: 按调参集挑出的组合候选,以及封存验证/压力检查结果。",
|
||||
"",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines)
|
||||
@@ -17,7 +17,7 @@ ALL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLI
|
||||
def screen_entry_features(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
||||
required = {"split_id", *FEATURE_ORDER, "long_entry_target", "short_entry_target", "long_expected_net_edge_bps", "short_expected_net_edge_bps"}
|
||||
required = {"split_id", *FEATURE_ORDER, "long_entry_target", "short_entry_target", "long_actual_plan_net_edge_bps", "short_actual_plan_net_edge_bps"}
|
||||
missing = sorted(required.difference(dataset.columns))
|
||||
if missing:
|
||||
raise ValueError(f"entry feature screen missing required columns: {missing}")
|
||||
@@ -67,7 +67,7 @@ def _screen_edge_column(dataset: pd.DataFrame, side: str) -> str:
|
||||
actual_col = f"{prefix}_actual_plan_net_edge_bps"
|
||||
if actual_col in dataset.columns:
|
||||
return actual_col
|
||||
return f"{prefix}_expected_net_edge_bps"
|
||||
raise ValueError(f"entry feature screen requires actual plan edge column: {actual_col}")
|
||||
|
||||
|
||||
def _split_baselines(dataset: pd.DataFrame, target_col: str, edge_col: str) -> dict[str, dict[str, float]]:
|
||||
@@ -146,7 +146,18 @@ def _bucket_edges(values: np.ndarray) -> np.ndarray:
|
||||
edges = np.quantile(clean, quantiles)
|
||||
edges = np.unique(edges)
|
||||
if edges.size < 3:
|
||||
return np.array([], dtype="float64")
|
||||
non_zero = clean[clean != 0.0]
|
||||
if non_zero.size < 300:
|
||||
return np.array([], dtype="float64")
|
||||
# 突破/扫单类特征常常绝大多数为 0。普通十分位会全挤在 0,
|
||||
# 这里单独保留“没有事件”和“有事件强弱”两类桶,避免漏掉稀有但可能有用的信号。
|
||||
event_edges = np.unique(np.quantile(non_zero, np.linspace(0.0, 1.0, 6)))
|
||||
if event_edges.size < 2:
|
||||
return np.array([-np.inf, 0.0, np.inf], dtype="float64")
|
||||
edges = np.unique(np.concatenate(([-np.inf, 0.0], event_edges[1:-1], [np.inf]))).astype("float64")
|
||||
if edges.size < 3:
|
||||
return np.array([], dtype="float64")
|
||||
return edges
|
||||
edges[0] = -np.inf
|
||||
edges[-1] = np.inf
|
||||
return edges
|
||||
@@ -236,7 +247,7 @@ def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
|
||||
"",
|
||||
"这份报告只回答一个问题:历史数据里,单个特征的某些区间有没有稳定变好。",
|
||||
"",
|
||||
"- 如果数据里有真实出场净收益,本报告用真实出场净收益;没有时才退回训练收益标签。",
|
||||
"- 本报告只使用真实出场净收益;缺少真实收益列时直接失败。",
|
||||
"- `tune_inner` 用来挑候选区间。",
|
||||
"- `validation_locked` 和 `latest_stress` 用来检查这个区间是不是出了训练样本也还能站住。",
|
||||
"- `stable_positive_edge=true` 代表这个区间在三个检查集里的平均净收益都大于 0。",
|
||||
|
||||
@@ -0,0 +1,366 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.metrics import brier_score_loss, roc_auc_score
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from trader_training.entry_feature_screen import _markdown_table
|
||||
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
||||
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
||||
|
||||
|
||||
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
|
||||
|
||||
def diagnose_entry_mae_labels(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
||||
_require_columns(dataset)
|
||||
|
||||
max_mae_values = tuple(float(item) for item in (args.max_mae_bps or (4.0, 6.0, 8.0, 12.0)))
|
||||
min_opportunity_values = tuple(float(item) for item in (args.min_opportunity_bps or (6.0, 12.0, 20.0)))
|
||||
model_families = tuple(str(item).strip().lower() for item in (args.model_families or ("linear",)) if str(item).strip())
|
||||
top_fractions = tuple(float(item) for item in (getattr(args, "top_fractions", None) or (float(args.top_fraction or 0.10),)))
|
||||
max_train_rows = int(args.max_train_rows or 0)
|
||||
|
||||
x_train_frame = dataset[dataset["split_id"].eq(FIT_SPLIT)].copy()
|
||||
if x_train_frame.empty:
|
||||
raise ValueError("entry mae label diagnostic needs fit_inner rows")
|
||||
if max_train_rows > 0 and len(x_train_frame) > max_train_rows:
|
||||
x_train_frame = x_train_frame.sort_values("event_time").tail(max_train_rows).copy() if "event_time" in x_train_frame.columns else x_train_frame.tail(max_train_rows).copy()
|
||||
x_train = _x(x_train_frame)
|
||||
|
||||
rows: list[dict[str, Any]] = []
|
||||
for side in ("LONG", "SHORT"):
|
||||
actual_edge_col = f"{side.lower()}_actual_plan_net_edge_bps"
|
||||
mae_col = f"{side.lower()}_mae_bps"
|
||||
opportunity_col = f"{side.lower()}_max_achievable_net_edge_bps"
|
||||
for max_mae_bps in max_mae_values:
|
||||
for min_opportunity_bps in min_opportunity_values:
|
||||
target_name = f"{side.lower()}_mae_le_{max_mae_bps:g}_opp_ge_{min_opportunity_bps:g}"
|
||||
y_train = _target(x_train_frame, mae_col, opportunity_col, max_mae_bps, min_opportunity_bps)
|
||||
if len(np.unique(y_train)) < 2:
|
||||
rows.append(
|
||||
{
|
||||
"side": side,
|
||||
"target_name": target_name,
|
||||
"model_family": "SKIPPED",
|
||||
"max_mae_bps": max_mae_bps,
|
||||
"min_opportunity_bps": min_opportunity_bps,
|
||||
"status": "SKIPPED_ONE_CLASS_TRAIN",
|
||||
"train_rows": int(len(y_train)),
|
||||
"train_positive_rate": float(y_train.mean()) if len(y_train) else 0.0,
|
||||
}
|
||||
)
|
||||
continue
|
||||
for model_family in model_families:
|
||||
model, scaler = _fit_model(model_family, x_train, y_train)
|
||||
for split_id in EVAL_SPLITS:
|
||||
split_frame = dataset[dataset["split_id"].eq(split_id)].copy()
|
||||
if split_frame.empty:
|
||||
continue
|
||||
y_true = _target(split_frame, mae_col, opportunity_col, max_mae_bps, min_opportunity_bps)
|
||||
proba = _predict(model_family, model, scaler, _x(split_frame))
|
||||
for top_fraction in top_fractions:
|
||||
rows.append(
|
||||
_metric_row(
|
||||
split_frame,
|
||||
y_true,
|
||||
proba,
|
||||
side,
|
||||
target_name,
|
||||
model_family,
|
||||
split_id,
|
||||
max_mae_bps,
|
||||
min_opportunity_bps,
|
||||
top_fraction,
|
||||
actual_edge_col,
|
||||
mae_col,
|
||||
opportunity_col,
|
||||
float(y_train.mean()),
|
||||
)
|
||||
)
|
||||
logging.info(
|
||||
"trader.training.entry_mae_label_diagnosed side=%s target=%s modelFamily=%s trainRows=%s trainPositiveRate=%.6f",
|
||||
side,
|
||||
target_name,
|
||||
model_family,
|
||||
len(y_train),
|
||||
float(y_train.mean()),
|
||||
)
|
||||
|
||||
metrics = pd.DataFrame(rows)
|
||||
candidates = _select_candidates(metrics)
|
||||
result = {
|
||||
"run_id": args.run_id,
|
||||
"feature_count": len(FEATURE_ORDER),
|
||||
"max_mae_bps": list(max_mae_values),
|
||||
"min_opportunity_bps": list(min_opportunity_values),
|
||||
"model_families": list(model_families),
|
||||
"top_fractions": list(top_fractions),
|
||||
"max_train_rows": max_train_rows,
|
||||
"metric_count": int(len(metrics)),
|
||||
"candidate_count": int(len(candidates)),
|
||||
"positive_top_edge_candidate_count": int(candidates["stable_top_edge_positive"].sum()) if not candidates.empty else 0,
|
||||
"purpose": "diagnostic_only_not_exported",
|
||||
"selection_rule": "fit on fit_inner; rank by top predicted low-MAE opportunity samples on tune_inner/validation_locked/latest_stress",
|
||||
}
|
||||
out_dir = root / "diagnostics"
|
||||
write_json(out_dir / "entry_mae_label_diagnostic_result.json", result)
|
||||
write_text(out_dir / "entry_mae_label_diagnostic_metrics.csv", metrics.to_csv(index=False))
|
||||
write_text(out_dir / "entry_mae_label_diagnostic_candidates.csv", candidates.to_csv(index=False))
|
||||
write_text(out_dir / "entry_mae_label_diagnostic_report.md", _markdown_report(result, candidates))
|
||||
logging.info(
|
||||
"trader.training.entry_mae_label_diagnostic_written runId=%s metricCount=%s candidateCount=%s reportPath=%s",
|
||||
args.run_id,
|
||||
len(metrics),
|
||||
len(candidates),
|
||||
out_dir / "entry_mae_label_diagnostic_report.md",
|
||||
)
|
||||
|
||||
|
||||
def _require_columns(dataset: pd.DataFrame) -> None:
|
||||
required = {"split_id", *FEATURE_ORDER}
|
||||
for side in ("long", "short"):
|
||||
required.update(
|
||||
{
|
||||
f"{side}_actual_plan_net_edge_bps",
|
||||
f"{side}_mae_bps",
|
||||
f"{side}_max_achievable_net_edge_bps",
|
||||
}
|
||||
)
|
||||
missing = sorted(required.difference(dataset.columns))
|
||||
if missing:
|
||||
raise ValueError(f"entry mae label diagnostic missing required columns: {missing}")
|
||||
|
||||
|
||||
def _x(frame: pd.DataFrame) -> np.ndarray:
|
||||
values = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32")
|
||||
if values.isna().any().any():
|
||||
missing = values.columns[values.isna().any()].tolist()
|
||||
raise ValueError(f"entry mae label diagnostic found non-finite feature values: {missing}")
|
||||
return values.to_numpy(dtype="float32")
|
||||
|
||||
|
||||
def _target(frame: pd.DataFrame, mae_col: str, opportunity_col: str, max_mae_bps: float, min_opportunity_bps: float) -> np.ndarray:
|
||||
mae = pd.to_numeric(frame[mae_col], errors="coerce")
|
||||
opportunity = pd.to_numeric(frame[opportunity_col], errors="coerce")
|
||||
return ((mae <= max_mae_bps) & (opportunity >= min_opportunity_bps)).astype(int).to_numpy()
|
||||
|
||||
|
||||
def _fit_model(model_family: str, x_train: np.ndarray, y_train: np.ndarray) -> tuple[Any, StandardScaler | None]:
|
||||
if model_family == "linear":
|
||||
scaler = StandardScaler()
|
||||
x_scaled = scaler.fit_transform(x_train)
|
||||
model = LogisticRegression(max_iter=500, class_weight="balanced")
|
||||
model.fit(x_scaled, y_train)
|
||||
return model, scaler
|
||||
if model_family == "tree":
|
||||
model = HistGradientBoostingClassifier(
|
||||
max_iter=120,
|
||||
learning_rate=0.04,
|
||||
max_leaf_nodes=31,
|
||||
l2_regularization=0.02,
|
||||
early_stopping=True,
|
||||
random_state=23,
|
||||
)
|
||||
model.fit(x_train, y_train)
|
||||
return model, None
|
||||
raise ValueError(f"unsupported model family: {model_family}")
|
||||
|
||||
|
||||
def _predict(model_family: str, model: Any, scaler: StandardScaler | None, x: np.ndarray) -> np.ndarray:
|
||||
if model_family == "linear":
|
||||
if scaler is None:
|
||||
raise ValueError("linear model missing scaler")
|
||||
return model.predict_proba(scaler.transform(x))[:, 1]
|
||||
return model.predict_proba(x)[:, 1]
|
||||
|
||||
|
||||
def _metric_row(
|
||||
frame: pd.DataFrame,
|
||||
y_true: np.ndarray,
|
||||
proba: np.ndarray,
|
||||
side: str,
|
||||
target_name: str,
|
||||
model_family: str,
|
||||
split_id: str,
|
||||
max_mae_bps: float,
|
||||
min_opportunity_bps: float,
|
||||
top_fraction: float,
|
||||
actual_edge_col: str,
|
||||
mae_col: str,
|
||||
opportunity_col: str,
|
||||
train_positive_rate: float,
|
||||
) -> dict[str, Any]:
|
||||
order = np.argsort(-proba)
|
||||
top_n = max(1, int(len(frame) * top_fraction))
|
||||
top_index = frame.index.to_numpy()[order[:top_n]]
|
||||
top = frame.loc[top_index]
|
||||
constant = np.full(len(y_true), np.clip(train_positive_rate, 1e-6, 1 - 1e-6))
|
||||
row: dict[str, Any] = {
|
||||
"side": side,
|
||||
"target_name": target_name,
|
||||
"model_family": model_family,
|
||||
"split_id": split_id,
|
||||
"status": "OK",
|
||||
"max_mae_bps": max_mae_bps,
|
||||
"min_opportunity_bps": min_opportunity_bps,
|
||||
"row_count": int(len(frame)),
|
||||
"positive_rate": float(y_true.mean()) if len(y_true) else 0.0,
|
||||
"train_positive_rate": train_positive_rate,
|
||||
"brier": float(brier_score_loss(y_true, proba)) if len(y_true) else 0.0,
|
||||
"constant_brier": float(brier_score_loss(y_true, constant)) if len(y_true) else 0.0,
|
||||
"top_fraction": top_fraction,
|
||||
"top_rows": int(len(top)),
|
||||
"top_target_rate": float(y_true[order[:top_n]].mean()) if len(y_true) else 0.0,
|
||||
"all_actual_edge_bps": float(frame[actual_edge_col].mean()),
|
||||
"top_actual_edge_bps": float(top[actual_edge_col].mean()),
|
||||
"top_mae_bps": float(top[mae_col].mean()),
|
||||
"top_opportunity_bps": float(top[opportunity_col].mean()),
|
||||
"top_probability_min": float(proba[order[:top_n]].min()) if len(proba) else 0.0,
|
||||
"top_probability_max": float(proba[order[:top_n]].max()) if len(proba) else 0.0,
|
||||
}
|
||||
if len(np.unique(y_true)) == 2:
|
||||
row["auc"] = float(roc_auc_score(y_true, proba))
|
||||
else:
|
||||
row["auc"] = np.nan
|
||||
row["brier_beats_constant"] = bool(row["brier"] < row["constant_brier"])
|
||||
row["top_edge_lift_bps"] = row["top_actual_edge_bps"] - row["all_actual_edge_bps"]
|
||||
row["top_target_lift"] = row["top_target_rate"] - row["positive_rate"]
|
||||
return row
|
||||
|
||||
|
||||
def _select_candidates(metrics: pd.DataFrame) -> pd.DataFrame:
|
||||
ok = metrics[metrics["status"].eq("OK")].copy()
|
||||
if ok.empty:
|
||||
return pd.DataFrame()
|
||||
key_columns = ["side", "target_name", "model_family", "max_mae_bps", "min_opportunity_bps", "top_fraction"]
|
||||
tune = ok[ok["split_id"].eq(TUNE_SPLIT)].copy()
|
||||
candidates = tune[
|
||||
key_columns
|
||||
+ [
|
||||
"row_count",
|
||||
"positive_rate",
|
||||
"auc",
|
||||
"brier",
|
||||
"constant_brier",
|
||||
"brier_beats_constant",
|
||||
"top_target_rate",
|
||||
"top_actual_edge_bps",
|
||||
"top_edge_lift_bps",
|
||||
"top_mae_bps",
|
||||
"top_opportunity_bps",
|
||||
]
|
||||
].rename(
|
||||
columns={
|
||||
"row_count": "tune_rows",
|
||||
"positive_rate": "tune_positive_rate",
|
||||
"auc": "tune_auc",
|
||||
"brier": "tune_brier",
|
||||
"constant_brier": "tune_constant_brier",
|
||||
"brier_beats_constant": "tune_brier_beats_constant",
|
||||
"top_target_rate": "tune_top_target_rate",
|
||||
"top_actual_edge_bps": "tune_top_actual_edge_bps",
|
||||
"top_edge_lift_bps": "tune_top_edge_lift_bps",
|
||||
"top_mae_bps": "tune_top_mae_bps",
|
||||
"top_opportunity_bps": "tune_top_opportunity_bps",
|
||||
}
|
||||
)
|
||||
for split_id in (VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT):
|
||||
split_rows = ok[ok["split_id"].eq(split_id)][
|
||||
key_columns + ["row_count", "positive_rate", "auc", "brier", "constant_brier", "brier_beats_constant", "top_target_rate", "top_actual_edge_bps", "top_edge_lift_bps", "top_mae_bps", "top_opportunity_bps"]
|
||||
].rename(
|
||||
columns={
|
||||
"row_count": f"{split_id}_rows",
|
||||
"positive_rate": f"{split_id}_positive_rate",
|
||||
"auc": f"{split_id}_auc",
|
||||
"brier": f"{split_id}_brier",
|
||||
"constant_brier": f"{split_id}_constant_brier",
|
||||
"brier_beats_constant": f"{split_id}_brier_beats_constant",
|
||||
"top_target_rate": f"{split_id}_top_target_rate",
|
||||
"top_actual_edge_bps": f"{split_id}_top_actual_edge_bps",
|
||||
"top_edge_lift_bps": f"{split_id}_top_edge_lift_bps",
|
||||
"top_mae_bps": f"{split_id}_top_mae_bps",
|
||||
"top_opportunity_bps": f"{split_id}_top_opportunity_bps",
|
||||
}
|
||||
)
|
||||
candidates = candidates.merge(split_rows, on=key_columns, how="left")
|
||||
|
||||
top_edge_columns = ["tune_top_actual_edge_bps", f"{VALIDATION_LOCKED_SPLIT}_top_actual_edge_bps", f"{LATEST_STRESS_SPLIT}_top_actual_edge_bps"]
|
||||
auc_columns = ["tune_auc", f"{VALIDATION_LOCKED_SPLIT}_auc", f"{LATEST_STRESS_SPLIT}_auc"]
|
||||
lift_columns = ["tune_top_edge_lift_bps", f"{VALIDATION_LOCKED_SPLIT}_top_edge_lift_bps", f"{LATEST_STRESS_SPLIT}_top_edge_lift_bps"]
|
||||
candidates["min_eval_top_edge_bps"] = candidates[top_edge_columns].min(axis=1)
|
||||
candidates["mean_eval_top_edge_bps"] = candidates[top_edge_columns].mean(axis=1)
|
||||
candidates["min_eval_auc"] = candidates[auc_columns].min(axis=1)
|
||||
candidates["stable_top_edge_positive"] = candidates[top_edge_columns].gt(0.0).all(axis=1)
|
||||
candidates["stable_lift"] = candidates[lift_columns].gt(0.0).all(axis=1)
|
||||
brier_flag_columns = ["tune_brier_beats_constant", f"{VALIDATION_LOCKED_SPLIT}_brier_beats_constant", f"{LATEST_STRESS_SPLIT}_brier_beats_constant"]
|
||||
for column in brier_flag_columns:
|
||||
candidates[column] = candidates[column].map(lambda value: bool(value) if pd.notna(value) else False)
|
||||
candidates["stable_brier_beats_constant"] = candidates[brier_flag_columns].all(axis=1)
|
||||
candidates["diagnostic_score"] = (
|
||||
candidates["min_eval_top_edge_bps"].fillna(-999.0)
|
||||
+ candidates["mean_eval_top_edge_bps"].fillna(-999.0) * 0.25
|
||||
+ candidates["min_eval_auc"].fillna(0.0) * 2.0
|
||||
+ candidates["stable_lift"].astype(float)
|
||||
)
|
||||
return candidates.sort_values("diagnostic_score", ascending=False).reset_index(drop=True)
|
||||
|
||||
|
||||
def _markdown_report(result: dict[str, Any], candidates: pd.DataFrame) -> str:
|
||||
lines = [
|
||||
"# Entry 低回撤标签诊断报告",
|
||||
"",
|
||||
"这份报告只做诊断,不导出上线模型。它回答:现有特征能不能识别“回撤小、同时有足够空间”的开仓点。",
|
||||
"",
|
||||
f"- run_id: `{result['run_id']}`",
|
||||
f"- 特征数: `{result['feature_count']}`",
|
||||
f"- 模型类型: `{','.join(result['model_families'])}`",
|
||||
f"- top_fractions: `{','.join(str(item) for item in result['top_fractions'])}`",
|
||||
f"- 指标行数: `{result['metric_count']}`",
|
||||
f"- 候选数: `{result['candidate_count']}`",
|
||||
f"- top 真实收益三段都转正的候选数: `{result['positive_top_edge_candidate_count']}`",
|
||||
"",
|
||||
]
|
||||
if candidates.empty:
|
||||
lines.extend(["## 候选", "", "没有候选。", ""])
|
||||
return "\n".join(lines)
|
||||
display_columns = [
|
||||
"side",
|
||||
"model_family",
|
||||
"top_fraction",
|
||||
"max_mae_bps",
|
||||
"min_opportunity_bps",
|
||||
"tune_auc",
|
||||
f"{VALIDATION_LOCKED_SPLIT}_auc",
|
||||
f"{LATEST_STRESS_SPLIT}_auc",
|
||||
"tune_top_actual_edge_bps",
|
||||
f"{VALIDATION_LOCKED_SPLIT}_top_actual_edge_bps",
|
||||
f"{LATEST_STRESS_SPLIT}_top_actual_edge_bps",
|
||||
"min_eval_top_edge_bps",
|
||||
"stable_top_edge_positive",
|
||||
"stable_lift",
|
||||
"stable_brier_beats_constant",
|
||||
"diagnostic_score",
|
||||
]
|
||||
lines.extend(
|
||||
[
|
||||
"## 候选",
|
||||
"",
|
||||
_markdown_table(candidates[display_columns].head(25)),
|
||||
"",
|
||||
"## 文件",
|
||||
"",
|
||||
"- `diagnostics/entry_mae_label_diagnostic_metrics.csv`: 每个标签、方向、模型、数据段的完整指标。",
|
||||
"- `diagnostics/entry_mae_label_diagnostic_candidates.csv`: 按三段 top 真实收益排序的候选。",
|
||||
"",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines)
|
||||
@@ -0,0 +1,363 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
from trader_training.io_utils import read_parquet, run_root, write_json, write_text
|
||||
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
||||
|
||||
|
||||
ALL_SPLITS = (FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
|
||||
|
||||
def diagnose_good_trade_structure(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
||||
min_good_edge_bps = float(args.min_good_edge_bps)
|
||||
bad_edge_bps = float(args.bad_edge_bps)
|
||||
top_fractions = tuple(float(item) for item in args.top_fractions)
|
||||
_require_columns(dataset)
|
||||
|
||||
side_frames = {
|
||||
side: _side_frame(dataset, side, min_good_edge_bps, bad_edge_bps)
|
||||
for side in ("LONG", "SHORT")
|
||||
}
|
||||
split_summary = pd.concat([_split_summary(frame, side) for side, frame in side_frames.items()], ignore_index=True)
|
||||
feature_rows = pd.concat([_feature_candidates(frame, side, top_fractions) for side, frame in side_frames.items()], ignore_index=True)
|
||||
model_rows = pd.concat([_tree_model_top_rows(frame, side, top_fractions) for side, frame in side_frames.items()], ignore_index=True)
|
||||
result = {
|
||||
"run_id": args.run_id,
|
||||
"min_good_edge_bps": min_good_edge_bps,
|
||||
"bad_edge_bps": bad_edge_bps,
|
||||
"feature_count": len(FEATURE_ORDER),
|
||||
"feature_candidate_count": int(len(feature_rows)),
|
||||
"stable_feature_count": int(feature_rows["stable_auc"].sum()) if not feature_rows.empty else 0,
|
||||
"stable_positive_top_feature_count": int(feature_rows["stable_positive_top_edge"].sum()) if not feature_rows.empty else 0,
|
||||
"tree_model_verdict": _tree_verdict(model_rows),
|
||||
}
|
||||
out_dir = root / "diagnostics"
|
||||
write_json(out_dir / "good_trade_structure_result.json", _jsonable(result))
|
||||
write_text(out_dir / "good_trade_split_summary.csv", split_summary.to_csv(index=False))
|
||||
write_text(out_dir / "good_trade_feature_candidates.csv", feature_rows.to_csv(index=False))
|
||||
write_text(out_dir / "good_trade_tree_model_top.csv", model_rows.to_csv(index=False))
|
||||
write_text(out_dir / "good_trade_structure_report.md", _markdown_report(result, split_summary, feature_rows, model_rows))
|
||||
logging.info(
|
||||
"trader.training.good_trade_structure_written runId=%s stableFeatureCount=%s stablePositiveTopFeatureCount=%s treeVerdict=%s",
|
||||
args.run_id,
|
||||
result["stable_feature_count"],
|
||||
result["stable_positive_top_feature_count"],
|
||||
result["tree_model_verdict"]["status"],
|
||||
)
|
||||
|
||||
|
||||
def _require_columns(dataset: pd.DataFrame) -> None:
|
||||
required = {
|
||||
"split_id",
|
||||
*FEATURE_ORDER,
|
||||
"long_actual_plan_net_edge_bps",
|
||||
"short_actual_plan_net_edge_bps",
|
||||
}
|
||||
missing = sorted(required - set(dataset.columns))
|
||||
if missing:
|
||||
raise ValueError(f"good trade structure diagnostic missing required columns: {missing}")
|
||||
|
||||
|
||||
def _side_frame(dataset: pd.DataFrame, side: str, min_good_edge_bps: float, bad_edge_bps: float) -> pd.DataFrame:
|
||||
edge_col = "long_actual_plan_net_edge_bps" if side == "LONG" else "short_actual_plan_net_edge_bps"
|
||||
frame = dataset[["sample_id", "split_id", edge_col, *FEATURE_ORDER]].copy()
|
||||
frame = frame.rename(columns={edge_col: "actual_edge_bps"})
|
||||
frame["side"] = side
|
||||
frame["actual_edge_bps"] = pd.to_numeric(frame["actual_edge_bps"], errors="coerce")
|
||||
frame["good_trade"] = frame["actual_edge_bps"].ge(min_good_edge_bps).astype("int8")
|
||||
frame["breakeven_trade"] = frame["actual_edge_bps"].ge(0.0).astype("int8")
|
||||
frame["bad_trade"] = frame["actual_edge_bps"].le(bad_edge_bps).astype("int8")
|
||||
return frame.dropna(subset=["actual_edge_bps"]).reset_index(drop=True)
|
||||
|
||||
|
||||
def _split_summary(frame: pd.DataFrame, side: str) -> pd.DataFrame:
|
||||
rows: list[dict[str, Any]] = []
|
||||
for split_id in ALL_SPLITS:
|
||||
part = frame[frame["split_id"].eq(split_id)]
|
||||
if part.empty:
|
||||
continue
|
||||
edge = part["actual_edge_bps"].astype(float)
|
||||
rows.append(
|
||||
{
|
||||
"side": side,
|
||||
"split_id": split_id,
|
||||
"rows": int(len(part)),
|
||||
"good_rate": float(part["good_trade"].mean()),
|
||||
"breakeven_rate": float(part["breakeven_trade"].mean()),
|
||||
"bad_rate": float(part["bad_trade"].mean()),
|
||||
"avg_edge_bps": float(edge.mean()),
|
||||
"p50_edge_bps": float(edge.quantile(0.50)),
|
||||
"p90_edge_bps": float(edge.quantile(0.90)),
|
||||
"p99_edge_bps": float(edge.quantile(0.99)),
|
||||
}
|
||||
)
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def _feature_candidates(frame: pd.DataFrame, side: str, top_fractions: tuple[float, ...]) -> pd.DataFrame:
|
||||
rows: list[dict[str, Any]] = []
|
||||
tune = frame[frame["split_id"].eq(TUNE_SPLIT)]
|
||||
for feature in FEATURE_ORDER:
|
||||
tune_auc = _raw_auc(tune, feature)
|
||||
if tune_auc is None:
|
||||
continue
|
||||
direction = "HIGH" if tune_auc >= 0.5 else "LOW"
|
||||
row: dict[str, Any] = {
|
||||
"side": side,
|
||||
"feature": feature,
|
||||
"better_when": direction,
|
||||
"tune_raw_auc": float(tune_auc),
|
||||
}
|
||||
directional_aucs = []
|
||||
top_edges = []
|
||||
top_good_rates = []
|
||||
for split_id in EVAL_SPLITS:
|
||||
part = frame[frame["split_id"].eq(split_id)]
|
||||
directional_auc = _directional_auc(part, feature, direction)
|
||||
top_metrics = _feature_top_metrics(part, feature, direction, top_fractions[0])
|
||||
row[f"{split_id}_directional_auc"] = directional_auc
|
||||
row[f"{split_id}_top{_fraction_label(top_fractions[0])}_rows"] = top_metrics["rows"]
|
||||
row[f"{split_id}_top{_fraction_label(top_fractions[0])}_good_rate"] = top_metrics["good_rate"]
|
||||
row[f"{split_id}_top{_fraction_label(top_fractions[0])}_avg_edge_bps"] = top_metrics["avg_edge_bps"]
|
||||
if directional_auc is not None:
|
||||
directional_aucs.append(float(directional_auc))
|
||||
if top_metrics["rows"] > 0:
|
||||
top_edges.append(float(top_metrics["avg_edge_bps"]))
|
||||
top_good_rates.append(float(top_metrics["good_rate"]))
|
||||
row["min_eval_directional_auc"] = min(directional_aucs) if directional_aucs else np.nan
|
||||
row["min_top_avg_edge_bps"] = min(top_edges) if top_edges else np.nan
|
||||
row["min_top_good_rate"] = min(top_good_rates) if top_good_rates else np.nan
|
||||
row["stable_auc"] = bool(len(directional_aucs) == len(EVAL_SPLITS) and min(directional_aucs) >= 0.53)
|
||||
row["stable_positive_top_edge"] = bool(len(top_edges) == len(EVAL_SPLITS) and min(top_edges) > 0.0)
|
||||
row["score"] = (
|
||||
float(row["min_eval_directional_auc"]) * 10.0
|
||||
+ float(row["min_top_avg_edge_bps"]) * 0.10
|
||||
+ (2.0 if row["stable_auc"] else 0.0)
|
||||
+ (3.0 if row["stable_positive_top_edge"] else 0.0)
|
||||
if np.isfinite(row["min_eval_directional_auc"]) and np.isfinite(row["min_top_avg_edge_bps"])
|
||||
else -999.0
|
||||
)
|
||||
rows.append(row)
|
||||
if not rows:
|
||||
return pd.DataFrame()
|
||||
return pd.DataFrame(rows).sort_values("score", ascending=False).reset_index(drop=True)
|
||||
|
||||
|
||||
def _raw_auc(frame: pd.DataFrame, feature: str) -> float | None:
|
||||
values = pd.to_numeric(frame[feature], errors="coerce").replace([np.inf, -np.inf], np.nan)
|
||||
working = pd.DataFrame({"x": values, "y": frame["good_trade"].astype(int)}).dropna()
|
||||
if len(working) < 1000 or working["x"].nunique() < 2 or working["y"].nunique() < 2:
|
||||
return None
|
||||
return float(roc_auc_score(working["y"].to_numpy(), working["x"].to_numpy()))
|
||||
|
||||
|
||||
def _directional_auc(frame: pd.DataFrame, feature: str, direction: str) -> float | None:
|
||||
auc = _raw_auc(frame, feature)
|
||||
if auc is None:
|
||||
return None
|
||||
return float(auc if direction == "HIGH" else 1.0 - auc)
|
||||
|
||||
|
||||
def _feature_top_metrics(frame: pd.DataFrame, feature: str, direction: str, fraction: float) -> dict[str, Any]:
|
||||
values = pd.to_numeric(frame[feature], errors="coerce").replace([np.inf, -np.inf], np.nan)
|
||||
working = pd.DataFrame(
|
||||
{
|
||||
"x": values,
|
||||
"good_trade": frame["good_trade"].astype(int),
|
||||
"actual_edge_bps": frame["actual_edge_bps"].astype(float),
|
||||
}
|
||||
).dropna()
|
||||
if working.empty:
|
||||
return {"rows": 0, "good_rate": 0.0, "avg_edge_bps": 0.0}
|
||||
ascending = direction == "LOW"
|
||||
top = working.sort_values("x", ascending=ascending).head(max(1, int(len(working) * fraction)))
|
||||
return {
|
||||
"rows": int(len(top)),
|
||||
"good_rate": float(top["good_trade"].mean()),
|
||||
"avg_edge_bps": float(top["actual_edge_bps"].mean()),
|
||||
}
|
||||
|
||||
|
||||
def _tree_model_top_rows(frame: pd.DataFrame, side: str, top_fractions: tuple[float, ...]) -> pd.DataFrame:
|
||||
train = frame[frame["split_id"].eq(FIT_SPLIT)].copy()
|
||||
if train.empty or train["good_trade"].nunique() < 2:
|
||||
return pd.DataFrame()
|
||||
model = HistGradientBoostingClassifier(
|
||||
max_iter=180,
|
||||
learning_rate=0.04,
|
||||
max_leaf_nodes=31,
|
||||
l2_regularization=0.02,
|
||||
early_stopping=True,
|
||||
random_state=71 if side == "LONG" else 73,
|
||||
)
|
||||
model.fit(_x(train), train["good_trade"].astype(int).to_numpy())
|
||||
rows: list[dict[str, Any]] = []
|
||||
for split_id in EVAL_SPLITS:
|
||||
part = frame[frame["split_id"].eq(split_id)].copy()
|
||||
if part.empty:
|
||||
continue
|
||||
proba = model.predict_proba(_x(part))[:, 1]
|
||||
auc = _model_auc(part["good_trade"].astype(int).to_numpy(), proba)
|
||||
for fraction in top_fractions:
|
||||
metrics = _top_fraction_metrics(part, proba, fraction)
|
||||
rows.append(
|
||||
{
|
||||
"side": side,
|
||||
"split_id": split_id,
|
||||
"model": "HistGradientBoostingClassifier",
|
||||
"auc": auc,
|
||||
"top_fraction": fraction,
|
||||
**metrics,
|
||||
}
|
||||
)
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def _model_auc(y_true: np.ndarray, proba: np.ndarray) -> float | None:
|
||||
if len(np.unique(y_true)) < 2:
|
||||
return None
|
||||
return float(roc_auc_score(y_true, proba))
|
||||
|
||||
|
||||
def _top_fraction_metrics(frame: pd.DataFrame, score: np.ndarray, fraction: float) -> dict[str, Any]:
|
||||
working = frame[["good_trade", "actual_edge_bps"]].copy()
|
||||
working["score"] = score
|
||||
top = working.sort_values("score", ascending=False).head(max(1, int(len(working) * fraction)))
|
||||
return {
|
||||
"rows": int(len(top)),
|
||||
"good_rate": float(top["good_trade"].mean()),
|
||||
"avg_edge_bps": float(top["actual_edge_bps"].mean()),
|
||||
"p50_edge_bps": float(top["actual_edge_bps"].quantile(0.50)),
|
||||
"p90_edge_bps": float(top["actual_edge_bps"].quantile(0.90)),
|
||||
}
|
||||
|
||||
|
||||
def _tree_verdict(model_rows: pd.DataFrame) -> dict[str, Any]:
|
||||
if model_rows.empty:
|
||||
return {"status": "NO_MODEL_ROWS", "reason": "没有足够样本训练树模型诊断。"}
|
||||
top10 = model_rows[model_rows["top_fraction"].eq(0.10)].copy()
|
||||
if top10.empty:
|
||||
return {"status": "NO_TOP10_ROWS", "reason": "没有 top10 诊断结果。"}
|
||||
grouped = top10.groupby("side", observed=False)
|
||||
promising = []
|
||||
for side, part in grouped:
|
||||
if set(part["split_id"]) >= set(EVAL_SPLITS) and part["avg_edge_bps"].min() > 0.0 and part["auc"].min() >= 0.56:
|
||||
promising.append(str(side))
|
||||
if promising:
|
||||
return {"status": "PROMISING_TREE_STRUCTURE", "reason": f"树模型 top10 在这些方向三段为正: {promising}"}
|
||||
return {"status": "NO_STABLE_TREE_STRUCTURE", "reason": "树模型 top10 也没有在 tune/validation/latest 三段同时转正。"}
|
||||
|
||||
|
||||
def _x(frame: pd.DataFrame) -> np.ndarray:
|
||||
return frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32").to_numpy()
|
||||
|
||||
|
||||
def _markdown_report(result: dict[str, Any], split_summary: pd.DataFrame, feature_rows: pd.DataFrame, model_rows: pd.DataFrame) -> str:
|
||||
top_fraction = 0.10
|
||||
lines = [
|
||||
"# 好单结构诊断报告",
|
||||
"",
|
||||
"这份报告只看一件事:现有 54 个特征能不能把真实赚钱单和亏钱单分开。",
|
||||
"",
|
||||
f"- run_id: `{result['run_id']}`",
|
||||
f"- 好单定义: 当前价格计划真实净收益 >= `{result['min_good_edge_bps']}` bps",
|
||||
f"- 坏单辅助定义: 当前价格计划真实净收益 <= `{result['bad_edge_bps']}` bps",
|
||||
f"- 树模型诊断结论: `{result['tree_model_verdict']['status']}`",
|
||||
f"- 结论说明: {result['tree_model_verdict']['reason']}",
|
||||
"",
|
||||
"## 基础分布",
|
||||
"",
|
||||
_markdown_table(split_summary),
|
||||
"",
|
||||
"## 单特征分辨力",
|
||||
"",
|
||||
f"- 稳定 AUC 特征数: `{result['stable_feature_count']}`",
|
||||
f"- top {_fraction_label(top_fraction)} 平均收益三段都为正的特征数: `{result['stable_positive_top_feature_count']}`",
|
||||
"",
|
||||
]
|
||||
feature_display = _feature_display(feature_rows, top_fraction)
|
||||
lines.append(_markdown_table(feature_display.head(25)))
|
||||
lines.extend(["", "## 树模型 top 分桶", ""])
|
||||
model_display = model_rows.sort_values(["side", "top_fraction", "split_id"]).copy() if not model_rows.empty else pd.DataFrame()
|
||||
lines.append(_markdown_table(model_display))
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## 文件",
|
||||
"",
|
||||
"- `diagnostics/good_trade_split_summary.csv`: 好单/坏单基础分布。",
|
||||
"- `diagnostics/good_trade_feature_candidates.csv`: 单特征分辨力明细。",
|
||||
"- `diagnostics/good_trade_tree_model_top.csv`: 树模型 top 分桶明细。",
|
||||
"",
|
||||
]
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _feature_display(feature_rows: pd.DataFrame, top_fraction: float) -> pd.DataFrame:
|
||||
if feature_rows.empty:
|
||||
return pd.DataFrame()
|
||||
label = _fraction_label(top_fraction)
|
||||
columns = [
|
||||
"side",
|
||||
"feature",
|
||||
"better_when",
|
||||
"min_eval_directional_auc",
|
||||
f"{TUNE_SPLIT}_top{label}_avg_edge_bps",
|
||||
f"{VALIDATION_LOCKED_SPLIT}_top{label}_avg_edge_bps",
|
||||
f"{LATEST_STRESS_SPLIT}_top{label}_avg_edge_bps",
|
||||
"min_top_avg_edge_bps",
|
||||
"min_top_good_rate",
|
||||
"stable_auc",
|
||||
"stable_positive_top_edge",
|
||||
"score",
|
||||
]
|
||||
return feature_rows[[column for column in columns if column in feature_rows.columns]].copy()
|
||||
|
||||
|
||||
def _markdown_table(frame: pd.DataFrame) -> str:
|
||||
if frame.empty:
|
||||
return "_无_"
|
||||
columns = list(frame.columns)
|
||||
lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join(["---"] * len(columns)) + " |"]
|
||||
for _, row in frame.iterrows():
|
||||
lines.append("| " + " | ".join(_format_cell(row[column]) for column in columns) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_cell(value: Any) -> str:
|
||||
if value is None or pd.isna(value):
|
||||
return ""
|
||||
if isinstance(value, (float, np.floating)):
|
||||
return f"{float(value):.6g}"
|
||||
if isinstance(value, (bool, np.bool_)):
|
||||
return "true" if bool(value) else "false"
|
||||
return str(value)
|
||||
|
||||
|
||||
def _fraction_label(fraction: float) -> str:
|
||||
return str(int(round(fraction * 100)))
|
||||
|
||||
|
||||
def _jsonable(value: Any) -> Any:
|
||||
if isinstance(value, dict):
|
||||
return {str(key): _jsonable(item) for key, item in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [_jsonable(item) for item in value]
|
||||
if isinstance(value, (np.integer,)):
|
||||
return int(value)
|
||||
if isinstance(value, (np.floating,)):
|
||||
return float(value)
|
||||
if isinstance(value, np.ndarray):
|
||||
return value.tolist()
|
||||
return value
|
||||
@@ -0,0 +1,474 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
|
||||
|
||||
from trader_training.io_utils import read_parquet, run_root, write_json, write_parquet, write_text
|
||||
from trader_training.pm import _pm_config_from_thresholds, _pm_frame, _price_plan_context, _simulate_open_trades, _trade_metrics
|
||||
from trader_training.schemas import FEATURE_ORDER, FIT_SPLIT, LATEST_STRESS_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT
|
||||
|
||||
|
||||
EVAL_SPLITS = (TUNE_SPLIT, VALIDATION_LOCKED_SPLIT, LATEST_STRESS_SPLIT)
|
||||
|
||||
|
||||
def probe_nonlinear_pm(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
direction_dataset = read_parquet(root / "dataset" / "direction_train.parquet")
|
||||
entry_dataset = read_parquet(root / "dataset" / "entry_train.parquet")
|
||||
probe_mode = _probe_mode(args)
|
||||
entry_train_filter = _entry_train_filter(args)
|
||||
entry_opportunity_bps = float(getattr(args, "entry_opportunity_bps", 40.0) or 40.0)
|
||||
direction_model = _fit_direction_model(direction_dataset) if probe_mode == "direction_entry_tree" else None
|
||||
entry_models = _fit_entry_models(direction_dataset, entry_dataset, entry_train_filter, entry_opportunity_bps)
|
||||
frames = {
|
||||
split_id: _prediction_frame(root, split_id, direction_dataset, entry_dataset, direction_model, entry_models)
|
||||
for split_id in EVAL_SPLITS
|
||||
}
|
||||
price_plan = _price_plan_context(root)
|
||||
candidates = _expanded_threshold_candidates()
|
||||
tune_rows: list[dict[str, Any]] = []
|
||||
best_thresholds: dict[str, float] | None = None
|
||||
best_tune_metrics: dict[str, Any] | None = None
|
||||
best_score = -float("inf")
|
||||
|
||||
for thresholds in candidates:
|
||||
trades = _simulate_open_trades(frames[TUNE_SPLIT], thresholds, _pm_config_from_thresholds(thresholds), price_plan)
|
||||
metrics = _trade_metrics(trades)
|
||||
score = _probe_score(metrics)
|
||||
tune_rows.append({**thresholds, **metrics, "score": score})
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_thresholds = thresholds
|
||||
best_tune_metrics = metrics
|
||||
|
||||
if best_thresholds is None or best_tune_metrics is None:
|
||||
raise ValueError("nonlinear PM probe did not evaluate any threshold candidate")
|
||||
|
||||
split_metrics: dict[str, Any] = {}
|
||||
split_trade_frames: dict[str, pd.DataFrame] = {}
|
||||
for split_id, frame in frames.items():
|
||||
trades = _simulate_open_trades(frame, best_thresholds, _pm_config_from_thresholds(best_thresholds), price_plan)
|
||||
trades = trades.copy()
|
||||
trades["eval_split"] = split_id
|
||||
split_trade_frames[split_id] = trades
|
||||
split_metrics[split_id] = _trade_metrics(trades)
|
||||
side_metrics = _side_metrics(split_trade_frames)
|
||||
|
||||
tune_frame = pd.DataFrame(tune_rows).sort_values("score", ascending=False).reset_index(drop=True)
|
||||
result = {
|
||||
"run_id": args.run_id,
|
||||
"purpose": "diagnostic_only_not_exported",
|
||||
"model_family": "sklearn_hist_gradient_boosting",
|
||||
"probe_mode": probe_mode,
|
||||
"entry_train_filter": entry_train_filter,
|
||||
"entry_opportunity_bps": entry_opportunity_bps,
|
||||
"candidate_count": len(candidates),
|
||||
"candidate_summary": _candidate_summary(tune_frame),
|
||||
"best_thresholds": best_thresholds,
|
||||
"best_tune_metrics": best_tune_metrics,
|
||||
"split_metrics": split_metrics,
|
||||
"side_metrics": side_metrics,
|
||||
"verdict": _verdict(split_metrics),
|
||||
}
|
||||
out_dir = root / "diagnostics"
|
||||
output_stem = _output_stem(probe_mode)
|
||||
trade_parts = [trades for trades in split_trade_frames.values() if not trades.empty]
|
||||
best_trade_frame = pd.concat(trade_parts, ignore_index=True) if trade_parts else pd.DataFrame()
|
||||
write_json(out_dir / f"{output_stem}_result.json", _jsonable(result))
|
||||
write_text(out_dir / f"{output_stem}_candidates.csv", tune_frame.head(200).to_csv(index=False))
|
||||
write_parquet(out_dir / f"{output_stem}_best_trades.parquet", best_trade_frame)
|
||||
write_text(out_dir / f"{output_stem}_side_metrics.csv", _side_metrics_frame(side_metrics).to_csv(index=False))
|
||||
write_text(out_dir / f"{output_stem}_report.md", _markdown_report(result, tune_frame.head(20)))
|
||||
logging.info(
|
||||
"trader.training.nonlinear_pm_probe_written runId=%s probeMode=%s entryTrainFilter=%s verdict=%s tuneTrades=%s validationTrades=%s stressTrades=%s",
|
||||
args.run_id,
|
||||
probe_mode,
|
||||
entry_train_filter,
|
||||
result["verdict"]["status"],
|
||||
split_metrics[TUNE_SPLIT]["trade_count"],
|
||||
split_metrics[VALIDATION_LOCKED_SPLIT]["trade_count"],
|
||||
split_metrics[LATEST_STRESS_SPLIT]["trade_count"],
|
||||
)
|
||||
|
||||
|
||||
def _probe_mode(args: Any) -> str:
|
||||
mode = str(getattr(args, "probe_mode", "direction_entry_tree") or "direction_entry_tree").strip().lower()
|
||||
allowed = {"direction_entry_tree", "entry_tree_only"}
|
||||
if mode not in allowed:
|
||||
raise ValueError(f"unsupported nonlinear PM probe mode: {mode}")
|
||||
return mode
|
||||
|
||||
|
||||
def _entry_train_filter(args: Any) -> str:
|
||||
value = str(getattr(args, "entry_train_filter", "direction_label") or "direction_label").strip().lower()
|
||||
allowed = {"direction_label", "side_opportunity"}
|
||||
if value not in allowed:
|
||||
raise ValueError(f"unsupported nonlinear Entry train filter: {value}")
|
||||
return value
|
||||
|
||||
|
||||
def _output_stem(probe_mode: str) -> str:
|
||||
return "nonlinear_pm_probe" if probe_mode == "direction_entry_tree" else f"nonlinear_pm_probe_{probe_mode}"
|
||||
|
||||
|
||||
def _fit_direction_model(dataset: pd.DataFrame) -> HistGradientBoostingClassifier:
|
||||
train = dataset[dataset["split_id"].eq(FIT_SPLIT)].copy()
|
||||
y = train[["long_target", "short_target", "neutral_target"]].to_numpy().argmax(axis=1)
|
||||
model = HistGradientBoostingClassifier(
|
||||
max_iter=160,
|
||||
learning_rate=0.04,
|
||||
max_leaf_nodes=31,
|
||||
l2_regularization=0.02,
|
||||
early_stopping=True,
|
||||
random_state=41,
|
||||
)
|
||||
model.fit(_x(train), y)
|
||||
return model
|
||||
|
||||
|
||||
def _fit_entry_models(direction_dataset: pd.DataFrame, entry_dataset: pd.DataFrame, entry_train_filter: str, opportunity_bps: float) -> dict[str, Any]:
|
||||
long_train = _entry_side_fit_frame(direction_dataset, entry_dataset, "LONG", entry_train_filter, opportunity_bps)
|
||||
short_train = _entry_side_fit_frame(direction_dataset, entry_dataset, "SHORT", entry_train_filter, opportunity_bps)
|
||||
return {
|
||||
"long_entry_prob": _fit_binary_head(long_train, "long_entry_target", seed=43),
|
||||
"short_entry_prob": _fit_binary_head(short_train, "short_entry_target", seed=47),
|
||||
"long_expected_net_edge_bps": _fit_regression_head(long_train, "long_actual_plan_net_edge_bps", seed=53),
|
||||
"short_expected_net_edge_bps": _fit_regression_head(short_train, "short_actual_plan_net_edge_bps", seed=59),
|
||||
}
|
||||
|
||||
|
||||
def _entry_side_fit_frame(
|
||||
direction_dataset: pd.DataFrame,
|
||||
entry_dataset: pd.DataFrame,
|
||||
side: str,
|
||||
entry_train_filter: str,
|
||||
opportunity_bps: float,
|
||||
) -> pd.DataFrame:
|
||||
side_lower = side.lower()
|
||||
train = entry_dataset[entry_dataset["split_id"].eq(FIT_SPLIT)].copy()
|
||||
if entry_train_filter == "direction_label":
|
||||
label_column = f"{side_lower}_target"
|
||||
required = {"sample_id", label_column}
|
||||
missing = sorted(required - set(direction_dataset.columns))
|
||||
if missing:
|
||||
raise ValueError(f"direction dataset missing columns for nonlinear Entry filter: {missing}")
|
||||
train = train.merge(direction_dataset[list(required)], on="sample_id", how="inner", validate="one_to_one")
|
||||
if len(train) == 0:
|
||||
raise ValueError(f"nonlinear Entry {side} direction-label filter produced no rows")
|
||||
mask = pd.to_numeric(train[label_column], errors="coerce").fillna(0).astype(int).eq(1)
|
||||
filter_name = f"DIRECTION_LABEL_{side}_FIT_ROWS"
|
||||
elif entry_train_filter == "side_opportunity":
|
||||
opportunity_column = f"{side_lower}_max_achievable_net_edge_bps"
|
||||
if opportunity_column not in train.columns:
|
||||
raise ValueError(f"entry dataset missing {opportunity_column} for nonlinear Entry side-opportunity filter")
|
||||
mask = pd.to_numeric(train[opportunity_column], errors="coerce").ge(opportunity_bps).fillna(False)
|
||||
filter_name = f"SIDE_OPPORTUNITY_{side}_GE_{opportunity_bps:g}_BPS_FIT_ROWS"
|
||||
else:
|
||||
raise ValueError(f"unsupported nonlinear Entry train filter: {entry_train_filter}")
|
||||
out = train.loc[mask].copy()
|
||||
logging.info(
|
||||
"trader.training.nonlinear_entry_fit_frame side=%s filter=%s rows=%s totalFitRows=%s",
|
||||
side,
|
||||
filter_name,
|
||||
len(out),
|
||||
len(train),
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
def _fit_binary_head(train: pd.DataFrame, target: str, seed: int) -> HistGradientBoostingClassifier:
|
||||
if len(train) < 1000:
|
||||
raise ValueError(f"not enough rows to train nonlinear Entry head {target}: {len(train)}")
|
||||
y = train[target].astype(int).to_numpy()
|
||||
if len(np.unique(y)) < 2:
|
||||
raise ValueError(f"nonlinear Entry head {target} has only one class")
|
||||
model = HistGradientBoostingClassifier(
|
||||
max_iter=180,
|
||||
learning_rate=0.04,
|
||||
max_leaf_nodes=31,
|
||||
l2_regularization=0.02,
|
||||
early_stopping=True,
|
||||
random_state=seed,
|
||||
)
|
||||
model.fit(_x(train), y)
|
||||
return model
|
||||
|
||||
|
||||
def _fit_regression_head(train: pd.DataFrame, target: str, seed: int) -> HistGradientBoostingRegressor:
|
||||
if len(train) < 1000:
|
||||
raise ValueError(f"not enough rows to train nonlinear Entry head {target}: {len(train)}")
|
||||
model = HistGradientBoostingRegressor(
|
||||
max_iter=180,
|
||||
learning_rate=0.04,
|
||||
max_leaf_nodes=31,
|
||||
l2_regularization=0.02,
|
||||
early_stopping=True,
|
||||
random_state=seed,
|
||||
)
|
||||
model.fit(_x(train), train[target].astype(float).to_numpy())
|
||||
return model
|
||||
|
||||
|
||||
def _prediction_frame(
|
||||
root,
|
||||
split_id: str,
|
||||
direction_dataset: pd.DataFrame,
|
||||
entry_dataset: pd.DataFrame,
|
||||
direction_model: HistGradientBoostingClassifier | None,
|
||||
entry_models: dict[str, Any],
|
||||
) -> pd.DataFrame:
|
||||
frame = _pm_frame(root, split_id).copy()
|
||||
entry_split = entry_dataset[entry_dataset["split_id"].eq(split_id)].copy()
|
||||
|
||||
entry_pred = entry_split[["sample_id"]].copy()
|
||||
entry_pred["long_entry_prob"] = entry_models["long_entry_prob"].predict_proba(_x(entry_split))[:, 1]
|
||||
entry_pred["short_entry_prob"] = entry_models["short_entry_prob"].predict_proba(_x(entry_split))[:, 1]
|
||||
entry_pred["pred_long_expected_net_edge_bps"] = entry_models["long_expected_net_edge_bps"].predict(_x(entry_split))
|
||||
entry_pred["pred_short_expected_net_edge_bps"] = entry_models["short_expected_net_edge_bps"].predict(_x(entry_split))
|
||||
replacements = entry_pred
|
||||
drop_columns = [
|
||||
"long_entry_prob",
|
||||
"short_entry_prob",
|
||||
"pred_long_expected_net_edge_bps",
|
||||
"pred_short_expected_net_edge_bps",
|
||||
]
|
||||
if direction_model is not None:
|
||||
direction_split = direction_dataset[direction_dataset["split_id"].eq(split_id)].copy()
|
||||
direction_proba = direction_model.predict_proba(_x(direction_split))
|
||||
direction_pred = direction_split[["sample_id"]].copy()
|
||||
direction_pred["long_prob"] = direction_proba[:, 0]
|
||||
direction_pred["short_prob"] = direction_proba[:, 1]
|
||||
direction_pred["neutral_prob"] = direction_proba[:, 2]
|
||||
replacements = direction_pred.merge(entry_pred, on="sample_id", how="inner", validate="one_to_one")
|
||||
drop_columns.extend(["long_prob", "short_prob", "neutral_prob"])
|
||||
out = frame.drop(columns=drop_columns, errors="ignore").merge(replacements, on="sample_id", how="inner", validate="one_to_one")
|
||||
if len(out) != len(frame):
|
||||
raise ValueError(f"nonlinear prediction frame lost rows for {split_id}: before={len(frame)} after={len(out)}")
|
||||
return out
|
||||
|
||||
|
||||
def _expanded_threshold_candidates() -> list[dict[str, float]]:
|
||||
# 多头和空头在不同市场段里的可靠性可能完全不同;这里分开搜,
|
||||
# 1.01 表示这一侧不开仓,用来检查只做多或只做空是否更稳。
|
||||
values = itertools.product(
|
||||
[0.20, 0.30, 0.40, 0.50, 0.60, 1.01],
|
||||
[0.20, 0.30, 0.40, 0.50, 0.60, 1.01],
|
||||
[0.05, 0.10, 0.20, 0.30, 0.40, 0.50],
|
||||
[0.45, 0.65, 0.85, 1.00],
|
||||
[-5.0, 0.0, 3.0, 5.0, 8.0],
|
||||
[0.00, 0.01, 0.02, 0.05],
|
||||
)
|
||||
return [
|
||||
{
|
||||
"long_open_prob": long_prob,
|
||||
"short_open_prob": short_prob,
|
||||
"min_entry_prob": entry_prob,
|
||||
"max_market_risk_prob": risk_prob,
|
||||
"min_expected_edge_bps": edge_bps,
|
||||
"min_direction_margin": margin,
|
||||
}
|
||||
for long_prob, short_prob, entry_prob, risk_prob, edge_bps, margin in values
|
||||
]
|
||||
|
||||
|
||||
def _probe_score(metrics: dict[str, Any]) -> float:
|
||||
if metrics["trade_count"] == 0:
|
||||
return -1_000_000.0
|
||||
sample_penalty = max(0, 80 - int(metrics["trade_count"])) * 2.0
|
||||
return (
|
||||
float(metrics["avg_weighted_edge_bps"]) * np.sqrt(float(metrics["trade_count"]))
|
||||
+ float(metrics["total_weighted_edge_bps"]) * 0.03
|
||||
- float(metrics["max_drawdown_bps"]) * 0.20
|
||||
- sample_penalty
|
||||
)
|
||||
|
||||
|
||||
def _side_metrics(split_trade_frames: dict[str, pd.DataFrame]) -> dict[str, dict[str, dict[str, Any]]]:
|
||||
metrics: dict[str, dict[str, dict[str, Any]]] = {}
|
||||
for split_id, trades in split_trade_frames.items():
|
||||
metrics[split_id] = {}
|
||||
for side in ("LONG", "SHORT"):
|
||||
side_trades = trades[trades["side"].eq(side)].copy() if not trades.empty else trades.copy()
|
||||
metrics[split_id][side] = {**_trade_metrics(side_trades), **_exit_metrics(side_trades)}
|
||||
return metrics
|
||||
|
||||
|
||||
def _exit_metrics(trades: pd.DataFrame) -> dict[str, float]:
|
||||
if trades.empty:
|
||||
return {
|
||||
"target_hit_rate": 0.0,
|
||||
"stop_hit_rate": 0.0,
|
||||
"timeout_exit_rate": 0.0,
|
||||
"avg_time_to_exit_min": 0.0,
|
||||
"p50_time_to_exit_min": 0.0,
|
||||
}
|
||||
target_hit = pd.to_numeric(trades["target_hit"], errors="coerce").fillna(0).astype(int)
|
||||
stop_hit = pd.to_numeric(trades["stop_hit"], errors="coerce").fillna(0).astype(int)
|
||||
time_to_exit_min = pd.to_numeric(trades["time_to_exit_ms"], errors="coerce").fillna(0.0).astype(float) / 60_000.0
|
||||
return {
|
||||
"target_hit_rate": float(target_hit.eq(1).mean()),
|
||||
"stop_hit_rate": float(stop_hit.eq(1).mean()),
|
||||
"timeout_exit_rate": float((target_hit.ne(1) & stop_hit.ne(1)).mean()),
|
||||
"avg_time_to_exit_min": float(time_to_exit_min.mean()),
|
||||
"p50_time_to_exit_min": float(time_to_exit_min.median()),
|
||||
}
|
||||
|
||||
|
||||
def _side_metrics_frame(side_metrics: dict[str, dict[str, dict[str, Any]]]) -> pd.DataFrame:
|
||||
rows: list[dict[str, Any]] = []
|
||||
for split_id, split_metrics in side_metrics.items():
|
||||
for side, metrics in split_metrics.items():
|
||||
rows.append({"split_id": split_id, "side": side, **metrics})
|
||||
return pd.DataFrame(rows)
|
||||
|
||||
|
||||
def _candidate_summary(tune_frame: pd.DataFrame) -> dict[str, Any]:
|
||||
if tune_frame.empty:
|
||||
return {
|
||||
"positive_avg_weighted_candidates": 0,
|
||||
"positive_total_weighted_candidates": 0,
|
||||
"best_avg_weighted_edge_bps": 0.0,
|
||||
"best_total_weighted_edge_bps": 0.0,
|
||||
"min_viable_trade_count": 80,
|
||||
"positive_avg_weighted_viable_candidates": 0,
|
||||
"positive_total_weighted_viable_candidates": 0,
|
||||
"best_viable_avg_weighted_edge_bps": 0.0,
|
||||
"best_viable_total_weighted_edge_bps": 0.0,
|
||||
}
|
||||
viable = tune_frame[tune_frame["trade_count"] >= 80]
|
||||
return {
|
||||
"positive_avg_weighted_candidates": int((tune_frame["avg_weighted_edge_bps"] > 0).sum()),
|
||||
"positive_total_weighted_candidates": int((tune_frame["total_weighted_edge_bps"] > 0).sum()),
|
||||
"best_avg_weighted_edge_bps": float(tune_frame["avg_weighted_edge_bps"].max()),
|
||||
"best_total_weighted_edge_bps": float(tune_frame["total_weighted_edge_bps"].max()),
|
||||
"min_viable_trade_count": 80,
|
||||
"positive_avg_weighted_viable_candidates": int((viable["avg_weighted_edge_bps"] > 0).sum()),
|
||||
"positive_total_weighted_viable_candidates": int((viable["total_weighted_edge_bps"] > 0).sum()),
|
||||
"best_viable_avg_weighted_edge_bps": float(viable["avg_weighted_edge_bps"].max()) if not viable.empty else 0.0,
|
||||
"best_viable_total_weighted_edge_bps": float(viable["total_weighted_edge_bps"].max()) if not viable.empty else 0.0,
|
||||
}
|
||||
|
||||
|
||||
def _verdict(metrics: dict[str, Any]) -> dict[str, Any]:
|
||||
tune = metrics[TUNE_SPLIT]
|
||||
validation = metrics[VALIDATION_LOCKED_SPLIT]
|
||||
stress = metrics[LATEST_STRESS_SPLIT]
|
||||
passed = (
|
||||
tune["trade_count"] >= 80
|
||||
and validation["trade_count"] >= 40
|
||||
and stress["trade_count"] >= 10
|
||||
and tune["avg_weighted_edge_bps"] > 0
|
||||
and validation["avg_weighted_edge_bps"] > 0
|
||||
and stress["avg_weighted_edge_bps"] > -1.0
|
||||
)
|
||||
return {
|
||||
"status": "PROMISING_DIAGNOSTIC_ONLY" if passed else "NO_STABLE_NONLINEAR_PM_EDGE",
|
||||
"reason": "只用于判断树模型方向是否值得继续工程化,不代表可上线。",
|
||||
}
|
||||
|
||||
|
||||
def _x(frame: pd.DataFrame) -> np.ndarray:
|
||||
return frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).astype("float32").to_numpy()
|
||||
|
||||
|
||||
def _markdown_report(result: dict[str, Any], top_candidates: pd.DataFrame) -> str:
|
||||
mode_text = "只替换 Entry,Direction 使用当前模型输出。" if result["probe_mode"] == "entry_tree_only" else "Direction 和 Entry 都替换成树模型。"
|
||||
lines = [
|
||||
"# Nonlinear PM Probe Report",
|
||||
"",
|
||||
"这份报告只做诊断,不导出上线模型。它回答:不加新特征,换成树模型后,PM 能不能筛出稳定正收益。",
|
||||
"",
|
||||
f"- run_id: `{result['run_id']}`",
|
||||
f"- probe_mode: `{result['probe_mode']}`",
|
||||
f"- 说明: {mode_text}",
|
||||
f"- Entry 训练人群: `{result['entry_train_filter']}`",
|
||||
f"- Entry 机会阈值: `{result['entry_opportunity_bps']}` bps",
|
||||
f"- verdict: `{result['verdict']['status']}`",
|
||||
f"- candidate_count: `{result['candidate_count']}`",
|
||||
f"- 正收益候选数: `{result['candidate_summary']['positive_avg_weighted_candidates']}`",
|
||||
f"- 至少 80 单的正收益候选数: `{result['candidate_summary']['positive_avg_weighted_viable_candidates']}`",
|
||||
f"- 至少 80 单的最好单笔加权收益: `{result['candidate_summary']['best_viable_avg_weighted_edge_bps']:.4f}` bps",
|
||||
f"- best_thresholds: `{result['best_thresholds']}`",
|
||||
"",
|
||||
"## Split Metrics",
|
||||
"",
|
||||
"| split | trades | win_rate | avg_actual_bps | avg_weighted_bps | total_weighted_bps | profit_factor |",
|
||||
"| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
|
||||
]
|
||||
for split_id, metrics in result["split_metrics"].items():
|
||||
lines.append(
|
||||
f"| {split_id} | {metrics['trade_count']} | {metrics['win_rate']:.4f} | "
|
||||
f"{metrics['avg_actual_edge_bps']:.4f} | {metrics['avg_weighted_edge_bps']:.4f} | "
|
||||
f"{metrics['total_weighted_edge_bps']:.4f} | {metrics['profit_factor']:.4f} |"
|
||||
)
|
||||
lines.extend(
|
||||
[
|
||||
"",
|
||||
"## Side Breakdown",
|
||||
"",
|
||||
"| split | side | trades | win_rate | avg_actual_bps | avg_weighted_bps | target_hit_rate | stop_hit_rate | timeout_rate | avg_exit_min |",
|
||||
"| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
|
||||
]
|
||||
)
|
||||
for split_id, side_metrics in result["side_metrics"].items():
|
||||
for side, metrics in side_metrics.items():
|
||||
lines.append(
|
||||
f"| {split_id} | {side} | {metrics['trade_count']} | {metrics['win_rate']:.4f} | "
|
||||
f"{metrics['avg_actual_edge_bps']:.4f} | {metrics['avg_weighted_edge_bps']:.4f} | "
|
||||
f"{metrics['target_hit_rate']:.4f} | {metrics['stop_hit_rate']:.4f} | "
|
||||
f"{metrics['timeout_exit_rate']:.4f} | {metrics['avg_time_to_exit_min']:.2f} |"
|
||||
)
|
||||
lines.extend(["", "## Top Tune Candidates", "", _candidate_table(top_candidates), ""])
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _candidate_table(frame: pd.DataFrame) -> str:
|
||||
if frame.empty:
|
||||
return "无候选。"
|
||||
columns = [
|
||||
"long_open_prob",
|
||||
"short_open_prob",
|
||||
"min_entry_prob",
|
||||
"max_market_risk_prob",
|
||||
"min_expected_edge_bps",
|
||||
"min_direction_margin",
|
||||
"trade_count",
|
||||
"avg_weighted_edge_bps",
|
||||
"total_weighted_edge_bps",
|
||||
"profit_factor",
|
||||
"score",
|
||||
]
|
||||
available = [column for column in columns if column in frame.columns]
|
||||
lines = [
|
||||
"| " + " | ".join(available) + " |",
|
||||
"| " + " | ".join(["---" for _ in available]) + " |",
|
||||
]
|
||||
for _, row in frame[available].iterrows():
|
||||
values = []
|
||||
for column in available:
|
||||
value = row[column]
|
||||
if isinstance(value, (float, np.floating)):
|
||||
values.append(f"{float(value):.6f}")
|
||||
else:
|
||||
values.append(str(value))
|
||||
lines.append("| " + " | ".join(values) + " |")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _jsonable(value: Any) -> Any:
|
||||
if isinstance(value, dict):
|
||||
return {str(key): _jsonable(item) for key, item in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [_jsonable(item) for item in value]
|
||||
if isinstance(value, (np.integer,)):
|
||||
return int(value)
|
||||
if isinstance(value, (np.floating,)):
|
||||
return float(value)
|
||||
if isinstance(value, np.ndarray):
|
||||
return value.tolist()
|
||||
return value
|
||||
@@ -337,20 +337,22 @@ def _load_direction_dataset(baseline_root: Path, feature: pd.DataFrame) -> pd.Da
|
||||
|
||||
|
||||
def _load_entry_dataset(baseline_root: Path, feature: pd.DataFrame) -> pd.DataFrame:
|
||||
labels = read_parquet(baseline_root / "label" / "entry_labels.parquet")
|
||||
required = {"sample_id", "side", "entry_target", "expected_net_edge_bps"}
|
||||
dataset_path = baseline_root / "dataset" / "entry_train.parquet"
|
||||
if not dataset_path.is_file():
|
||||
raise FileNotFoundError(f"entry_train dataset is required for OFI experiment: {dataset_path}")
|
||||
labels = read_parquet(dataset_path)
|
||||
required = {
|
||||
"sample_id",
|
||||
"long_entry_target",
|
||||
"short_entry_target",
|
||||
"long_actual_plan_net_edge_bps",
|
||||
"short_actual_plan_net_edge_bps",
|
||||
}
|
||||
missing = sorted(required.difference(labels.columns))
|
||||
if missing:
|
||||
raise ValueError(f"entry labels missing columns: {missing}")
|
||||
long = labels[labels["side"].eq("LONG")][["sample_id", "entry_target", "expected_net_edge_bps"]].rename(
|
||||
columns={"entry_target": "long_entry_target", "expected_net_edge_bps": "long_expected_net_edge_bps"}
|
||||
)
|
||||
short = labels[labels["side"].eq("SHORT")][["sample_id", "entry_target", "expected_net_edge_bps"]].rename(
|
||||
columns={"entry_target": "short_entry_target", "expected_net_edge_bps": "short_expected_net_edge_bps"}
|
||||
)
|
||||
pivot = long.merge(short, on="sample_id", how="inner")
|
||||
dataset = feature.merge(pivot, on="sample_id", how="inner")
|
||||
logging.info("trader.training.ofi_entry_dataset_loaded rowCount=%s", len(dataset))
|
||||
raise ValueError(f"entry_train dataset missing columns: {missing}")
|
||||
dataset = feature.merge(labels[list(required)], on="sample_id", how="inner")
|
||||
logging.info("trader.training.ofi_entry_dataset_loaded source=entry_train rowCount=%s", len(dataset))
|
||||
return dataset
|
||||
|
||||
|
||||
@@ -407,8 +409,8 @@ def _train_entry(frame: pd.DataFrame, feature_columns: list[str]) -> tuple[dict[
|
||||
specs = [
|
||||
("long_entry_prob", "binary", "long_entry_target"),
|
||||
("short_entry_prob", "binary", "short_entry_target"),
|
||||
("long_expected_net_edge_bps", "regression", "long_expected_net_edge_bps"),
|
||||
("short_expected_net_edge_bps", "regression", "short_expected_net_edge_bps"),
|
||||
("long_actual_plan_net_edge_bps", "regression", "long_actual_plan_net_edge_bps"),
|
||||
("short_actual_plan_net_edge_bps", "regression", "short_actual_plan_net_edge_bps"),
|
||||
]
|
||||
results: dict[str, Any] = {"feature_count": len(feature_columns), "feature_hash": sha256_json(feature_columns)}
|
||||
split_predictions: dict[str, pd.DataFrame] = {
|
||||
@@ -785,8 +787,8 @@ def _model_compare_report(args: Any, baseline_root: Path, results: dict[str, Any
|
||||
f"| Direction | neutral_auc | {baseline_direction.get('neutral_auc')} |",
|
||||
f"| Entry | long_auc | {baseline_entry['long_entry_prob'].get('auc')} |",
|
||||
f"| Entry | short_auc | {baseline_entry['short_entry_prob'].get('auc')} |",
|
||||
f"| Entry | long_edge_mae_ratio | {baseline_entry['long_expected_net_edge_bps'].get('mae_vs_constant_ratio')} |",
|
||||
f"| Entry | short_edge_mae_ratio | {baseline_entry['short_expected_net_edge_bps'].get('mae_vs_constant_ratio')} |",
|
||||
f"| Entry | long_exported_edge_mae_ratio | {baseline_entry['long_expected_net_edge_bps'].get('mae_vs_constant_ratio')} |",
|
||||
f"| Entry | short_exported_edge_mae_ratio | {baseline_entry['short_expected_net_edge_bps'].get('mae_vs_constant_ratio')} |",
|
||||
"",
|
||||
"## Diagnostic Direction Result",
|
||||
"",
|
||||
@@ -817,7 +819,7 @@ def _model_compare_report(args: Any, baseline_root: Path, results: dict[str, Any
|
||||
lines.append(
|
||||
f"| {head} | {feature_set_name} | {split_id} | {metric.get('auc')} | {metric.get('brier_vs_constant_ratio')} | {metric.get('top10_hit_rate')} |"
|
||||
)
|
||||
for head in ("long_expected_net_edge_bps", "short_expected_net_edge_bps"):
|
||||
for head in ("long_actual_plan_net_edge_bps", "short_actual_plan_net_edge_bps"):
|
||||
for split_id in EVAL_SPLITS:
|
||||
metric = entry.get(head, {}).get(split_id, {})
|
||||
lines.append(f"| {head} | {feature_set_name} | {split_id} | {metric.get('mae_vs_constant_ratio')} | | |")
|
||||
@@ -827,6 +829,7 @@ def _model_compare_report(args: Any, baseline_root: Path, results: dict[str, Any
|
||||
"## Verdict Rule",
|
||||
"",
|
||||
"只有 `market_plus_ofi` 在 validation_locked 和 latest_stress 上同时好过 `market_only`,才进入正式特征链路。",
|
||||
"Entry 的收益回归诊断使用 `actual_plan_net_edge_bps`,也就是真实按价格计划出场后的净收益。",
|
||||
"",
|
||||
]
|
||||
)
|
||||
|
||||
@@ -165,7 +165,8 @@ def integrated_backtest(args: Any) -> None:
|
||||
price_plan,
|
||||
)
|
||||
stress_trades["eval_split"] = LATEST_STRESS_SPLIT
|
||||
trades = pd.concat([tune_trades, validation_locked_trades, stress_trades], ignore_index=True)
|
||||
trade_parts = [part for part in (tune_trades, validation_locked_trades, stress_trades) if not part.empty]
|
||||
trades = pd.concat(trade_parts, ignore_index=True) if trade_parts else _empty_trade_frame()
|
||||
metrics = {
|
||||
TUNE_SPLIT: _trade_metrics(tune_trades),
|
||||
VALIDATION_LOCKED_SPLIT: _trade_metrics(validation_locked_trades),
|
||||
@@ -232,8 +233,8 @@ def _pm_frame(root, split_id: str) -> pd.DataFrame:
|
||||
price_plan = _price_plan_context(root)
|
||||
entry_dataset = read_parquet(root / "dataset" / "entry_train.parquet").rename(
|
||||
columns={
|
||||
"long_expected_net_edge_bps": "actual_long_expected_net_edge_bps",
|
||||
"short_expected_net_edge_bps": "actual_short_expected_net_edge_bps",
|
||||
"long_actual_plan_net_edge_bps": "actual_long_plan_edge_bps",
|
||||
"short_actual_plan_net_edge_bps": "actual_short_plan_edge_bps",
|
||||
}
|
||||
)
|
||||
entry_plan_outcome = _entry_plan_outcome_frame(root)
|
||||
@@ -245,7 +246,10 @@ def _pm_frame(root, split_id: str) -> pd.DataFrame:
|
||||
"pred_short_expected_net_edge_bps",
|
||||
]
|
||||
risk_cols = ["sample_id", "market_risk_prob", "long_position_risk_prob", "short_position_risk_prob"]
|
||||
actual_cols = ["sample_id", "actual_long_expected_net_edge_bps", "actual_short_expected_net_edge_bps", "long_entry_target", "short_entry_target"]
|
||||
actual_cols = ["sample_id", "actual_long_plan_edge_bps", "actual_short_plan_edge_bps", "long_entry_target", "short_entry_target"]
|
||||
missing_actual_cols = sorted(set(actual_cols) - set(entry_dataset.columns))
|
||||
if missing_actual_cols:
|
||||
raise ValueError(f"entry_train is missing actual plan edge columns for PM: {missing_actual_cols}")
|
||||
frame = (
|
||||
direction[["sample_id", "symbol", "event_time", "split_id", "long_prob", "short_prob", "neutral_prob"]]
|
||||
.merge(entry[entry_cols], on="sample_id", how="inner")
|
||||
@@ -257,7 +261,7 @@ def _pm_frame(root, split_id: str) -> pd.DataFrame:
|
||||
raise ValueError(f"PM frame is empty for {split_id}; check model predictions and entry dataset")
|
||||
frame["model_pred_long_expected_net_edge_bps"] = frame["pred_long_expected_net_edge_bps"]
|
||||
frame["model_pred_short_expected_net_edge_bps"] = frame["pred_short_expected_net_edge_bps"]
|
||||
edge_mode = "MODEL_EXPECTED_NET_EDGE"
|
||||
edge_mode = "MODEL_ACTUAL_PLAN_EDGE"
|
||||
if price_plan.get("entryTargetMethod") not in {"OPPORTUNITY_MFE_V1", "OPPORTUNITY_QUALITY_V1"}:
|
||||
frame["pred_long_expected_net_edge_bps"] = _probability_implied_edge(frame["long_entry_prob"], price_plan)
|
||||
frame["pred_short_expected_net_edge_bps"] = _probability_implied_edge(frame["short_entry_prob"], price_plan)
|
||||
@@ -278,7 +282,7 @@ def _probability_implied_edge(entry_prob: pd.Series, price_plan: dict[str, Any])
|
||||
price_plan.get("costBps", DEFAULT_BACKTEST_PRICE_PLAN["costBps"])
|
||||
)
|
||||
probability = pd.to_numeric(entry_prob, errors="coerce").fillna(0.0).clip(lower=0.0, upper=1.0)
|
||||
# Entry 的概率头比收益回归头稳定。这里用固定止盈止损的盈亏比把概率换成期望收益,
|
||||
# Entry 的概率头比收益回归头稳定。这里用当前价格计划的盈亏比把概率换成期望收益,
|
||||
# 让低命中、高赔率计划也能被 PM 正常搜索;真实结果仍由标签里的实际路径收益评估。
|
||||
return probability * target_net_bps + (1.0 - probability) * stop_net_bps
|
||||
|
||||
@@ -333,9 +337,9 @@ def _threshold_candidates() -> list[dict[str, float]]:
|
||||
values = itertools.product(
|
||||
[0.50, 0.60, 0.70, 1.01],
|
||||
[0.50, 0.60, 0.70, 1.01],
|
||||
[0.03, 0.50, 0.70, 0.85],
|
||||
[0.45, 0.65, 0.85],
|
||||
[0.0, 8.0, 15.0, 25.0],
|
||||
[0.30, 0.50, 0.70, 0.85],
|
||||
[0.45, 0.65],
|
||||
[3.0, 8.0, 15.0, 25.0],
|
||||
[0.02, 0.06, 0.10],
|
||||
)
|
||||
return [
|
||||
@@ -394,8 +398,10 @@ def _simulate_open_trades(
|
||||
trades["entry_prob"] = np.where(is_long, trades["long_entry_prob"], trades["short_entry_prob"])
|
||||
trades["predicted_edge_bps"] = np.where(is_long, trades["pred_long_expected_net_edge_bps"], trades["pred_short_expected_net_edge_bps"])
|
||||
trades["actual_edge_bps"] = np.where(is_long, trades["long_trade_net_edge_bps"], trades["short_trade_net_edge_bps"])
|
||||
trades["label_max_edge_bps"] = np.where(is_long, trades["actual_long_expected_net_edge_bps"], trades["actual_short_expected_net_edge_bps"])
|
||||
trades["label_actual_plan_edge_bps"] = np.where(is_long, trades["actual_long_plan_edge_bps"], trades["actual_short_plan_edge_bps"])
|
||||
trades["entry_target"] = np.where(is_long, trades["long_entry_target"], trades["short_entry_target"])
|
||||
trades["target_hit"] = np.where(is_long, trades["long_target_hit"], trades["short_target_hit"]).astype(int)
|
||||
trades["stop_hit"] = np.where(is_long, trades["long_stop_hit"], trades["short_stop_hit"]).astype(int)
|
||||
effective_pm_config = pm_config or _pm_config_from_thresholds(thresholds)
|
||||
effective_price_plan = price_plan or DEFAULT_BACKTEST_PRICE_PLAN
|
||||
trades["time_to_exit_ms"] = _time_to_exit_ms(trades, is_long, effective_price_plan)
|
||||
@@ -417,9 +423,11 @@ def _simulate_open_trades(
|
||||
"entry_prob",
|
||||
"market_risk_prob",
|
||||
"predicted_edge_bps",
|
||||
"label_max_edge_bps",
|
||||
"label_actual_plan_edge_bps",
|
||||
"actual_edge_bps",
|
||||
"entry_target",
|
||||
"target_hit",
|
||||
"stop_hit",
|
||||
"time_to_exit_ms",
|
||||
"planned_ratio",
|
||||
"weighted_edge_bps",
|
||||
@@ -440,9 +448,11 @@ def _empty_trade_frame() -> pd.DataFrame:
|
||||
"entry_prob",
|
||||
"market_risk_prob",
|
||||
"predicted_edge_bps",
|
||||
"label_max_edge_bps",
|
||||
"label_actual_plan_edge_bps",
|
||||
"actual_edge_bps",
|
||||
"entry_target",
|
||||
"target_hit",
|
||||
"stop_hit",
|
||||
"time_to_exit_ms",
|
||||
"planned_ratio",
|
||||
"weighted_edge_bps",
|
||||
@@ -687,7 +697,7 @@ def _write_pm_report(path, candidates: pd.DataFrame, best_thresholds: dict[str,
|
||||
lines = [
|
||||
"# PM Threshold Report",
|
||||
"",
|
||||
"本次不是固定写死阈值,而是在调参集上试一组可复现的阈值。PM 回测使用固定止盈止损后的真实净收益,并且开仓后按持仓结束时间加冷却时间阻止重叠开仓。",
|
||||
"本次不是固定写死阈值,而是在调参集上试一组可复现的阈值。PM 回测使用当前价格计划的真实净收益,并且开仓后按持仓结束时间加冷却时间阻止重叠开仓。",
|
||||
"",
|
||||
"## Best Thresholds",
|
||||
"",
|
||||
@@ -713,7 +723,7 @@ def _write_backtest_report(path, result: dict[str, Any]) -> None:
|
||||
lines = [
|
||||
"# Integrated Backtest Report",
|
||||
"",
|
||||
"这里用验证集模型输出和 PM 阈值生成交易明细,统计净收益、胜率、回撤和分段表现。收益按固定止盈止损计划的真实净收益计算,不使用窗口内最大可拿收益。",
|
||||
"这里用验证集模型输出和 PM 阈值生成交易明细,统计净收益、胜率、回撤和分段表现。收益按当前价格计划的真实净收益计算,不使用窗口内最大可拿收益。",
|
||||
"",
|
||||
"```json",
|
||||
str(result).replace("'", '"'),
|
||||
|
||||
@@ -339,7 +339,7 @@ def _markdown_report(payload: dict[str, Any], summary: pd.DataFrame) -> str:
|
||||
"",
|
||||
_markdown_table(top),
|
||||
"",
|
||||
"说明:positive_label_rate 和 avg_price_plan_net_edge_bps 按固定止盈止损计划统计;avg_expected_net_edge_bps 只是辅助观察未来最大可拿空间,不能单独决定价格计划。这里选的是下一轮实验用的价格计划,不是上线结论。真正能不能上线仍然看模型训练、PM 搜索、validation_locked 和 latest_stress 回测。",
|
||||
"说明:positive_label_rate 和 avg_price_plan_net_edge_bps 按当前价格计划统计;avg_expected_net_edge_bps 只是辅助观察未来最大可拿空间,不能单独决定价格计划。这里选的是下一轮实验用的价格计划,不是上线结论。真正能不能上线仍然看模型训练、PM 搜索、validation_locked 和 latest_stress 回测。",
|
||||
"",
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
@@ -39,8 +39,8 @@ TARGETS = {
|
||||
"heads": [
|
||||
("long_entry_prob", "binary", "long_entry_target", ["long_entry_prob"], ["longEntryProb"]),
|
||||
("short_entry_prob", "binary", "short_entry_target", ["short_entry_prob"], ["shortEntryProb"]),
|
||||
("long_expected_net_edge_bps", "regression", "long_expected_net_edge_bps", ["long_expected_net_edge_bps"], [None]),
|
||||
("short_expected_net_edge_bps", "regression", "short_expected_net_edge_bps", ["short_expected_net_edge_bps"], [None]),
|
||||
("long_expected_net_edge_bps", "regression", "long_actual_plan_net_edge_bps", ["long_expected_net_edge_bps"], [None]),
|
||||
("short_expected_net_edge_bps", "regression", "short_actual_plan_net_edge_bps", ["short_expected_net_edge_bps"], [None]),
|
||||
],
|
||||
},
|
||||
"CONTINUE": {
|
||||
@@ -89,6 +89,8 @@ def train_small_models(args: Any) -> None:
|
||||
model_manifest: dict[str, Any] = {}
|
||||
for model_name, spec in TARGETS.items():
|
||||
dataset = read_parquet(root / "dataset" / spec["dataset"])
|
||||
if model_name == "ENTRY" and _conditional_entry_source(args) == "direction_label":
|
||||
dataset = _attach_direction_fit_labels(root, dataset)
|
||||
if args.max_rows and len(dataset) > args.max_rows:
|
||||
dataset = dataset.sort_values("event_time").tail(args.max_rows).copy()
|
||||
if dataset.empty:
|
||||
@@ -116,14 +118,17 @@ def train_small_models(args: Any) -> None:
|
||||
heads: list[LinearHead] = []
|
||||
head_results: list[HeadResult] = []
|
||||
for item in spec["heads"]:
|
||||
head_results.extend(_fit_head(item, x_train_scaled, x_tune_scaled, train, tune, scaler))
|
||||
head_name = item[0]
|
||||
head_train_mask, head_filter = _head_train_mask(model_name, head_name, train, args)
|
||||
head_results.extend(_fit_head(item, x_train_scaled, x_tune_scaled, train, tune, scaler, head_train_mask, head_filter, args))
|
||||
for result in head_results:
|
||||
logging.info(
|
||||
"trader.training.model_head_trained runId=%s model=%s head=%s kind=%s metrics=%s",
|
||||
"trader.training.model_head_trained runId=%s model=%s head=%s kind=%s targetSource=%s metrics=%s",
|
||||
args.run_id,
|
||||
model_name,
|
||||
result.field,
|
||||
result.kind,
|
||||
result.metrics.get("target_source"),
|
||||
result.metrics,
|
||||
)
|
||||
for result in head_results:
|
||||
@@ -183,20 +188,103 @@ def train_small_models(args: Any) -> None:
|
||||
write_json(root / "model" / "model_train_manifest.json", model_manifest)
|
||||
|
||||
|
||||
def _fit_head(item, x_train, x_tune, train: pd.DataFrame, tune: pd.DataFrame, scaler: StandardScaler) -> list[HeadResult]:
|
||||
def _conditional_entry_enabled(args: Any) -> bool:
|
||||
return _conditional_entry_source(args) != "none"
|
||||
|
||||
|
||||
def _conditional_entry_source(args: Any) -> str:
|
||||
source = str(getattr(args, "conditional_entry_source", "none") or "none").strip().lower()
|
||||
if bool(getattr(args, "conditional_entry_direction_labels", False)):
|
||||
source = "direction_label"
|
||||
allowed = {"none", "direction_label", "side_opportunity"}
|
||||
if source not in allowed:
|
||||
raise ValueError(f"unsupported conditional Entry source: {source}")
|
||||
return source
|
||||
|
||||
|
||||
def _attach_direction_fit_labels(root: Path, entry_dataset: pd.DataFrame) -> pd.DataFrame:
|
||||
direction = read_parquet(root / "dataset" / "direction_train.parquet")
|
||||
required = {"sample_id", "long_target", "short_target"}
|
||||
missing = sorted(required - set(direction.columns))
|
||||
if missing:
|
||||
raise ValueError(f"direction_train is missing columns required by conditional Entry training: {missing}")
|
||||
merged = entry_dataset.merge(direction[list(required)], on="sample_id", how="inner", validate="one_to_one")
|
||||
if len(merged) != len(entry_dataset):
|
||||
raise ValueError(
|
||||
f"conditional Entry training lost rows while attaching direction labels: before={len(entry_dataset)} after={len(merged)}"
|
||||
)
|
||||
logging.info(
|
||||
"trader.training.entry_direction_labels_attached rowCount=%s longDirectionRows=%s shortDirectionRows=%s",
|
||||
len(merged),
|
||||
int(pd.to_numeric(merged["long_target"], errors="coerce").fillna(0).astype(int).sum()),
|
||||
int(pd.to_numeric(merged["short_target"], errors="coerce").fillna(0).astype(int).sum()),
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _head_train_mask(model_name: str, head_name: str, train: pd.DataFrame, args: Any) -> tuple[np.ndarray, str]:
|
||||
source = _conditional_entry_source(args)
|
||||
if model_name != "ENTRY" or source == "none":
|
||||
return np.ones(len(train), dtype=bool), "ALL_FIT_ROWS"
|
||||
if head_name.startswith("long_"):
|
||||
side = "LONG"
|
||||
direction_label_column = "long_target"
|
||||
opportunity_column = "long_max_achievable_net_edge_bps"
|
||||
elif head_name.startswith("short_"):
|
||||
side = "SHORT"
|
||||
direction_label_column = "short_target"
|
||||
opportunity_column = "short_max_achievable_net_edge_bps"
|
||||
else:
|
||||
return np.ones(len(train), dtype=bool), "ALL_FIT_ROWS"
|
||||
if source == "direction_label":
|
||||
if direction_label_column not in train.columns:
|
||||
raise ValueError(f"conditional Entry training requires {direction_label_column} for head {head_name}")
|
||||
mask = pd.to_numeric(train[direction_label_column], errors="coerce").fillna(0).astype(int).eq(1).to_numpy()
|
||||
return mask, f"DIRECTION_LABEL_{side}_FIT_ROWS"
|
||||
threshold = float(getattr(args, "conditional_entry_opportunity_bps", 40.0) or 40.0)
|
||||
if opportunity_column not in train.columns:
|
||||
raise ValueError(f"side opportunity Entry training requires {opportunity_column} for head {head_name}")
|
||||
mask = pd.to_numeric(train[opportunity_column], errors="coerce").ge(threshold).fillna(False).to_numpy()
|
||||
filter_name = f"SIDE_OPPORTUNITY_{side}_GE_{threshold:g}_BPS_FIT_ROWS"
|
||||
return mask, filter_name
|
||||
|
||||
|
||||
def _fit_head(
|
||||
item,
|
||||
x_train,
|
||||
x_tune,
|
||||
train: pd.DataFrame,
|
||||
tune: pd.DataFrame,
|
||||
scaler: StandardScaler,
|
||||
head_train_mask: np.ndarray | None = None,
|
||||
head_filter: str = "ALL_FIT_ROWS",
|
||||
args: Any | None = None,
|
||||
) -> list[HeadResult]:
|
||||
name, kind, target, fields, target_names = item
|
||||
if head_train_mask is None:
|
||||
head_train_mask = np.ones(len(train), dtype=bool)
|
||||
head_train_mask = np.asarray(head_train_mask, dtype=bool)
|
||||
if len(head_train_mask) != len(train):
|
||||
raise ValueError(f"head train mask length mismatch for {name}: mask={len(head_train_mask)} train={len(train)}")
|
||||
min_fit_rows = int(getattr(args, "conditional_entry_min_fit_rows", 1000) or 1000) if head_filter != "ALL_FIT_ROWS" else 1
|
||||
head_fit_rows = int(head_train_mask.sum())
|
||||
if head_fit_rows < min_fit_rows:
|
||||
raise ValueError(f"{name} has too few fit rows after {head_filter}: {head_fit_rows} < {min_fit_rows}")
|
||||
head_train = train.loc[head_train_mask].copy()
|
||||
x_head_train = x_train[head_train_mask]
|
||||
if kind == "multiclass":
|
||||
y_train = train[target].to_numpy().argmax(axis=1)
|
||||
y_train = head_train[target].to_numpy().argmax(axis=1)
|
||||
y_val = tune[target].to_numpy().argmax(axis=1)
|
||||
model = LogisticRegression(max_iter=500)
|
||||
model.fit(x_train, y_train)
|
||||
model.fit(x_head_train, y_train)
|
||||
proba = model.predict_proba(x_tune)
|
||||
weight, bias = _fold_scaler(model.coef_.T, model.intercept_, scaler)
|
||||
train_prior = train[target].to_numpy().mean(axis=0)
|
||||
train_prior = head_train[target].to_numpy().mean(axis=0)
|
||||
metrics = _multiclass_metrics(y_train, y_val, proba, train_prior)
|
||||
_add_fit_filter_metrics(metrics, head_filter, head_fit_rows, len(train))
|
||||
return [HeadResult("direction", target_names[0], "softmax", weight, bias, metrics, proba, y_val)]
|
||||
if kind == "binary":
|
||||
y_train = pd.to_numeric(train[target], errors="coerce").fillna(0).astype(int).to_numpy()
|
||||
y_train = pd.to_numeric(head_train[target], errors="coerce").fillna(0).astype(int).to_numpy()
|
||||
y_val = pd.to_numeric(tune[target], errors="coerce").fillna(0).astype(int).to_numpy()
|
||||
if len(np.unique(y_train)) < 2:
|
||||
prevalence = float(np.clip(y_train.mean(), 1e-6, 1 - 1e-6))
|
||||
@@ -205,7 +293,7 @@ def _fit_head(item, x_train, x_tune, train: pd.DataFrame, tune: pd.DataFrame, sc
|
||||
proba = np.full(len(y_val), prevalence, dtype=np.float32)
|
||||
else:
|
||||
model = LogisticRegression(max_iter=500)
|
||||
model.fit(x_train, y_train)
|
||||
model.fit(x_head_train, y_train)
|
||||
coef = model.coef_
|
||||
intercept = model.intercept_
|
||||
proba = model.predict_proba(x_tune)[:, 1]
|
||||
@@ -213,18 +301,29 @@ def _fit_head(item, x_train, x_tune, train: pd.DataFrame, tune: pd.DataFrame, sc
|
||||
metrics = _binary_metrics(y_train, y_val, proba)
|
||||
if len(np.unique(y_val)) == 2:
|
||||
metrics["auc"] = float(roc_auc_score(y_val, proba))
|
||||
_add_fit_filter_metrics(metrics, head_filter, head_fit_rows, len(train))
|
||||
return [HeadResult(fields[0], target_names[0], "sigmoid", weight, bias, metrics, proba.reshape(-1, 1), y_val)]
|
||||
if kind == "regression":
|
||||
y_train = pd.to_numeric(train[target], errors="coerce").fillna(0.0).to_numpy()
|
||||
y_train = pd.to_numeric(head_train[target], errors="coerce").fillna(0.0).to_numpy()
|
||||
y_val = pd.to_numeric(tune[target], errors="coerce").fillna(0.0).to_numpy()
|
||||
model = HuberRegressor(alpha=0.001, epsilon=1.35, max_iter=500)
|
||||
model.fit(x_train, y_train)
|
||||
model.fit(x_head_train, y_train)
|
||||
pred = model.predict(x_tune)
|
||||
weight, bias = _fold_scaler(model.coef_.reshape(1, -1).T, np.array([model.intercept_]), scaler)
|
||||
return [HeadResult(fields[0], None, "identity", weight, bias, _regression_metrics(y_train, y_val, pred), pred.reshape(-1, 1), y_val)]
|
||||
metrics = _regression_metrics(y_train, y_val, pred)
|
||||
metrics["target_source"] = target
|
||||
_add_fit_filter_metrics(metrics, head_filter, head_fit_rows, len(train))
|
||||
return [HeadResult(fields[0], None, "identity", weight, bias, metrics, pred.reshape(-1, 1), y_val)]
|
||||
raise ValueError(f"unsupported head kind: {kind}")
|
||||
|
||||
|
||||
def _add_fit_filter_metrics(metrics: dict[str, Any], fit_filter: str, fit_rows: int, total_fit_rows: int) -> None:
|
||||
metrics["fit_filter"] = fit_filter
|
||||
metrics["fit_rows"] = int(fit_rows)
|
||||
metrics["fit_total_rows"] = int(total_fit_rows)
|
||||
metrics["fit_row_ratio"] = float(fit_rows / total_fit_rows) if total_fit_rows else 0.0
|
||||
|
||||
|
||||
def _fold_scaler(weight_scaled: np.ndarray, bias_scaled: np.ndarray, scaler: StandardScaler) -> tuple[np.ndarray, np.ndarray]:
|
||||
scale = np.where(scaler.scale_ == 0, 1.0, scaler.scale_)
|
||||
weight = weight_scaled / scale.reshape(-1, 1)
|
||||
|
||||
Reference in New Issue
Block a user