@@ -8,7 +8,7 @@ import numpy as np
import pandas as pd
from sklearn . ensemble import HistGradientBoostingClassifier , HistGradientBoostingRegressor
from trader_training . io_utils import read_parquet , run_root , write_json , write_text
from trader_training . io_utils import read_parquet , run_root , write_json , write_parquet , write_text
from trader_training . pm import _pm_config_from_thresholds , _pm_frame , _price_plan_context , _simulate_open_trades , _trade_metrics
from trader_training . schemas import FEATURE_ORDER , FIT_SPLIT , LATEST_STRESS_SPLIT , TUNE_SPLIT , VALIDATION_LOCKED_SPLIT
@@ -20,8 +20,11 @@ def probe_nonlinear_pm(args: Any) -> None:
root = run_root ( args )
direction_dataset = read_parquet ( root / " dataset " / " direction_train.parquet " )
entry_dataset = read_parquet ( root / " dataset " / " entry_train.parquet " )
direction_model = _fit_direction_model ( direction_dataset )
entry_models = _fit_entry_models ( direction_dataset , entry_dataset )
probe_mode = _probe_mode ( args )
entry_train_filter = _entry_train_filter ( args )
entry_opportunity_bps = float ( getattr ( args , " entry_opportunity_bps " , 40.0 ) or 40.0 )
direction_model = _fit_direction_model ( direction_dataset ) if probe_mode == " direction_entry_tree " else None
entry_models = _fit_entry_models ( direction_dataset , entry_dataset , entry_train_filter , entry_opportunity_bps )
frames = {
split_id : _prediction_frame ( root , split_id , direction_dataset , entry_dataset , direction_model , entry_models )
for split_id in EVAL_SPLITS
@@ -47,28 +50,45 @@ def probe_nonlinear_pm(args: Any) -> None:
raise ValueError ( " nonlinear PM probe did not evaluate any threshold candidate " )
split_metrics : dict [ str , Any ] = { }
split_trade_frames : dict [ str , pd . DataFrame ] = { }
for split_id , frame in frames . items ( ) :
trades = _simulate_open_trades ( frame , best_thresholds , _pm_config_from_thresholds ( best_thresholds ) , price_plan )
trades = trades . copy ( )
trades [ " eval_split " ] = split_id
split_trade_frames [ split_id ] = trades
split_metrics [ split_id ] = _trade_metrics ( trades )
side_metrics = _side_metrics ( split_trade_frames )
tune_frame = pd . DataFrame ( tune_rows ) . sort_values ( " score " , ascending = False ) . reset_index ( drop = True )
result = {
" run_id " : args . run_id ,
" purpose " : " diagnostic_only_not_exported " ,
" model_family " : " sklearn_hist_gradient_boosting " ,
" probe_mode " : probe_mode ,
" entry_train_filter " : entry_train_filter ,
" entry_opportunity_bps " : entry_opportunity_bps ,
" candidate_count " : len ( candidates ) ,
" candidate_summary " : _candidate_summary ( tune_frame ) ,
" best_thresholds " : best_thresholds ,
" best_tune_metrics " : best_tune_metrics ,
" split_metrics " : split_metrics ,
" side_metrics " : side_metrics ,
" verdict " : _verdict ( split_metrics ) ,
}
out_dir = root / " diagnostics "
write_json ( out_dir / " nonlinear_pm_probe_result.json " , _jsonable ( result ) )
write_text ( out_dir / " nonlinear_pm_probe_candidates.csv " , tune_frame . head ( 200 ) . to_csv ( index = False ) )
write_text ( out_dir / " nonlinear_pm_probe_report.md " , _markdown_report ( result , tune_frame . head ( 20 ) ) )
output_stem = _output_stem ( probe_mode )
trade_parts = [ trades for trades in split_trade_frames . values ( ) if not trades . empty ]
best_trade_frame = pd . concat ( trade_parts , ignore_index = True ) if trade_parts else pd . DataFrame ( )
write_json ( out_dir / f " { output_stem } _result.json " , _jsonable ( result ) )
write_text ( out_dir / f " { output_stem } _candidates.csv " , tune_frame . head ( 200 ) . to_csv ( index = False ) )
write_parquet ( out_dir / f " { output_stem } _best_trades.parquet " , best_trade_frame )
write_text ( out_dir / f " { output_stem } _side_metrics.csv " , _side_metrics_frame ( side_metrics ) . to_csv ( index = False ) )
write_text ( out_dir / f " { output_stem } _report.md " , _markdown_report ( result , tune_frame . head ( 20 ) ) )
logging . info (
" trader.training.nonlinear_pm_probe_written runId= %s verdict= %s tuneTrades= %s validationTrades= %s stressTrades= %s " ,
" trader.training.nonlinear_pm_probe_written runId= %s probeMode= %s entryTrainFilter= %s verdict= %s tuneTrades= %s validationTrades= %s stressTrades= %s " ,
args . run_id ,
probe_mode ,
entry_train_filter ,
result [ " verdict " ] [ " status " ] ,
split_metrics [ TUNE_SPLIT ] [ " trade_count " ] ,
split_metrics [ VALIDATION_LOCKED_SPLIT ] [ " trade_count " ] ,
@@ -76,6 +96,26 @@ def probe_nonlinear_pm(args: Any) -> None:
)
def _probe_mode ( args : Any ) - > str :
mode = str ( getattr ( args , " probe_mode " , " direction_entry_tree " ) or " direction_entry_tree " ) . strip ( ) . lower ( )
allowed = { " direction_entry_tree " , " entry_tree_only " }
if mode not in allowed :
raise ValueError ( f " unsupported nonlinear PM probe mode: { mode } " )
return mode
def _entry_train_filter ( args : Any ) - > str :
value = str ( getattr ( args , " entry_train_filter " , " direction_label " ) or " direction_label " ) . strip ( ) . lower ( )
allowed = { " direction_label " , " side_opportunity " }
if value not in allowed :
raise ValueError ( f " unsupported nonlinear Entry train filter: { value } " )
return value
def _output_stem ( probe_mode : str ) - > str :
return " nonlinear_pm_probe " if probe_mode == " direction_entry_tree " else f " nonlinear_pm_probe_ { probe_mode } "
def _fit_direction_model ( dataset : pd . DataFrame ) - > HistGradientBoostingClassifier :
train = dataset [ dataset [ " split_id " ] . eq ( FIT_SPLIT ) ] . copy ( )
y = train [ [ " long_target " , " short_target " , " neutral_target " ] ] . to_numpy ( ) . argmax ( axis = 1 )
@@ -91,22 +131,56 @@ def _fit_direction_model(dataset: pd.DataFrame) -> HistGradientBoostingClassifie
return model
def _fit_entry_models ( direction_dataset : pd . DataFrame , entry_dataset : pd . DataFrame ) - > dict [ str , Any ] :
merged = entry_dataset . merge (
direction_dataset [ [ " sample_id " , " long_target " , " short_target " ] ] ,
on = " sample_id " ,
how = " inner " ,
validate = " one_to_one " ,
)
train = merged [ merged [ " split_id " ] . eq ( FIT_SPLIT ) ] . copy ( )
def _fit_entry_models ( direction_dataset : pd . DataFrame , entry_dataset : pd . DataFrame , entry_train_filter : str , opportunity_bps : float ) - > dict [ str , Any ] :
long_train = _entry_side_fit_frame ( direction_dataset , entry_dataset , " LONG " , entry_train_filter , opportunity_bps )
short_train = _entry_side_fit_frame ( direction_dataset , entry_dataset , " SHORT " , entry_train_filter , opportunity_bps )
return {
" long_entry_prob " : _fit_binary_head ( train [ train [ " long_target " ] . eq ( 1 ) ] , " long_entry_target " , seed = 43 ) ,
" short_entry_prob " : _fit_binary_head ( train [ train [ " short_target " ] . eq ( 1 ) ] , " short_entry_target " , seed = 47 ) ,
" long_expected_net_edge_bps " : _fit_regression_head ( train [ train [ " long_target " ] . eq ( 1 ) ] , " long_actual_plan_net_edge_bps " , seed = 53 ) ,
" short_expected_net_edge_bps " : _fit_regression_head ( train [ train [ " short_target " ] . eq ( 1 ) ] , " short_actual_plan_net_edge_bps " , seed = 59 ) ,
" long_entry_prob " : _fit_binary_head ( long_train , " long_entry_target " , seed = 43 ) ,
" short_entry_prob " : _fit_binary_head ( short_train , " short_entry_target " , seed = 47 ) ,
" long_expected_net_edge_bps " : _fit_regression_head ( long_train , " long_actual_plan_net_edge_bps " , seed = 53 ) ,
" short_expected_net_edge_bps " : _fit_regression_head ( short_train , " short_actual_plan_net_edge_bps " , seed = 59 ) ,
}
def _entry_side_fit_frame (
direction_dataset : pd . DataFrame ,
entry_dataset : pd . DataFrame ,
side : str ,
entry_train_filter : str ,
opportunity_bps : float ,
) - > pd . DataFrame :
side_lower = side . lower ( )
train = entry_dataset [ entry_dataset [ " split_id " ] . eq ( FIT_SPLIT ) ] . copy ( )
if entry_train_filter == " direction_label " :
label_column = f " { side_lower } _target "
required = { " sample_id " , label_column }
missing = sorted ( required - set ( direction_dataset . columns ) )
if missing :
raise ValueError ( f " direction dataset missing columns for nonlinear Entry filter: { missing } " )
train = train . merge ( direction_dataset [ list ( required ) ] , on = " sample_id " , how = " inner " , validate = " one_to_one " )
if len ( train ) == 0 :
raise ValueError ( f " nonlinear Entry { side } direction-label filter produced no rows " )
mask = pd . to_numeric ( train [ label_column ] , errors = " coerce " ) . fillna ( 0 ) . astype ( int ) . eq ( 1 )
filter_name = f " DIRECTION_LABEL_ { side } _FIT_ROWS "
elif entry_train_filter == " side_opportunity " :
opportunity_column = f " { side_lower } _max_achievable_net_edge_bps "
if opportunity_column not in train . columns :
raise ValueError ( f " entry dataset missing { opportunity_column } for nonlinear Entry side-opportunity filter " )
mask = pd . to_numeric ( train [ opportunity_column ] , errors = " coerce " ) . ge ( opportunity_bps ) . fillna ( False )
filter_name = f " SIDE_OPPORTUNITY_ { side } _GE_ { opportunity_bps : g } _BPS_FIT_ROWS "
else :
raise ValueError ( f " unsupported nonlinear Entry train filter: { entry_train_filter } " )
out = train . loc [ mask ] . copy ( )
logging . info (
" trader.training.nonlinear_entry_fit_frame side= %s filter= %s rows= %s totalFitRows= %s " ,
side ,
filter_name ,
len ( out ) ,
len ( train ) ,
)
return out
def _fit_binary_head ( train : pd . DataFrame , target : str , seed : int ) - > HistGradientBoostingClassifier :
if len ( train ) < 1000 :
raise ValueError ( f " not enough rows to train nonlinear Entry head { target } : { len ( train ) } " )
@@ -145,44 +219,45 @@ def _prediction_frame(
split_id : str ,
direction_dataset : pd . DataFrame ,
entry_dataset : pd . DataFrame ,
direction_model : HistGradientBoostingClassifier ,
direction_model : HistGradientBoostingClassifier | None ,
entry_models : dict [ str , Any ] ,
) - > pd . DataFrame :
frame = _pm_frame ( root , split_id ) . copy ( )
direction_split = direction_dataset [ direction_dataset [ " split_id " ] . eq ( split_id ) ] . copy ( )
entry_split = entry_dataset [ entry_dataset [ " split_id " ] . eq ( split_id ) ] . copy ( )
direction_proba = direction_model . predict_proba ( _x ( direction_split ) )
direction_pred = direction_split [ [ " sample_id " ] ] . copy ( )
direction_pred [ " long_prob " ] = direction_proba [ : , 0 ]
direction_pred [ " short_prob " ] = direction_proba [ : , 1 ]
direction_pred [ " neutral_prob " ] = direction_proba [ : , 2 ]
entry_pred = entry_split [ [ " sample_id " ] ] . copy ( )
entry_pred [ " long_entry_prob " ] = entry_models [ " long_entry_prob " ] . predict_proba ( _x ( entry_split ) ) [ : , 1 ]
entry_pred [ " short_entry_prob " ] = entry_models [ " short_entry_prob " ] . predict_proba ( _x ( entry_split ) ) [ : , 1 ]
entry_pred [ " pred_long_expected_net_edge_bps " ] = entry_models [ " long_expected_net_edge_bps " ] . predict ( _x ( entry_split ) )
entry_pred [ " pred_short_expected_net_edge_bps " ] = entry_models [ " short_expected_net_edge_bps " ] . predict ( _x ( entry_split ) )
replacements = direction_pred . merge ( entry_pred , on = " sample_id " , how = " inner " , validate = " one_to_one " )
out = frame . drop (
columns = [
" long_prob " ,
" short_prob " ,
" neutral_prob " ,
" long_entry_prob " ,
" short_entry_prob " ,
" pred_long_expected_net_edge_bps " ,
" pred_short_expected_net_edge_bps " ,
] ,
errors = " ignore " ,
) . merge ( replacements , on = " sample_id " , how = " inner " , validate = " one_to_one " )
replacements = entry_pred
drop_columns = [
" long_entry_prob " ,
" short_entry_prob " ,
" pred_long_expected_net_edge_bps " ,
" pred_short_expected_net_edge_bps " ,
]
if direction_model is not None :
direction_split = direction_dataset [ direction_dataset [ " split_id " ] . eq ( split_id ) ] . copy ( )
direction_proba = direction_model . predict_proba ( _x ( direction_split ) )
direction_pred = direction_split [ [ " sample_id " ] ] . copy ( )
direction_pred [ " long_prob " ] = direction_proba [ : , 0 ]
direction_pred [ " short_prob " ] = direction_proba [ : , 1 ]
direction_pred [ " neutral_prob " ] = direction_proba [ : , 2 ]
replacements = direction_pred . merge ( entry_pred , on = " sample_id " , how = " inner " , validate = " one_to_one " )
drop_columns . extend ( [ " long_prob " , " short_prob " , " neutral_prob " ] )
out = frame . drop ( columns = drop_columns , errors = " ignore " ) . merge ( replacements , on = " sample_id " , how = " inner " , validate = " one_to_one " )
if len ( out ) != len ( frame ) :
raise ValueError ( f " nonlinear prediction frame lost rows for { split_id } : before= { len ( frame ) } after= { len ( out ) } " )
return out
def _expanded_threshold_candidates ( ) - > list [ dict [ str , float ] ] :
# 多头和空头在不同市场段里的可靠性可能完全不同;这里分开搜,
# 1.01 表示这一侧不开仓,用来检查只做多或只做空是否更稳。
values = itertools . product (
[ 0.20 , 0.25 , 0.30 , 0.35 , 0.40 , 0.45 , 0.50 , 0.60 ] ,
[ 0.20 , 0.30 , 0.40 , 0.50 , 0.60 , 1.01 ] ,
[ 0.20 , 0.30 , 0.40 , 0.50 , 0.60 , 1.01 ] ,
[ 0.05 , 0.10 , 0.20 , 0.30 , 0.40 , 0.50 ] ,
[ 0.45 , 0.65 , 0.85 , 1.00 ] ,
[ - 5.0 , 0.0 , 3.0 , 5.0 , 8.0 ] ,
@@ -190,14 +265,14 @@ def _expanded_threshold_candidates() -> list[dict[str, float]]:
)
return [
{
" long_open_prob " : direction_prob ,
" short_open_prob " : direction_prob ,
" long_open_prob " : long_prob ,
" short_open_prob " : short_prob ,
" min_entry_prob " : entry_prob ,
" max_market_risk_prob " : risk_prob ,
" min_expected_edge_bps " : edge_bps ,
" min_direction_margin " : margin ,
}
for direction_prob , entry_prob , risk_prob , edge_bps , margin in values
for long_prob , short_prob , entry_prob , risk_prob , edge_bps , margin in values
]
@@ -213,6 +288,72 @@ def _probe_score(metrics: dict[str, Any]) -> float:
)
def _side_metrics ( split_trade_frames : dict [ str , pd . DataFrame ] ) - > dict [ str , dict [ str , dict [ str , Any ] ] ] :
metrics : dict [ str , dict [ str , dict [ str , Any ] ] ] = { }
for split_id , trades in split_trade_frames . items ( ) :
metrics [ split_id ] = { }
for side in ( " LONG " , " SHORT " ) :
side_trades = trades [ trades [ " side " ] . eq ( side ) ] . copy ( ) if not trades . empty else trades . copy ( )
metrics [ split_id ] [ side ] = { * * _trade_metrics ( side_trades ) , * * _exit_metrics ( side_trades ) }
return metrics
def _exit_metrics ( trades : pd . DataFrame ) - > dict [ str , float ] :
if trades . empty :
return {
" target_hit_rate " : 0.0 ,
" stop_hit_rate " : 0.0 ,
" timeout_exit_rate " : 0.0 ,
" avg_time_to_exit_min " : 0.0 ,
" p50_time_to_exit_min " : 0.0 ,
}
target_hit = pd . to_numeric ( trades [ " target_hit " ] , errors = " coerce " ) . fillna ( 0 ) . astype ( int )
stop_hit = pd . to_numeric ( trades [ " stop_hit " ] , errors = " coerce " ) . fillna ( 0 ) . astype ( int )
time_to_exit_min = pd . to_numeric ( trades [ " time_to_exit_ms " ] , errors = " coerce " ) . fillna ( 0.0 ) . astype ( float ) / 60_000.0
return {
" target_hit_rate " : float ( target_hit . eq ( 1 ) . mean ( ) ) ,
" stop_hit_rate " : float ( stop_hit . eq ( 1 ) . mean ( ) ) ,
" timeout_exit_rate " : float ( ( target_hit . ne ( 1 ) & stop_hit . ne ( 1 ) ) . mean ( ) ) ,
" avg_time_to_exit_min " : float ( time_to_exit_min . mean ( ) ) ,
" p50_time_to_exit_min " : float ( time_to_exit_min . median ( ) ) ,
}
def _side_metrics_frame ( side_metrics : dict [ str , dict [ str , dict [ str , Any ] ] ] ) - > pd . DataFrame :
rows : list [ dict [ str , Any ] ] = [ ]
for split_id , split_metrics in side_metrics . items ( ) :
for side , metrics in split_metrics . items ( ) :
rows . append ( { " split_id " : split_id , " side " : side , * * metrics } )
return pd . DataFrame ( rows )
def _candidate_summary ( tune_frame : pd . DataFrame ) - > dict [ str , Any ] :
if tune_frame . empty :
return {
" positive_avg_weighted_candidates " : 0 ,
" positive_total_weighted_candidates " : 0 ,
" best_avg_weighted_edge_bps " : 0.0 ,
" best_total_weighted_edge_bps " : 0.0 ,
" min_viable_trade_count " : 80 ,
" positive_avg_weighted_viable_candidates " : 0 ,
" positive_total_weighted_viable_candidates " : 0 ,
" best_viable_avg_weighted_edge_bps " : 0.0 ,
" best_viable_total_weighted_edge_bps " : 0.0 ,
}
viable = tune_frame [ tune_frame [ " trade_count " ] > = 80 ]
return {
" positive_avg_weighted_candidates " : int ( ( tune_frame [ " avg_weighted_edge_bps " ] > 0 ) . sum ( ) ) ,
" positive_total_weighted_candidates " : int ( ( tune_frame [ " total_weighted_edge_bps " ] > 0 ) . sum ( ) ) ,
" best_avg_weighted_edge_bps " : float ( tune_frame [ " avg_weighted_edge_bps " ] . max ( ) ) ,
" best_total_weighted_edge_bps " : float ( tune_frame [ " total_weighted_edge_bps " ] . max ( ) ) ,
" min_viable_trade_count " : 80 ,
" positive_avg_weighted_viable_candidates " : int ( ( viable [ " avg_weighted_edge_bps " ] > 0 ) . sum ( ) ) ,
" positive_total_weighted_viable_candidates " : int ( ( viable [ " total_weighted_edge_bps " ] > 0 ) . sum ( ) ) ,
" best_viable_avg_weighted_edge_bps " : float ( viable [ " avg_weighted_edge_bps " ] . max ( ) ) if not viable . empty else 0.0 ,
" best_viable_total_weighted_edge_bps " : float ( viable [ " total_weighted_edge_bps " ] . max ( ) ) if not viable . empty else 0.0 ,
}
def _verdict ( metrics : dict [ str , Any ] ) - > dict [ str , Any ] :
tune = metrics [ TUNE_SPLIT ]
validation = metrics [ VALIDATION_LOCKED_SPLIT ]
@@ -236,14 +377,22 @@ def _x(frame: pd.DataFrame) -> np.ndarray:
def _markdown_report ( result : dict [ str , Any ] , top_candidates : pd . DataFrame ) - > str :
mode_text = " 只替换 Entry, Direction 使用当前模型输出。 " if result [ " probe_mode " ] == " entry_tree_only " else " Direction 和 Entry 都替换成树模型。 "
lines = [
" # Nonlinear PM Probe Report " ,
" " ,
" 这份报告只做诊断,不导出上线模型。它回答:不加新特征,只 换成树模型后,PM 能不能筛出稳定正收益。 " ,
" 这份报告只做诊断,不导出上线模型。它回答:不加新特征,换成树模型后,PM 能不能筛出稳定正收益。 " ,
" " ,
f " - run_id: ` { result [ ' run_id ' ] } ` " ,
f " - probe_mode: ` { result [ ' probe_mode ' ] } ` " ,
f " - 说明: { mode_text } " ,
f " - Entry 训练人群: ` { result [ ' entry_train_filter ' ] } ` " ,
f " - Entry 机会阈值: ` { result [ ' entry_opportunity_bps ' ] } ` bps " ,
f " - verdict: ` { result [ ' verdict ' ] [ ' status ' ] } ` " ,
f " - candidate_count: ` { result [ ' candidate_count ' ] } ` " ,
f " - 正收益候选数: ` { result [ ' candidate_summary ' ] [ ' positive_avg_weighted_candidates ' ] } ` " ,
f " - 至少 80 单的正收益候选数: ` { result [ ' candidate_summary ' ] [ ' positive_avg_weighted_viable_candidates ' ] } ` " ,
f " - 至少 80 单的最好单笔加权收益: ` { result [ ' candidate_summary ' ] [ ' best_viable_avg_weighted_edge_bps ' ] : .4f } ` bps " ,
f " - best_thresholds: ` { result [ ' best_thresholds ' ] } ` " ,
" " ,
" ## Split Metrics " ,
@@ -257,6 +406,23 @@ def _markdown_report(result: dict[str, Any], top_candidates: pd.DataFrame) -> st
f " { metrics [ ' avg_actual_edge_bps ' ] : .4f } | { metrics [ ' avg_weighted_edge_bps ' ] : .4f } | "
f " { metrics [ ' total_weighted_edge_bps ' ] : .4f } | { metrics [ ' profit_factor ' ] : .4f } | "
)
lines . extend (
[
" " ,
" ## Side Breakdown " ,
" " ,
" | split | side | trades | win_rate | avg_actual_bps | avg_weighted_bps | target_hit_rate | stop_hit_rate | timeout_rate | avg_exit_min | " ,
" | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | " ,
]
)
for split_id , side_metrics in result [ " side_metrics " ] . items ( ) :
for side , metrics in side_metrics . items ( ) :
lines . append (
f " | { split_id } | { side } | { metrics [ ' trade_count ' ] } | { metrics [ ' win_rate ' ] : .4f } | "
f " { metrics [ ' avg_actual_edge_bps ' ] : .4f } | { metrics [ ' avg_weighted_edge_bps ' ] : .4f } | "
f " { metrics [ ' target_hit_rate ' ] : .4f } | { metrics [ ' stop_hit_rate ' ] : .4f } | "
f " { metrics [ ' timeout_exit_rate ' ] : .4f } | { metrics [ ' avg_time_to_exit_min ' ] : .2f } | "
)
lines . extend ( [ " " , " ## Top Tune Candidates " , " " , _candidate_table ( top_candidates ) , " " ] )
return " \n " . join ( lines )