Improve Trader V4 training pipeline
Align entry labels with max future edge, tune direction labeling, and harden regression evaluation. Add training diagnostics, price-plan search, feature screening, and nonlinear benchmark scripts.
This commit is contained in:
@@ -275,107 +275,161 @@ def _asof_column(
|
||||
return merged
|
||||
|
||||
|
||||
def build_replay_1m(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
raw_root = args.raw_root or DEFAULT_RAW_ROOT
|
||||
logging.info("trader.training.replay_started runId=%s symbol=%s rawRoot=%s", args.run_id, args.symbol, raw_root)
|
||||
replay = _read_candles(raw_root, args.symbol, args.start_date, args.end_date)
|
||||
trades = _read_trades(raw_root, args.symbol, args.start_date, args.end_date)
|
||||
level1 = _read_level1(raw_root, args.symbol, args.start_date, args.end_date)
|
||||
liquidations = _read_liquidations(raw_root, args.symbol, args.start_date, args.end_date)
|
||||
REPLAY_REQUIRED_COLUMNS = [
|
||||
"open",
|
||||
"high",
|
||||
"low",
|
||||
"close",
|
||||
"volume",
|
||||
"best_bid_price",
|
||||
"best_ask_price",
|
||||
"spread_bps",
|
||||
"level1_ofi_1m",
|
||||
"funding_bps",
|
||||
"mark_price",
|
||||
"index_price",
|
||||
"open_interest",
|
||||
]
|
||||
|
||||
REPLAY_OUTPUT_COLUMNS = [
|
||||
"symbol",
|
||||
"timeframe",
|
||||
"event_time",
|
||||
"open_time_ms",
|
||||
"open",
|
||||
"high",
|
||||
"low",
|
||||
"close",
|
||||
"volume",
|
||||
"taker_buy_volume",
|
||||
"taker_sell_volume",
|
||||
"funding_bps",
|
||||
"mark_price",
|
||||
"index_price",
|
||||
"next_funding_time",
|
||||
"open_interest",
|
||||
"best_bid_price",
|
||||
"best_ask_price",
|
||||
"spread_bps",
|
||||
"level1_ofi_1m",
|
||||
"liquidation_buy_notional_1m",
|
||||
"liquidation_sell_notional_1m",
|
||||
"liquidation_available",
|
||||
"source_coverage",
|
||||
]
|
||||
|
||||
|
||||
def _replay_date_texts(raw_root: Path, symbol: str, start_date: str | None, end_date: str | None) -> list[str]:
|
||||
if start_date and end_date:
|
||||
return [day.strftime("%Y-%m-%d") for day in pd.date_range(pd.Timestamp(start_date), pd.Timestamp(end_date), freq="D")]
|
||||
files = partition_files(raw_root, "candles", symbol, start_date, end_date)
|
||||
dates = sorted({next((part.split("=", 1)[1] for part in file.parts if part.startswith("dt=")), "") for file in files})
|
||||
return [date for date in dates if date]
|
||||
|
||||
|
||||
def _previous_date_text(day: str) -> str:
|
||||
return (pd.Timestamp(day) - pd.Timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def _build_replay_day(raw_root: Path, symbol: str, day: str) -> pd.DataFrame:
|
||||
replay = _read_candles(raw_root, symbol, day, day)
|
||||
replay = replay[replay["event_time"].dt.strftime("%Y-%m-%d").eq(day)].copy()
|
||||
trades = _read_trades(raw_root, symbol, day, day)
|
||||
level1 = _read_level1(raw_root, symbol, day, day)
|
||||
liquidations = _read_liquidations(raw_root, symbol, day, day)
|
||||
replay = replay.merge(trades, on=["symbol", "event_time", "open_time_ms"], how="left")
|
||||
replay = replay.merge(level1, on=["symbol", "event_time", "open_time_ms"], how="left")
|
||||
replay = replay.merge(liquidations, on=["symbol", "event_time", "open_time_ms"], how="left")
|
||||
replay[["taker_buy_volume", "taker_sell_volume"]] = replay[["taker_buy_volume", "taker_sell_volume"]].fillna(0.0)
|
||||
for column in ("liquidation_buy_notional_1m", "liquidation_sell_notional_1m", "liquidation_available"):
|
||||
replay[column] = replay[column].fillna(0.0)
|
||||
replay[column] = pd.to_numeric(replay[column], errors="coerce").fillna(0.0)
|
||||
|
||||
funding = _asof_column(replay, raw_root, "funding", args.symbol, args.start_date, args.end_date, ("rate", "mark_price", "index_price", "next_funding_time"))
|
||||
# Funding and open interest are as-of values. Include the previous UTC day so
|
||||
# the first minutes of a day can use the last known value without reading the
|
||||
# whole training window into memory.
|
||||
lookback_start = _previous_date_text(day)
|
||||
funding = _asof_column(replay, raw_root, "funding", symbol, lookback_start, day, ("rate", "mark_price", "index_price", "next_funding_time"))
|
||||
funding = funding.rename(columns={"rate": "funding_rate"})
|
||||
funding["funding_bps"] = pd.to_numeric(funding["funding_rate"], errors="coerce") * 10000.0
|
||||
replay = replay.merge(funding.drop(columns=["funding_rate"]), on=["symbol", "event_time"], how="left")
|
||||
replay["next_funding_time"] = to_utc_series(replay["next_funding_time"])
|
||||
|
||||
oi = _asof_column(replay, raw_root, "open_interest", args.symbol, args.start_date, args.end_date, ("open_interest",))
|
||||
oi = _asof_column(replay, raw_root, "open_interest", symbol, lookback_start, day, ("open_interest",))
|
||||
replay = replay.merge(oi, on=["symbol", "event_time"], how="left")
|
||||
replay["timeframe"] = "1m"
|
||||
replay["source_coverage"] = "crypto_lake_raw"
|
||||
|
||||
required = [
|
||||
"open",
|
||||
"high",
|
||||
"low",
|
||||
"close",
|
||||
"volume",
|
||||
"best_bid_price",
|
||||
"best_ask_price",
|
||||
"spread_bps",
|
||||
"level1_ofi_1m",
|
||||
"funding_bps",
|
||||
"mark_price",
|
||||
"index_price",
|
||||
"open_interest",
|
||||
]
|
||||
replay["event_date"] = replay["event_time"].dt.strftime("%Y-%m-%d")
|
||||
missing_required = replay[required].isna().any(axis=1)
|
||||
day_quality = (
|
||||
replay.assign(missing_required=missing_required.astype(int))
|
||||
.groupby("event_date", as_index=False, observed=True)
|
||||
.agg(row_count=("event_time", "count"), missing_required_rows=("missing_required", "sum"))
|
||||
)
|
||||
day_quality["ready"] = (day_quality["row_count"] >= int(args.min_minutes_per_day)) & day_quality["missing_required_rows"].eq(0)
|
||||
ready_days = sorted(day_quality.loc[day_quality["ready"], "event_date"].astype(str).tolist())
|
||||
excluded_days = [
|
||||
{
|
||||
"date": row.event_date,
|
||||
"row_count": int(row.row_count),
|
||||
"missing_required_rows": int(row.missing_required_rows),
|
||||
"reason": "MISSING_REQUIRED_MARKET_FIELDS" if int(row.missing_required_rows) else "INCOMPLETE_MINUTE_COUNT",
|
||||
}
|
||||
for row in day_quality.loc[~day_quality["ready"]].itertuples(index=False)
|
||||
]
|
||||
return replay
|
||||
|
||||
|
||||
def build_replay_1m(args: Any) -> None:
|
||||
root = run_root(args)
|
||||
raw_root = args.raw_root or DEFAULT_RAW_ROOT
|
||||
logging.info("trader.training.replay_started runId=%s symbol=%s rawRoot=%s", args.run_id, args.symbol, raw_root)
|
||||
dates = _replay_date_texts(raw_root, args.symbol, args.start_date, args.end_date)
|
||||
if not dates:
|
||||
raise ValueError("no candle dates are available for replay_1m")
|
||||
|
||||
ready_days: list[str] = []
|
||||
excluded_days: list[dict[str, Any]] = []
|
||||
ready_frames: list[pd.DataFrame] = []
|
||||
row_before_filter = 0
|
||||
for index, day in enumerate(dates, start=1):
|
||||
logging.info("trader.training.replay_day_started runId=%s day=%s index=%s total=%s", args.run_id, day, index, len(dates))
|
||||
try:
|
||||
day_replay = _build_replay_day(raw_root, args.symbol, day)
|
||||
except Exception as exc:
|
||||
excluded_days.append(
|
||||
{
|
||||
"date": day,
|
||||
"row_count": 0,
|
||||
"missing_required_rows": 0,
|
||||
"reason": "DAY_BUILD_FAILED",
|
||||
"error": str(exc),
|
||||
}
|
||||
)
|
||||
logging.warning("trader.training.replay_day_failed runId=%s day=%s error=%s", args.run_id, day, exc)
|
||||
continue
|
||||
|
||||
row_count = len(day_replay)
|
||||
row_before_filter += row_count
|
||||
missing_required_rows = int(day_replay[REPLAY_REQUIRED_COLUMNS].isna().any(axis=1).sum())
|
||||
ready = row_count >= int(args.min_minutes_per_day) and missing_required_rows == 0
|
||||
if ready:
|
||||
ready_days.append(day)
|
||||
ready_frames.append(day_replay[REPLAY_OUTPUT_COLUMNS].copy())
|
||||
else:
|
||||
excluded_days.append(
|
||||
{
|
||||
"date": day,
|
||||
"row_count": int(row_count),
|
||||
"missing_required_rows": missing_required_rows,
|
||||
"reason": "MISSING_REQUIRED_MARKET_FIELDS" if missing_required_rows else "INCOMPLETE_MINUTE_COUNT",
|
||||
}
|
||||
)
|
||||
logging.info(
|
||||
"trader.training.replay_day_finished runId=%s day=%s ready=%s rows=%s missingRequiredRows=%s",
|
||||
args.run_id,
|
||||
day,
|
||||
ready,
|
||||
row_count,
|
||||
missing_required_rows,
|
||||
)
|
||||
|
||||
if len(ready_days) < int(args.min_replay_ready_days):
|
||||
write_json(root / "replay" / "excluded_days.json", excluded_days)
|
||||
write_text(root / "replay" / "replay_ready_days.txt", "\n".join(ready_days) + ("\n" if ready_days else ""))
|
||||
raise ValueError(f"replay_1m has only {len(ready_days)} replay-ready days, required {args.min_replay_ready_days}")
|
||||
before_filter = len(replay)
|
||||
replay = replay[replay["event_date"].isin(ready_days)].copy()
|
||||
replay = pd.concat(ready_frames, ignore_index=True)
|
||||
logging.info(
|
||||
"trader.training.replay_ready_days_selected runId=%s readyDays=%s excludedDays=%s rowBefore=%s rowAfter=%s",
|
||||
args.run_id,
|
||||
len(ready_days),
|
||||
len(excluded_days),
|
||||
before_filter,
|
||||
row_before_filter,
|
||||
len(replay),
|
||||
)
|
||||
|
||||
columns = [
|
||||
"symbol",
|
||||
"timeframe",
|
||||
"event_time",
|
||||
"open_time_ms",
|
||||
"open",
|
||||
"high",
|
||||
"low",
|
||||
"close",
|
||||
"volume",
|
||||
"taker_buy_volume",
|
||||
"taker_sell_volume",
|
||||
"funding_bps",
|
||||
"mark_price",
|
||||
"index_price",
|
||||
"next_funding_time",
|
||||
"open_interest",
|
||||
"best_bid_price",
|
||||
"best_ask_price",
|
||||
"spread_bps",
|
||||
"level1_ofi_1m",
|
||||
"liquidation_buy_notional_1m",
|
||||
"liquidation_sell_notional_1m",
|
||||
"liquidation_available",
|
||||
"source_coverage",
|
||||
]
|
||||
replay = replay[columns].sort_values(["symbol", "event_time"]).reset_index(drop=True)
|
||||
replay = replay[REPLAY_OUTPUT_COLUMNS].sort_values(["symbol", "event_time"]).reset_index(drop=True)
|
||||
path = root / "replay" / "replay_1m.parquet"
|
||||
data_hash = write_parquet(path, replay)
|
||||
write_json(
|
||||
|
||||
Reference in New Issue
Block a user