from __future__ import annotations import logging from typing import Any import numpy as np import pandas as pd from trader_training.io_utils import ( manifest, read_parquet, require_columns, run_root, sha256_json, to_utc_series, write_json, write_parquet, write_text, ) from trader_training.replay import assign_split from trader_training.schemas import FEATURE_ORDER, FEATURE_VERSION, FEATURES, FIT_SPLIT, TUNE_SPLIT, VALIDATION_LOCKED_SPLIT META_COLUMNS = [ "sample_id", "symbol", "event_time", "open_time_ms", "split_id", "walk_forward_fold", "feature_version", "data_quality_flag", ] def _safe_divide(numerator: pd.Series, denominator: pd.Series, default: float = 0.0) -> pd.Series: result = numerator / denominator.replace(0, np.nan) return result.replace([np.inf, -np.inf], np.nan).fillna(default) def _rolling_rank_last(values: pd.Series, window: int) -> pd.Series: def calc(raw: np.ndarray) -> float: last = raw[-1] return float(np.sum(raw <= last) / len(raw)) return values.rolling(window, min_periods=window).apply(calc, raw=True) def _complete_days(frame: pd.DataFrame) -> pd.DataFrame: frame = frame.copy() frame["event_date"] = frame["event_time"].dt.strftime("%Y-%m-%d") counts = frame.groupby(["symbol", "event_date"])["event_time"].count() complete = counts[counts == 1440].reset_index()[["symbol", "event_date"]] return frame.merge(complete, on=["symbol", "event_date"], how="inner").drop(columns=["event_date"]) def build_feature_frame(args: Any) -> None: root = run_root(args) replay_path = args.replay_path or root / "replay" / "replay_1m.parquet" split_manifest_path = args.split_manifest_path or root / "split" / "split_manifest.json" replay = read_parquet(replay_path) required = [ "symbol", "event_time", "open_time_ms", "open", "high", "low", "close", "volume", "taker_buy_volume", "taker_sell_volume", "funding_bps", "mark_price", "index_price", "next_funding_time", "open_interest", "spread_bps", "level1_ofi_1m", "liquidation_buy_notional_1m", "liquidation_sell_notional_1m", "liquidation_available", ] require_columns(replay, required, "replay_1m") replay = replay.copy() replay["event_time"] = to_utc_series(replay["event_time"]) replay["next_funding_time"] = to_utc_series(replay["next_funding_time"]) replay = replay.sort_values(["symbol", "event_time"]).reset_index(drop=True) if not args.allow_incomplete_days: before = len(replay) replay = _complete_days(replay) logging.info("trader.training.feature_complete_days rowBefore=%s rowAfter=%s", before, len(replay)) frames: list[pd.DataFrame] = [] for symbol, group in replay.groupby("symbol", sort=False): group = group.sort_values("event_time").reset_index(drop=True).copy() close = group["close"].astype(float) high = group["high"].astype(float) low = group["low"].astype(float) volume = group["volume"].astype(float) log_ret = np.log(close / close.shift(1)) group["ret_1m_bps"] = (close / close.shift(1) - 1.0) * 10000.0 group["ret_5m_bps"] = (close / close.shift(5) - 1.0) * 10000.0 group["ret_15m_bps"] = (close / close.shift(15) - 1.0) * 10000.0 group["ret_60m_bps"] = (close / close.shift(60) - 1.0) * 10000.0 group["ret_240m_bps"] = (close / close.shift(240) - 1.0) * 10000.0 group["realized_vol_15m_bps"] = log_ret.rolling(15, min_periods=15).std() * 10000.0 group["realized_vol_60m_bps"] = log_ret.rolling(60, min_periods=60).std() * 10000.0 group["vol_ratio_15m_60m"] = _safe_divide(group["realized_vol_15m_bps"], group["realized_vol_60m_bps"].clip(lower=1.0)) group["range_15m_bps"] = (high.rolling(15, min_periods=15).max() / low.rolling(15, min_periods=15).min() - 1.0) * 10000.0 group["range_60m_bps"] = (high.rolling(60, min_periods=60).max() / low.rolling(60, min_periods=60).min() - 1.0) * 10000.0 vol_mean = volume.rolling(60, min_periods=60).mean() vol_std = volume.rolling(60, min_periods=60).std().replace(0, np.nan) group["volume_zscore_60m"] = ((volume - vol_mean) / vol_std).fillna(0.0) group["trend_consistency_15m"] = np.sign(group["ret_1m_bps"]).rolling(15, min_periods=15).mean() high60 = high.rolling(60, min_periods=60).max() low60 = low.rolling(60, min_periods=60).min() group["channel_position_60m_pct"] = ((close - low60) / (high60 - low60).clip(lower=1e-12)).clip(0.0, 1.0) prev_high60 = high.shift(1).rolling(60, min_periods=60).max() prev_low60 = low.shift(1).rolling(60, min_periods=60).min() group["upper_breakout_60m_bps"] = ((close / prev_high60 - 1.0).clip(lower=0.0)) * 10000.0 group["lower_breakout_60m_bps"] = ((prev_low60 / close - 1.0).clip(lower=0.0)) * 10000.0 recent_high15 = high.rolling(15, min_periods=15).max() recent_low15 = low.rolling(15, min_periods=15).min() broke_up = recent_high15 > prev_high60 broke_down = recent_low15 < prev_low60 group["upper_failed_break_reclaim_15m_bps"] = np.where(broke_up, ((prev_high60 - close).clip(lower=0.0) / close) * 10000.0, 0.0) group["lower_failed_break_reclaim_15m_bps"] = np.where(broke_down, ((close - prev_low60).clip(lower=0.0) / close) * 10000.0, 0.0) group["sweep_up_15m_bps"] = ((recent_high15 / close - 1.0).clip(lower=0.0)) * 10000.0 group["sweep_down_15m_bps"] = ((close / recent_low15 - 1.0).clip(lower=0.0)) * 10000.0 rank = _rolling_rank_last(group["range_15m_bps"], 240) group["compression_score_4h_pct"] = 1.0 - rank group["compression_release_15m_bps"] = (group["range_15m_bps"] - group["range_15m_bps"].rolling(240, min_periods=240).median()).clip(lower=0.0) buy = group["taker_buy_volume"].astype(float) sell = group["taker_sell_volume"].astype(float) group["taker_imbalance_1m"] = _safe_divide(buy - sell, buy + sell) group["taker_imbalance_5m"] = _safe_divide(buy.rolling(5, min_periods=5).sum() - sell.rolling(5, min_periods=5).sum(), (buy + sell).rolling(5, min_periods=5).sum()) group["taker_imbalance_15m"] = _safe_divide(buy.rolling(15, min_periods=15).sum() - sell.rolling(15, min_periods=15).sum(), (buy + sell).rolling(15, min_periods=15).sum()) group["spread_rank_24h_pct"] = _rolling_rank_last(group["spread_bps"].astype(float), 1440) group["oi_delta_15m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(15) - 1.0) * 10000.0 group["oi_delta_60m_bps"] = (group["open_interest"].astype(float) / group["open_interest"].astype(float).shift(60) - 1.0) * 10000.0 group["mark_index_basis_bps"] = (group["mark_price"].astype(float) / group["index_price"].astype(float) - 1.0) * 10000.0 liq_buy = group["liquidation_buy_notional_1m"].astype(float) liq_sell = group["liquidation_sell_notional_1m"].astype(float) liq_total_15 = (liq_buy + liq_sell).rolling(15, min_periods=1).sum() group["liquidation_imbalance_15m"] = _safe_divide(liq_buy.rolling(15, min_periods=1).sum() - liq_sell.rolling(15, min_periods=1).sum(), liq_total_15) liq_mean = liq_total_15.rolling(1440, min_periods=60).mean() liq_std = liq_total_15.rolling(1440, min_periods=60).std().replace(0, np.nan) group["liquidation_notional_zscore_15m"] = ((liq_total_15 - liq_mean) / liq_std).fillna(0.0) minute_of_day = group["event_time"].dt.hour * 60 + group["event_time"].dt.minute group["minute_of_day_sin"] = np.sin(2 * np.pi * minute_of_day / 1440.0) group["minute_of_day_cos"] = np.cos(2 * np.pi * minute_of_day / 1440.0) group["minutes_to_next_funding"] = ((group["next_funding_time"] - group["event_time"]).dt.total_seconds() / 60.0).clip(0.0, 480.0) group["symbol"] = symbol frames.append(group) frame = pd.concat(frames, ignore_index=True) frame["sample_id"] = frame["symbol"].astype(str) + ":" + frame["open_time_ms"].astype(str) frame["split_id"] = assign_split(frame["event_time"], split_manifest_path) frame["walk_forward_fold"] = np.where(frame["split_id"].eq(FIT_SPLIT), "fold_01", "NO_FOLD") frame["feature_version"] = FEATURE_VERSION hard_na = frame[FEATURE_ORDER].isna().any(axis=1) optional_missing = frame["liquidation_available"].fillna(0).eq(0) frame["data_quality_flag"] = np.where(hard_na, "WARMUP", np.where(optional_missing, "PARTIAL_OPTIONAL", "OK")) ordered = frame[META_COLUMNS + FEATURE_ORDER].copy() for feature in FEATURE_ORDER: ordered[feature] = pd.to_numeric(ordered[feature], errors="coerce").astype("float32") feature_dir = root / "feature" data_hash = write_parquet(feature_dir / "feature_frame.parquet", ordered) schema = [feature.as_json() for feature in FEATURES] feature_order_hash = write_json(feature_dir / "feature_order.json", FEATURE_ORDER) feature_schema_hash = write_json(feature_dir / "feature_schema.json", schema) write_json( feature_dir / "feature_frame.manifest.json", manifest( feature_dir / "feature_frame.parquet", { "row_count": len(ordered), "ok_row_count": int(ordered["data_quality_flag"].eq("OK").sum()), "partial_optional_row_count": int(ordered["data_quality_flag"].eq("PARTIAL_OPTIONAL").sum()), "warmup_row_count": int(ordered["data_quality_flag"].eq("WARMUP").sum()), "feature_count": len(FEATURE_ORDER), "feature_version": FEATURE_VERSION, "feature_order_hash": feature_order_hash, "feature_schema_hash": feature_schema_hash, "data_hash_sha256": data_hash, }, ), ) write_feature_report(feature_dir / "feature_quality_report.md", ordered, feature_schema_hash, feature_order_hash) logging.info( "trader.training.feature_written runId=%s rowCount=%s splitCounts=%s eventFrom=%s eventTo=%s path=%s", args.run_id, len(ordered), ordered["split_id"].value_counts().to_dict(), ordered["event_time"].min(), ordered["event_time"].max(), feature_dir / "feature_frame.parquet", ) def write_feature_report(path, frame: pd.DataFrame, feature_schema_hash: str, feature_order_hash: str) -> None: split_rows = [] for split_id, group in frame.groupby("split_id", sort=True): split_rows.append( { "split_id": split_id, "rows": len(group), "start": str(group["event_time"].min()), "end": str(group["event_time"].max()), "ok": int(group["data_quality_flag"].eq("OK").sum()), "partial_optional": int(group["data_quality_flag"].eq("PARTIAL_OPTIONAL").sum()), "warmup": int(group["data_quality_flag"].eq("WARMUP").sum()), } ) finite_rows = [] for feature in FEATURE_ORDER: series = pd.to_numeric(frame[feature], errors="coerce") values = series.to_numpy(dtype=float) finite_rows.append( { "feature": feature, "nan_count": int(series.isna().sum()), "inf_count": int(np.isinf(values).sum()), "finite_count": int(np.isfinite(values).sum()), } ) correlation_rows = _high_correlation_rows(frame) drift_rows = _drift_rows(frame) lines = [ "# Trader Feature Quality Report", "", f"- row_count: {len(frame)}", f"- OK: {int(frame['data_quality_flag'].eq('OK').sum())}", f"- PARTIAL_OPTIONAL: {int(frame['data_quality_flag'].eq('PARTIAL_OPTIONAL').sum())}", f"- WARMUP: {int(frame['data_quality_flag'].eq('WARMUP').sum())}", f"- feature_schema_hash: {feature_schema_hash}", f"- feature_order_hash: {feature_order_hash}", "", "## Split Coverage", "", _markdown_table(split_rows, ["split_id", "rows", "start", "end", "ok", "partial_optional", "warmup"]), "", "## Source Coverage", "", f"- replay_1m_required_columns: present", f"- liquidation_available_share: {float(frame['liquidation_available'].mean()):.6f}", f"- feature_rows_with_optional_liquidation_missing: {int(frame['data_quality_flag'].eq('PARTIAL_OPTIONAL').sum())}", "", "## Leakage Check", "", "- 所有特征只使用当前分钟收盘后已经知道的数据,滚动窗口都只看 `<= t`。", "- 未来价格、未来收益、目标标签不进入 `feature_frame.parquet`。", "", "## Extreme Value Check", "", _markdown_table(finite_rows, ["feature", "nan_count", "inf_count", "finite_count"]), "", "## High Correlation Check", "", _markdown_table(correlation_rows, ["feature_a", "feature_b", "corr_abs"]), "", "## Drift Check", "", _markdown_table( drift_rows, ["feature", "train_p50", "tune_p50", "validation_p50", "p50_diff", "train_p99", "tune_p99", "validation_p99", "p99_diff"], ), "", "## Distribution", "", "| feature | null_count | min | p01 | p50 | p99 | max |", "| --- | ---: | ---: | ---: | ---: | ---: | ---: |", ] for feature in FEATURE_ORDER: series = pd.to_numeric(frame[feature], errors="coerce") quantiles = series.quantile([0.01, 0.5, 0.99]) lines.append( f"| {feature} | {int(series.isna().sum())} | {series.min():.6g} | {quantiles.loc[0.01]:.6g} | {quantiles.loc[0.5]:.6g} | {quantiles.loc[0.99]:.6g} | {series.max():.6g} |" ) write_text(path, "\n".join(lines) + "\n") def feature_order_hash() -> str: return sha256_json(FEATURE_ORDER) def _high_correlation_rows(frame: pd.DataFrame) -> list[dict[str, object]]: sample = frame[FEATURE_ORDER].apply(pd.to_numeric, errors="coerce").dropna() if len(sample) > 5000: sample = sample.sample(5000, random_state=7) if sample.empty: return [{"feature_a": "NONE", "feature_b": "NONE", "corr_abs": 0.0}] corr = sample.corr().abs() rows = [] for left_index, left in enumerate(FEATURE_ORDER): for right in FEATURE_ORDER[left_index + 1 :]: value = corr.loc[left, right] if pd.notna(value) and value >= 0.95: rows.append({"feature_a": left, "feature_b": right, "corr_abs": round(float(value), 6)}) return rows[:30] or [{"feature_a": "NONE", "feature_b": "NONE", "corr_abs": 0.0}] def _drift_rows(frame: pd.DataFrame) -> list[dict[str, object]]: train = frame[frame["split_id"].eq(FIT_SPLIT)] validation = frame[frame["split_id"].eq(VALIDATION_LOCKED_SPLIT)] tune = frame[frame["split_id"].eq(TUNE_SPLIT)] rows = [] for feature in FEATURE_ORDER: train_series = pd.to_numeric(train[feature], errors="coerce") validation_series = pd.to_numeric(validation[feature], errors="coerce") tune_series = pd.to_numeric(tune[feature], errors="coerce") train_p50 = float(train_series.quantile(0.5)) if not train_series.empty else 0.0 tune_p50 = float(tune_series.quantile(0.5)) if not tune_series.empty else 0.0 validation_p50 = float(validation_series.quantile(0.5)) if not validation_series.empty else 0.0 train_p99 = float(train_series.quantile(0.99)) if not train_series.empty else 0.0 tune_p99 = float(tune_series.quantile(0.99)) if not tune_series.empty else 0.0 validation_p99 = float(validation_series.quantile(0.99)) if not validation_series.empty else 0.0 rows.append( { "feature": feature, "train_p50": round(train_p50, 6), "tune_p50": round(tune_p50, 6), "validation_p50": round(validation_p50, 6), "p50_diff": round(validation_p50 - train_p50, 6), "train_p99": round(train_p99, 6), "tune_p99": round(tune_p99, 6), "validation_p99": round(validation_p99, 6), "p99_diff": round(validation_p99 - train_p99, 6), } ) return rows def _markdown_table(rows: list[dict[str, object]], columns: list[str]) -> str: if not rows: rows = [{column: "" for column in columns}] lines = ["| " + " | ".join(columns) + " |", "| " + " | ".join("---" for _ in columns) + " |"] for row in rows: lines.append("| " + " | ".join(str(row.get(column, "")) for column in columns) + " |") return "\n".join(lines)