Handle sparse event buckets in entry screening
This commit is contained in:
@@ -146,7 +146,18 @@ def _bucket_edges(values: np.ndarray) -> np.ndarray:
|
||||
edges = np.quantile(clean, quantiles)
|
||||
edges = np.unique(edges)
|
||||
if edges.size < 3:
|
||||
return np.array([], dtype="float64")
|
||||
non_zero = clean[clean != 0.0]
|
||||
if non_zero.size < 300:
|
||||
return np.array([], dtype="float64")
|
||||
# 突破/扫单类特征常常绝大多数为 0。普通十分位会全挤在 0,
|
||||
# 这里单独保留“没有事件”和“有事件强弱”两类桶,避免漏掉稀有但可能有用的信号。
|
||||
event_edges = np.unique(np.quantile(non_zero, np.linspace(0.0, 1.0, 6)))
|
||||
if event_edges.size < 2:
|
||||
return np.array([-np.inf, 0.0, np.inf], dtype="float64")
|
||||
edges = np.unique(np.concatenate(([-np.inf, 0.0], event_edges[1:-1], [np.inf]))).astype("float64")
|
||||
if edges.size < 3:
|
||||
return np.array([], dtype="float64")
|
||||
return edges
|
||||
edges[0] = -np.inf
|
||||
edges[-1] = np.inf
|
||||
return edges
|
||||
|
||||
Reference in New Issue
Block a user