refactor: remove_small_objects用Otsu替代中位数25%

对连通域面积分布做Otsu自动找分界,不再拍脑袋定百分比
This commit is contained in:
2026-05-08 15:31:47 +08:00
parent b07e7a1182
commit 1041a66270
4 changed files with 57 additions and 17 deletions
+3
View File
@@ -15,3 +15,6 @@ Thumbs.db
# Obsidian # Obsidian
.obsidian/ .obsidian/
# Flask
.playwright-mcp/
+25 -7
View File
@@ -200,23 +200,41 @@ def remove_small_objects(binary: np.ndarray) -> np.ndarray:
""" """
自动去除小连通域(噪声)。 自动去除小连通域(噪声)。
统计所有连通域面积的中位数, 连通域面积分布做 Otsu 阈值检测:
小于中位数 25% 的视为噪声,直接剔除 面积分布天然双峰——噪声区(几个像素) 和 真斑点区(几百像素)
Otsu 自动找到两峰之间的最佳分界,小于该值的视为噪声。
换图换分辨率都自动适应,不需要手动调参。 换图换分辨率都自动适应,不需要手动调参。
""" """
labeled, num = ndimage.label(binary) labeled, num = ndimage.label(binary)
if num == 0: if num == 0:
return binary # 全黑,直接返回 return binary
# 收集所有连通域的面积 # 收集所有连通域的面积
areas = [int(np.sum(labeled == i)) for i in range(1, num + 1)] areas = np.array([int(np.sum(labeled == i)) for i in range(1, num + 1)])
median = np.median(areas) # 面积中位数 if len(areas) < 2:
min_size = max(1, int(median * 0.25)) # 中位数的25%,最少1像素 return binary
# 对面积数组做 Otsu(与像素 Otsu 完全相同的原理)
# 将面积值当作"灰度",找到最小类内方差的分界点
best_T, best_cost, n_total = 0, float('inf'), len(areas)
for T in np.unique(areas):
small = areas[areas <= T] # 候选噪声组
large = areas[areas > T] # 候选真斑点组
w_s = len(small) / n_total
w_l = len(large) / n_total
if w_s == 0 or w_l == 0:
continue
cost = w_s * np.var(small) + w_l * np.var(large)
if cost < best_cost:
best_cost = cost
best_T = T
min_size = best_T # Otsu 自动找到的面积分界线
# 面积不达标的连通域整块置0 # 面积不达标的连通域整块置0
result = binary.copy() result = binary.copy()
for i in range(1, num + 1): for i in range(1, num + 1):
if areas[i - 1] < min_size: if int(np.sum(labeled == i)) < min_size:
result[labeled == i] = 0 result[labeled == i] = 0
return result return result
+19 -7
View File
@@ -207,21 +207,33 @@ def remove_small_objects(binary: np.ndarray) -> np.ndarray:
""" """
自动去除小连通域。 自动去除小连通域。
统计所有连通域面积中位数,小于中位数 25% 的视为噪声 连通域面积分布做 Otsu 阈值检测——面积天然双峰
自动剔除,不需要人设定阈值 Otsu 自动找到噪声峰和真斑点峰之间的最佳分界,零人工参数
""" """
labeled, num = ndimage.label(binary) labeled, num = ndimage.label(binary)
if num == 0: if num == 0:
return binary return binary
# 统计每个连通域的面积 areas = np.array([int(np.sum(labeled == i)) for i in range(1, num + 1)])
areas = [int(np.sum(labeled == i)) for i in range(1, num + 1)] if len(areas) < 2:
median_area = np.median(areas) return binary
min_size = max(1, int(median_area * 0.25)) # 中位数的25%,最少1像素
best_T, best_cost, n_total = 0, float('inf'), len(areas)
for T_val in np.unique(areas):
small = areas[areas <= T_val]
large = areas[areas > T_val]
w_s = len(small) / n_total
w_l = len(large) / n_total
if w_s == 0 or w_l == 0:
continue
cost = w_s * np.var(small) + w_l * np.var(large)
if cost < best_cost:
best_cost = cost
best_T = T_val
result = binary.copy() result = binary.copy()
for i in range(1, num + 1): for i in range(1, num + 1):
if areas[i - 1] < min_size: if int(np.sum(labeled == i)) < best_T:
result[labeled == i] = 0 result[labeled == i] = 0
return result return result
+10 -3
View File
@@ -72,11 +72,18 @@ def keep_largest_object(binary):
def remove_small_objects(binary): def remove_small_objects(binary):
L, n = ndimage.label(binary) L, n = ndimage.label(binary)
if n == 0: return binary if n == 0: return binary
areas = [int(np.sum(L==i)) for i in range(1,n+1)] areas = np.array([int(np.sum(L==i)) for i in range(1,n+1)])
minsz = max(1, int(np.median(areas)*0.25)) if len(areas) < 2: return binary
best_T, best_cost, n_total = 0, float('inf'), len(areas)
for T in np.unique(areas):
s, l = areas[areas<=T], areas[areas>T]
w_s, w_l = len(s)/n_total, len(l)/n_total
if w_s==0 or w_l==0: continue
cost = w_s*np.var(s) + w_l*np.var(l)
if cost < best_cost: best_cost, best_T = cost, T
r = binary.copy() r = binary.copy()
for i in range(1, n+1): for i in range(1, n+1):
if areas[i-1] < minsz: r[L==i] = 0 if int(np.sum(L==i)) < best_T: r[L==i] = 0
return r return r