From 1041a66270a2f8ccf2746b6b3b92109c04b40c09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=88=98=E8=88=AA=E5=AE=87?= <3364451258@qq.com> Date: Fri, 8 May 2026 15:31:47 +0800 Subject: [PATCH] =?UTF-8?q?refactor:=20remove=5Fsmall=5Fobjects=E7=94=A8Ot?= =?UTF-8?q?su=E6=9B=BF=E4=BB=A3=E4=B8=AD=E4=BD=8D=E6=95=B025%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 对连通域面积分布做Otsu自动找分界,不再拍脑袋定百分比 --- .gitignore | 3 +++ src/cDNA_gridding_simple.py | 32 +++++++++++++++++++++++++------- src/cDNA_segmentation.py | 26 +++++++++++++++++++------- web/app.py | 13 ++++++++++--- 4 files changed, 57 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 2a95f7e..f88bc1f 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,6 @@ Thumbs.db # Obsidian .obsidian/ + +# Flask +.playwright-mcp/ \ No newline at end of file diff --git a/src/cDNA_gridding_simple.py b/src/cDNA_gridding_simple.py index bdf1cac..0e528a5 100644 --- a/src/cDNA_gridding_simple.py +++ b/src/cDNA_gridding_simple.py @@ -200,23 +200,41 @@ def remove_small_objects(binary: np.ndarray) -> np.ndarray: """ 自动去除小连通域(噪声)。 - 统计所有连通域面积的中位数, - 小于中位数 25% 的视为噪声,直接剔除。 + 对连通域面积分布做 Otsu 阈值检测: + 面积分布天然双峰——噪声区(几个像素) 和 真斑点区(几百像素)。 + Otsu 自动找到两峰之间的最佳分界,小于该值的视为噪声。 换图换分辨率都自动适应,不需要手动调参。 """ labeled, num = ndimage.label(binary) if num == 0: - return binary # 全黑,直接返回 + return binary # 收集所有连通域的面积 - areas = [int(np.sum(labeled == i)) for i in range(1, num + 1)] - median = np.median(areas) # 面积中位数 - min_size = max(1, int(median * 0.25)) # 中位数的25%,最少1像素 + areas = np.array([int(np.sum(labeled == i)) for i in range(1, num + 1)]) + if len(areas) < 2: + return binary + + # 对面积数组做 Otsu(与像素 Otsu 完全相同的原理) + # 将面积值当作"灰度",找到最小类内方差的分界点 + best_T, best_cost, n_total = 0, float('inf'), len(areas) + for T in np.unique(areas): + small = areas[areas <= T] # 候选噪声组 + large = areas[areas > T] # 候选真斑点组 + w_s = len(small) / n_total + w_l = len(large) / n_total + if w_s == 0 or w_l == 0: + continue + cost = w_s * np.var(small) + w_l * np.var(large) + if cost < best_cost: + best_cost = cost + best_T = T + + min_size = best_T # Otsu 自动找到的面积分界线 # 面积不达标的连通域整块置0 result = binary.copy() for i in range(1, num + 1): - if areas[i - 1] < min_size: + if int(np.sum(labeled == i)) < min_size: result[labeled == i] = 0 return result diff --git a/src/cDNA_segmentation.py b/src/cDNA_segmentation.py index 4f07e9e..c3b3860 100644 --- a/src/cDNA_segmentation.py +++ b/src/cDNA_segmentation.py @@ -207,21 +207,33 @@ def remove_small_objects(binary: np.ndarray) -> np.ndarray: """ 自动去除小连通域。 - 统计所有连通域的面积中位数,小于中位数 25% 的视为噪声, - 自动剔除,不需要人设定阈值。 + 对连通域面积分布做 Otsu 阈值检测——面积天然双峰, + Otsu 自动找到噪声峰和真斑点峰之间的最佳分界,零人工参数。 """ labeled, num = ndimage.label(binary) if num == 0: return binary - # 统计每个连通域的面积 - areas = [int(np.sum(labeled == i)) for i in range(1, num + 1)] - median_area = np.median(areas) - min_size = max(1, int(median_area * 0.25)) # 中位数的25%,最少1像素 + areas = np.array([int(np.sum(labeled == i)) for i in range(1, num + 1)]) + if len(areas) < 2: + return binary + + best_T, best_cost, n_total = 0, float('inf'), len(areas) + for T_val in np.unique(areas): + small = areas[areas <= T_val] + large = areas[areas > T_val] + w_s = len(small) / n_total + w_l = len(large) / n_total + if w_s == 0 or w_l == 0: + continue + cost = w_s * np.var(small) + w_l * np.var(large) + if cost < best_cost: + best_cost = cost + best_T = T_val result = binary.copy() for i in range(1, num + 1): - if areas[i - 1] < min_size: + if int(np.sum(labeled == i)) < best_T: result[labeled == i] = 0 return result diff --git a/web/app.py b/web/app.py index 282ec2b..bbeeb45 100644 --- a/web/app.py +++ b/web/app.py @@ -72,11 +72,18 @@ def keep_largest_object(binary): def remove_small_objects(binary): L, n = ndimage.label(binary) if n == 0: return binary - areas = [int(np.sum(L==i)) for i in range(1,n+1)] - minsz = max(1, int(np.median(areas)*0.25)) + areas = np.array([int(np.sum(L==i)) for i in range(1,n+1)]) + if len(areas) < 2: return binary + best_T, best_cost, n_total = 0, float('inf'), len(areas) + for T in np.unique(areas): + s, l = areas[areas<=T], areas[areas>T] + w_s, w_l = len(s)/n_total, len(l)/n_total + if w_s==0 or w_l==0: continue + cost = w_s*np.var(s) + w_l*np.var(l) + if cost < best_cost: best_cost, best_T = cost, T r = binary.copy() for i in range(1, n+1): - if areas[i-1] < minsz: r[L==i] = 0 + if int(np.sum(L==i)) < best_T: r[L==i] = 0 return r