refactor: v2.0 完全解耦 — 阿里云内闭环

- 删除 VOC_DATA_DIR / get_voc_conn(不再跨云直读 SQLite)
- 案例 DB 自带 comments 表,自包含所有数据
- 新增 POST /import-voc:通过 VOC 公网 API 导入评论
- VOC_API_BASE 环境变量控制 API 地址
- 新增 httpx 依赖
This commit is contained in:
lidf 2026-04-07 19:47:34 +08:00
parent ec8eaa0b36
commit c5e2a58258
5 changed files with 194 additions and 191 deletions

View File

@ -1,13 +1,19 @@
# LLM通过 LiteLLM 网关) # 黑手党提案后端 v2.0 — 环境变量
# 完全独立,阿里云内闭环
# VOC 公网 API跨云只读访问用于 import-voc
VOC_API_BASE=https://brand.brainwork.club/voc/api/research
# LLM 路由(走同机 LiteLLM
LITELLM_PROXY_URL=http://127.0.0.1:4000/v1 LITELLM_PROXY_URL=http://127.0.0.1:4000/v1
LITELLM_MASTER_KEY= LITELLM_MASTER_KEY=sk-xxx
# 模型
MODEL_ID=qwen-plus MODEL_ID=qwen-plus
TEMPERATURE=0.1
# 向量化DashScope text-embedding-v4 # DashScope向量化用
DASHSCOPE_API_KEY= DASHSCOPE_API_KEY=sk-xxx
# 共享 VOC 数据层 # 端口
VOC_DATA_DIR=/opt/apps/voc-researcher/data
# 服务
PORT=8093 PORT=8093

View File

@ -1,9 +1,11 @@
""" """
黑手党提案 数据库管理 黑手党提案 数据库管理完全独立阿里云内闭环
双库设计 每个提案案例一个 SQLite 文件自包含所有数据
1. 案例 DB读写每个提案案例一个 SQLite存分析结果 - case_card案例元信息
2. VOC DB只读读取共享 VOC 数据层的原始评论 - comments VOC API 导入的评论本地副本
- ude_sentences / ude_clustersUDE 分析结果
- conflicts / proposal_sections后续阶段
""" """
import os import os
import sqlite3 import sqlite3
@ -17,10 +19,11 @@ load_dotenv()
DATA_DIR = Path(__file__).parent / "data" DATA_DIR = Path(__file__).parent / "data"
DATA_DIR.mkdir(exist_ok=True) DATA_DIR.mkdir(exist_ok=True)
VOC_DATA_DIR = Path(os.getenv("VOC_DATA_DIR", "")) # VOC 公网 API腾讯云跨云只读访问
VOC_API_BASE = os.getenv("VOC_API_BASE", "https://brand.brainwork.club/voc/api/research")
# ═══════════ 案例 DB读写 ═══════════ # ═══════════ Schema ═══════════
CASE_SCHEMA = """ CASE_SCHEMA = """
CREATE TABLE IF NOT EXISTS case_card ( CREATE TABLE IF NOT EXISTS case_card (
@ -29,13 +32,25 @@ CREATE TABLE IF NOT EXISTS case_card (
focus_product TEXT, focus_product TEXT,
competitors TEXT, competitors TEXT,
voc_research_id TEXT, voc_research_id TEXT,
voc_api_base TEXT,
created_at TEXT DEFAULT (datetime('now')), created_at TEXT DEFAULT (datetime('now')),
status TEXT DEFAULT 'draft' status TEXT DEFAULT 'draft'
); );
CREATE TABLE IF NOT EXISTS comments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
voc_id INTEGER,
platform TEXT,
text TEXT NOT NULL,
like_count INTEGER DEFAULT 0,
published_at TEXT,
imported_at TEXT DEFAULT (datetime('now')),
UNIQUE(voc_id)
);
CREATE TABLE IF NOT EXISTS ude_sentences ( CREATE TABLE IF NOT EXISTS ude_sentences (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
voc_comment_id INTEGER, comment_id INTEGER REFERENCES comments(id),
ude_text TEXT NOT NULL, ude_text TEXT NOT NULL,
confidence REAL DEFAULT 0.5, confidence REAL DEFAULT 0.5,
vector TEXT, vector TEXT,
@ -73,8 +88,10 @@ CREATE TABLE IF NOT EXISTS proposal_sections (
""" """
# ═══════════ 案例 DB ═══════════
def get_case_conn(case_id: str) -> sqlite3.Connection: def get_case_conn(case_id: str) -> sqlite3.Connection:
"""获取案例 DB 连接(读写)""" """获取案例 DB 连接"""
path = DATA_DIR / f"{case_id}.db" path = DATA_DIR / f"{case_id}.db"
if not path.exists(): if not path.exists():
raise FileNotFoundError(f"案例 {case_id} 不存在") raise FileNotFoundError(f"案例 {case_id} 不存在")
@ -93,8 +110,8 @@ def init_case_db(brand_name: str, category: str = "", focus_product: str = "",
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
conn.executescript(CASE_SCHEMA) conn.executescript(CASE_SCHEMA)
conn.execute( conn.execute(
"INSERT INTO case_card (brand_name, category, focus_product, competitors, voc_research_id) VALUES (?,?,?,?,?)", "INSERT INTO case_card (brand_name, category, focus_product, competitors, voc_research_id, voc_api_base) VALUES (?,?,?,?,?,?)",
(brand_name, category, focus_product, competitors, voc_research_id) (brand_name, category, focus_product, competitors, voc_research_id, VOC_API_BASE)
) )
conn.commit() conn.commit()
conn.close() conn.close()
@ -111,11 +128,13 @@ def list_cases() -> list[dict]:
conn.row_factory = sqlite3.Row conn.row_factory = sqlite3.Row
card = conn.execute("SELECT * FROM case_card LIMIT 1").fetchone() card = conn.execute("SELECT * FROM case_card LIMIT 1").fetchone()
if card: if card:
comment_count = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
ude_count = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] ude_count = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
cluster_count = conn.execute("SELECT count(*) FROM ude_clusters").fetchone()[0] cluster_count = conn.execute("SELECT count(*) FROM ude_clusters").fetchone()[0]
cases.append({ cases.append({
"case_id": case_id, "case_id": case_id,
**dict(card), **dict(card),
"comment_count": comment_count,
"ude_count": ude_count, "ude_count": ude_count,
"cluster_count": cluster_count, "cluster_count": cluster_count,
}) })
@ -123,45 +142,3 @@ def list_cases() -> list[dict]:
except Exception: except Exception:
pass pass
return cases return cases
# ═══════════ VOC DB只读 ═══════════
def get_voc_conn(voc_research_id: str) -> sqlite3.Connection:
"""只读访问共享 VOC 数据"""
if not VOC_DATA_DIR.exists():
raise FileNotFoundError(f"VOC 数据目录不存在: {VOC_DATA_DIR}")
path = VOC_DATA_DIR / f"{voc_research_id}.db"
if not path.exists():
raise FileNotFoundError(f"VOC 研究 {voc_research_id} 不存在")
conn = sqlite3.connect(f"file:{path}?mode=ro", uri=True)
conn.row_factory = sqlite3.Row
return conn
def list_voc_researches() -> list[dict]:
"""列出共享 VOC 数据层中的所有研究"""
if not VOC_DATA_DIR.exists():
return []
researches = []
for db_file in sorted(VOC_DATA_DIR.glob("*.db")):
if db_file.name in ("global_cache.db", "agent_sessions.db"):
continue
rid = db_file.stem
try:
conn = sqlite3.connect(f"file:{db_file}?mode=ro", uri=True)
conn.row_factory = sqlite3.Row
card = conn.execute("SELECT brand_name FROM research_card LIMIT 1").fetchone()
comment_count = conn.execute(
"SELECT count(*) FROM comments WHERE length(text) > 10"
).fetchone()[0]
conn.close()
if card and comment_count > 0:
researches.append({
"research_id": rid,
"brand_name": card["brand_name"],
"comment_count": comment_count,
})
except Exception:
pass
return researches

View File

@ -5,3 +5,4 @@ python-dotenv>=1.0.0
numpy>=1.24.0 numpy>=1.24.0
scikit-learn>=1.3.0 scikit-learn>=1.3.0
gunicorn>=21.2.0 gunicorn>=21.2.0
httpx>=0.27.0

View File

@ -1,12 +1,12 @@
""" """
黑手党提案 独立后端 黑手党提案 独立后端阿里云内闭环
FastAPI 服务端口 8093 FastAPI 服务端口 8093
数据来源只读访问共享 VOC 数据层 VOC 数据通过公网 API 导入不直读 VOC DB
分析结果存自己的案例 DB
""" """
import os import os
import logging import logging
import httpx
from fastapi import FastAPI, Header, HTTPException, Query from fastapi import FastAPI, Header, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@ -15,15 +15,12 @@ from dotenv import load_dotenv
load_dotenv() load_dotenv()
from db import ( from db import get_case_conn, init_case_db, list_cases as _list_cases, DATA_DIR, VOC_API_BASE
get_case_conn, get_voc_conn, init_case_db,
list_cases as _list_cases, list_voc_researches as _list_voc_researches,
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s") logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")
logger = logging.getLogger("mafia") logger = logging.getLogger("mafia")
app = FastAPI(title="黑手党提案后端", version="1.0.0", description="独立后端:共享 VOC 数据层 + 自有分析存储") app = FastAPI(title="黑手党提案后端", version="2.0.0", description="独立后端阿里云内闭环VOC 通过 API 导入")
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
allow_origins=["*"], allow_origins=["*"],
@ -70,18 +67,23 @@ async def get_case(case_id: str):
try: try:
with get_case_conn(case_id) as conn: with get_case_conn(case_id) as conn:
card = conn.execute("SELECT * FROM case_card LIMIT 1").fetchone() card = conn.execute("SELECT * FROM case_card LIMIT 1").fetchone()
comment_count = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
ude_count = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] ude_count = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
cluster_count = conn.execute("SELECT count(*) FROM ude_clusters").fetchone()[0] cluster_count = conn.execute("SELECT count(*) FROM ude_clusters").fetchone()[0]
if not card: if not card:
raise HTTPException(404, "案例不存在") raise HTTPException(404, "案例不存在")
return {"caseId": case_id, **dict(card), "udeCount": ude_count, "clusterCount": cluster_count} return {
"caseId": case_id, **dict(card),
"commentCount": comment_count,
"udeCount": ude_count,
"clusterCount": cluster_count,
}
except FileNotFoundError: except FileNotFoundError:
raise HTTPException(404, "案例不存在") raise HTTPException(404, "案例不存在")
@app.delete("/api/cases/{case_id}") @app.delete("/api/cases/{case_id}")
async def delete_case(case_id: str): async def delete_case(case_id: str):
from db import DATA_DIR
path = DATA_DIR / f"{case_id}.db" path = DATA_DIR / f"{case_id}.db"
if path.exists(): if path.exists():
path.unlink() path.unlink()
@ -89,60 +91,107 @@ async def delete_case(case_id: str):
raise HTTPException(404, "案例不存在") raise HTTPException(404, "案例不存在")
# ═══════════ VOC 关联 ═══════════ # ═══════════ VOC 导入(跨云 API ═══════════
@app.post("/api/cases/{case_id}/link-voc") @app.post("/api/cases/{case_id}/link-voc")
async def link_voc(case_id: str, req: LinkVocRequest): async def link_voc(case_id: str, req: LinkVocRequest):
"""关联 VOC 研究 ID验证 VOC 研究存在后再写入)""" """关联 VOC 研究 ID"""
try:
with get_voc_conn(req.vocResearchId) as voc:
count = voc.execute(
"SELECT count(*) FROM comments WHERE length(text) > 10 "
).fetchone()[0]
except FileNotFoundError as e:
raise HTTPException(404, str(e))
try: try:
with get_case_conn(case_id) as conn: with get_case_conn(case_id) as conn:
conn.execute("UPDATE case_card SET voc_research_id = ?", (req.vocResearchId,)) conn.execute("UPDATE case_card SET voc_research_id = ?", (req.vocResearchId,))
conn.commit() conn.commit()
except FileNotFoundError: except FileNotFoundError:
raise HTTPException(404, "案例不存在") raise HTTPException(404, "案例不存在")
return {"linked": True, "vocResearchId": req.vocResearchId}
return {"linked": True, "vocCommentCount": count}
@app.get("/api/voc/researches") @app.post("/api/cases/{case_id}/import-voc")
async def get_voc_researches(): async def import_voc(case_id: str, page: int = Query(1), pageSize: int = Query(100)):
return _list_voc_researches() """从 VOC 公网 API 拉取评论数据,存入本地案例 DB"""
@app.get("/api/cases/{case_id}/voc-comments")
async def get_voc_comments(case_id: str, page: int = 1, pageSize: int = 50):
"""从共享 VOC 数据层只读获取原始评论"""
try: try:
with get_case_conn(case_id) as conn: with get_case_conn(case_id) as conn:
card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone() card = conn.execute("SELECT voc_research_id, voc_api_base FROM case_card LIMIT 1").fetchone()
except FileNotFoundError: except FileNotFoundError:
raise HTTPException(404, "案例不存在") raise HTTPException(404, "案例不存在")
if not card or not card["voc_research_id"]: if not card or not card["voc_research_id"]:
raise HTTPException(400, "未关联 VOC 研究") raise HTTPException(400, "未关联 VOC 研究,请先调用 link-voc")
voc_rid = card["voc_research_id"]
api_base = card["voc_api_base"] or VOC_API_BASE
# 从 VOC API 拉取(只读,不需要 TikHub Key
total_imported = 0
current_page = page
async with httpx.AsyncClient(timeout=30) as client:
while True:
url = f"{api_base}/{voc_rid}/voc-list?page={current_page}&page_size={pageSize}"
try:
resp = await client.get(url)
if resp.status_code != 200:
logger.warning(f"[Import] VOC API 返回 {resp.status_code}: {resp.text[:100]}")
break
data = resp.json()
except Exception as e:
logger.error(f"[Import] VOC API 请求失败: {e}")
break
items = data.get("items") or data.get("data") or []
if not items:
break
with get_case_conn(case_id) as conn:
for item in items:
text = item.get("text", "")
if len(text) < 10:
continue
try:
conn.execute(
"INSERT OR IGNORE INTO comments (voc_id, platform, text, like_count, published_at) VALUES (?,?,?,?,?)",
(
item.get("id"),
item.get("platform", ""),
text,
item.get("like_count", 0),
item.get("published_at", ""),
)
)
total_imported += 1
except Exception:
pass
conn.commit()
total = data.get("total", 0)
if current_page * pageSize >= total:
break
current_page += 1
# 更新统计
with get_case_conn(case_id) as conn:
local_count = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
return {
"imported": total_imported,
"totalLocal": local_count,
"vocResearchId": voc_rid,
"pagesProcessed": current_page - page + 1,
}
@app.get("/api/cases/{case_id}/comments")
async def get_comments(case_id: str, page: int = 1, pageSize: int = 50):
"""查看本地导入的评论"""
try: try:
with get_voc_conn(card["voc_research_id"]) as voc: with get_case_conn(case_id) as conn:
total = voc.execute( total = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
"SELECT count(*) FROM comments WHERE length(text) > 10 " rows = conn.execute("""
).fetchone()[0] SELECT id, voc_id, platform, text, like_count, published_at
rows = voc.execute(""" FROM comments ORDER BY like_count DESC
SELECT id, platform, text, like_count, published_at
FROM comments WHERE length(text) > 10
ORDER BY like_count DESC
LIMIT ? OFFSET ? LIMIT ? OFFSET ?
""", (pageSize, (page - 1) * pageSize)).fetchall() """, (pageSize, (page - 1) * pageSize)).fetchall()
except FileNotFoundError as e: except FileNotFoundError:
raise HTTPException(404, str(e)) raise HTTPException(404, "案例不存在")
return {"total": total, "page": page, "items": [dict(r) for r in rows]} return {"total": total, "page": page, "items": [dict(r) for r in rows]}
@ -200,11 +249,11 @@ async def get_coverage(case_id: str):
@app.get("/api/health") @app.get("/api/health")
async def health(): async def health():
from db import VOC_DATA_DIR, DATA_DIR
return { return {
"status": "ok", "status": "ok",
"vocDataDir": str(VOC_DATA_DIR), "version": "2.0.0",
"vocDataExists": VOC_DATA_DIR.exists(), "architecture": "independent (Aliyun self-contained)",
"vocApiBase": VOC_API_BASE,
"caseDataDir": str(DATA_DIR), "caseDataDir": str(DATA_DIR),
} }

View File

@ -1,10 +1,9 @@
""" """
黑手党提案 UDE 提取工具 黑手党提案 UDE 提取工具阿里云内闭环
流程VOC 原始评论 LLM 转写 UDE DashScope 向量化 DBSCAN 聚类 覆盖扫描 流程本地 comments LLM 转写 UDE DashScope 向量化 DBSCAN 聚类
数据来源只读访问共享 VOC 数据层 所有数据读写都在案例 DB 不跨云
分析结果写入本项目的案例 DB
""" """
from __future__ import annotations from __future__ import annotations
@ -47,7 +46,7 @@ def _get_embed_client(key: str) -> OpenAI:
) )
# ═══════════ Step 1: VOC → UDE 转写 ═══════════ # ═══════════ Step 1: 本地评论 → UDE 转写 ═══════════
async def _call_ude_llm(prompt: str, comments: list[dict]) -> list[dict]: async def _call_ude_llm(prompt: str, comments: list[dict]) -> list[dict]:
"""单批 LLM 转写""" """单批 LLM 转写"""
@ -87,40 +86,33 @@ async def _process_ude_batch(comments, prompt, semaphore):
async def run_ude_extraction(case_id: str, limit: int = 0) -> dict: async def run_ude_extraction(case_id: str, limit: int = 0) -> dict:
"""共享 VOC 数据读取原始评论,转写为 UDE存入案例 DB""" """本地 comments 表读取评论,转写为 UDE存入 ude_sentences"""
from db import get_case_conn, get_voc_conn from db import get_case_conn
prompt = PROMPT_PATH.read_text("utf-8") if PROMPT_PATH.exists() else "" prompt = PROMPT_PATH.read_text("utf-8") if PROMPT_PATH.exists() else ""
if not prompt: if not prompt:
return {"error": "UDE 转写 prompt 未找到 (prompts/voc_to_ude.txt)"} return {"error": "UDE 转写 prompt 未找到 (prompts/voc_to_ude.txt)"}
with get_case_conn(case_id) as case_conn: with get_case_conn(case_id) as conn:
card = case_conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone() # 获取已转写的 comment_ids
if not card or not card["voc_research_id"]: done_ids = {r[0] for r in conn.execute(
return {"error": "未关联 VOC 研究。请先调用 link-voc。"} "SELECT comment_id FROM ude_sentences"
voc_research_id = card["voc_research_id"]
# 获取已转写的 voc_comment_ids
done_ids = {r[0] for r in case_conn.execute(
"SELECT voc_comment_id FROM ude_sentences"
).fetchall()} ).fetchall()}
# 从 VOC DB 只读获取原始评论 # 从本地 comments 表读取
with get_voc_conn(voc_research_id) as voc_conn: rows = conn.execute("""
rows = voc_conn.execute("""
SELECT id, platform, text SELECT id, platform, text
FROM comments FROM comments WHERE length(text) > 10
WHERE length(text) > 10
ORDER BY id ORDER BY id
""").fetchall() """).fetchall()
# 过滤已完成的 total_comments = len(rows)
pending = [r for r in rows if r["id"] not in done_ids] pending = [r for r in rows if r["id"] not in done_ids]
if not pending: if not pending:
with get_case_conn(case_id) as conn: with get_case_conn(case_id) as conn:
total = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] total = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
return {"message": "全部已转写完成", "total_udes": total, "new": 0} return {"message": "全部已转写完成", "totalUdes": total, "new": 0}
if limit > 0: if limit > 0:
pending = pending[:limit] pending = pending[:limit]
@ -137,7 +129,7 @@ async def run_ude_extraction(case_id: str, limit: int = 0) -> dict:
# 写入案例 DB # 写入案例 DB
ok = 0 ok = 0
with get_case_conn(case_id) as case_conn: with get_case_conn(case_id) as conn:
for results in all_results: for results in all_results:
for r in (results or []): for r in (results or []):
if not isinstance(r, dict): if not isinstance(r, dict):
@ -149,21 +141,21 @@ async def run_ude_extraction(case_id: str, limit: int = 0) -> dict:
if not cid: if not cid:
continue continue
try: try:
case_conn.execute( conn.execute(
"INSERT OR IGNORE INTO ude_sentences (voc_comment_id, ude_text, confidence) VALUES (?, ?, ?)", "INSERT OR IGNORE INTO ude_sentences (comment_id, ude_text, confidence) VALUES (?, ?, ?)",
(int(cid), ude_text, r.get("confidence", 0.5)) (int(cid), ude_text, r.get("confidence", 0.5))
) )
ok += 1 ok += 1
except Exception as e: except Exception as e:
logger.warning(f"[UDE] 写入失败 id={cid}: {e}") logger.warning(f"[UDE] 写入失败 id={cid}: {e}")
case_conn.commit() conn.commit()
total = case_conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] total = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
return { return {
"new_udes": ok, "newUdes": ok,
"total_udes": total, "totalUdes": total,
"total_voc_comments": len(rows), "totalComments": total_comments,
"remaining": len(rows) - total, "remaining": total_comments - total,
"batches": len(batches), "batches": len(batches),
} }
@ -181,10 +173,10 @@ def _embed_texts(client: OpenAI, texts: list[str]) -> list[list[float]]:
def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3, def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
dashscope_key: str = None) -> dict: dashscope_key: str = None) -> dict:
"""向量化 + DBSCAN 聚类""" """向量化 + DBSCAN 聚类(全部在本地案例 DB 内)"""
from sklearn.cluster import DBSCAN from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_distances from sklearn.metrics.pairwise import cosine_distances
from db import get_case_conn, get_voc_conn from db import get_case_conn
key = dashscope_key or os.getenv("DASHSCOPE_API_KEY", "") key = dashscope_key or os.getenv("DASHSCOPE_API_KEY", "")
if not key: if not key:
@ -193,13 +185,13 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
embed_client = _get_embed_client(key) embed_client = _get_embed_client(key)
with get_case_conn(case_id) as conn: with get_case_conn(case_id) as conn:
rows = conn.execute("SELECT id, voc_comment_id, ude_text FROM ude_sentences ORDER BY id").fetchall() rows = conn.execute("SELECT id, comment_id, ude_text FROM ude_sentences ORDER BY id").fetchall()
if len(rows) < min_samples: if len(rows) < min_samples:
return {"error": f"UDE 不足 ({len(rows)} 条),至少需要 {min_samples} 条。"} return {"error": f"UDE 不足 ({len(rows)} 条),至少需要 {min_samples} 条。"}
ude_texts = [r["ude_text"] for r in rows] ude_texts = [r["ude_text"] for r in rows]
ude_ids = [r["id"] for r in rows] ude_ids = [r["id"] for r in rows]
comment_ids = [r["voc_comment_id"] for r in rows] comment_ids = [r["comment_id"] for r in rows]
# 向量化 # 向量化
vectors = _embed_texts(embed_client, ude_texts) vectors = _embed_texts(embed_client, ude_texts)
@ -223,10 +215,6 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
# 清空旧聚类,写入新聚类 # 清空旧聚类,写入新聚类
conn.execute("DELETE FROM ude_clusters") conn.execute("DELETE FROM ude_clusters")
# 获取关联的 VOC research_id 用于读取原声
card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone()
voc_rid = card["voc_research_id"] if card else None
clusters = [] clusters = []
unique_labels = sorted(set(labels) - {-1}) unique_labels = sorted(set(labels) - {-1})
@ -241,30 +229,24 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
dists = cosine_distances([centroid], member_vectors)[0] dists = cosine_distances([centroid], member_vectors)[0]
representative = member_texts[dists.argmin()] representative = member_texts[dists.argmin()]
# 原声 # 原声采样(从本地 comments 表)
sample_voices = [] sample_voices = []
if voc_rid: for cid in member_cids[:5]:
try: voice = conn.execute(
voc_conn = get_voc_conn(voc_rid) "SELECT text, platform FROM comments WHERE id = ?", (cid,)
for cid in member_cids[:5]: ).fetchone()
voice = voc_conn.execute( if voice:
"SELECT text, platform FROM comments WHERE id = ?", (cid,) sample_voices.append({"text": voice["text"][:200], "platform": voice["platform"]})
).fetchone()
if voice:
sample_voices.append({"text": voice["text"][:200], "platform": voice["platform"]})
voc_conn.close()
except Exception:
pass
conn.execute( conn.execute(
"INSERT INTO ude_clusters (representative_ude, coverage, sample_voices) VALUES (?, ?, ?)", "INSERT INTO ude_clusters (representative_ude, coverage, sample_voices) VALUES (?, ?, ?)",
(representative, len(member_indices), json.dumps(sample_voices, ensure_ascii=False)) (representative, len(member_indices), json.dumps(sample_voices, ensure_ascii=False))
) )
clusters.append({ clusters.append({
"cluster_id": int(cluster_id), "clusterId": int(cluster_id),
"representative_ude": representative, "representativeUde": representative,
"coverage": len(member_indices), "coverage": len(member_indices),
"sample_voices": sample_voices, "sampleVoices": sample_voices,
}) })
conn.commit() conn.commit()
@ -272,24 +254,22 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
noise_count = int((labels == -1).sum()) noise_count = int((labels == -1).sum())
return { return {
"total_udes": len(labels), "totalUdes": len(labels),
"num_clusters": len(clusters), "numClusters": len(clusters),
"noise_count": noise_count, "noiseCount": noise_count,
"noise_pct": round(noise_count / len(labels) * 100, 1) if len(labels) else 0, "noisePct": round(noise_count / len(labels) * 100, 1) if len(labels) else 0,
"clusters": clusters, "clusters": clusters,
"params": {"eps": eps, "min_samples": min_samples}, "params": {"eps": eps, "minSamples": min_samples},
} }
# ═══════════ Step 5: 覆盖扫描 ═══════════ # ═══════════ 覆盖扫描 ═══════════
def run_coverage_scan(case_id: str) -> dict: def run_coverage_scan(case_id: str) -> dict:
from db import get_case_conn, get_voc_conn from db import get_case_conn
with get_case_conn(case_id) as conn: with get_case_conn(case_id) as conn:
card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone() total_comments = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
voc_rid = card["voc_research_id"] if card else None
total_udes = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] total_udes = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
clustered = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id >= 0").fetchone()[0] clustered = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id >= 0").fetchone()[0]
noise = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id = -1").fetchone()[0] noise = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id = -1").fetchone()[0]
@ -299,27 +279,17 @@ def run_coverage_scan(case_id: str) -> dict:
).fetchall()] ).fetchall()]
noise_samples = [dict(r) for r in conn.execute( noise_samples = [dict(r) for r in conn.execute(
"SELECT ude_text, voc_comment_id, confidence FROM ude_sentences WHERE cluster_id = -1 ORDER BY confidence DESC LIMIT 10" "SELECT ude_text, comment_id, confidence FROM ude_sentences WHERE cluster_id = -1 ORDER BY confidence DESC LIMIT 10"
).fetchall()] ).fetchall()]
total_voc = 0
if voc_rid:
try:
with get_voc_conn(voc_rid) as voc:
total_voc = voc.execute(
"SELECT count(*) FROM comments WHERE length(text) > 10 "
).fetchone()[0]
except Exception:
pass
return { return {
"total_voc_comments": total_voc, "totalComments": total_comments,
"total_udes": total_udes, "totalUdes": total_udes,
"udes_clustered": clustered, "udesClustered": clustered,
"udes_noise": noise, "udesNoise": noise,
"coverage_rate": round(clustered / total_voc * 100, 1) if total_voc else 0, "coverageRate": round(clustered / total_comments * 100, 1) if total_comments else 0,
"cluster_distribution": cluster_stats, "clusterDistribution": cluster_stats,
"noise_samples": noise_samples, "noiseSamples": noise_samples,
"verdict": "充分" if (total_udes > 0 and noise / total_udes < 0.1) else "verdict": "充分" if (total_udes > 0 and noise / total_udes < 0.1) else
("需关注" if (total_udes > 0 and noise / total_udes < 0.2) else "需调参"), ("需关注" if (total_udes > 0 and noise / total_udes < 0.2) else "需调参"),
} }