refactor: v2.0 完全解耦 — 阿里云内闭环
- 删除 VOC_DATA_DIR / get_voc_conn(不再跨云直读 SQLite) - 案例 DB 自带 comments 表,自包含所有数据 - 新增 POST /import-voc:通过 VOC 公网 API 导入评论 - VOC_API_BASE 环境变量控制 API 地址 - 新增 httpx 依赖
This commit is contained in:
parent
ec8eaa0b36
commit
c5e2a58258
@ -1,13 +1,19 @@
|
||||
# LLM(通过 LiteLLM 网关)
|
||||
# 黑手党提案后端 v2.0 — 环境变量
|
||||
# 完全独立,阿里云内闭环
|
||||
|
||||
# VOC 公网 API(跨云只读访问,用于 import-voc)
|
||||
VOC_API_BASE=https://brand.brainwork.club/voc/api/research
|
||||
|
||||
# LLM 路由(走同机 LiteLLM)
|
||||
LITELLM_PROXY_URL=http://127.0.0.1:4000/v1
|
||||
LITELLM_MASTER_KEY=
|
||||
LITELLM_MASTER_KEY=sk-xxx
|
||||
|
||||
# 模型
|
||||
MODEL_ID=qwen-plus
|
||||
TEMPERATURE=0.1
|
||||
|
||||
# 向量化(DashScope text-embedding-v4)
|
||||
DASHSCOPE_API_KEY=
|
||||
# DashScope(向量化用)
|
||||
DASHSCOPE_API_KEY=sk-xxx
|
||||
|
||||
# 共享 VOC 数据层
|
||||
VOC_DATA_DIR=/opt/apps/voc-researcher/data
|
||||
|
||||
# 服务
|
||||
# 端口
|
||||
PORT=8093
|
||||
|
||||
@ -1,9 +1,11 @@
|
||||
"""
|
||||
黑手党提案 — 数据库管理
|
||||
黑手党提案 — 数据库管理(完全独立,阿里云内闭环)
|
||||
|
||||
双库设计:
|
||||
1. 案例 DB(读写):每个提案案例一个 SQLite,存分析结果
|
||||
2. VOC DB(只读):读取共享 VOC 数据层的原始评论
|
||||
每个提案案例一个 SQLite 文件,自包含所有数据:
|
||||
- case_card:案例元信息
|
||||
- comments:从 VOC API 导入的评论(本地副本)
|
||||
- ude_sentences / ude_clusters:UDE 分析结果
|
||||
- conflicts / proposal_sections:后续阶段
|
||||
"""
|
||||
import os
|
||||
import sqlite3
|
||||
@ -17,10 +19,11 @@ load_dotenv()
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
|
||||
VOC_DATA_DIR = Path(os.getenv("VOC_DATA_DIR", ""))
|
||||
# VOC 公网 API(腾讯云,跨云只读访问)
|
||||
VOC_API_BASE = os.getenv("VOC_API_BASE", "https://brand.brainwork.club/voc/api/research")
|
||||
|
||||
|
||||
# ═══════════ 案例 DB(读写) ═══════════
|
||||
# ═══════════ Schema ═══════════
|
||||
|
||||
CASE_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS case_card (
|
||||
@ -29,13 +32,25 @@ CREATE TABLE IF NOT EXISTS case_card (
|
||||
focus_product TEXT,
|
||||
competitors TEXT,
|
||||
voc_research_id TEXT,
|
||||
voc_api_base TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now')),
|
||||
status TEXT DEFAULT 'draft'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS comments (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
voc_id INTEGER,
|
||||
platform TEXT,
|
||||
text TEXT NOT NULL,
|
||||
like_count INTEGER DEFAULT 0,
|
||||
published_at TEXT,
|
||||
imported_at TEXT DEFAULT (datetime('now')),
|
||||
UNIQUE(voc_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS ude_sentences (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
voc_comment_id INTEGER,
|
||||
comment_id INTEGER REFERENCES comments(id),
|
||||
ude_text TEXT NOT NULL,
|
||||
confidence REAL DEFAULT 0.5,
|
||||
vector TEXT,
|
||||
@ -73,8 +88,10 @@ CREATE TABLE IF NOT EXISTS proposal_sections (
|
||||
"""
|
||||
|
||||
|
||||
# ═══════════ 案例 DB ═══════════
|
||||
|
||||
def get_case_conn(case_id: str) -> sqlite3.Connection:
|
||||
"""获取案例 DB 连接(读写)"""
|
||||
"""获取案例 DB 连接"""
|
||||
path = DATA_DIR / f"{case_id}.db"
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"案例 {case_id} 不存在")
|
||||
@ -93,8 +110,8 @@ def init_case_db(brand_name: str, category: str = "", focus_product: str = "",
|
||||
conn.row_factory = sqlite3.Row
|
||||
conn.executescript(CASE_SCHEMA)
|
||||
conn.execute(
|
||||
"INSERT INTO case_card (brand_name, category, focus_product, competitors, voc_research_id) VALUES (?,?,?,?,?)",
|
||||
(brand_name, category, focus_product, competitors, voc_research_id)
|
||||
"INSERT INTO case_card (brand_name, category, focus_product, competitors, voc_research_id, voc_api_base) VALUES (?,?,?,?,?,?)",
|
||||
(brand_name, category, focus_product, competitors, voc_research_id, VOC_API_BASE)
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
@ -111,11 +128,13 @@ def list_cases() -> list[dict]:
|
||||
conn.row_factory = sqlite3.Row
|
||||
card = conn.execute("SELECT * FROM case_card LIMIT 1").fetchone()
|
||||
if card:
|
||||
comment_count = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
|
||||
ude_count = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
|
||||
cluster_count = conn.execute("SELECT count(*) FROM ude_clusters").fetchone()[0]
|
||||
cases.append({
|
||||
"case_id": case_id,
|
||||
**dict(card),
|
||||
"comment_count": comment_count,
|
||||
"ude_count": ude_count,
|
||||
"cluster_count": cluster_count,
|
||||
})
|
||||
@ -123,45 +142,3 @@ def list_cases() -> list[dict]:
|
||||
except Exception:
|
||||
pass
|
||||
return cases
|
||||
|
||||
|
||||
# ═══════════ VOC DB(只读) ═══════════
|
||||
|
||||
def get_voc_conn(voc_research_id: str) -> sqlite3.Connection:
|
||||
"""只读访问共享 VOC 数据"""
|
||||
if not VOC_DATA_DIR.exists():
|
||||
raise FileNotFoundError(f"VOC 数据目录不存在: {VOC_DATA_DIR}")
|
||||
path = VOC_DATA_DIR / f"{voc_research_id}.db"
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"VOC 研究 {voc_research_id} 不存在")
|
||||
conn = sqlite3.connect(f"file:{path}?mode=ro", uri=True)
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
|
||||
def list_voc_researches() -> list[dict]:
|
||||
"""列出共享 VOC 数据层中的所有研究"""
|
||||
if not VOC_DATA_DIR.exists():
|
||||
return []
|
||||
researches = []
|
||||
for db_file in sorted(VOC_DATA_DIR.glob("*.db")):
|
||||
if db_file.name in ("global_cache.db", "agent_sessions.db"):
|
||||
continue
|
||||
rid = db_file.stem
|
||||
try:
|
||||
conn = sqlite3.connect(f"file:{db_file}?mode=ro", uri=True)
|
||||
conn.row_factory = sqlite3.Row
|
||||
card = conn.execute("SELECT brand_name FROM research_card LIMIT 1").fetchone()
|
||||
comment_count = conn.execute(
|
||||
"SELECT count(*) FROM comments WHERE length(text) > 10"
|
||||
).fetchone()[0]
|
||||
conn.close()
|
||||
if card and comment_count > 0:
|
||||
researches.append({
|
||||
"research_id": rid,
|
||||
"brand_name": card["brand_name"],
|
||||
"comment_count": comment_count,
|
||||
})
|
||||
except Exception:
|
||||
pass
|
||||
return researches
|
||||
|
||||
@ -5,3 +5,4 @@ python-dotenv>=1.0.0
|
||||
numpy>=1.24.0
|
||||
scikit-learn>=1.3.0
|
||||
gunicorn>=21.2.0
|
||||
httpx>=0.27.0
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
"""
|
||||
黑手党提案 — 独立后端
|
||||
黑手党提案 — 独立后端(阿里云内闭环)
|
||||
|
||||
FastAPI 服务,端口 8093。
|
||||
数据来源:只读访问共享 VOC 数据层。
|
||||
分析结果:存自己的案例 DB。
|
||||
VOC 数据通过公网 API 导入,不直读 VOC DB。
|
||||
"""
|
||||
import os
|
||||
import logging
|
||||
import httpx
|
||||
|
||||
from fastapi import FastAPI, Header, HTTPException, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
@ -15,15 +15,12 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from db import (
|
||||
get_case_conn, get_voc_conn, init_case_db,
|
||||
list_cases as _list_cases, list_voc_researches as _list_voc_researches,
|
||||
)
|
||||
from db import get_case_conn, init_case_db, list_cases as _list_cases, DATA_DIR, VOC_API_BASE
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s %(message)s")
|
||||
logger = logging.getLogger("mafia")
|
||||
|
||||
app = FastAPI(title="黑手党提案后端", version="1.0.0", description="独立后端:共享 VOC 数据层 + 自有分析存储")
|
||||
app = FastAPI(title="黑手党提案后端", version="2.0.0", description="独立后端:阿里云内闭环,VOC 通过 API 导入")
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
@ -70,18 +67,23 @@ async def get_case(case_id: str):
|
||||
try:
|
||||
with get_case_conn(case_id) as conn:
|
||||
card = conn.execute("SELECT * FROM case_card LIMIT 1").fetchone()
|
||||
comment_count = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
|
||||
ude_count = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
|
||||
cluster_count = conn.execute("SELECT count(*) FROM ude_clusters").fetchone()[0]
|
||||
if not card:
|
||||
raise HTTPException(404, "案例不存在")
|
||||
return {"caseId": case_id, **dict(card), "udeCount": ude_count, "clusterCount": cluster_count}
|
||||
return {
|
||||
"caseId": case_id, **dict(card),
|
||||
"commentCount": comment_count,
|
||||
"udeCount": ude_count,
|
||||
"clusterCount": cluster_count,
|
||||
}
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(404, "案例不存在")
|
||||
|
||||
|
||||
@app.delete("/api/cases/{case_id}")
|
||||
async def delete_case(case_id: str):
|
||||
from db import DATA_DIR
|
||||
path = DATA_DIR / f"{case_id}.db"
|
||||
if path.exists():
|
||||
path.unlink()
|
||||
@ -89,60 +91,107 @@ async def delete_case(case_id: str):
|
||||
raise HTTPException(404, "案例不存在")
|
||||
|
||||
|
||||
# ═══════════ VOC 关联 ═══════════
|
||||
# ═══════════ VOC 导入(跨云 API) ═══════════
|
||||
|
||||
@app.post("/api/cases/{case_id}/link-voc")
|
||||
async def link_voc(case_id: str, req: LinkVocRequest):
|
||||
"""关联 VOC 研究 ID(验证 VOC 研究存在后再写入)"""
|
||||
try:
|
||||
with get_voc_conn(req.vocResearchId) as voc:
|
||||
count = voc.execute(
|
||||
"SELECT count(*) FROM comments WHERE length(text) > 10 "
|
||||
).fetchone()[0]
|
||||
except FileNotFoundError as e:
|
||||
raise HTTPException(404, str(e))
|
||||
|
||||
"""关联 VOC 研究 ID"""
|
||||
try:
|
||||
with get_case_conn(case_id) as conn:
|
||||
conn.execute("UPDATE case_card SET voc_research_id = ?", (req.vocResearchId,))
|
||||
conn.commit()
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(404, "案例不存在")
|
||||
|
||||
return {"linked": True, "vocCommentCount": count}
|
||||
return {"linked": True, "vocResearchId": req.vocResearchId}
|
||||
|
||||
|
||||
@app.get("/api/voc/researches")
|
||||
async def get_voc_researches():
|
||||
return _list_voc_researches()
|
||||
|
||||
|
||||
@app.get("/api/cases/{case_id}/voc-comments")
|
||||
async def get_voc_comments(case_id: str, page: int = 1, pageSize: int = 50):
|
||||
"""从共享 VOC 数据层只读获取原始评论"""
|
||||
@app.post("/api/cases/{case_id}/import-voc")
|
||||
async def import_voc(case_id: str, page: int = Query(1), pageSize: int = Query(100)):
|
||||
"""从 VOC 公网 API 拉取评论数据,存入本地案例 DB"""
|
||||
try:
|
||||
with get_case_conn(case_id) as conn:
|
||||
card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone()
|
||||
card = conn.execute("SELECT voc_research_id, voc_api_base FROM case_card LIMIT 1").fetchone()
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(404, "案例不存在")
|
||||
|
||||
if not card or not card["voc_research_id"]:
|
||||
raise HTTPException(400, "未关联 VOC 研究")
|
||||
raise HTTPException(400, "未关联 VOC 研究,请先调用 link-voc")
|
||||
|
||||
voc_rid = card["voc_research_id"]
|
||||
api_base = card["voc_api_base"] or VOC_API_BASE
|
||||
|
||||
# 从 VOC API 拉取(只读,不需要 TikHub Key)
|
||||
total_imported = 0
|
||||
current_page = page
|
||||
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
while True:
|
||||
url = f"{api_base}/{voc_rid}/voc-list?page={current_page}&page_size={pageSize}"
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
if resp.status_code != 200:
|
||||
logger.warning(f"[Import] VOC API 返回 {resp.status_code}: {resp.text[:100]}")
|
||||
break
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
logger.error(f"[Import] VOC API 请求失败: {e}")
|
||||
break
|
||||
|
||||
items = data.get("items") or data.get("data") or []
|
||||
if not items:
|
||||
break
|
||||
|
||||
with get_case_conn(case_id) as conn:
|
||||
for item in items:
|
||||
text = item.get("text", "")
|
||||
if len(text) < 10:
|
||||
continue
|
||||
try:
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO comments (voc_id, platform, text, like_count, published_at) VALUES (?,?,?,?,?)",
|
||||
(
|
||||
item.get("id"),
|
||||
item.get("platform", ""),
|
||||
text,
|
||||
item.get("like_count", 0),
|
||||
item.get("published_at", ""),
|
||||
)
|
||||
)
|
||||
total_imported += 1
|
||||
except Exception:
|
||||
pass
|
||||
conn.commit()
|
||||
|
||||
total = data.get("total", 0)
|
||||
if current_page * pageSize >= total:
|
||||
break
|
||||
current_page += 1
|
||||
|
||||
# 更新统计
|
||||
with get_case_conn(case_id) as conn:
|
||||
local_count = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
|
||||
|
||||
return {
|
||||
"imported": total_imported,
|
||||
"totalLocal": local_count,
|
||||
"vocResearchId": voc_rid,
|
||||
"pagesProcessed": current_page - page + 1,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/cases/{case_id}/comments")
|
||||
async def get_comments(case_id: str, page: int = 1, pageSize: int = 50):
|
||||
"""查看本地导入的评论"""
|
||||
try:
|
||||
with get_voc_conn(card["voc_research_id"]) as voc:
|
||||
total = voc.execute(
|
||||
"SELECT count(*) FROM comments WHERE length(text) > 10 "
|
||||
).fetchone()[0]
|
||||
rows = voc.execute("""
|
||||
SELECT id, platform, text, like_count, published_at
|
||||
FROM comments WHERE length(text) > 10
|
||||
ORDER BY like_count DESC
|
||||
with get_case_conn(case_id) as conn:
|
||||
total = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
|
||||
rows = conn.execute("""
|
||||
SELECT id, voc_id, platform, text, like_count, published_at
|
||||
FROM comments ORDER BY like_count DESC
|
||||
LIMIT ? OFFSET ?
|
||||
""", (pageSize, (page - 1) * pageSize)).fetchall()
|
||||
except FileNotFoundError as e:
|
||||
raise HTTPException(404, str(e))
|
||||
|
||||
except FileNotFoundError:
|
||||
raise HTTPException(404, "案例不存在")
|
||||
return {"total": total, "page": page, "items": [dict(r) for r in rows]}
|
||||
|
||||
|
||||
@ -200,11 +249,11 @@ async def get_coverage(case_id: str):
|
||||
|
||||
@app.get("/api/health")
|
||||
async def health():
|
||||
from db import VOC_DATA_DIR, DATA_DIR
|
||||
return {
|
||||
"status": "ok",
|
||||
"vocDataDir": str(VOC_DATA_DIR),
|
||||
"vocDataExists": VOC_DATA_DIR.exists(),
|
||||
"version": "2.0.0",
|
||||
"architecture": "independent (Aliyun self-contained)",
|
||||
"vocApiBase": VOC_API_BASE,
|
||||
"caseDataDir": str(DATA_DIR),
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +1,9 @@
|
||||
"""
|
||||
黑手党提案 — UDE 提取工具
|
||||
黑手党提案 — UDE 提取工具(阿里云内闭环)
|
||||
|
||||
流程:VOC 原始评论 → LLM 转写 UDE → DashScope 向量化 → DBSCAN 聚类 → 覆盖扫描
|
||||
流程:本地 comments → LLM 转写 UDE → DashScope 向量化 → DBSCAN 聚类
|
||||
|
||||
数据来源:只读访问共享 VOC 数据层
|
||||
分析结果:写入本项目的案例 DB
|
||||
所有数据读写都在案例 DB 内,不跨云。
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@ -47,7 +46,7 @@ def _get_embed_client(key: str) -> OpenAI:
|
||||
)
|
||||
|
||||
|
||||
# ═══════════ Step 1: VOC → UDE 转写 ═══════════
|
||||
# ═══════════ Step 1: 本地评论 → UDE 转写 ═══════════
|
||||
|
||||
async def _call_ude_llm(prompt: str, comments: list[dict]) -> list[dict]:
|
||||
"""单批 LLM 转写"""
|
||||
@ -87,40 +86,33 @@ async def _process_ude_batch(comments, prompt, semaphore):
|
||||
|
||||
|
||||
async def run_ude_extraction(case_id: str, limit: int = 0) -> dict:
|
||||
"""从共享 VOC 数据读取原始评论,转写为 UDE,存入案例 DB"""
|
||||
from db import get_case_conn, get_voc_conn
|
||||
"""从本地 comments 表读取评论,转写为 UDE,存入 ude_sentences"""
|
||||
from db import get_case_conn
|
||||
|
||||
prompt = PROMPT_PATH.read_text("utf-8") if PROMPT_PATH.exists() else ""
|
||||
if not prompt:
|
||||
return {"error": "UDE 转写 prompt 未找到 (prompts/voc_to_ude.txt)"}
|
||||
|
||||
with get_case_conn(case_id) as case_conn:
|
||||
card = case_conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone()
|
||||
if not card or not card["voc_research_id"]:
|
||||
return {"error": "未关联 VOC 研究。请先调用 link-voc。"}
|
||||
|
||||
voc_research_id = card["voc_research_id"]
|
||||
|
||||
# 获取已转写的 voc_comment_ids
|
||||
done_ids = {r[0] for r in case_conn.execute(
|
||||
"SELECT voc_comment_id FROM ude_sentences"
|
||||
with get_case_conn(case_id) as conn:
|
||||
# 获取已转写的 comment_ids
|
||||
done_ids = {r[0] for r in conn.execute(
|
||||
"SELECT comment_id FROM ude_sentences"
|
||||
).fetchall()}
|
||||
|
||||
# 从 VOC DB 只读获取原始评论
|
||||
with get_voc_conn(voc_research_id) as voc_conn:
|
||||
rows = voc_conn.execute("""
|
||||
# 从本地 comments 表读取
|
||||
rows = conn.execute("""
|
||||
SELECT id, platform, text
|
||||
FROM comments
|
||||
WHERE length(text) > 10
|
||||
FROM comments WHERE length(text) > 10
|
||||
ORDER BY id
|
||||
""").fetchall()
|
||||
|
||||
# 过滤已完成的
|
||||
total_comments = len(rows)
|
||||
pending = [r for r in rows if r["id"] not in done_ids]
|
||||
|
||||
if not pending:
|
||||
with get_case_conn(case_id) as conn:
|
||||
total = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
|
||||
return {"message": "全部已转写完成", "total_udes": total, "new": 0}
|
||||
return {"message": "全部已转写完成", "totalUdes": total, "new": 0}
|
||||
|
||||
if limit > 0:
|
||||
pending = pending[:limit]
|
||||
@ -137,7 +129,7 @@ async def run_ude_extraction(case_id: str, limit: int = 0) -> dict:
|
||||
|
||||
# 写入案例 DB
|
||||
ok = 0
|
||||
with get_case_conn(case_id) as case_conn:
|
||||
with get_case_conn(case_id) as conn:
|
||||
for results in all_results:
|
||||
for r in (results or []):
|
||||
if not isinstance(r, dict):
|
||||
@ -149,21 +141,21 @@ async def run_ude_extraction(case_id: str, limit: int = 0) -> dict:
|
||||
if not cid:
|
||||
continue
|
||||
try:
|
||||
case_conn.execute(
|
||||
"INSERT OR IGNORE INTO ude_sentences (voc_comment_id, ude_text, confidence) VALUES (?, ?, ?)",
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO ude_sentences (comment_id, ude_text, confidence) VALUES (?, ?, ?)",
|
||||
(int(cid), ude_text, r.get("confidence", 0.5))
|
||||
)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
logger.warning(f"[UDE] 写入失败 id={cid}: {e}")
|
||||
case_conn.commit()
|
||||
total = case_conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
|
||||
conn.commit()
|
||||
total = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
|
||||
|
||||
return {
|
||||
"new_udes": ok,
|
||||
"total_udes": total,
|
||||
"total_voc_comments": len(rows),
|
||||
"remaining": len(rows) - total,
|
||||
"newUdes": ok,
|
||||
"totalUdes": total,
|
||||
"totalComments": total_comments,
|
||||
"remaining": total_comments - total,
|
||||
"batches": len(batches),
|
||||
}
|
||||
|
||||
@ -181,10 +173,10 @@ def _embed_texts(client: OpenAI, texts: list[str]) -> list[list[float]]:
|
||||
|
||||
def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
|
||||
dashscope_key: str = None) -> dict:
|
||||
"""向量化 + DBSCAN 聚类"""
|
||||
"""向量化 + DBSCAN 聚类(全部在本地案例 DB 内)"""
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.metrics.pairwise import cosine_distances
|
||||
from db import get_case_conn, get_voc_conn
|
||||
from db import get_case_conn
|
||||
|
||||
key = dashscope_key or os.getenv("DASHSCOPE_API_KEY", "")
|
||||
if not key:
|
||||
@ -193,13 +185,13 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
|
||||
embed_client = _get_embed_client(key)
|
||||
|
||||
with get_case_conn(case_id) as conn:
|
||||
rows = conn.execute("SELECT id, voc_comment_id, ude_text FROM ude_sentences ORDER BY id").fetchall()
|
||||
rows = conn.execute("SELECT id, comment_id, ude_text FROM ude_sentences ORDER BY id").fetchall()
|
||||
if len(rows) < min_samples:
|
||||
return {"error": f"UDE 不足 ({len(rows)} 条),至少需要 {min_samples} 条。"}
|
||||
|
||||
ude_texts = [r["ude_text"] for r in rows]
|
||||
ude_ids = [r["id"] for r in rows]
|
||||
comment_ids = [r["voc_comment_id"] for r in rows]
|
||||
comment_ids = [r["comment_id"] for r in rows]
|
||||
|
||||
# 向量化
|
||||
vectors = _embed_texts(embed_client, ude_texts)
|
||||
@ -223,10 +215,6 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
|
||||
# 清空旧聚类,写入新聚类
|
||||
conn.execute("DELETE FROM ude_clusters")
|
||||
|
||||
# 获取关联的 VOC research_id 用于读取原声
|
||||
card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone()
|
||||
voc_rid = card["voc_research_id"] if card else None
|
||||
|
||||
clusters = []
|
||||
unique_labels = sorted(set(labels) - {-1})
|
||||
|
||||
@ -241,30 +229,24 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
|
||||
dists = cosine_distances([centroid], member_vectors)[0]
|
||||
representative = member_texts[dists.argmin()]
|
||||
|
||||
# 取原声
|
||||
# 原声采样(从本地 comments 表)
|
||||
sample_voices = []
|
||||
if voc_rid:
|
||||
try:
|
||||
voc_conn = get_voc_conn(voc_rid)
|
||||
for cid in member_cids[:5]:
|
||||
voice = voc_conn.execute(
|
||||
"SELECT text, platform FROM comments WHERE id = ?", (cid,)
|
||||
).fetchone()
|
||||
if voice:
|
||||
sample_voices.append({"text": voice["text"][:200], "platform": voice["platform"]})
|
||||
voc_conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
for cid in member_cids[:5]:
|
||||
voice = conn.execute(
|
||||
"SELECT text, platform FROM comments WHERE id = ?", (cid,)
|
||||
).fetchone()
|
||||
if voice:
|
||||
sample_voices.append({"text": voice["text"][:200], "platform": voice["platform"]})
|
||||
|
||||
conn.execute(
|
||||
"INSERT INTO ude_clusters (representative_ude, coverage, sample_voices) VALUES (?, ?, ?)",
|
||||
(representative, len(member_indices), json.dumps(sample_voices, ensure_ascii=False))
|
||||
)
|
||||
clusters.append({
|
||||
"cluster_id": int(cluster_id),
|
||||
"representative_ude": representative,
|
||||
"clusterId": int(cluster_id),
|
||||
"representativeUde": representative,
|
||||
"coverage": len(member_indices),
|
||||
"sample_voices": sample_voices,
|
||||
"sampleVoices": sample_voices,
|
||||
})
|
||||
|
||||
conn.commit()
|
||||
@ -272,24 +254,22 @@ def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3,
|
||||
noise_count = int((labels == -1).sum())
|
||||
|
||||
return {
|
||||
"total_udes": len(labels),
|
||||
"num_clusters": len(clusters),
|
||||
"noise_count": noise_count,
|
||||
"noise_pct": round(noise_count / len(labels) * 100, 1) if len(labels) else 0,
|
||||
"totalUdes": len(labels),
|
||||
"numClusters": len(clusters),
|
||||
"noiseCount": noise_count,
|
||||
"noisePct": round(noise_count / len(labels) * 100, 1) if len(labels) else 0,
|
||||
"clusters": clusters,
|
||||
"params": {"eps": eps, "min_samples": min_samples},
|
||||
"params": {"eps": eps, "minSamples": min_samples},
|
||||
}
|
||||
|
||||
|
||||
# ═══════════ Step 5: 覆盖扫描 ═══════════
|
||||
# ═══════════ 覆盖扫描 ═══════════
|
||||
|
||||
def run_coverage_scan(case_id: str) -> dict:
|
||||
from db import get_case_conn, get_voc_conn
|
||||
from db import get_case_conn
|
||||
|
||||
with get_case_conn(case_id) as conn:
|
||||
card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone()
|
||||
voc_rid = card["voc_research_id"] if card else None
|
||||
|
||||
total_comments = conn.execute("SELECT count(*) FROM comments").fetchone()[0]
|
||||
total_udes = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0]
|
||||
clustered = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id >= 0").fetchone()[0]
|
||||
noise = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id = -1").fetchone()[0]
|
||||
@ -299,27 +279,17 @@ def run_coverage_scan(case_id: str) -> dict:
|
||||
).fetchall()]
|
||||
|
||||
noise_samples = [dict(r) for r in conn.execute(
|
||||
"SELECT ude_text, voc_comment_id, confidence FROM ude_sentences WHERE cluster_id = -1 ORDER BY confidence DESC LIMIT 10"
|
||||
"SELECT ude_text, comment_id, confidence FROM ude_sentences WHERE cluster_id = -1 ORDER BY confidence DESC LIMIT 10"
|
||||
).fetchall()]
|
||||
|
||||
total_voc = 0
|
||||
if voc_rid:
|
||||
try:
|
||||
with get_voc_conn(voc_rid) as voc:
|
||||
total_voc = voc.execute(
|
||||
"SELECT count(*) FROM comments WHERE length(text) > 10 "
|
||||
).fetchone()[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"total_voc_comments": total_voc,
|
||||
"total_udes": total_udes,
|
||||
"udes_clustered": clustered,
|
||||
"udes_noise": noise,
|
||||
"coverage_rate": round(clustered / total_voc * 100, 1) if total_voc else 0,
|
||||
"cluster_distribution": cluster_stats,
|
||||
"noise_samples": noise_samples,
|
||||
"totalComments": total_comments,
|
||||
"totalUdes": total_udes,
|
||||
"udesClustered": clustered,
|
||||
"udesNoise": noise,
|
||||
"coverageRate": round(clustered / total_comments * 100, 1) if total_comments else 0,
|
||||
"clusterDistribution": cluster_stats,
|
||||
"noiseSamples": noise_samples,
|
||||
"verdict": "充分" if (total_udes > 0 and noise / total_udes < 0.1) else
|
||||
("需关注" if (total_udes > 0 and noise / total_udes < 0.2) else "需调参"),
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user