""" 黑手党提案 — UDE 提取工具 流程:VOC 原始评论 → LLM 转写 UDE → DashScope 向量化 → DBSCAN 聚类 → 覆盖扫描 数据来源:只读访问共享 VOC 数据层 分析结果:写入本项目的案例 DB """ from __future__ import annotations import json import os import asyncio import logging from pathlib import Path import numpy as np from openai import OpenAI, AsyncOpenAI from dotenv import load_dotenv load_dotenv() logger = logging.getLogger(__name__) MODEL = os.getenv("MODEL_ID", "qwen-plus") TEMPERATURE = float(os.getenv("TEMPERATURE", "0.1")) BATCH_SIZE = 10 CONCURRENCY = 5 EMBED_DIM = 1024 EMBED_BATCH_SIZE = 25 PROMPT_PATH = Path(__file__).parent.parent / "prompts" / "voc_to_ude.txt" def _get_llm_client() -> AsyncOpenAI: return AsyncOpenAI( api_key=os.getenv("LITELLM_MASTER_KEY"), base_url=os.getenv("LITELLM_PROXY_URL"), ) def _get_embed_client(key: str) -> OpenAI: if not key: raise ValueError("DashScope API Key 未配置。请通过 Header 或 .env 传入。") return OpenAI( api_key=key, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", ) # ═══════════ Step 1: VOC → UDE 转写 ═══════════ async def _call_ude_llm(prompt: str, comments: list[dict]) -> list[dict]: """单批 LLM 转写""" client = _get_llm_client() user_msg = "请将以下消费者评论转写为 UDE 格式句,返回 JSON:\n\n" for c in comments: user_msg += f"[{c['id']}] 平台:{c['platform']} 原文: \"{c['text'][:300]}\"\n\n" try: resp = await client.chat.completions.create( model=MODEL, messages=[ {"role": "system", "content": prompt}, {"role": "user", "content": user_msg}, ], temperature=TEMPERATURE, max_tokens=4000, response_format={"type": "json_object"}, ) content = (resp.choices[0].message.content or "").strip() parsed = json.loads(content) if isinstance(parsed, dict): for key in ("results", "data", "items", "udes"): if key in parsed and isinstance(parsed[key], list): return parsed[key] if isinstance(parsed, list): return parsed return [] except Exception as e: logger.warning(f"[UDE] LLM 转写失败: {str(e)[:80]}") return [] async def _process_ude_batch(comments, prompt, semaphore): async with semaphore: return await _call_ude_llm(prompt, comments) async def run_ude_extraction(case_id: str, limit: int = 0) -> dict: """从共享 VOC 数据读取原始评论,转写为 UDE,存入案例 DB""" from db import get_case_conn, get_voc_conn prompt = PROMPT_PATH.read_text("utf-8") if PROMPT_PATH.exists() else "" if not prompt: return {"error": "UDE 转写 prompt 未找到 (prompts/voc_to_ude.txt)"} with get_case_conn(case_id) as case_conn: card = case_conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone() if not card or not card["voc_research_id"]: return {"error": "未关联 VOC 研究。请先调用 link-voc。"} voc_research_id = card["voc_research_id"] # 获取已转写的 voc_comment_ids done_ids = {r[0] for r in case_conn.execute( "SELECT voc_comment_id FROM ude_sentences" ).fetchall()} # 从 VOC DB 只读获取原始评论 with get_voc_conn(voc_research_id) as voc_conn: rows = voc_conn.execute(""" SELECT id, platform, text FROM comments WHERE length(text) > 10 ORDER BY id """).fetchall() # 过滤已完成的 pending = [r for r in rows if r["id"] not in done_ids] if not pending: with get_case_conn(case_id) as conn: total = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] return {"message": "全部已转写完成", "total_udes": total, "new": 0} if limit > 0: pending = pending[:limit] # 切批 batches = [] for i in range(0, len(pending), BATCH_SIZE): chunk = pending[i:i + BATCH_SIZE] batches.append([{"id": r["id"], "platform": r["platform"], "text": r["text"]} for r in chunk]) semaphore = asyncio.Semaphore(CONCURRENCY) tasks = [asyncio.create_task(_process_ude_batch(b, prompt, semaphore)) for b in batches] all_results = await asyncio.gather(*tasks) # 写入案例 DB ok = 0 with get_case_conn(case_id) as case_conn: for results in all_results: for r in (results or []): if not isinstance(r, dict): continue ude_text = r.get("ude") if not ude_text: continue cid = r.get("id") if not cid: continue try: case_conn.execute( "INSERT OR IGNORE INTO ude_sentences (voc_comment_id, ude_text, confidence) VALUES (?, ?, ?)", (int(cid), ude_text, r.get("confidence", 0.5)) ) ok += 1 except Exception as e: logger.warning(f"[UDE] 写入失败 id={cid}: {e}") case_conn.commit() total = case_conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] return { "new_udes": ok, "total_udes": total, "total_voc_comments": len(rows), "remaining": len(rows) - total, "batches": len(batches), } # ═══════════ Step 2 & 3: 向量化 + 聚类 ═══════════ def _embed_texts(client: OpenAI, texts: list[str]) -> list[list[float]]: all_vectors = [] for i in range(0, len(texts), EMBED_BATCH_SIZE): batch = texts[i:i + EMBED_BATCH_SIZE] resp = client.embeddings.create(model="text-embedding-v4", input=batch, dimensions=EMBED_DIM) all_vectors.extend([item.embedding for item in resp.data]) return all_vectors def run_clustering(case_id: str, eps: float = 0.25, min_samples: int = 3, dashscope_key: str = None) -> dict: """向量化 + DBSCAN 聚类""" from sklearn.cluster import DBSCAN from sklearn.metrics.pairwise import cosine_distances from db import get_case_conn, get_voc_conn key = dashscope_key or os.getenv("DASHSCOPE_API_KEY", "") if not key: return {"error": "DashScope API Key 未配置。"} embed_client = _get_embed_client(key) with get_case_conn(case_id) as conn: rows = conn.execute("SELECT id, voc_comment_id, ude_text FROM ude_sentences ORDER BY id").fetchall() if len(rows) < min_samples: return {"error": f"UDE 不足 ({len(rows)} 条),至少需要 {min_samples} 条。"} ude_texts = [r["ude_text"] for r in rows] ude_ids = [r["id"] for r in rows] comment_ids = [r["voc_comment_id"] for r in rows] # 向量化 vectors = _embed_texts(embed_client, ude_texts) vec_array = np.array(vectors) # 保存向量 for i, uid in enumerate(ude_ids): conn.execute("UPDATE ude_sentences SET vector = ? WHERE id = ?", (json.dumps(vectors[i]), uid)) # DBSCAN dist_matrix = cosine_distances(vec_array) clustering = DBSCAN(eps=eps, min_samples=min_samples, metric="precomputed").fit(dist_matrix) labels = clustering.labels_ # 更新聚类标签 for i, uid in enumerate(ude_ids): conn.execute("UPDATE ude_sentences SET cluster_id = ? WHERE id = ?", (int(labels[i]), uid)) # 清空旧聚类,写入新聚类 conn.execute("DELETE FROM ude_clusters") # 获取关联的 VOC research_id 用于读取原声 card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone() voc_rid = card["voc_research_id"] if card else None clusters = [] unique_labels = sorted(set(labels) - {-1}) for cluster_id in unique_labels: member_indices = [i for i, l in enumerate(labels) if l == cluster_id] member_texts = [ude_texts[i] for i in member_indices] member_vectors = vec_array[member_indices] member_cids = [comment_ids[i] for i in member_indices] # 簇中心 centroid = member_vectors.mean(axis=0) dists = cosine_distances([centroid], member_vectors)[0] representative = member_texts[dists.argmin()] # 取原声 sample_voices = [] if voc_rid: try: voc_conn = get_voc_conn(voc_rid) for cid in member_cids[:5]: voice = voc_conn.execute( "SELECT text, platform FROM comments WHERE id = ?", (cid,) ).fetchone() if voice: sample_voices.append({"text": voice["text"][:200], "platform": voice["platform"]}) voc_conn.close() except Exception: pass conn.execute( "INSERT INTO ude_clusters (representative_ude, coverage, sample_voices) VALUES (?, ?, ?)", (representative, len(member_indices), json.dumps(sample_voices, ensure_ascii=False)) ) clusters.append({ "cluster_id": int(cluster_id), "representative_ude": representative, "coverage": len(member_indices), "sample_voices": sample_voices, }) conn.commit() clusters.sort(key=lambda x: x["coverage"], reverse=True) noise_count = int((labels == -1).sum()) return { "total_udes": len(labels), "num_clusters": len(clusters), "noise_count": noise_count, "noise_pct": round(noise_count / len(labels) * 100, 1) if len(labels) else 0, "clusters": clusters, "params": {"eps": eps, "min_samples": min_samples}, } # ═══════════ Step 5: 覆盖扫描 ═══════════ def run_coverage_scan(case_id: str) -> dict: from db import get_case_conn, get_voc_conn with get_case_conn(case_id) as conn: card = conn.execute("SELECT voc_research_id FROM case_card LIMIT 1").fetchone() voc_rid = card["voc_research_id"] if card else None total_udes = conn.execute("SELECT count(*) FROM ude_sentences").fetchone()[0] clustered = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id >= 0").fetchone()[0] noise = conn.execute("SELECT count(*) FROM ude_sentences WHERE cluster_id = -1").fetchone()[0] cluster_stats = [dict(r) for r in conn.execute( "SELECT cluster_id, count(*) as cnt FROM ude_sentences WHERE cluster_id >= 0 GROUP BY cluster_id ORDER BY cnt DESC" ).fetchall()] noise_samples = [dict(r) for r in conn.execute( "SELECT ude_text, voc_comment_id, confidence FROM ude_sentences WHERE cluster_id = -1 ORDER BY confidence DESC LIMIT 10" ).fetchall()] total_voc = 0 if voc_rid: try: with get_voc_conn(voc_rid) as voc: total_voc = voc.execute( "SELECT count(*) FROM comments WHERE length(text) > 10 " ).fetchone()[0] except Exception: pass return { "total_voc_comments": total_voc, "total_udes": total_udes, "udes_clustered": clustered, "udes_noise": noise, "coverage_rate": round(clustered / total_voc * 100, 1) if total_voc else 0, "cluster_distribution": cluster_stats, "noise_samples": noise_samples, "verdict": "充分" if (total_udes > 0 and noise / total_udes < 0.1) else ("需关注" if (total_udes > 0 and noise / total_udes < 0.2) else "需调参"), }