feat(report_agent): enhance interview text processing and response handling; improve quote extraction and formatting for better clarity

2026-02-14 16:56:48 +08:00 · 2026-02-14 16:56:48 +08:00 · 7601d78fd4
commit 7601d78fd4
parent dc0a9261d1
2 changed files with 205 additions and 84 deletions
--- a/backend/app/services/zep_tools.py
+++ b/backend/app/services/zep_tools.py
@ -308,7 +308,30 @@ class AgentInterview:
        if self.key_quotes:
            text += "\n**关键引言:**\n"
            for quote in self.key_quotes:
-                text += f"> \"{quote}\"\n"
+                # 清理各种引号
+                clean_quote = quote.replace('\u201c', '').replace('\u201d', '').replace('"', '')
+                clean_quote = clean_quote.replace('\u300c', '').replace('\u300d', '')
+                clean_quote = clean_quote.strip()
+                # 去掉开头的标点
+                while clean_quote and clean_quote[0] in '，,；;：:、。！？\n\r\t ':
+                    clean_quote = clean_quote[1:]
+                # 过滤包含问题编号的垃圾内容（问题1-9）
+                skip = False
+                for d in '123456789':
+                    if f'\u95ee\u9898{d}' in clean_quote:
+                        skip = True
+                        break
+                if skip:
+                    continue
+                # 截断过长内容（按句号截断，而非硬截断）
+                if len(clean_quote) > 150:
+                    dot_pos = clean_quote.find('\u3002', 80)
+                    if dot_pos > 0:
+                        clean_quote = clean_quote[:dot_pos + 1]
+                    else:
+                        clean_quote = clean_quote[:147] + "..."
+                if clean_quote and len(clean_quote) >= 10:
+                    text += f'> "{clean_quote}"\n'
        return text


@ -350,27 +373,26 @@ class InterviewResult:
    def to_text(self) -> str:
        """转换为详细的文本格式，供LLM理解和报告引用"""
        text_parts = [
-            f"## 🎤 深度采访报告",
+            "## 深度采访报告",
            f"**采访主题:** {self.interview_topic}",
            f"**采访人数:** {self.interviewed_count} / {self.total_agents} 位模拟Agent",
-            f"\n### 采访对象选择理由",
-            f"{self.selection_reasoning}",
-            f"\n---"
+            "\n### 采访对象选择理由",
+            self.selection_reasoning or "（自动选择）",
+            "\n---",
+            "\n### 采访实录",
        ]
-        
-        # 各Agent的采访内容
+
        if self.interviews:
-            text_parts.append(f"\n### 采访实录")
            for i, interview in enumerate(self.interviews, 1):
                text_parts.append(f"\n#### 采访 #{i}: {interview.agent_name}")
                text_parts.append(interview.to_text())
                text_parts.append("\n---")
-        
-        # 采访摘要
-        if self.summary:
-            text_parts.append(f"\n### 采访摘要与核心观点")
-            text_parts.append(self.summary)
-        
+        else:
+            text_parts.append("（无采访记录）\n\n---")
+
+        text_parts.append("\n### 采访摘要与核心观点")
+        text_parts.append(self.summary or "（无摘要）")
+
        return "\n".join(text_parts)


@ -1329,8 +1351,18 @@ class ZepToolsService:
        # 将问题合并为一个采访prompt
        combined_prompt = "\n".join([f"{i+1}. {q}" for i, q in enumerate(result.interview_questions)])
        
-        # 添加优化前缀，避免Agent调用工具而直接回复文本
-        INTERVIEW_PROMPT_PREFIX = "结合你的人设、所有的过往记忆与行动，不调用任何工具直接用文本回复我："
+        # 添加优化前缀，约束Agent回复格式
+        INTERVIEW_PROMPT_PREFIX = (
+            "你正在接受一次采访。请结合你的人设、所有的过往记忆与行动，"
+            "以纯文本方式直接回答以下问题。\n"
+            "回复要求：\n"
+            "1. 直接用自然语言回答，不要调用任何工具\n"
+            "2. 不要返回JSON格式或工具调用格式\n"
+            "3. 不要使用Markdown标题（如#、##、###）\n"
+            "4. 按问题编号逐一回答，每个回答以「问题X：」开头（X为问题编号）\n"
+            "5. 每个问题的回答之间用空行分隔\n"
+            "6. 回答要有实质内容，每个问题至少回答2-3句话\n\n"
+        )
        optimized_prompt = f"{INTERVIEW_PROMPT_PREFIX}{combined_prompt}"
        
        # Step 4: 调用真实的采访API（不指定platform，默认双平台同时采访）
@ -1380,26 +1412,43 @@ class ZepToolsService:
                
                twitter_response = twitter_result.get("response", "")
                reddit_response = reddit_result.get("response", "")
-                
-                # 合并两个平台的回答
-                response_parts = []
-                if twitter_response:
-                    response_parts.append(f"【Twitter平台回答】\n{twitter_response}")
-                if reddit_response:
-                    response_parts.append(f"【Reddit平台回答】\n{reddit_response}")
-                
-                if response_parts:
-                    response_text = "\n\n".join(response_parts)
-                else:
-                    response_text = "[无回复]"
-                
+
+                # 清理可能的工具调用 JSON 包裹
+                twitter_response = self._clean_tool_call_response(twitter_response)
+                reddit_response = self._clean_tool_call_response(reddit_response)
+
+                # 始终输出双平台标记
+                twitter_text = twitter_response if twitter_response else "（该平台未获得回复）"
+                reddit_text = reddit_response if reddit_response else "（该平台未获得回复）"
+                response_text = f"【Twitter平台回答】\n{twitter_text}\n\n【Reddit平台回答】\n{reddit_text}"
+
                # 提取关键引言（从两个平台的回答中）
                import re
                combined_responses = f"{twitter_response} {reddit_response}"
-                key_quotes = re.findall(r'[""「」『』]([^""「」『』]{10,100})[""「」『』]', combined_responses)
+
+                # 清理响应文本：去掉标记、编号、Markdown 等干扰
+                clean_text = re.sub(r'#{1,6}\s+', '', combined_responses)
+                clean_text = re.sub(r'\{[^}]*tool_name[^}]*\}', '', clean_text)
+                clean_text = re.sub(r'[*_`|>~\-]{2,}', '', clean_text)
+                clean_text = re.sub(r'问题\d+[：:]\s*', '', clean_text)
+                clean_text = re.sub(r'【[^】]+】', '', clean_text)
+
+                # 策略1（主）: 提取完整的有实质内容的句子
+                sentences = re.split(r'[。！？]', clean_text)
+                meaningful = [
+                    s.strip() for s in sentences
+                    if 20 <= len(s.strip()) <= 150
+                    and not re.match(r'^[\s\W，,；;：:、]+', s.strip())
+                    and not s.strip().startswith(('{', '问题'))
+                ]
+                meaningful.sort(key=len, reverse=True)
+                key_quotes = [s + "。" for s in meaningful[:3]]
+
+                # 策略2（补充）: 正确配对的中文引号「」内长文本
                if not key_quotes:
-                    sentences = combined_responses.split('。')
-                    key_quotes = [s.strip() + '。' for s in sentences if len(s.strip()) > 20][:3]
+                    paired = re.findall(r'\u201c([^\u201c\u201d]{15,100})\u201d', clean_text)
+                    paired += re.findall(r'\u300c([^\u300c\u300d]{15,100})\u300d', clean_text)
+                    key_quotes = [q for q in paired if not re.match(r'^[，,；;：:、]', q)][:3]
                
                interview = AgentInterview(
                    agent_name=agent_name,
@ -1435,6 +1484,27 @@ class ZepToolsService:
        logger.info(f"InterviewAgents完成: 采访了 {result.interviewed_count} 个Agent（双平台）")
        return result
    
+    @staticmethod
+    def _clean_tool_call_response(response: str) -> str:
+        """清理 Agent 回复中的 JSON 工具调用包裹，提取实际内容"""
+        if not response or not response.strip().startswith('{'):
+            return response
+        text = response.strip()
+        if 'tool_name' not in text[:80]:
+            return response
+        import re as _re
+        try:
+            data = json.loads(text)
+            if isinstance(data, dict) and 'arguments' in data:
+                for key in ('content', 'text', 'body', 'message', 'reply'):
+                    if key in data['arguments']:
+                        return str(data['arguments'][key])
+        except (json.JSONDecodeError, KeyError, TypeError):
+            match = _re.search(r'"content"\s*:\s*"((?:[^"\\]|\\.)*)"', text)
+            if match:
+                return match.group(1).replace('\\n', '\n').replace('\\"', '"')
+        return response
+
    def _load_agent_profiles(self, simulation_id: str) -> List[Dict[str, Any]]:
        """加载模拟的Agent人设文件"""
        import os
@ -1581,6 +1651,8 @@ class ZepToolsService:
 2. 针对不同角色可能有不同答案
 3. 涵盖事实、观点、感受等多个维度
 4. 语言自然，像真实采访一样
+5. 每个问题控制在50字以内，简洁明了
+6. 直接提问，不要包含背景说明或前缀

 返回JSON格式：{"questions": ["问题1", "问题2", ...]}"""

@ -1633,7 +1705,14 @@ class ZepToolsService:
 2. 指出观点的共识和分歧
 3. 突出有价值的引言
 4. 客观中立，不偏袒任何一方
-5. 控制在1000字内"""
+5. 控制在1000字内
+
+格式约束（必须遵守）：
+- 使用纯文本段落，用空行分隔不同部分
+- 不要使用Markdown标题（如#、##、###）
+- 不要使用分割线（如---、***）
+- 引用受访者原话时使用中文引号「」
+- 可以使用**加粗**标记关键词，但不要使用其他Markdown语法"""

        user_prompt = f"""采访主题：{interview_requirement}

--- a/frontend/src/components/Step4Report.vue
+++ b/frontend/src/components/Step4Report.vue
@ -849,27 +849,36 @@ const parseInterview = (text) => {
          interview.redditAnswer = redditMatch[1].trim()
        }
        
-        // 如果只有一个平台的回答，将其作为主回答
-        // 这样无论显示哪个平台都能有内容
+        // 平台回退逻辑（兼容旧格式：只有一个平台标记的情况）
        if (!twitterMatch && redditMatch) {
-          // 只有 Reddit 回答，将其也设为 twitterAnswer 作为默认显示
-          interview.twitterAnswer = interview.redditAnswer
+          // 只有 Reddit 回答，仅在非占位文本时复制为默认显示
+          if (interview.redditAnswer && interview.redditAnswer !== '（该平台未获得回复）') {
+            interview.twitterAnswer = interview.redditAnswer
+          }
        } else if (twitterMatch && !redditMatch) {
-          // 只有 Twitter 回答，将其也设为 redditAnswer
-          interview.redditAnswer = interview.twitterAnswer
+          if (interview.twitterAnswer && interview.twitterAnswer !== '（该平台未获得回复）') {
+            interview.redditAnswer = interview.twitterAnswer
+          }
        } else if (!twitterMatch && !redditMatch) {
-          // 如果没有明确分平台，整体作为回答
+          // 没有分平台标记（极旧格式），整体作为回答
          interview.twitterAnswer = answerText
        }
      }
      
-      // 提取关键引言
+      // 提取关键引言（兼容多种引号格式）
      const quotesMatch = block.match(/\*\*关键引言:\*\*\n([\s\S]*?)(?=\n---|\n####|$)/)
      if (quotesMatch) {
        const quotesText = quotesMatch[1]
-        const quoteMatches = quotesText.match(/> "([^"]+)"/g)
+        // 优先匹配 > "text" 格式
+        let quoteMatches = quotesText.match(/> "([^"]+)"/g)
+        // 回退：匹配 > "text" 或 > \u201Ctext\u201D（中文引号）
+        if (!quoteMatches) {
+          quoteMatches = quotesText.match(/> [\u201C""]([^\u201D""]+)[\u201D""]/g)
+        }
        if (quoteMatches) {
-          interview.quotes = quoteMatches.map(q => q.replace(/^> "|"$/g, '').trim())
+          interview.quotes = quoteMatches
+            .map(q => q.replace(/^> [\u201C""]|[\u201D""]$/g, '').trim())
+            .filter(q => q)
        }
      }
      
@ -1314,79 +1323,100 @@ const InterviewDisplay = {
      return text.substring(0, 400) + '...'
    }
    
+    // 检查是否为平台占位文本
+    const isPlaceholderText = (text) => {
+      if (!text) return true
+      const t = text.trim()
+      return t === '（该平台未获得回复）' || t === '(该平台未获得回复)' || t === '[无回复]'
+    }
+
    // 尝试按问题编号分割回答
    const splitAnswerByQuestions = (answerText, questionCount) => {
      if (!answerText || questionCount <= 0) return [answerText]
-      
-      // 更健壮的分割逻辑：查找所有 "数字." 格式的编号位置
-      // 支持格式：
-      // - "1.  \n内容" （数字+点+空格+换行+内容）
-      // - "\n\n2.  \n内容" （换行+数字+点+空格+换行+内容）
-      // 使用更宽松的匹配：开头或换行后的数字+点+空白
-      const numberPattern = /(?:^|[\r\n]+)(\d+)\.\s+/g
-      const matches = []
+      if (isPlaceholderText(answerText)) return ['']
+
+      // 支持两种编号格式：
+      // 1. "问题X：" 或 "问题X:" （中文格式，后端新格式）
+      // 2. "1. " 或 "\n1. " （数字+点，旧格式兼容）
+      let matches = []
      let match
-      
-      while ((match = numberPattern.exec(answerText)) !== null) {
+
+      // 优先尝试 "问题X：" 格式
+      const cnPattern = /(?:^|[\r\n]+)问题(\d+)[：:]\s*/g
+      while ((match = cnPattern.exec(answerText)) !== null) {
        matches.push({
          num: parseInt(match[1]),
          index: match.index,
          fullMatch: match[0]
        })
      }
-      
+
+      // 如果没匹配到，回退到 "数字." 格式
+      if (matches.length === 0) {
+        const numPattern = /(?:^|[\r\n]+)(\d+)\.\s+/g
+        while ((match = numPattern.exec(answerText)) !== null) {
+          matches.push({
+            num: parseInt(match[1]),
+            index: match.index,
+            fullMatch: match[0]
+          })
+        }
+      }
+
      // 如果没有找到编号或只找到一个，返回整体
      if (matches.length <= 1) {
-        // 尝试移除开头的编号（格式：1.  \n 或 1. ）
-        const cleaned = answerText.replace(/^\d+\.\s+/, '').trim()
+        const cleaned = answerText
+          .replace(/^问题\d+[：:]\s*/, '')
+          .replace(/^\d+\.\s+/, '')
+          .trim()
        return [cleaned || answerText]
      }
-      
+
      // 按编号提取各部分
      const parts = []
      for (let i = 0; i < matches.length; i++) {
        const current = matches[i]
        const next = matches[i + 1]
-        
+
        const startIdx = current.index + current.fullMatch.length
        const endIdx = next ? next.index : answerText.length
-        
+
        let part = answerText.substring(startIdx, endIdx).trim()
-        // 移除末尾可能的多余换行
        part = part.replace(/[\r\n]+$/, '').trim()
        parts.push(part)
      }
-      
-      // 如果分割成功且数量合理，返回分割结果
+
      if (parts.length > 0 && parts.some(p => p)) {
        return parts
      }
-      
+
      return [answerText]
    }
    
    // 获取某个问题对应的回答
    const getAnswerForQuestion = (interview, qIdx, platform) => {
      const answer = platform === 'twitter' ? interview.twitterAnswer : (interview.redditAnswer || interview.twitterAnswer)
-      if (!answer) return ''
-      
+      if (!answer || isPlaceholderText(answer)) return answer || ''
+
      const questionCount = interview.questions?.length || 1
      const answers = splitAnswerByQuestions(answer, questionCount)
-      
-      // 如果只有一个回答部分，或者索引超出，返回完整回答
-      if (answers.length === 1 || qIdx >= answers.length) {
-        return qIdx === 0 ? answer : ''
+
+      // 分割成功且索引有效
+      if (answers.length > 1 && qIdx < answers.length) {
+        return answers[qIdx] || ''
      }
-      
-      return answers[qIdx] || ''
+
+      // 分割失败：第一个问题返回完整回答，其余返回空
+      return qIdx === 0 ? answer : ''
    }
    
-    // 检查某个问题是否有双平台回答
+    // 检查某个问题是否有双平台回答（过滤占位文本）
    const hasMultiplePlatforms = (interview, qIdx) => {
      if (!interview.twitterAnswer || !interview.redditAnswer) return false
      const twitterAnswer = getAnswerForQuestion(interview, qIdx, 'twitter')
      const redditAnswer = getAnswerForQuestion(interview, qIdx, 'reddit')
-      return twitterAnswer && redditAnswer && twitterAnswer !== redditAnswer
+      // 两个平台都有真实回答（非占位文本）且内容不同
+      return !isPlaceholderText(twitterAnswer) && !isPlaceholderText(redditAnswer) && twitterAnswer !== redditAnswer
    }
    
    return () => h('div', { class: 'interview-display' }, [
@ -1453,7 +1483,8 @@ const InterviewDisplay = {
            const hasDualPlatform = hasMultiplePlatforms(interview, qIdx)
            const expandKey = `${activeIndex.value}-${qIdx}`
            const isExpanded = expandedAnswers.value.has(expandKey)
-            
+            const isPlaceholder = isPlaceholderText(answerText)
+
            return h('div', { class: 'qa-pair', key: qIdx }, [
              // Question Block
              h('div', { class: 'qa-question' }, [
@ -1463,14 +1494,14 @@ const InterviewDisplay = {
                  h('div', { class: 'qa-text' }, question)
                ])
              ]),
-              
+
              // Answer Block
-              answerText && h('div', { class: 'qa-answer' }, [
+              answerText && h('div', { class: ['qa-answer', { 'answer-placeholder': isPlaceholder }] }, [
                h('div', { class: 'qa-badge a-badge' }, `A${qIdx + 1}`),
                h('div', { class: 'qa-content' }, [
                  h('div', { class: 'qa-answer-header' }, [
                    h('div', { class: 'qa-sender' }, interview?.name || 'Agent'),
-                    // 双平台切换按钮
+                    // 双平台切换按钮（仅在有真实双平台回答时显示）
                    hasDualPlatform && h('div', { class: 'platform-switch' }, [
                      h('button', {
                        class: ['platform-btn', { active: currentPlatform === 'twitter' }],
@ -1494,14 +1525,16 @@ const InterviewDisplay = {
                      ])
                    ])
                  ]),
-                  h('div', { 
-                    class: 'qa-text answer-text',
-                    innerHTML: formatAnswer(answerText, isExpanded)
-                      .replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
-                      .replace(/\n/g, '<br>')
+                  h('div', {
+                    class: ['qa-text', 'answer-text', { 'placeholder-text': isPlaceholder }],
+                    innerHTML: isPlaceholder
+                      ? answerText
+                      : formatAnswer(answerText, isExpanded)
+                          .replace(/\*\*(.+?)\*\*/g, '<strong>$1</strong>')
+                          .replace(/\n/g, '<br>')
                  }),
-                  // Expand/Collapse Button
-                  answerText.length > 400 && h('button', {
+                  // Expand/Collapse Button（占位文本不显示）
+                  !isPlaceholder && answerText.length > 400 && h('button', {
                    class: 'expand-answer-btn',
                    onClick: () => toggleAnswer(expandKey)
                  }, isExpanded ? 'Show Less' : 'Show More')
@ -3913,6 +3946,15 @@ watch(() => props.reportId, (newId) => {
  margin-top: 0;
 }

+:deep(.interview-display .answer-placeholder) {
+  opacity: 0.6;
+}
+
+:deep(.interview-display .placeholder-text) {
+  font-style: italic;
+  color: #9CA3AF;
+}
+
 :deep(.interview-display .qa-answer-header) {
  display: flex;
  justify-content: space-between;