361 lines
12 KiB
Python
361 lines
12 KiB
Python
"""
|
||
本体生成服务
|
||
接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义
|
||
"""
|
||
|
||
import json
|
||
from typing import Dict, Any, List, Optional
|
||
from ..utils.llm_client import LLMClient
|
||
|
||
|
||
# 本体生成的系统提示词
|
||
ONTOLOGY_SYSTEM_PROMPT = """你是一个专业的知识图谱本体设计专家。你的任务是分析给定的文本内容和模拟需求,设计适合**社交媒体舆论模拟**的实体类型和关系类型。
|
||
|
||
**重要:你必须输出有效的JSON格式数据,不要输出任何其他内容。**
|
||
|
||
## 核心任务背景
|
||
|
||
我们正在构建一个**社交媒体舆论模拟系统**。在这个系统中:
|
||
- 每个实体都是一个可以在社交媒体上发声、互动、传播信息的"账号"或"主体"
|
||
- 实体之间会相互影响、转发、评论、回应
|
||
- 我们需要模拟舆论事件中各方的反应和信息传播路径
|
||
|
||
因此,**实体必须是现实中真实存在的、可以在社媒上发声和互动的主体**:
|
||
|
||
**可以是(鼓励多样化划分)**:
|
||
- 具体的个人(公众人物、当事人、意见领袖、专家学者)
|
||
- 公司、企业(包括其官方账号)
|
||
- 组织机构(大学、协会、NGO、工会等)
|
||
- 政府部门、监管机构
|
||
- 媒体机构(报纸、电视台、自媒体、网站)
|
||
- 社交媒体平台本身
|
||
- 特定群体代表(如校友会、粉丝团、维权群体等)
|
||
|
||
**不可以是**:
|
||
- 抽象概念(如"舆论"、"情绪"、"趋势")
|
||
- 主题/话题(如"学术诚信"、"教育改革")
|
||
- 观点/态度(如"支持方"、"反对方")
|
||
- 泛指群体(如"网友"、"公众"、"学生群体")
|
||
|
||
## 输出格式
|
||
|
||
请输出JSON格式,包含以下结构:
|
||
|
||
```json
|
||
{
|
||
"entity_types": [
|
||
{
|
||
"name": "实体类型名称(英文,PascalCase)",
|
||
"description": "简短描述(英文,不超过100字符)",
|
||
"attributes": [
|
||
{
|
||
"name": "属性名(英文,snake_case)",
|
||
"type": "text",
|
||
"description": "属性描述"
|
||
}
|
||
],
|
||
"examples": ["示例实体1", "示例实体2"]
|
||
}
|
||
],
|
||
"edge_types": [
|
||
{
|
||
"name": "关系类型名称(英文,UPPER_SNAKE_CASE)",
|
||
"description": "简短描述(英文,不超过100字符)",
|
||
"source_targets": [
|
||
{"source": "源实体类型", "target": "目标实体类型"}
|
||
],
|
||
"attributes": []
|
||
}
|
||
],
|
||
"analysis_summary": "对文本内容的简要分析说明(中文)"
|
||
}
|
||
```
|
||
|
||
## 设计指南
|
||
|
||
1. **实体类型设计(重要!请尽量多划分)**:
|
||
- **数量要求:至少5个,最多10个实体类型**
|
||
- 每个实体类型代表一类可以在社媒上发声的主体
|
||
- 尽量细分不同角色,例如:
|
||
- 不要只用"Person",可以细分为"PublicFigure"、"Expert"、"Whistleblower"等
|
||
- 不要只用"Organization",可以细分为"University"、"Company"、"NGO"等
|
||
- description必须清晰说明什么样的实体应该被提取
|
||
- 每个类型提供2-3个具体示例
|
||
|
||
2. **关系类型设计**:
|
||
- 关系应该反映社媒互动中的真实联系
|
||
- 关注可能影响舆论传播的关系:
|
||
- 信息传播:REPORTS_ON, COMMENTS_ON, SHARES
|
||
- 组织关系:AFFILIATED_WITH, WORKS_FOR, REPRESENTS
|
||
- 互动关系:RESPONDS_TO, SUPPORTS, OPPOSES
|
||
- 关系类型:5-10个为宜
|
||
|
||
3. **属性设计**:
|
||
- 每个实体类型1-3个关键属性
|
||
- 属性应有助于识别实体的社媒影响力(如role、influence_level等)
|
||
|
||
## 实体类型参考(请根据文本内容灵活选择和扩展)
|
||
|
||
- Person: 普通个人
|
||
- PublicFigure: 公众人物(明星、网红、意见领袖)
|
||
- Expert: 专家学者
|
||
- Journalist: 记者
|
||
- Company: 公司企业
|
||
- University: 高校
|
||
- GovernmentAgency: 政府机构
|
||
- MediaOutlet: 传统媒体
|
||
- SelfMedia: 自媒体账号
|
||
- SocialPlatform: 社交媒体平台
|
||
- NGO: 非政府组织
|
||
- IndustryAssociation: 行业协会
|
||
- AlumniAssociation: 校友会
|
||
- FanGroup: 粉丝群体/支持群体
|
||
|
||
## 关系类型参考
|
||
|
||
- WORKS_FOR: 工作于
|
||
- AFFILIATED_WITH: 隶属于
|
||
- REPRESENTS: 代表
|
||
- REGULATES: 监管
|
||
- REPORTS_ON: 报道
|
||
- COMMENTS_ON: 评论
|
||
- RESPONDS_TO: 回应
|
||
- SUPPORTS: 支持
|
||
- OPPOSES: 反对
|
||
- COLLABORATES_WITH: 合作
|
||
- COMPETES_WITH: 竞争
|
||
"""
|
||
|
||
|
||
class OntologyGenerator:
|
||
"""
|
||
本体生成器
|
||
分析文本内容,生成实体和关系类型定义
|
||
"""
|
||
|
||
def __init__(self, llm_client: Optional[LLMClient] = None):
|
||
self.llm_client = llm_client or LLMClient()
|
||
|
||
def generate(
|
||
self,
|
||
document_texts: List[str],
|
||
simulation_requirement: str,
|
||
additional_context: Optional[str] = None
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
生成本体定义
|
||
|
||
Args:
|
||
document_texts: 文档文本列表
|
||
simulation_requirement: 模拟需求描述
|
||
additional_context: 额外上下文
|
||
|
||
Returns:
|
||
本体定义(entity_types, edge_types等)
|
||
"""
|
||
# 构建用户消息
|
||
user_message = self._build_user_message(
|
||
document_texts,
|
||
simulation_requirement,
|
||
additional_context
|
||
)
|
||
|
||
messages = [
|
||
{"role": "system", "content": ONTOLOGY_SYSTEM_PROMPT},
|
||
{"role": "user", "content": user_message}
|
||
]
|
||
|
||
# 调用LLM
|
||
result = self.llm_client.chat_json(
|
||
messages=messages,
|
||
temperature=0.3,
|
||
max_tokens=4096
|
||
)
|
||
|
||
# 验证和后处理
|
||
result = self._validate_and_process(result)
|
||
|
||
return result
|
||
|
||
# 传给 LLM 的文本最大长度(5万字)
|
||
MAX_TEXT_LENGTH_FOR_LLM = 50000
|
||
|
||
def _build_user_message(
|
||
self,
|
||
document_texts: List[str],
|
||
simulation_requirement: str,
|
||
additional_context: Optional[str]
|
||
) -> str:
|
||
"""构建用户消息"""
|
||
|
||
# 合并文本
|
||
combined_text = "\n\n---\n\n".join(document_texts)
|
||
original_length = len(combined_text)
|
||
|
||
# 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建)
|
||
if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM:
|
||
combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM]
|
||
combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..."
|
||
|
||
message = f"""## 模拟需求
|
||
|
||
{simulation_requirement}
|
||
|
||
## 文档内容
|
||
|
||
{combined_text}
|
||
"""
|
||
|
||
if additional_context:
|
||
message += f"""
|
||
## 额外说明
|
||
|
||
{additional_context}
|
||
"""
|
||
|
||
message += """
|
||
请根据以上内容,设计适合社会舆论模拟的实体类型和关系类型。
|
||
记住:所有实体类型必须是现实中可以发声的主体,不能是抽象概念。
|
||
"""
|
||
|
||
return message
|
||
|
||
def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||
"""验证和后处理结果"""
|
||
|
||
# 确保必要字段存在
|
||
if "entity_types" not in result:
|
||
result["entity_types"] = []
|
||
if "edge_types" not in result:
|
||
result["edge_types"] = []
|
||
if "analysis_summary" not in result:
|
||
result["analysis_summary"] = ""
|
||
|
||
# 验证实体类型
|
||
for entity in result["entity_types"]:
|
||
if "attributes" not in entity:
|
||
entity["attributes"] = []
|
||
if "examples" not in entity:
|
||
entity["examples"] = []
|
||
# 确保description不超过100字符
|
||
if len(entity.get("description", "")) > 100:
|
||
entity["description"] = entity["description"][:97] + "..."
|
||
|
||
# 验证关系类型
|
||
for edge in result["edge_types"]:
|
||
if "source_targets" not in edge:
|
||
edge["source_targets"] = []
|
||
if "attributes" not in edge:
|
||
edge["attributes"] = []
|
||
if len(edge.get("description", "")) > 100:
|
||
edge["description"] = edge["description"][:97] + "..."
|
||
|
||
return result
|
||
|
||
def generate_python_code(self, ontology: Dict[str, Any]) -> str:
|
||
"""
|
||
将本体定义转换为Python代码(类似ontology.py)
|
||
|
||
Args:
|
||
ontology: 本体定义
|
||
|
||
Returns:
|
||
Python代码字符串
|
||
"""
|
||
code_lines = [
|
||
'"""',
|
||
'自定义实体类型定义',
|
||
'由MiroFish自动生成,用于社会舆论模拟',
|
||
'"""',
|
||
'',
|
||
'from pydantic import Field',
|
||
'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel',
|
||
'',
|
||
'',
|
||
'# ============== 实体类型定义 ==============',
|
||
'',
|
||
]
|
||
|
||
# 生成实体类型
|
||
for entity in ontology.get("entity_types", []):
|
||
name = entity["name"]
|
||
desc = entity.get("description", f"A {name} entity.")
|
||
|
||
code_lines.append(f'class {name}(EntityModel):')
|
||
code_lines.append(f' """{desc}"""')
|
||
|
||
attrs = entity.get("attributes", [])
|
||
if attrs:
|
||
for attr in attrs:
|
||
attr_name = attr["name"]
|
||
attr_desc = attr.get("description", attr_name)
|
||
code_lines.append(f' {attr_name}: EntityText = Field(')
|
||
code_lines.append(f' description="{attr_desc}",')
|
||
code_lines.append(f' default=None')
|
||
code_lines.append(f' )')
|
||
else:
|
||
code_lines.append(' pass')
|
||
|
||
code_lines.append('')
|
||
code_lines.append('')
|
||
|
||
code_lines.append('# ============== 关系类型定义 ==============')
|
||
code_lines.append('')
|
||
|
||
# 生成关系类型
|
||
for edge in ontology.get("edge_types", []):
|
||
name = edge["name"]
|
||
# 转换为PascalCase类名
|
||
class_name = ''.join(word.capitalize() for word in name.split('_'))
|
||
desc = edge.get("description", f"A {name} relationship.")
|
||
|
||
code_lines.append(f'class {class_name}(EdgeModel):')
|
||
code_lines.append(f' """{desc}"""')
|
||
|
||
attrs = edge.get("attributes", [])
|
||
if attrs:
|
||
for attr in attrs:
|
||
attr_name = attr["name"]
|
||
attr_desc = attr.get("description", attr_name)
|
||
code_lines.append(f' {attr_name}: EntityText = Field(')
|
||
code_lines.append(f' description="{attr_desc}",')
|
||
code_lines.append(f' default=None')
|
||
code_lines.append(f' )')
|
||
else:
|
||
code_lines.append(' pass')
|
||
|
||
code_lines.append('')
|
||
code_lines.append('')
|
||
|
||
# 生成类型字典
|
||
code_lines.append('# ============== 类型配置 ==============')
|
||
code_lines.append('')
|
||
code_lines.append('ENTITY_TYPES = {')
|
||
for entity in ontology.get("entity_types", []):
|
||
name = entity["name"]
|
||
code_lines.append(f' "{name}": {name},')
|
||
code_lines.append('}')
|
||
code_lines.append('')
|
||
code_lines.append('EDGE_TYPES = {')
|
||
for edge in ontology.get("edge_types", []):
|
||
name = edge["name"]
|
||
class_name = ''.join(word.capitalize() for word in name.split('_'))
|
||
code_lines.append(f' "{name}": {class_name},')
|
||
code_lines.append('}')
|
||
code_lines.append('')
|
||
|
||
# 生成边的source_targets映射
|
||
code_lines.append('EDGE_SOURCE_TARGETS = {')
|
||
for edge in ontology.get("edge_types", []):
|
||
name = edge["name"]
|
||
source_targets = edge.get("source_targets", [])
|
||
if source_targets:
|
||
st_list = ', '.join([
|
||
f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}'
|
||
for st in source_targets
|
||
])
|
||
code_lines.append(f' "{name}": [{st_list}],')
|
||
code_lines.append('}')
|
||
|
||
return '\n'.join(code_lines)
|
||
|