""" 本体生成服务 接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义 """ import json from typing import Dict, Any, List, Optional from ..utils.llm_client import LLMClient # System prompt for ontology generation ONTOLOGY_SYSTEM_PROMPT = """You are a professional knowledge graph ontology design expert. Your task is to analyze the given text content and simulation requirements, and design entity types and relationship types suitable for **social media public opinion simulation**. **Important: You must output valid JSON format data. Do not output anything else.** ## Core Task Background We are building a **social media public opinion simulation system**. In this system: - Each entity is an "account" or "actor" that can post, interact, and spread information on social media - Entities influence each other through reposts, comments, and responses - We need to simulate the reactions and information propagation paths of various parties in public opinion events Therefore, **entities must be real-world actors that can post and interact on social media**: **Can be**: - Specific individuals (public figures, parties involved, opinion leaders, experts, ordinary people) - Companies and enterprises (including their official accounts) - Organizations (universities, associations, NGOs, unions, etc.) - Government departments and regulatory agencies - Media organizations (newspapers, TV stations, self-media, websites) - Social media platforms themselves - Representatives of specific groups (e.g., alumni associations, fan groups, advocacy groups, etc.) **Cannot be**: - Abstract concepts (e.g., "public opinion", "emotion", "trend") - Topics/themes (e.g., "academic integrity", "education reform") - Viewpoints/attitudes (e.g., "supporters", "opponents") ## Output Format Please output in JSON format with the following structure: ```json { "entity_types": [ { "name": "Entity type name (English, PascalCase)", "description": "Brief description (English, no more than 100 characters)", "attributes": [ { "name": "Attribute name (English, snake_case)", "type": "text", "description": "Attribute description" } ], "examples": ["Example entity 1", "Example entity 2"] } ], "edge_types": [ { "name": "Relationship type name (English, UPPER_SNAKE_CASE)", "description": "Brief description (English, no more than 100 characters)", "source_targets": [ {"source": "Source entity type", "target": "Target entity type"} ], "attributes": [] } ], "analysis_summary": "Brief analysis of the text content" } ``` ## Design Guidelines (Extremely Important!) ### 1. Entity Type Design - Must Be Strictly Followed **Quantity requirement: Exactly 10 entity types** **Hierarchy requirements (must include both specific types and fallback types)**: Your 10 entity types must include the following levels: A. **Fallback types (must be included, placed as the last 2 in the list)**: - `Person`: Fallback type for any individual person. When a person doesn't fit other more specific person types, they go here. - `Organization`: Fallback type for any organization. When an organization doesn't fit other more specific organization types, it goes here. B. **Specific types (8 types, designed based on text content)**: - Design more specific types for the main roles appearing in the text - For example: if the text involves academic events, you can have `Student`, `Professor`, `University` - For example: if the text involves business events, you can have `Company`, `CEO`, `Employee` **Why fallback types are needed**: - Various people appear in the text, such as "elementary school teachers", "bystanders", "random netizens" - If there is no specific type match, they should be classified under `Person` - Similarly, small organizations, temporary groups, etc. should be classified under `Organization` **Design principles for specific types**: - Identify high-frequency or key role types from the text - Each specific type should have clear boundaries to avoid overlap - The description must clearly explain how this type differs from the fallback type ### 2. Relationship Type Design - Quantity: 6-10 - Relationships should reflect real connections in social media interactions - Ensure the source_targets of relationships cover the entity types you defined ### 3. Attribute Design - 1-3 key attributes per entity type - **Note**: Attribute names cannot use `name`, `uuid`, `group_id`, `created_at`, `summary` (these are system reserved words) - Recommended: `full_name`, `title`, `role`, `position`, `location`, `description`, etc. ## Entity Type Reference **Individual types (specific)**: - Student: Student - Professor: Professor/Scholar - Journalist: Journalist - Celebrity: Celebrity/Influencer - Executive: Executive - Official: Government official - Lawyer: Lawyer - Doctor: Doctor **Individual types (fallback)**: - Person: Any individual (used when not fitting the above specific types) **Organization types (specific)**: - University: University - Company: Company/Enterprise - GovernmentAgency: Government agency - MediaOutlet: Media organization - Hospital: Hospital - School: School - NGO: Non-governmental organization **Organization types (fallback)**: - Organization: Any organization (used when not fitting the above specific types) ## Relationship Type Reference - WORKS_FOR: Works for - STUDIES_AT: Studies at - AFFILIATED_WITH: Affiliated with - REPRESENTS: Represents - REGULATES: Regulates - REPORTS_ON: Reports on - COMMENTS_ON: Comments on - RESPONDS_TO: Responds to - SUPPORTS: Supports - OPPOSES: Opposes - COLLABORATES_WITH: Collaborates with - COMPETES_WITH: Competes with """ class OntologyGenerator: """ 本体生成器 分析文本内容,生成实体和关系类型定义 """ def __init__(self, llm_client: Optional[LLMClient] = None): self.llm_client = llm_client or LLMClient() def generate( self, document_texts: List[str], simulation_requirement: str, additional_context: Optional[str] = None ) -> Dict[str, Any]: """ 生成本体定义 Args: document_texts: 文档文本列表 simulation_requirement: 模拟需求描述 additional_context: 额外上下文 Returns: 本体定义(entity_types, edge_types等) """ # 构建用户消息 user_message = self._build_user_message( document_texts, simulation_requirement, additional_context ) messages = [ {"role": "system", "content": ONTOLOGY_SYSTEM_PROMPT}, {"role": "user", "content": user_message} ] # 调用LLM result = self.llm_client.chat_json( messages=messages, temperature=0.3, max_tokens=4096 ) # 验证和后处理 result = self._validate_and_process(result) return result # 传给 LLM 的文本最大长度(5万字) MAX_TEXT_LENGTH_FOR_LLM = 50000 def _build_user_message( self, document_texts: List[str], simulation_requirement: str, additional_context: Optional[str] ) -> str: """构建用户消息""" # 合并文本 combined_text = "\n\n---\n\n".join(document_texts) original_length = len(combined_text) # 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建) if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM: combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM] combined_text += f"\n\n...(Original text: {original_length} chars, truncated to first {self.MAX_TEXT_LENGTH_FOR_LLM} chars for ontology analysis)..." message = f"""## Simulation Requirements {simulation_requirement} ## Document Content {combined_text} """ if additional_context: message += f""" ## Additional Notes {additional_context} """ message += """ Based on the above content, design entity types and relationship types suitable for social media public opinion simulation. **Mandatory rules**: 1. Output exactly 10 entity types 2. The last 2 must be fallback types: Person (individual fallback) and Organization (organization fallback) 3. The first 8 are specific types designed based on the text content 4. All entity types must be real-world actors that can post on social media, not abstract concepts 5. Attribute names cannot use reserved words like name, uuid, group_id — use full_name, org_name, etc. instead """ return message def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: """验证和后处理结果""" # 确保必要字段存在 if "entity_types" not in result: result["entity_types"] = [] if "edge_types" not in result: result["edge_types"] = [] if "analysis_summary" not in result: result["analysis_summary"] = "" # 验证实体类型 for entity in result["entity_types"]: if "attributes" not in entity: entity["attributes"] = [] if "examples" not in entity: entity["examples"] = [] # 确保description不超过100字符 if len(entity.get("description", "")) > 100: entity["description"] = entity["description"][:97] + "..." # 验证关系类型 for edge in result["edge_types"]: if "source_targets" not in edge: edge["source_targets"] = [] if "attributes" not in edge: edge["attributes"] = [] if len(edge.get("description", "")) > 100: edge["description"] = edge["description"][:97] + "..." # Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型 MAX_ENTITY_TYPES = 10 MAX_EDGE_TYPES = 10 # 兜底类型定义 person_fallback = { "name": "Person", "description": "Any individual person not fitting other specific person types.", "attributes": [ {"name": "full_name", "type": "text", "description": "Full name of the person"}, {"name": "role", "type": "text", "description": "Role or occupation"} ], "examples": ["ordinary citizen", "anonymous netizen"] } organization_fallback = { "name": "Organization", "description": "Any organization not fitting other specific organization types.", "attributes": [ {"name": "org_name", "type": "text", "description": "Name of the organization"}, {"name": "org_type", "type": "text", "description": "Type of organization"} ], "examples": ["small business", "community group"] } # 检查是否已有兜底类型 entity_names = {e["name"] for e in result["entity_types"]} has_person = "Person" in entity_names has_organization = "Organization" in entity_names # 需要添加的兜底类型 fallbacks_to_add = [] if not has_person: fallbacks_to_add.append(person_fallback) if not has_organization: fallbacks_to_add.append(organization_fallback) if fallbacks_to_add: current_count = len(result["entity_types"]) needed_slots = len(fallbacks_to_add) # 如果添加后会超过 10 个,需要移除一些现有类型 if current_count + needed_slots > MAX_ENTITY_TYPES: # 计算需要移除多少个 to_remove = current_count + needed_slots - MAX_ENTITY_TYPES # 从末尾移除(保留前面更重要的具体类型) result["entity_types"] = result["entity_types"][:-to_remove] # 添加兜底类型 result["entity_types"].extend(fallbacks_to_add) # 最终确保不超过限制(防御性编程) if len(result["entity_types"]) > MAX_ENTITY_TYPES: result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES] if len(result["edge_types"]) > MAX_EDGE_TYPES: result["edge_types"] = result["edge_types"][:MAX_EDGE_TYPES] return result def generate_python_code(self, ontology: Dict[str, Any]) -> str: """ 将本体定义转换为Python代码(类似ontology.py) Args: ontology: 本体定义 Returns: Python代码字符串 """ code_lines = [ '"""', '自定义实体类型定义', '由MiroFish自动生成,用于社会舆论模拟', '"""', '', 'from pydantic import Field', 'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel', '', '', '# ============== 实体类型定义 ==============', '', ] # 生成实体类型 for entity in ontology.get("entity_types", []): name = entity["name"] desc = entity.get("description", f"A {name} entity.") code_lines.append(f'class {name}(EntityModel):') code_lines.append(f' """{desc}"""') attrs = entity.get("attributes", []) if attrs: for attr in attrs: attr_name = attr["name"] attr_desc = attr.get("description", attr_name) code_lines.append(f' {attr_name}: EntityText = Field(') code_lines.append(f' description="{attr_desc}",') code_lines.append(f' default=None') code_lines.append(f' )') else: code_lines.append(' pass') code_lines.append('') code_lines.append('') code_lines.append('# ============== 关系类型定义 ==============') code_lines.append('') # 生成关系类型 for edge in ontology.get("edge_types", []): name = edge["name"] # 转换为PascalCase类名 class_name = ''.join(word.capitalize() for word in name.split('_')) desc = edge.get("description", f"A {name} relationship.") code_lines.append(f'class {class_name}(EdgeModel):') code_lines.append(f' """{desc}"""') attrs = edge.get("attributes", []) if attrs: for attr in attrs: attr_name = attr["name"] attr_desc = attr.get("description", attr_name) code_lines.append(f' {attr_name}: EntityText = Field(') code_lines.append(f' description="{attr_desc}",') code_lines.append(f' default=None') code_lines.append(f' )') else: code_lines.append(' pass') code_lines.append('') code_lines.append('') # 生成类型字典 code_lines.append('# ============== 类型配置 ==============') code_lines.append('') code_lines.append('ENTITY_TYPES = {') for entity in ontology.get("entity_types", []): name = entity["name"] code_lines.append(f' "{name}": {name},') code_lines.append('}') code_lines.append('') code_lines.append('EDGE_TYPES = {') for edge in ontology.get("edge_types", []): name = edge["name"] class_name = ''.join(word.capitalize() for word in name.split('_')) code_lines.append(f' "{name}": {class_name},') code_lines.append('}') code_lines.append('') # 生成边的source_targets映射 code_lines.append('EDGE_SOURCE_TARGETS = {') for edge in ontology.get("edge_types", []): name = edge["name"] source_targets = edge.get("source_targets", []) if source_targets: st_list = ', '.join([ f'{{"source": "{st.get("source", "Entity")}", "target": "{st.get("target", "Entity")}"}}' for st in source_targets ]) code_lines.append(f' "{name}": [{st_list}],') code_lines.append('}') return '\n'.join(code_lines)