feat: 人设模板系统、节点调度标签、pydantic-settings收敛、错误处理增强

新增persona_template表和CRUD API，BaseIndividualModel增加node_affinity和template_origin_id字段， WorkerCluster支持多集群Ray资源调度，环境变量收敛到pydantic-settings统一校验，数据库异常转换为结构化BusinessError/RetryableError，系统节点支持custom_system_prompt。 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-04 06:07:46 +00:00
parent f3a92a793e
commit 8f1398c591
23 changed files with 582 additions and 48 deletions
@@ -36,10 +36,15 @@ class WorkerCluster:
    """
    工作集群 Actor：管理和调度所有的 worker_individual
    设计理念：按需加载，内存 LRU 淘汰，避免 Actor 爆炸
+
+    分布式模式下每种 node_type 对应一个独立实例，Ray 根据自定义资源
+    ``kilostar_node_cpu`` / ``kilostar_node_core`` / ``kilostar_node_gpu``
+    将 Actor 调度到声明了对应资源的节点上。
    """

-    def __init__(self, max_capacity: int = 200, num_runners: int = 10):
+    def __init__(self, max_capacity: int = 200, num_runners: int = 10, node_type: str = "cpu"):
        self.max_capacity = max_capacity
+        self.node_type = node_type
        self._active_workers: OrderedDict[str, BaseIndividual] = OrderedDict()
        self.status = "running"
        self.task_queue = None
@@ -76,6 +81,8 @@ class WorkerCluster:
            raise ValueError(f"无法唤醒 Agent {agent_id}：数据库中不存在该档案")

        worker_type = agent_config.get("type", "ordinary")
+        node_affinity = agent_config.get("node_affinity", "cpu")
+        self.logger.debug(f"[WorkerCluster] 唤醒 Agent {agent_id}, node_affinity={node_affinity}")
        if worker_type == "skill":
            worker = SkillIndividual(agent_config)
        elif worker_type == "special":