# Copyright 2026 zhaoxi826 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import ray @ray.remote class WorkerCluster: """ 工作集群 Actor:管理和调度所有的 worker_individual 设计理念:按需加载,内存 LRU 淘汰,避免 Actor 爆炸 """ def __init__(self, db_actor, max_capacity: int = 200): self.db = db_actor self.max_capacity = max_capacity # 核心:LRU 活跃 Agent 缓存池 self._active_workers: OrderedDict[str, BaseWorkerIndividual] = OrderedDict() self.status = "running" async def _recruit_worker(self, agent_id: str) -> BaseWorkerIndividual: """内部方法:招聘/唤醒一个具体的 Agent 对象""" # 1. 尝试从缓存直接命中 if agent_id in self._active_workers: self._active_workers.move_to_end(agent_id) # 标记为最近使用 return self._active_workers[agent_id] # 2. 缓存未命中,去数据库拉取 Agent 档案配置 # agent_config = await self.db.get_agent_config.remote(agent_id) # 模拟从数据库取出的配置数据 agent_config = { "agent_id": agent_id, "type": "skill", # 取决于数据库里的设定:ordinary, skill, special "prompt": "你是一个资深架构师..." } if not agent_config: raise ValueError(f"无法唤醒 Agent {agent_id}:数据库中不存在该档案") # 3. 工厂模式:根据类型动态装配不同量级的 Individual worker_type = agent_config.get("type", "ordinary") if worker_type == "skill": worker = SkillIndividual(agent_config) elif worker_type == "special": worker = SpecialIndividual(agent_config) else: worker = OrdinaryIndividual(agent_config) # 4. 放入内存池,如果爆满则淘汰最老的那个 self._active_workers[agent_id] = worker if len(self._active_workers) > self.max_capacity: evicted_id, _ = self._active_workers.popitem(last=False) print(f"[WorkerCluster] 内存池满,休眠老化 Agent: {evicted_id}") return worker async def execute_task(self, agent_id: str, task_event: dict) -> dict: """ 对外暴露的唯一干活接口。 task_event 应该包含所有的上下文(Context、历史记忆、本次指令) """ try: # 1. 获取工作实体(秒级热启动或毫秒级缓存命中) worker = await self._recruit_worker(agent_id) # 2. 注入上下文并执行 # 这里的 run 方法内部不保存状态,所有记忆都从 task_event 传入 start_time = time.time() result = await worker.run(task_event) cost_time = time.time() - start_time # 3. 封装标准回包 return { "success": True, "agent_id": agent_id, "data": result, "metrics": {"cost_time_sec": round(cost_time, 2)} } except Exception as e: # 异常隔离:一个 Agent 报错,绝对不能把整个 Cluster 搞崩 return { "success": False, "agent_id": agent_id, "error": str(e) } def get_cluster_metrics(self): """监控探针:用于查看当前集群负载""" return { "active_worker_count": len(self._active_workers), "max_capacity": self.max_capacity, "cached_agent_ids": list(self._active_workers.keys()) }