6d658b4f4d
- 工具系统从 kilostar/plugin/tool_plugin/ 迁移到 data/toolset/(manifest.json 声明式) - 新增 plugin_runtime 模块:BaseOrganization / GlobalPluginManager / loader / tool_bridge - 新增 org_task + org_task_event 表及 DAO(alembic 0009) - 新增 /api/v1/plugin 路由(submit/status/stream/install/reload) - 新增 data/plugin/example_dept 示例重型插件 - regulatory_node 支持聊天历史上下文注入 - send_file 改为 artifact 存盘 + SSE 推送下载链接 - 前端 WorkflowFileCard 组件 + ToolSettings README 渲染 - utils 整理:合并 access/role_check、standalone_proxy→ray_compat、删除废弃模块 - 项目结构文档移至 docs/STRUCTURE.md 并详细展开 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
173 lines
7.1 KiB
Python
173 lines
7.1 KiB
Python
# Copyright 2026 zhaoxi826
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import time
|
|
import asyncio
|
|
from collections import OrderedDict
|
|
from kilostar.utils.ray_compat import actor_class, _STANDALONE
|
|
from kilostar.utils.ray_hook import ray_actor_hook
|
|
|
|
if _STANDALONE:
|
|
from asyncio import Queue
|
|
else:
|
|
from ray.util.queue import Queue
|
|
from kilostar.worker_individual.base_individual import BaseIndividual
|
|
from kilostar.worker_individual.skill_individual import SkillIndividual
|
|
from kilostar.worker_individual.ordinary_individual import OrdinaryIndividual
|
|
from kilostar.worker_individual.special_individual import SpecialIndividual
|
|
|
|
|
|
from kilostar.utils.logger import get_logger
|
|
|
|
|
|
@actor_class
|
|
class WorkerCluster:
|
|
"""
|
|
工作集群 Actor:管理和调度所有的 worker_individual
|
|
设计理念:按需加载,内存 LRU 淘汰,避免 Actor 爆炸
|
|
|
|
分布式模式下每种 node_type 对应一个独立实例,Ray 根据自定义资源
|
|
``kilostar_node_cpu`` / ``kilostar_node_core`` / ``kilostar_node_gpu``
|
|
将 Actor 调度到声明了对应资源的节点上。
|
|
"""
|
|
|
|
def __init__(self, max_capacity: int = 200, num_runners: int = 10, node_type: str = "cpu"):
|
|
self.max_capacity = max_capacity
|
|
self.node_type = node_type
|
|
self._active_workers: OrderedDict[str, BaseIndividual] = OrderedDict()
|
|
self.status = "running"
|
|
self.task_queue = None
|
|
self.results_futures = {}
|
|
self.runners = []
|
|
self.num_runners = num_runners
|
|
self.logger = get_logger("worker_cluster")
|
|
|
|
async def start(self):
|
|
"""启动 runner 协程池并初始化任务队列。"""
|
|
if self.task_queue is None:
|
|
self.task_queue = Queue()
|
|
self.runners = [
|
|
asyncio.create_task(self._runner(i)) for i in range(self.num_runners)
|
|
]
|
|
self.logger.info(f"WorkerCluster 已启动 {self.num_runners} 个 runner 协程。")
|
|
|
|
async def _recruit_worker(self, agent_id: str) -> BaseIndividual:
|
|
"""内部方法:招聘/唤醒一个具体的 Agent 对象"""
|
|
if agent_id in self._active_workers:
|
|
self._active_workers.move_to_end(agent_id)
|
|
return self._active_workers[agent_id]
|
|
|
|
from kilostar.core.global_state_machine.gsm_snapshot import fetch_snapshot
|
|
|
|
global_state_machine = ray_actor_hook(
|
|
"global_state_machine"
|
|
).global_state_machine
|
|
# 走快照读,避开 GSM actor RPC:高频唤醒路径不再是单 actor 瓶颈
|
|
snapshot = await fetch_snapshot(gsm_actor=global_state_machine)
|
|
agent_config = snapshot.individuals.get(agent_id)
|
|
|
|
if not agent_config:
|
|
raise ValueError(f"无法唤醒 Agent {agent_id}:数据库中不存在该档案")
|
|
|
|
worker_type = agent_config.get("type", "ordinary")
|
|
node_affinity = agent_config.get("node_affinity", "cpu")
|
|
self.logger.debug(f"[WorkerCluster] 唤醒 Agent {agent_id}, node_affinity={node_affinity}")
|
|
if worker_type == "skill":
|
|
worker = SkillIndividual(agent_config)
|
|
elif worker_type == "special":
|
|
worker = SpecialIndividual(agent_config)
|
|
else:
|
|
worker = OrdinaryIndividual(agent_config)
|
|
|
|
self._active_workers[agent_id] = worker
|
|
if len(self._active_workers) > self.max_capacity:
|
|
evicted_id, _ = self._active_workers.popitem(last=False)
|
|
self.logger.info(f"[WorkerCluster] 内存池满,休眠老化 Agent: {evicted_id}")
|
|
|
|
return worker
|
|
|
|
async def _runner(self, runner_id: int):
|
|
"""单个 runner 协程:从任务队列取任务,按 agent_id 唤醒 Worker 执行,结果写回 future。"""
|
|
while True:
|
|
try:
|
|
if self.task_queue is None:
|
|
await asyncio.sleep(0.1)
|
|
continue
|
|
task = await self.task_queue.get() if _STANDALONE else await self.task_queue.get_async()
|
|
task_id = task.get("task_id")
|
|
agent_id = task.get("agent_id")
|
|
task_event = task.get("task_event")
|
|
|
|
self.logger.debug(
|
|
f"[WorkerCluster Runner {runner_id}] 开始处理任务 {task_id} 给 Agent {agent_id}"
|
|
)
|
|
start_time = time.time()
|
|
|
|
try:
|
|
worker = await self._recruit_worker(agent_id)
|
|
result = await worker.run(task_event)
|
|
cost_time = time.time() - start_time
|
|
|
|
response = {
|
|
"success": True,
|
|
"agent_id": agent_id,
|
|
"data": result,
|
|
"metrics": {"cost_time_sec": round(cost_time, 2)},
|
|
}
|
|
except Exception as e:
|
|
self.logger.exception(
|
|
f"[WorkerCluster Runner {runner_id}] 执行任务 {task_id} 时发生错误: {e}"
|
|
)
|
|
response = {"success": False, "agent_id": agent_id, "error": str(e)}
|
|
if task_id in self.results_futures:
|
|
future = self.results_futures[task_id]
|
|
if not future.done():
|
|
future.set_result(response)
|
|
|
|
except Exception as e:
|
|
self.logger.error(
|
|
f"[WorkerCluster Runner {runner_id}] 循环发生异常: {e}"
|
|
)
|
|
await asyncio.sleep(1)
|
|
|
|
async def submit_task(self, task_id: str, agent_id: str, task_event: dict):
|
|
"""提交一个任务到队列,挂起等待 runner 处理完成后返回响应字典。"""
|
|
if not self.runners:
|
|
await self.start()
|
|
|
|
future = asyncio.Future()
|
|
self.results_futures[task_id] = future
|
|
|
|
task = {"task_id": task_id, "agent_id": agent_id, "task_event": task_event}
|
|
if _STANDALONE:
|
|
await self.task_queue.put(task)
|
|
else:
|
|
await self.task_queue.put_async(task)
|
|
self.logger.debug(f"[WorkerCluster] 任务 {task_id} 已加入队列。")
|
|
|
|
try:
|
|
result = await future
|
|
return result
|
|
finally:
|
|
self.results_futures.pop(task_id, None)
|
|
|
|
def get_cluster_metrics(self):
|
|
"""返回当前内存池中 Worker 数量、容量、缓存的 agent_id 列表与队列长度等指标。"""
|
|
return {
|
|
"active_worker_count": len(self._active_workers),
|
|
"max_capacity": self.max_capacity,
|
|
"cached_agent_ids": list(self._active_workers.keys()),
|
|
"queue_size": self.task_queue.qsize() if _STANDALONE else self.task_queue.size(),
|
|
}
|