feat(standalone): 新增单机模式,KILOSTAR_MODE=standalone 时去掉 Ray 依赖
通过 StandaloneProxy 适配层让 .remote() 调用在单机模式下透明降级为 asyncio 协程调用,7 个 Actor 和 workflow task 均可在纯 asyncio 环境运行, 启动快、资源占用低。分布式模式行为完全不变。 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -12,8 +12,13 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import ray
|
||||
import os
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from kilostar.utils.standalone_proxy import actor_class
|
||||
|
||||
_STANDALONE = os.environ.get("KILOSTAR_MODE", "distributed") == "standalone"
|
||||
if not _STANDALONE:
|
||||
import ray
|
||||
|
||||
from kilostar.core.global_state_machine.individual_manager import (
|
||||
GlobalIndividualManager,
|
||||
@@ -25,7 +30,7 @@ from kilostar.core.global_state_machine.gsm_snapshot import GSMSnapshot
|
||||
from kilostar.core.postgres_database import PostgresDatabase
|
||||
|
||||
|
||||
@ray.remote
|
||||
@actor_class
|
||||
class GlobalStateMachine:
|
||||
"""全局状态机 Actor,统一持有 Provider/Tool/Skill/Individual/MCP/CustomToolset 注册表。
|
||||
|
||||
@@ -44,10 +49,9 @@ class GlobalStateMachine:
|
||||
self._tool_configs: Dict[str, Dict[str, Any]] = {}
|
||||
self._custom_toolsets: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
# 配置快照与版本号:每次写入 → version+=1 → ray.put 新 snapshot
|
||||
# 读端通过 current_config_ref 拿 ref 后用 ray.get 直读,绕开 actor 单线程瓶颈
|
||||
# 配置快照与版本号:每次写入 → version+=1 → 发布新 snapshot
|
||||
self._config_version: int = 0
|
||||
self._current_ref: Optional[ray.ObjectRef] = None
|
||||
self._current_ref = None
|
||||
|
||||
self.postgres_database = postgres_database
|
||||
|
||||
@@ -113,19 +117,19 @@ class GlobalStateMachine:
|
||||
)
|
||||
|
||||
def _publish_snapshot(self) -> None:
|
||||
"""版本号 +1 并把当前状态 put 到 Ray Object Store。
|
||||
|
||||
旧 ref 会因为引用计数归零而进入回收队列;正在执行的 task 已经把 ref
|
||||
拷贝到了自己的进程,dec 不会影响它们的读取。
|
||||
"""
|
||||
"""版本号 +1 并发布当前状态快照。"""
|
||||
self._config_version += 1
|
||||
self._current_ref = ray.put(self._build_snapshot())
|
||||
snapshot = self._build_snapshot()
|
||||
if _STANDALONE:
|
||||
self._current_ref = snapshot
|
||||
else:
|
||||
self._current_ref = ray.put(snapshot)
|
||||
|
||||
async def current_config_ref(self) -> Tuple[int, ray.ObjectRef]:
|
||||
"""返回 ``(version, ObjectRef)``,调用方拿了 ref 后用 ``ray.get`` 自取。
|
||||
async def current_config_ref(self) -> Tuple[int, Any]:
|
||||
"""返回 ``(version, ObjectRef 或 snapshot)``。
|
||||
|
||||
**不要**直接返回 snapshot 对象 —— 那样会走 actor RPC 反序列化,丧失
|
||||
object store 的共享内存优势。返回 ref 才能让调用方在自己进程里 ray.get。
|
||||
分布式模式返回 ObjectRef,调用方用 ``ray.get`` 自取;
|
||||
单机模式直接返回 snapshot 对象。
|
||||
"""
|
||||
if self._current_ref is None:
|
||||
self._publish_snapshot()
|
||||
|
||||
@@ -30,10 +30,13 @@ GSM 仍然是 source of truth + 写入串行化器,但读路径解耦:
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable, Dict, List, Optional, Tuple
|
||||
|
||||
import ray
|
||||
_STANDALONE = os.environ.get("KILOSTAR_MODE", "distributed") == "standalone"
|
||||
if not _STANDALONE:
|
||||
import ray
|
||||
|
||||
from kilostar.core.global_state_machine.model_provider.base_provider import Provider
|
||||
from kilostar.utils.logger import get_logger
|
||||
@@ -113,14 +116,19 @@ async def fetch_snapshot(
|
||||
):
|
||||
return _local_cache["snapshot"]
|
||||
|
||||
version, ref = await gsm_actor.current_config_ref.remote()
|
||||
snapshot = ray.get(ref)
|
||||
version, ref_or_snapshot = await gsm_actor.current_config_ref.remote()
|
||||
if _STANDALONE:
|
||||
snapshot = ref_or_snapshot
|
||||
else:
|
||||
snapshot = ray.get(ref_or_snapshot)
|
||||
_local_cache["version"] = version
|
||||
_local_cache["snapshot"] = snapshot
|
||||
return snapshot
|
||||
|
||||
version, ref = await gsm_actor.current_config_ref.remote()
|
||||
return ray.get(ref)
|
||||
version, ref_or_snapshot = await gsm_actor.current_config_ref.remote()
|
||||
if _STANDALONE:
|
||||
return ref_or_snapshot
|
||||
return ray.get(ref_or_snapshot)
|
||||
|
||||
|
||||
def reset_local_cache() -> None:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import ray
|
||||
import asyncio
|
||||
from typing import Dict
|
||||
from kilostar.utils.standalone_proxy import actor_class
|
||||
from kilostar.utils.ray_hook import ray_actor_hook
|
||||
from kilostar.utils.logger import get_logger
|
||||
|
||||
@@ -11,7 +11,7 @@ class TraceQueues:
|
||||
self.receive: asyncio.Queue[str] = asyncio.Queue()
|
||||
|
||||
|
||||
@ray.remote
|
||||
@actor_class
|
||||
class GlobalWorkflowManager:
|
||||
def __init__(self):
|
||||
self._traces: Dict[str, TraceQueues] = {}
|
||||
|
||||
@@ -13,8 +13,8 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import ray
|
||||
from typing import Union, overload
|
||||
from kilostar.utils.standalone_proxy import actor_class
|
||||
from kilostar.core.individual.consciousness_node.template import (
|
||||
ConsciousnessNodeDeps,
|
||||
ForregulatoryNode,
|
||||
@@ -32,7 +32,7 @@ from kilostar.utils.ray_hook import ray_actor_hook
|
||||
from kilostar.utils.i18n import agent_prompt
|
||||
|
||||
|
||||
@ray.remote
|
||||
@actor_class
|
||||
class ConsciousnessNode:
|
||||
def __init__(self) -> None:
|
||||
from kilostar.utils.logger import get_logger
|
||||
|
||||
@@ -12,8 +12,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import ray
|
||||
from pydantic_ai import Agent, RunContext
|
||||
from kilostar.utils.standalone_proxy import actor_class
|
||||
from kilostar.core.global_state_machine.global_state_machine import GlobalStateMachine
|
||||
from kilostar.core.global_state_machine.model_provider.base_provider import Provider
|
||||
from kilostar.adapter.model_adapter.agent_factory import AgentFactory
|
||||
@@ -25,7 +25,7 @@ from kilostar.core.individual.control_node.template import (
|
||||
from kilostar.utils.i18n import agent_prompt
|
||||
|
||||
|
||||
@ray.remote
|
||||
@actor_class
|
||||
class ControlNode:
|
||||
"""ControlNode(控制节点):工作流中具体子任务的执行 Actor。
|
||||
|
||||
|
||||
@@ -13,8 +13,8 @@
|
||||
# limitations under the License.
|
||||
|
||||
import datetime
|
||||
import ray
|
||||
from typing import Union
|
||||
from kilostar.utils.standalone_proxy import actor_class
|
||||
from kilostar.adapter.model_adapter.agent_factory import AgentFactory
|
||||
from kilostar.core.global_state_machine.global_state_machine import GlobalStateMachine
|
||||
from kilostar.core.global_state_machine.model_provider import Provider
|
||||
@@ -27,7 +27,7 @@ from pydantic_ai import RunContext, Agent
|
||||
from kilostar.utils.i18n import agent_prompt
|
||||
|
||||
|
||||
@ray.remote
|
||||
@actor_class
|
||||
class RegulatoryNode:
|
||||
"""RegulatoryNode(监管节点):用户请求的入口路由 Actor。
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
|
||||
import ray
|
||||
from kilostar.utils.standalone_proxy import actor_class
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from kilostar.core.postgres_database.model.base import BaseDataModel
|
||||
@@ -55,7 +55,7 @@ from .module.custom_toolset import CustomToolsetDatabase
|
||||
from .module.system_event_log import SystemEventLogDatabase
|
||||
|
||||
|
||||
@ray.remote
|
||||
@actor_class
|
||||
class PostgresDatabase:
|
||||
"""以 Ray Actor 形式暴露的统一数据库门面。
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ import datetime
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Awaitable, Callable, Dict, List, Optional
|
||||
|
||||
import ray
|
||||
from kilostar.utils.standalone_proxy import remote_task, _STANDALONE
|
||||
from pydantic import BaseModel, Field
|
||||
from pydantic_graph import BaseNode, End, Graph, GraphRunContext
|
||||
from pydantic_graph.persistence import BaseStatePersistence
|
||||
@@ -519,7 +519,7 @@ async def resume_workflow_graph(
|
||||
return final_output
|
||||
|
||||
|
||||
@ray.remote
|
||||
@remote_task
|
||||
def run_workflow_task(
|
||||
workflow_data: dict, trace_id: str, resume_only: bool = False
|
||||
):
|
||||
@@ -575,4 +575,7 @@ def run_workflow_task(
|
||||
workflow_data, trace_id, persistence=persistence
|
||||
)
|
||||
|
||||
asyncio.run(_entry())
|
||||
if _STANDALONE:
|
||||
return _entry()
|
||||
else:
|
||||
asyncio.run(_entry())
|
||||
|
||||
Reference in New Issue
Block a user