# Copyright 2026 zhaoxi826 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import time from functools import lru_cache from typing import Any, Dict from kilostar.utils.ray_compat import _STANDALONE if not _STANDALONE: import ray class ActorList: """属性式访问的简易容器,用 ``a.actor_name`` 取代 ``d["actor_name"]``。""" def __init__(self): super().__setattr__("dict", {}) def __setattr__(self, key, value): self.dict[key] = value def __getattr__(self, key): if key in self.dict: return self.dict[key] raise AttributeError(f"ActorList 对象没有属性 '{key}'") def __delattr__(self, key): if key in self.dict: del self.dict[key] else: raise AttributeError(f"ActorList对象没有属性 '{key}'") # ─── Standalone Registry ─── _standalone_registry: Dict[str, Any] = {} def register_standalone(name: str, instance: Any) -> None: """注册一个单机模式下的 Actor 单例(已包装为 StandaloneProxy)。""" from kilostar.utils.ray_compat import StandaloneProxy _standalone_registry[name] = StandaloneProxy(instance) # ─── Distributed Mode Helpers ─── if not _STANDALONE: @lru_cache(maxsize=128) def _get_cached_actor_handle(actor_name: str): """缓存接口""" return ray.get_actor(actor_name, namespace="kilostar") def clear_actor_cache(): """清理接口""" _get_cached_actor_handle.cache_clear() def wait_for_actor( actor_name: str, *, timeout: float = 10.0, interval: float = 0.5 ): """阻塞等待某个 actor 就绪,返回其句柄。""" deadline = time.monotonic() + max(timeout, 0.0) last_err: Exception | None = None while True: try: return _get_cached_actor_handle(actor_name) except Exception as e: last_err = e if time.monotonic() >= deadline: raise TimeoutError( f"等待 actor {actor_name!r} 就绪超时({timeout}s):{last_err}" ) from last_err time.sleep(interval) else: def _get_cached_actor_handle(actor_name: str): raise RuntimeError("单机模式下不应调用 _get_cached_actor_handle") def clear_actor_cache(): pass def wait_for_actor(actor_name: str, **kwargs): raise RuntimeError("单机模式下不应调用 wait_for_actor") # ─── 统一入口 ─── def ray_actor_hook(*actor_names: str, timeout: float = 0.0, interval: float = 0.5): """按名字批量取出 Actor 句柄,组装成一个 ActorList 返回。 单机模式从 _standalone_registry 取,分布式模式走 ray.get_actor。 """ actor_list = ActorList() if _STANDALONE: for name in actor_names: if name not in _standalone_registry: raise ValueError( f"Standalone registry: actor {name!r} not registered" ) setattr(actor_list, name, _standalone_registry[name]) return actor_list for actor_name in actor_names: if timeout > 0: handle = wait_for_actor( actor_name, timeout=timeout, interval=interval ) else: handle = _get_cached_actor_handle(actor_name) setattr(actor_list, actor_name, handle) return actor_list def get_worker_cluster(affinity: str = "cpu"): """按 node_affinity 标签取对应的 WorkerCluster actor 句柄。 单机模式统一返回唯一的 worker_cluster 实例。 分布式模式按 affinity 路由到 worker_cluster_cpu / _core / _gpu。 未知标签降级到 cpu。 """ if _STANDALONE: return _standalone_registry.get("worker_cluster") _valid = {"cpu", "core", "gpu"} node_type = affinity if affinity in _valid else "cpu" return _get_cached_actor_handle(f"worker_cluster_{node_type}")