Files
KiloStar/kilostar/utils/ray_hook.py
T
zhaoxi 8f1398c591 feat: 人设模板系统、节点调度标签、pydantic-settings收敛、错误处理增强
新增persona_template表和CRUD API,BaseIndividualModel增加node_affinity和template_origin_id字段,
WorkerCluster支持多集群Ray资源调度,环境变量收敛到pydantic-settings统一校验,
数据库异常转换为结构化BusinessError/RetryableError,系统节点支持custom_system_prompt。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-04 06:07:46 +00:00

143 lines
4.5 KiB
Python

# Copyright 2026 zhaoxi826
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from functools import lru_cache
from typing import Any, Dict
from kilostar.utils.standalone_proxy import _STANDALONE
if not _STANDALONE:
import ray
class ActorList:
"""属性式访问的简易容器,用 ``a.actor_name`` 取代 ``d["actor_name"]``。"""
def __init__(self):
super().__setattr__("dict", {})
def __setattr__(self, key, value):
self.dict[key] = value
def __getattr__(self, key):
if key in self.dict:
return self.dict[key]
raise AttributeError(f"ActorList 对象没有属性 '{key}'")
def __delattr__(self, key):
if key in self.dict:
del self.dict[key]
else:
raise AttributeError(f"ActorList对象没有属性 '{key}'")
# ─── Standalone Registry ───
_standalone_registry: Dict[str, Any] = {}
def register_standalone(name: str, instance: Any) -> None:
"""注册一个单机模式下的 Actor 单例(已包装为 StandaloneProxy)。"""
from kilostar.utils.standalone_proxy import StandaloneProxy
_standalone_registry[name] = StandaloneProxy(instance)
# ─── Distributed Mode Helpers ───
if not _STANDALONE:
@lru_cache(maxsize=128)
def _get_cached_actor_handle(actor_name: str):
"""缓存接口"""
return ray.get_actor(actor_name, namespace="kilostar")
def clear_actor_cache():
"""清理接口"""
_get_cached_actor_handle.cache_clear()
def wait_for_actor(
actor_name: str, *, timeout: float = 10.0, interval: float = 0.5
):
"""阻塞等待某个 actor 就绪,返回其句柄。"""
deadline = time.monotonic() + max(timeout, 0.0)
last_err: Exception | None = None
while True:
try:
return _get_cached_actor_handle(actor_name)
except Exception as e:
last_err = e
if time.monotonic() >= deadline:
raise TimeoutError(
f"等待 actor {actor_name!r} 就绪超时({timeout}s):{last_err}"
) from last_err
time.sleep(interval)
else:
def _get_cached_actor_handle(actor_name: str):
raise RuntimeError("单机模式下不应调用 _get_cached_actor_handle")
def clear_actor_cache():
pass
def wait_for_actor(actor_name: str, **kwargs):
raise RuntimeError("单机模式下不应调用 wait_for_actor")
# ─── 统一入口 ───
def ray_actor_hook(*actor_names: str, timeout: float = 0.0, interval: float = 0.5):
"""按名字批量取出 Actor 句柄,组装成一个 ActorList 返回。
单机模式从 _standalone_registry 取,分布式模式走 ray.get_actor。
"""
actor_list = ActorList()
if _STANDALONE:
for name in actor_names:
if name not in _standalone_registry:
raise ValueError(
f"Standalone registry: actor {name!r} not registered"
)
setattr(actor_list, name, _standalone_registry[name])
return actor_list
for actor_name in actor_names:
if timeout > 0:
handle = wait_for_actor(
actor_name, timeout=timeout, interval=interval
)
else:
handle = _get_cached_actor_handle(actor_name)
setattr(actor_list, actor_name, handle)
return actor_list
def get_worker_cluster(affinity: str = "cpu"):
"""按 node_affinity 标签取对应的 WorkerCluster actor 句柄。
单机模式统一返回唯一的 worker_cluster 实例。
分布式模式按 affinity 路由到 worker_cluster_cpu / _core / _gpu。
未知标签降级到 cpu。
"""
if _STANDALONE:
return _standalone_registry.get("worker_cluster")
_valid = {"cpu", "core", "gpu"}
node_type = affinity if affinity in _valid else "cpu"
return _get_cached_actor_handle(f"worker_cluster_{node_type}")