feat: 新增工具插件、系统日志、workflow配置及前端优化

1. 新增工具插件(edit_file, python_executor, search_file, shell_executor, write_file)
2. 新增系统事件日志模块和API
3. 新增workflow配置文件和详情API
4. 前端增加SSE、错误边界、设置引导等组件
5. 优化认证加密、速率限制、配置加载等工具模块
6. 删除废弃的cluster和health API
7. 补充单元测试和集成测试

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-03 07:34:43 +00:00
parent f04fef916f
commit a53ffebe0e
57 changed files with 2804 additions and 271 deletions
+52 -6
View File
@@ -61,6 +61,7 @@ class WorkflowGraphState(BaseModel):
# 已发过 put_pending 的 HumanApproval step index 列表;resume 后避免重复推送。
# 用 list(不是 set)是为了 pydantic_graph 序列化 history 时 JSON 友好。
approvals_notified: List[int] = Field(default_factory=list)
jump_counts: Dict[str, int] = Field(default_factory=dict)
# 业务侧执行入口:把 step + state 喂进去,拿到 (output_text, success_bool)
@@ -277,8 +278,13 @@ async def _execute_step(
)
try:
output_text, success = await executor(step_data, state)
except Exception as e: # 执行器抛异常 → 走失败分支
step_timeout = step_data.get("timeout", 300)
output_text, success = await asyncio.wait_for(
executor(step_data, state), timeout=step_timeout
)
except asyncio.TimeoutError:
output_text, success = f"步骤执行超时({step_data.get('timeout', 300)}s", False
except Exception as e:
output_text, success = str(e), False
if success:
@@ -311,6 +317,25 @@ async def _execute_step(
logic_gate = step_data.get("logic_gate") or {}
fail_target = logic_gate.get("if_fail")
if fail_target and "jump_to_step_" in fail_target:
from kilostar.utils.config_loader import get_workflow_config
max_attempts = get_workflow_config().retry.max_attempts
jump_key = f"{state.current_step_index}->{fail_target}"
state.jump_counts[jump_key] = state.jump_counts.get(jump_key, 0) + 1
if state.jump_counts[jump_key] > max_attempts:
state.logs.append(
{
str(state.current_step_index): [
str(datetime.datetime.now()),
"failed",
f"环重试次数超过上限 ({max_attempts}),终止工作流",
]
}
)
await _persist_context(ctx, status=WorkflowStatus.FAILED.value)
return Finalize(status=WorkflowStatus.FAILED.value)
target_step = int(fail_target.split("_")[-1]) - 1
state.current_step_index = target_step
await _persist_context(ctx, status=WorkflowStatus.RUNNING.value)
@@ -495,14 +520,21 @@ async def resume_workflow_graph(
@ray.remote
def run_workflow_task(workflow_data: dict, trace_id: str):
def run_workflow_task(
workflow_data: dict, trace_id: str, resume_only: bool = False
):
"""workflow 的 ray task 入口:一次性执行,跑完即销毁。
生产路径下持久化交给 ``PostgresStatePersistence`` —— 即便进程崩溃,再 fire
一次相同 ``trace_id`` 的任务(或调 ``/workflow/{trace_id}/resume``)即可
续跑。同时为了支持 fresh start先尝试 ``hydrate``
续跑。入口先尝试 ``hydrate``
- hydrate 拿到内容 → 走 resume 路径
- hydrate 没拿到 → 走全新路径
- hydrate 没拿到 → 走全新路径(用传入的 ``workflow_data``
``resume_only``:由 ``/resume`` API 显式置 True。此模式下 hydrate 失败
(抛异常或没有持久化记录)必须 fail-fast,而不能 fall through 到全新路径——
否则会拿着空 ``workflow_data`` 空跑一个 ``work_link=[]`` 的 workflow 并误判
为 COMPLETED(静默 bug)。
ray task 是新进程,contextvars 不会从 caller 传过来,所以入口先 bind 一次
``trace_id``,让节点内的日志自动带上它。
@@ -511,6 +543,9 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
from kilostar.core.work.workflow.graph_persistence import (
build_postgres_persistence,
)
from kilostar.utils.logger import get_logger
_logger = get_logger("workflow_task")
async def _entry() -> None:
with trace_id_scope(trace_id):
@@ -519,9 +554,20 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
recovered = False
try:
recovered = await persistence.hydrate()
except Exception: # pragma: no cover - 防御
except Exception as e:
if resume_only:
_logger.error(f"resume 失败:无法 hydrate 图持久化记录: {e}")
raise
recovered = False
if resume_only and not recovered:
msg = (
f"resume 失败:trace {trace_id} 没有可恢复的图持久化记录,"
"拒绝以全新模式空跑"
)
_logger.error(msg)
raise RuntimeError(msg)
if recovered:
await resume_workflow_graph(trace_id, persistence=persistence)
else: