feat: 新增工具插件、系统日志、workflow配置及前端优化

1. 新增工具插件(edit_file, python_executor, search_file, shell_executor, write_file) 2. 新增系统事件日志模块和API 3. 新增workflow配置文件和详情API 4. 前端增加SSE、错误边界、设置引导等组件 5. 优化认证加密、速率限制、配置加载等工具模块 6. 删除废弃的cluster和health API 7. 补充单元测试和集成测试 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-03 07:34:43 +00:00
parent f04fef916f
commit a53ffebe0e
57 changed files with 2804 additions and 271 deletions
@@ -61,6 +61,7 @@ class WorkflowGraphState(BaseModel):
    # 已发过 put_pending 的 HumanApproval step index 列表；resume 后避免重复推送。
    # 用 list（不是 set）是为了 pydantic_graph 序列化 history 时 JSON 友好。
    approvals_notified: List[int] = Field(default_factory=list)
+    jump_counts: Dict[str, int] = Field(default_factory=dict)


 # 业务侧执行入口：把 step + state 喂进去，拿到 (output_text, success_bool)
@@ -277,8 +278,13 @@ async def _execute_step(
    )

    try:
-        output_text, success = await executor(step_data, state)
-    except Exception as e:  # 执行器抛异常 → 走失败分支
+        step_timeout = step_data.get("timeout", 300)
+        output_text, success = await asyncio.wait_for(
+            executor(step_data, state), timeout=step_timeout
+        )
+    except asyncio.TimeoutError:
+        output_text, success = f"步骤执行超时（{step_data.get('timeout', 300)}s）", False
+    except Exception as e:
        output_text, success = str(e), False

    if success:
@@ -311,6 +317,25 @@ async def _execute_step(
    logic_gate = step_data.get("logic_gate") or {}
    fail_target = logic_gate.get("if_fail")
    if fail_target and "jump_to_step_" in fail_target:
+        from kilostar.utils.config_loader import get_workflow_config
+
+        max_attempts = get_workflow_config().retry.max_attempts
+        jump_key = f"{state.current_step_index}->{fail_target}"
+        state.jump_counts[jump_key] = state.jump_counts.get(jump_key, 0) + 1
+
+        if state.jump_counts[jump_key] > max_attempts:
+            state.logs.append(
+                {
+                    str(state.current_step_index): [
+                        str(datetime.datetime.now()),
+                        "failed",
+                        f"环重试次数超过上限 ({max_attempts})，终止工作流",
+                    ]
+                }
+            )
+            await _persist_context(ctx, status=WorkflowStatus.FAILED.value)
+            return Finalize(status=WorkflowStatus.FAILED.value)
+
        target_step = int(fail_target.split("_")[-1]) - 1
        state.current_step_index = target_step
        await _persist_context(ctx, status=WorkflowStatus.RUNNING.value)
@@ -495,14 +520,21 @@ async def resume_workflow_graph(


@ray.remote
-def run_workflow_task(workflow_data: dict, trace_id: str):
+def run_workflow_task(
+    workflow_data: dict, trace_id: str, resume_only: bool = False
+):
    """workflow 的 ray task 入口：一次性执行，跑完即销毁。

    生产路径下持久化交给 ``PostgresStatePersistence`` —— 即便进程崩溃，再 fire
    一次相同 ``trace_id`` 的任务（或调 ``/workflow/{trace_id}/resume``）即可
-    续跑。同时为了支持 fresh start，先尝试 ``hydrate``：
+    续跑。入口先尝试 ``hydrate``：
    - hydrate 拿到内容 → 走 resume 路径
-    - hydrate 没拿到 → 走全新路径
+    - hydrate 没拿到 → 走全新路径（用传入的 ``workflow_data``）
+
+    ``resume_only``：由 ``/resume`` API 显式置 True。此模式下 hydrate 失败
+    （抛异常或没有持久化记录）必须 fail-fast，而不能 fall through 到全新路径——
+    否则会拿着空 ``workflow_data`` 空跑一个 ``work_link=[]`` 的 workflow 并误判
+    为 COMPLETED（静默 bug）。

    ray task 是新进程，contextvars 不会从 caller 传过来，所以入口先 bind 一次
    ``trace_id``，让节点内的日志自动带上它。
@@ -511,6 +543,9 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
    from kilostar.core.work.workflow.graph_persistence import (
        build_postgres_persistence,
    )
+    from kilostar.utils.logger import get_logger
+
+    _logger = get_logger("workflow_task")

    async def _entry() -> None:
        with trace_id_scope(trace_id):
@@ -519,9 +554,20 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
            recovered = False
            try:
                recovered = await persistence.hydrate()
-            except Exception:  # pragma: no cover - 防御
+            except Exception as e:
+                if resume_only:
+                    _logger.error(f"resume 失败：无法 hydrate 图持久化记录: {e}")
+                    raise
                recovered = False

+            if resume_only and not recovered:
+                msg = (
+                    f"resume 失败：trace {trace_id} 没有可恢复的图持久化记录，"
+                    "拒绝以全新模式空跑"
+                )
+                _logger.error(msg)
+                raise RuntimeError(msg)
+
            if recovered:
                await resume_workflow_graph(trace_id, persistence=persistence)
            else: