feat: 新增工具插件、系统日志、workflow配置及前端优化
1. 新增工具插件(edit_file, python_executor, search_file, shell_executor, write_file) 2. 新增系统事件日志模块和API 3. 新增workflow配置文件和详情API 4. 前端增加SSE、错误边界、设置引导等组件 5. 优化认证加密、速率限制、配置加载等工具模块 6. 删除废弃的cluster和health API 7. 补充单元测试和集成测试 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -61,6 +61,7 @@ class WorkflowGraphState(BaseModel):
|
||||
# 已发过 put_pending 的 HumanApproval step index 列表;resume 后避免重复推送。
|
||||
# 用 list(不是 set)是为了 pydantic_graph 序列化 history 时 JSON 友好。
|
||||
approvals_notified: List[int] = Field(default_factory=list)
|
||||
jump_counts: Dict[str, int] = Field(default_factory=dict)
|
||||
|
||||
|
||||
# 业务侧执行入口:把 step + state 喂进去,拿到 (output_text, success_bool)
|
||||
@@ -277,8 +278,13 @@ async def _execute_step(
|
||||
)
|
||||
|
||||
try:
|
||||
output_text, success = await executor(step_data, state)
|
||||
except Exception as e: # 执行器抛异常 → 走失败分支
|
||||
step_timeout = step_data.get("timeout", 300)
|
||||
output_text, success = await asyncio.wait_for(
|
||||
executor(step_data, state), timeout=step_timeout
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
output_text, success = f"步骤执行超时({step_data.get('timeout', 300)}s)", False
|
||||
except Exception as e:
|
||||
output_text, success = str(e), False
|
||||
|
||||
if success:
|
||||
@@ -311,6 +317,25 @@ async def _execute_step(
|
||||
logic_gate = step_data.get("logic_gate") or {}
|
||||
fail_target = logic_gate.get("if_fail")
|
||||
if fail_target and "jump_to_step_" in fail_target:
|
||||
from kilostar.utils.config_loader import get_workflow_config
|
||||
|
||||
max_attempts = get_workflow_config().retry.max_attempts
|
||||
jump_key = f"{state.current_step_index}->{fail_target}"
|
||||
state.jump_counts[jump_key] = state.jump_counts.get(jump_key, 0) + 1
|
||||
|
||||
if state.jump_counts[jump_key] > max_attempts:
|
||||
state.logs.append(
|
||||
{
|
||||
str(state.current_step_index): [
|
||||
str(datetime.datetime.now()),
|
||||
"failed",
|
||||
f"环重试次数超过上限 ({max_attempts}),终止工作流",
|
||||
]
|
||||
}
|
||||
)
|
||||
await _persist_context(ctx, status=WorkflowStatus.FAILED.value)
|
||||
return Finalize(status=WorkflowStatus.FAILED.value)
|
||||
|
||||
target_step = int(fail_target.split("_")[-1]) - 1
|
||||
state.current_step_index = target_step
|
||||
await _persist_context(ctx, status=WorkflowStatus.RUNNING.value)
|
||||
@@ -495,14 +520,21 @@ async def resume_workflow_graph(
|
||||
|
||||
|
||||
@ray.remote
|
||||
def run_workflow_task(workflow_data: dict, trace_id: str):
|
||||
def run_workflow_task(
|
||||
workflow_data: dict, trace_id: str, resume_only: bool = False
|
||||
):
|
||||
"""workflow 的 ray task 入口:一次性执行,跑完即销毁。
|
||||
|
||||
生产路径下持久化交给 ``PostgresStatePersistence`` —— 即便进程崩溃,再 fire
|
||||
一次相同 ``trace_id`` 的任务(或调 ``/workflow/{trace_id}/resume``)即可
|
||||
续跑。同时为了支持 fresh start,先尝试 ``hydrate``:
|
||||
续跑。入口先尝试 ``hydrate``:
|
||||
- hydrate 拿到内容 → 走 resume 路径
|
||||
- hydrate 没拿到 → 走全新路径
|
||||
- hydrate 没拿到 → 走全新路径(用传入的 ``workflow_data``)
|
||||
|
||||
``resume_only``:由 ``/resume`` API 显式置 True。此模式下 hydrate 失败
|
||||
(抛异常或没有持久化记录)必须 fail-fast,而不能 fall through 到全新路径——
|
||||
否则会拿着空 ``workflow_data`` 空跑一个 ``work_link=[]`` 的 workflow 并误判
|
||||
为 COMPLETED(静默 bug)。
|
||||
|
||||
ray task 是新进程,contextvars 不会从 caller 传过来,所以入口先 bind 一次
|
||||
``trace_id``,让节点内的日志自动带上它。
|
||||
@@ -511,6 +543,9 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
|
||||
from kilostar.core.work.workflow.graph_persistence import (
|
||||
build_postgres_persistence,
|
||||
)
|
||||
from kilostar.utils.logger import get_logger
|
||||
|
||||
_logger = get_logger("workflow_task")
|
||||
|
||||
async def _entry() -> None:
|
||||
with trace_id_scope(trace_id):
|
||||
@@ -519,9 +554,20 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
|
||||
recovered = False
|
||||
try:
|
||||
recovered = await persistence.hydrate()
|
||||
except Exception: # pragma: no cover - 防御
|
||||
except Exception as e:
|
||||
if resume_only:
|
||||
_logger.error(f"resume 失败:无法 hydrate 图持久化记录: {e}")
|
||||
raise
|
||||
recovered = False
|
||||
|
||||
if resume_only and not recovered:
|
||||
msg = (
|
||||
f"resume 失败:trace {trace_id} 没有可恢复的图持久化记录,"
|
||||
"拒绝以全新模式空跑"
|
||||
)
|
||||
_logger.error(msg)
|
||||
raise RuntimeError(msg)
|
||||
|
||||
if recovered:
|
||||
await resume_workflow_graph(trace_id, persistence=persistence)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user