feat: 新增工具插件、系统日志、workflow配置及前端优化
1. 新增工具插件(edit_file, python_executor, search_file, shell_executor, write_file) 2. 新增系统事件日志模块和API 3. 新增workflow配置文件和详情API 4. 前端增加SSE、错误边界、设置引导等组件 5. 优化认证加密、速率限制、配置加载等工具模块 6. 删除废弃的cluster和health API 7. 补充单元测试和集成测试 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -203,6 +203,10 @@ class GlobalStateMachine:
|
||||
"""返回某个 scope 下的"系统 + 自定义工具组"toolset 列表(不含 MCP)。"""
|
||||
return self._global_tool_manager.get_toolsets_for_scope(scope)
|
||||
|
||||
def get_retrieval_toolsets_for_scope(self, scope: str) -> List[Any]:
|
||||
"""仅返回 retrieval 工具集(system_node 专用,不包含 generation 工具)。"""
|
||||
return self._global_tool_manager.get_retrieval_toolsets_for_scope(scope)
|
||||
|
||||
# ─── MCP Server Registry ───────────────────────────────────
|
||||
|
||||
async def add_mcp_server(self, server_id: str, config: Dict[str, Any]) -> bool:
|
||||
|
||||
@@ -34,7 +34,9 @@ class GlobalToolManager:
|
||||
def __init__(self) -> None:
|
||||
self.tool_metadata = {}
|
||||
self._tool_funcs = defaultdict(dict)
|
||||
self._retrieval_tool_funcs = defaultdict(dict)
|
||||
self._system_toolsets = {}
|
||||
self._retrieval_toolsets = {}
|
||||
self._custom_toolsets = {}
|
||||
self._third_party_funcs = {}
|
||||
self.tool_mapper = defaultdict(dict)
|
||||
@@ -75,11 +77,14 @@ class GlobalToolManager:
|
||||
is_system = bool(tool_data_cls.model_fields.get("is_system").default)
|
||||
category_field = tool_data_cls.model_fields.get("category")
|
||||
category = (category_field.default if category_field else "other") or "other"
|
||||
toolset_field = tool_data_cls.model_fields.get("toolset")
|
||||
toolset_name = (toolset_field.default if toolset_field else "other") or "other"
|
||||
|
||||
self.tool_metadata[plugin_name] = {
|
||||
"name": plugin_name,
|
||||
"is_system": is_system,
|
||||
"category": category,
|
||||
"toolset": toolset_name,
|
||||
"action_scope": list(action_scopes),
|
||||
}
|
||||
|
||||
@@ -92,12 +97,15 @@ class GlobalToolManager:
|
||||
for scope in scopes:
|
||||
self._tool_funcs[scope][plugin_name] = tool_func
|
||||
self.tool_mapper[scope][plugin_name] = tool_data_cls
|
||||
if toolset_name == "retrieval":
|
||||
self._retrieval_tool_funcs[scope][plugin_name] = tool_func
|
||||
else:
|
||||
self._third_party_funcs[plugin_name] = tool_func
|
||||
for scope in scopes:
|
||||
self.tool_mapper[scope][plugin_name] = tool_data_cls
|
||||
|
||||
self._build_system_toolsets()
|
||||
self._build_retrieval_toolsets()
|
||||
|
||||
def _build_system_toolsets(self) -> None:
|
||||
FunctionToolset = self._import_function_toolset()
|
||||
@@ -114,6 +122,21 @@ class GlobalToolManager:
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to build system toolset {scope}: {e}")
|
||||
|
||||
def _build_retrieval_toolsets(self) -> None:
|
||||
FunctionToolset = self._import_function_toolset()
|
||||
if FunctionToolset is None:
|
||||
return
|
||||
for scope, name_to_func in self._retrieval_tool_funcs.items():
|
||||
if not name_to_func:
|
||||
continue
|
||||
try:
|
||||
self._retrieval_toolsets[scope] = FunctionToolset(
|
||||
tools=list(name_to_func.values()),
|
||||
id=f"retrieval::{scope}",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to build retrieval toolset {scope}: {e}")
|
||||
|
||||
def rebuild_custom_toolsets(self, custom_defs: Dict[str, Dict[str, Any]]) -> None:
|
||||
"""根据 DB 中的自定义工具组定义重建 custom FunctionToolset。"""
|
||||
FunctionToolset = self._import_function_toolset()
|
||||
@@ -170,6 +193,15 @@ class GlobalToolManager:
|
||||
result.extend(self._custom_toolsets.values())
|
||||
return result
|
||||
|
||||
def get_retrieval_toolsets_for_scope(self, scope: str) -> List[Any]:
|
||||
"""仅返回 retrieval 工具集(system_node 专用)。"""
|
||||
result: List[Any] = []
|
||||
for s in ("default", scope):
|
||||
ts = self._retrieval_toolsets.get(s)
|
||||
if ts is not None:
|
||||
result.append(ts)
|
||||
return result
|
||||
|
||||
# ─── Metadata accessors ───
|
||||
|
||||
def is_third_party_tool(self, tool_name: str) -> bool:
|
||||
|
||||
@@ -33,6 +33,7 @@ from kilostar.core.postgres_database.model.system_node import SystemNodeConfigMo
|
||||
from kilostar.core.postgres_database.model.mcp_server import MCPServerModel
|
||||
from kilostar.core.postgres_database.model.tool_config import ToolConfigModel
|
||||
from kilostar.core.postgres_database.model.custom_toolset import CustomToolsetModel
|
||||
from kilostar.core.postgres_database.model.system_event_log import SystemEventLog
|
||||
|
||||
# 兼容旧代码的别名
|
||||
Provider = ProviderModel
|
||||
@@ -61,5 +62,6 @@ __all__ = [
|
||||
"MCPServerModel",
|
||||
"ToolConfigModel",
|
||||
"CustomToolsetModel",
|
||||
"SystemEventLog",
|
||||
"AgentType",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,35 @@
|
||||
from sqlalchemy import String, DateTime, Integer, func, Text
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from .base import BaseDataModel
|
||||
|
||||
|
||||
class SystemEventLog(BaseDataModel):
|
||||
__tablename__ = "system_event_log"
|
||||
|
||||
id: Mapped[int] = mapped_column(
|
||||
Integer, primary_key=True, autoincrement=True
|
||||
)
|
||||
trace_id: Mapped[str] = mapped_column(
|
||||
String(64), index=True, comment="关联的工作流 trace_id"
|
||||
)
|
||||
event_type: Mapped[str] = mapped_column(
|
||||
String(50), index=True,
|
||||
comment="事件类型: workflow_start/step_enter/step_complete/step_error/workflow_complete/workflow_fail/system"
|
||||
)
|
||||
level: Mapped[str] = mapped_column(
|
||||
String(10), index=True, default="info",
|
||||
comment="日志级别: info/warn/error"
|
||||
)
|
||||
node_name: Mapped[str | None] = mapped_column(
|
||||
String(100), nullable=True, comment="相关节点名称"
|
||||
)
|
||||
message: Mapped[str] = mapped_column(
|
||||
Text, comment="日志消息正文"
|
||||
)
|
||||
extra_data: Mapped[dict | None] = mapped_column(
|
||||
JSONB, nullable=True, comment="附加元数据(step_index/output 等)"
|
||||
)
|
||||
created_at: Mapped[str] = mapped_column(
|
||||
DateTime(timezone=True), server_default=func.now(), index=True
|
||||
)
|
||||
@@ -0,0 +1,72 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import List, Optional
|
||||
from sqlalchemy import select, desc
|
||||
from sqlalchemy.ext.asyncio import async_sessionmaker, AsyncSession
|
||||
|
||||
from kilostar.core.postgres_database.model.system_event_log import SystemEventLog
|
||||
from kilostar.core.postgres_database.database_exception import database_exception
|
||||
|
||||
|
||||
class SystemEventLogDatabase:
|
||||
def __init__(self, async_session_maker: async_sessionmaker[AsyncSession]):
|
||||
self.async_session_maker = async_session_maker
|
||||
|
||||
@database_exception
|
||||
async def insert_event(
|
||||
self,
|
||||
trace_id: str,
|
||||
event_type: str,
|
||||
level: str,
|
||||
message: str,
|
||||
node_name: Optional[str] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
) -> None:
|
||||
async with self.async_session_maker() as session:
|
||||
log = SystemEventLog(
|
||||
trace_id=trace_id,
|
||||
event_type=event_type,
|
||||
level=level,
|
||||
message=message,
|
||||
node_name=node_name,
|
||||
extra_data=metadata,
|
||||
)
|
||||
session.add(log)
|
||||
await session.commit()
|
||||
|
||||
@database_exception
|
||||
async def query_events(
|
||||
self,
|
||||
trace_id: Optional[str] = None,
|
||||
event_type: Optional[str] = None,
|
||||
level: Optional[str] = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0,
|
||||
) -> List[dict]:
|
||||
async with self.async_session_maker() as session:
|
||||
stmt = select(SystemEventLog).order_by(desc(SystemEventLog.created_at))
|
||||
|
||||
if trace_id:
|
||||
stmt = stmt.where(SystemEventLog.trace_id == trace_id)
|
||||
if event_type:
|
||||
stmt = stmt.where(SystemEventLog.event_type == event_type)
|
||||
if level:
|
||||
stmt = stmt.where(SystemEventLog.level == level)
|
||||
|
||||
stmt = stmt.offset(offset).limit(limit)
|
||||
result = await session.execute(stmt)
|
||||
rows = result.scalars().all()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": r.id,
|
||||
"trace_id": r.trace_id,
|
||||
"event_type": r.event_type,
|
||||
"level": r.level,
|
||||
"node_name": r.node_name,
|
||||
"message": r.message,
|
||||
"metadata": r.extra_data,
|
||||
"created_at": str(r.created_at) if r.created_at else None,
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
@@ -41,6 +41,7 @@ from kilostar.core.postgres_database.model.system_node import SystemNodeConfigMo
|
||||
from kilostar.core.postgres_database.model.mcp_server import MCPServerModel
|
||||
from kilostar.core.postgres_database.model.tool_config import ToolConfigModel
|
||||
from kilostar.core.postgres_database.model.custom_toolset import CustomToolsetModel
|
||||
from kilostar.core.postgres_database.model.system_event_log import SystemEventLog
|
||||
|
||||
from .module.individual import IndividualDatabase
|
||||
from .module.user import AuthDatabase
|
||||
@@ -51,6 +52,7 @@ from .module.chat_history import ChatHistoryDatabase
|
||||
from .module.mcp_server import MCPServerDatabase
|
||||
from .module.tool_config import ToolConfigDatabase
|
||||
from .module.custom_toolset import CustomToolsetDatabase
|
||||
from .module.system_event_log import SystemEventLogDatabase
|
||||
|
||||
|
||||
@ray.remote
|
||||
@@ -85,6 +87,7 @@ class PostgresDatabase:
|
||||
self._mcp_server_database = MCPServerDatabase(self.async_session_maker)
|
||||
self._tool_config_database = ToolConfigDatabase(self.async_session_maker)
|
||||
self._custom_toolset_database = CustomToolsetDatabase(self.async_session_maker)
|
||||
self._system_event_log_database = SystemEventLogDatabase(self.async_session_maker)
|
||||
|
||||
self.ready_event = asyncio.Event()
|
||||
|
||||
@@ -94,11 +97,10 @@ class PostgresDatabase:
|
||||
async with self.async_engine.begin() as conn:
|
||||
await conn.run_sync(BaseDataModel.metadata.create_all)
|
||||
print("✅ 数据库表创建/验证完成")
|
||||
self.ready_event.set()
|
||||
except Exception as e:
|
||||
print(f"❌ 数据库初始化失败: {e}")
|
||||
raise
|
||||
finally:
|
||||
self.ready_event.set()
|
||||
|
||||
async def ping(self) -> bool:
|
||||
"""轻量探活:等待 ready 后执行 ``SELECT 1``。"""
|
||||
@@ -376,3 +378,35 @@ class PostgresDatabase:
|
||||
"""删除一个自定义工具组。"""
|
||||
await self.ready_event.wait()
|
||||
return await self._custom_toolset_database.delete(toolset_id)
|
||||
|
||||
# System Event Log Methods
|
||||
async def insert_event_log(
|
||||
self,
|
||||
trace_id: str,
|
||||
event_type: str,
|
||||
level: str,
|
||||
message: str,
|
||||
node_name=None,
|
||||
metadata=None,
|
||||
):
|
||||
await self.ready_event.wait()
|
||||
return await self._system_event_log_database.insert_event(
|
||||
trace_id=trace_id,
|
||||
event_type=event_type,
|
||||
level=level,
|
||||
message=message,
|
||||
node_name=node_name,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
async def query_event_logs(
|
||||
self, trace_id=None, event_type=None, level=None, limit=100, offset=0
|
||||
):
|
||||
await self.ready_event.wait()
|
||||
return await self._system_event_log_database.query_events(
|
||||
trace_id=trace_id,
|
||||
event_type=event_type,
|
||||
level=level,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
|
||||
@@ -61,6 +61,7 @@ class WorkflowGraphState(BaseModel):
|
||||
# 已发过 put_pending 的 HumanApproval step index 列表;resume 后避免重复推送。
|
||||
# 用 list(不是 set)是为了 pydantic_graph 序列化 history 时 JSON 友好。
|
||||
approvals_notified: List[int] = Field(default_factory=list)
|
||||
jump_counts: Dict[str, int] = Field(default_factory=dict)
|
||||
|
||||
|
||||
# 业务侧执行入口:把 step + state 喂进去,拿到 (output_text, success_bool)
|
||||
@@ -277,8 +278,13 @@ async def _execute_step(
|
||||
)
|
||||
|
||||
try:
|
||||
output_text, success = await executor(step_data, state)
|
||||
except Exception as e: # 执行器抛异常 → 走失败分支
|
||||
step_timeout = step_data.get("timeout", 300)
|
||||
output_text, success = await asyncio.wait_for(
|
||||
executor(step_data, state), timeout=step_timeout
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
output_text, success = f"步骤执行超时({step_data.get('timeout', 300)}s)", False
|
||||
except Exception as e:
|
||||
output_text, success = str(e), False
|
||||
|
||||
if success:
|
||||
@@ -311,6 +317,25 @@ async def _execute_step(
|
||||
logic_gate = step_data.get("logic_gate") or {}
|
||||
fail_target = logic_gate.get("if_fail")
|
||||
if fail_target and "jump_to_step_" in fail_target:
|
||||
from kilostar.utils.config_loader import get_workflow_config
|
||||
|
||||
max_attempts = get_workflow_config().retry.max_attempts
|
||||
jump_key = f"{state.current_step_index}->{fail_target}"
|
||||
state.jump_counts[jump_key] = state.jump_counts.get(jump_key, 0) + 1
|
||||
|
||||
if state.jump_counts[jump_key] > max_attempts:
|
||||
state.logs.append(
|
||||
{
|
||||
str(state.current_step_index): [
|
||||
str(datetime.datetime.now()),
|
||||
"failed",
|
||||
f"环重试次数超过上限 ({max_attempts}),终止工作流",
|
||||
]
|
||||
}
|
||||
)
|
||||
await _persist_context(ctx, status=WorkflowStatus.FAILED.value)
|
||||
return Finalize(status=WorkflowStatus.FAILED.value)
|
||||
|
||||
target_step = int(fail_target.split("_")[-1]) - 1
|
||||
state.current_step_index = target_step
|
||||
await _persist_context(ctx, status=WorkflowStatus.RUNNING.value)
|
||||
@@ -495,14 +520,21 @@ async def resume_workflow_graph(
|
||||
|
||||
|
||||
@ray.remote
|
||||
def run_workflow_task(workflow_data: dict, trace_id: str):
|
||||
def run_workflow_task(
|
||||
workflow_data: dict, trace_id: str, resume_only: bool = False
|
||||
):
|
||||
"""workflow 的 ray task 入口:一次性执行,跑完即销毁。
|
||||
|
||||
生产路径下持久化交给 ``PostgresStatePersistence`` —— 即便进程崩溃,再 fire
|
||||
一次相同 ``trace_id`` 的任务(或调 ``/workflow/{trace_id}/resume``)即可
|
||||
续跑。同时为了支持 fresh start,先尝试 ``hydrate``:
|
||||
续跑。入口先尝试 ``hydrate``:
|
||||
- hydrate 拿到内容 → 走 resume 路径
|
||||
- hydrate 没拿到 → 走全新路径
|
||||
- hydrate 没拿到 → 走全新路径(用传入的 ``workflow_data``)
|
||||
|
||||
``resume_only``:由 ``/resume`` API 显式置 True。此模式下 hydrate 失败
|
||||
(抛异常或没有持久化记录)必须 fail-fast,而不能 fall through 到全新路径——
|
||||
否则会拿着空 ``workflow_data`` 空跑一个 ``work_link=[]`` 的 workflow 并误判
|
||||
为 COMPLETED(静默 bug)。
|
||||
|
||||
ray task 是新进程,contextvars 不会从 caller 传过来,所以入口先 bind 一次
|
||||
``trace_id``,让节点内的日志自动带上它。
|
||||
@@ -511,6 +543,9 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
|
||||
from kilostar.core.work.workflow.graph_persistence import (
|
||||
build_postgres_persistence,
|
||||
)
|
||||
from kilostar.utils.logger import get_logger
|
||||
|
||||
_logger = get_logger("workflow_task")
|
||||
|
||||
async def _entry() -> None:
|
||||
with trace_id_scope(trace_id):
|
||||
@@ -519,9 +554,20 @@ def run_workflow_task(workflow_data: dict, trace_id: str):
|
||||
recovered = False
|
||||
try:
|
||||
recovered = await persistence.hydrate()
|
||||
except Exception: # pragma: no cover - 防御
|
||||
except Exception as e:
|
||||
if resume_only:
|
||||
_logger.error(f"resume 失败:无法 hydrate 图持久化记录: {e}")
|
||||
raise
|
||||
recovered = False
|
||||
|
||||
if resume_only and not recovered:
|
||||
msg = (
|
||||
f"resume 失败:trace {trace_id} 没有可恢复的图持久化记录,"
|
||||
"拒绝以全新模式空跑"
|
||||
)
|
||||
_logger.error(msg)
|
||||
raise RuntimeError(msg)
|
||||
|
||||
if recovered:
|
||||
await resume_workflow_graph(trace_id, persistence=persistence)
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user