This commit is contained in:
2026-07-01 09:22:26 +00:00
parent 4aa1dab283
commit aa47a19e98
53 changed files with 4721 additions and 77 deletions
@@ -0,0 +1,16 @@
"""data_analytics 插件本地工具集。
agent 看到这些工具时不带凭证参数,凭证由 organization 通过 ContextVar 注入。
"""
from .s3_list_objects import s3_list_objects
from .s3_peek import s3_peek
from .s3_get_object import s3_get_object
from .ray_submit import ray_submit
__all__ = [
"s3_list_objects",
"s3_peek",
"s3_get_object",
"ray_submit",
]
@@ -0,0 +1,43 @@
"""S3 工具共用辅助:从 ContextVar 拿凭证 + 解析 URI。
所有 s3_* 工具都依赖这个模块,把"明文凭证"的取用集中在一处。
"""
from __future__ import annotations
import re
from typing import Any, Dict, Tuple
def get_s3_creds_or_raise() -> Dict[str, Any]:
"""从 organization 注入的 ContextVar 中取出明文凭证;未注入则抛错。"""
# 延迟 import 避免循环;这里走 organization 子类被加载时注入的虚拟包路径
from ..core.organization import S3_CREDS_VAR
creds = S3_CREDS_VAR.get()
if not creds:
raise RuntimeError(
"未提供 S3 凭证:本任务上下文中没有 cred_id,请在创建 job 时选择凭证。"
)
return creds
def parse_s3_uri(uri: str) -> Tuple[str, str]:
"""解析 ``s3://bucket/key`` → ``(bucket, key)``;非法格式抛 ValueError。"""
m = re.match(r"^s3://([^/]+)/(.+)$", uri.strip())
if not m:
raise ValueError(f"非法 S3 URI{uri!r}(期待 s3://bucket/key 形式)")
return m.group(1), m.group(2)
def make_session_kwargs(creds: Dict[str, Any]) -> Dict[str, Any]:
"""转 boto3/aiobotocore client 调用所需的 kwargs。"""
kw: Dict[str, Any] = {
"aws_access_key_id": creds["access_key"],
"aws_secret_access_key": creds["secret_key"],
"region_name": creds.get("region") or "us-east-1",
}
endpoint = creds.get("endpoint_url")
if endpoint:
kw["endpoint_url"] = endpoint
return kw
@@ -0,0 +1,39 @@
{
"name": "data_analytics_internal",
"version": "0.1.0",
"description": "data_analytics 插件内部工具:S3 只读 + Ray 提交。仅限本插件内部 agent 调用。",
"tools": [
{
"name": "s3_list_objects",
"file": "s3_list_objects.py",
"is_system": true,
"action_scope": ["data_analytics_internal"],
"config_args": {},
"category": "system"
},
{
"name": "s3_peek",
"file": "s3_peek.py",
"is_system": true,
"action_scope": ["data_analytics_internal"],
"config_args": {},
"category": "system"
},
{
"name": "s3_get_object",
"file": "s3_get_object.py",
"is_system": true,
"action_scope": ["data_analytics_internal"],
"config_args": {},
"category": "system"
},
{
"name": "ray_submit",
"file": "ray_submit.py",
"is_system": true,
"action_scope": ["data_analytics_internal"],
"config_args": {},
"category": "system"
}
]
}
@@ -0,0 +1,95 @@
"""ray_submit:把分析脚本提交到 Raydistributed)或 subprocessstandalone)执行。
凭证以 ``AWS_*`` 环境变量注入子进程,让 boto3/pandas-s3 自然读到。
脚本走 ``kilostar.utils.sandbox.validate_python_code`` 的静态屏蔽兜底。
"""
from __future__ import annotations
import asyncio
import os
import sys
import tempfile
from kilostar.utils.ray_compat import _STANDALONE
from kilostar.utils.sandbox import (
CodeViolation,
get_python_timeout,
validate_python_code,
)
from ._s3_common import get_s3_creds_or_raise
def _build_env(creds) -> dict:
env = os.environ.copy()
env["AWS_ACCESS_KEY_ID"] = creds["access_key"]
env["AWS_SECRET_ACCESS_KEY"] = creds["secret_key"]
env["AWS_DEFAULT_REGION"] = creds.get("region") or "us-east-1"
if creds.get("endpoint_url"):
env["AWS_ENDPOINT_URL_S3"] = creds["endpoint_url"]
env["AWS_ENDPOINT_URL"] = creds["endpoint_url"]
return env
async def ray_submit(script: str, timeout: int = 300) -> str:
"""提交 Python 脚本到 Ray(分布式)或子进程(单机)执行。
脚本中可直接 ``import boto3`` 读 S3(凭证已通过环境变量注入);可用
pandas / polars / numpy 等已安装的依赖。**只读**——不要尝试 put/delete。
Args:
script: Python 源码
timeout: 超时秒数(默认 300
Returns:
stdout(必要时尾部追加 stderr 与 exit code
"""
try:
script = validate_python_code(script)
except CodeViolation as e:
return f"[Sandbox] {e}"
creds = get_s3_creds_or_raise()
env = _build_env(creds)
timeout = get_python_timeout(timeout)
# standalone 与 distributed 第一版都走 subprocess,保证环境变量传递可控
# ray.remote 跑函数时 env vars 需另装 runtime_env,复杂度跟 subprocess 持平
# 但前者透明可控,先这样落地)
tmp_file = None
try:
with tempfile.NamedTemporaryFile(
mode="w", suffix=".py", delete=False, encoding="utf-8"
) as f:
f.write(script)
tmp_file = f.name
proc = await asyncio.create_subprocess_exec(
sys.executable,
tmp_file,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
env=env,
)
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
out = stdout.decode("utf-8", errors="replace")
err = stderr.decode("utf-8", errors="replace")
result = ""
if out:
result += out
if err:
result += f"\n[stderr]\n{err}"
if proc.returncode != 0:
result += f"\n[exit code: {proc.returncode}]"
result = result.strip() or "(no output)"
if not _STANDALONE:
result = f"[mode: ray-cluster (subprocess)]\n{result}"
return result
except asyncio.TimeoutError:
return f"[Error] ray_submit 执行超时({timeout}s"
except Exception as e:
return f"[Error] ray_submit 失败:{e}"
finally:
if tmp_file and os.path.exists(tmp_file):
os.unlink(tmp_file)
@@ -0,0 +1,46 @@
"""s3_get_object:下载到 artifact 目录(路径强校验防穿越)。"""
from __future__ import annotations
import os
from pathlib import Path
from kilostar.utils.settings import get_artifact_dir
from ._s3_common import get_s3_creds_or_raise, make_session_kwargs, parse_s3_uri
async def s3_get_object(uri: str, save_as: str) -> str:
"""把 S3 对象下载到本进程的 artifact 工作区,返回本地绝对路径。
``save_as`` 必须是相对路径,落到 ``data/artifact/data_analytics_downloads/``
下面(防越权写入任意目录)。下载后供 python_executor / ray_submit 中以
pandas/polars 读取。
Args:
uri: 形如 ``s3://bucket/key`` 的对象路径
save_as: 保存的相对文件名(不能含 ``..`` 或绝对路径)
Returns:
本地保存的绝对路径
"""
from aiobotocore.session import get_session
creds = get_s3_creds_or_raise()
bucket, key = parse_s3_uri(uri)
save_path = Path(save_as).as_posix()
if save_path.startswith("/") or ".." in save_path.split("/"):
raise ValueError(f"save_as 必须是相对、不含 .. 的路径,收到 {save_as!r}")
base = get_artifact_dir() / "data_analytics_downloads"
base.mkdir(parents=True, exist_ok=True)
target = base / save_path
target.parent.mkdir(parents=True, exist_ok=True)
session = get_session()
async with session.create_client("s3", **make_session_kwargs(creds)) as client:
resp = await client.get_object(Bucket=bucket, Key=key)
body = await resp["Body"].read()
target.write_bytes(body)
return str(target.resolve())
@@ -0,0 +1,47 @@
"""s3_list_objects:列出 bucket+prefix 下的对象列表(key/size/last_modified)。"""
from __future__ import annotations
from typing import Any, Dict, List
from ._s3_common import get_s3_creds_or_raise, make_session_kwargs
async def s3_list_objects(
bucket: str,
prefix: str = "",
limit: int = 50,
) -> List[Dict[str, Any]]:
"""列出 S3 bucket 下指定 prefix 的对象(最多 limit 条)。
Args:
bucket: S3 bucket 名
prefix: 对象 key 前缀,留空表示根路径
limit: 最多返回条数(1-1000),默认 50
Returns:
对象信息列表,每项含 key / size / last_modifiedISO 字符串)
"""
from aiobotocore.session import get_session
creds = get_s3_creds_or_raise()
limit = max(1, min(int(limit), 1000))
session = get_session()
out: List[Dict[str, Any]] = []
async with session.create_client("s3", **make_session_kwargs(creds)) as client:
paginator = client.get_paginator("list_objects_v2")
async for page in paginator.paginate(
Bucket=bucket, Prefix=prefix, PaginationConfig={"MaxItems": limit}
):
for item in page.get("Contents", []) or []:
out.append({
"key": item.get("Key"),
"size": item.get("Size"),
"last_modified": (
item["LastModified"].isoformat() if item.get("LastModified") else None
),
})
if len(out) >= limit:
return out
return out
@@ -0,0 +1,35 @@
"""s3_peek:读取对象的头若干字节并尝试 UTF-8 解码(看几行用)。"""
from __future__ import annotations
from ._s3_common import get_s3_creds_or_raise, make_session_kwargs, parse_s3_uri
async def s3_peek(uri: str, n_bytes: int = 4096) -> str:
"""读取 S3 对象的头 ``n_bytes`` 字节,UTF-8 解码后返回。
适合快速预览 csv/json/log 等文本类对象的开头几行。二进制内容会以
``[binary, ...]`` 占位说明返回。
Args:
uri: 形如 ``s3://bucket/key`` 的对象路径
n_bytes: 读取字节数,默认 4096,上限 1MB
Returns:
对象内容片段(解码后的字符串或占位说明)
"""
from aiobotocore.session import get_session
creds = get_s3_creds_or_raise()
bucket, key = parse_s3_uri(uri)
n = max(1, min(int(n_bytes), 1024 * 1024))
session = get_session()
async with session.create_client("s3", **make_session_kwargs(creds)) as client:
resp = await client.get_object(Bucket=bucket, Key=key, Range=f"bytes=0-{n-1}")
body = await resp["Body"].read()
try:
text = body.decode("utf-8")
return text
except UnicodeDecodeError:
return f"[binary, {len(body)} bytes; first 64 hex] {body[:64].hex()}"