存档
This commit is contained in:
@@ -0,0 +1,16 @@
|
||||
"""data_analytics 插件本地工具集。
|
||||
|
||||
agent 看到这些工具时不带凭证参数,凭证由 organization 通过 ContextVar 注入。
|
||||
"""
|
||||
|
||||
from .s3_list_objects import s3_list_objects
|
||||
from .s3_peek import s3_peek
|
||||
from .s3_get_object import s3_get_object
|
||||
from .ray_submit import ray_submit
|
||||
|
||||
__all__ = [
|
||||
"s3_list_objects",
|
||||
"s3_peek",
|
||||
"s3_get_object",
|
||||
"ray_submit",
|
||||
]
|
||||
@@ -0,0 +1,43 @@
|
||||
"""S3 工具共用辅助:从 ContextVar 拿凭证 + 解析 URI。
|
||||
|
||||
所有 s3_* 工具都依赖这个模块,把"明文凭证"的取用集中在一处。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
|
||||
def get_s3_creds_or_raise() -> Dict[str, Any]:
|
||||
"""从 organization 注入的 ContextVar 中取出明文凭证;未注入则抛错。"""
|
||||
# 延迟 import 避免循环;这里走 organization 子类被加载时注入的虚拟包路径
|
||||
from ..core.organization import S3_CREDS_VAR
|
||||
|
||||
creds = S3_CREDS_VAR.get()
|
||||
if not creds:
|
||||
raise RuntimeError(
|
||||
"未提供 S3 凭证:本任务上下文中没有 cred_id,请在创建 job 时选择凭证。"
|
||||
)
|
||||
return creds
|
||||
|
||||
|
||||
def parse_s3_uri(uri: str) -> Tuple[str, str]:
|
||||
"""解析 ``s3://bucket/key`` → ``(bucket, key)``;非法格式抛 ValueError。"""
|
||||
m = re.match(r"^s3://([^/]+)/(.+)$", uri.strip())
|
||||
if not m:
|
||||
raise ValueError(f"非法 S3 URI:{uri!r}(期待 s3://bucket/key 形式)")
|
||||
return m.group(1), m.group(2)
|
||||
|
||||
|
||||
def make_session_kwargs(creds: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""转 boto3/aiobotocore client 调用所需的 kwargs。"""
|
||||
kw: Dict[str, Any] = {
|
||||
"aws_access_key_id": creds["access_key"],
|
||||
"aws_secret_access_key": creds["secret_key"],
|
||||
"region_name": creds.get("region") or "us-east-1",
|
||||
}
|
||||
endpoint = creds.get("endpoint_url")
|
||||
if endpoint:
|
||||
kw["endpoint_url"] = endpoint
|
||||
return kw
|
||||
@@ -0,0 +1,39 @@
|
||||
{
|
||||
"name": "data_analytics_internal",
|
||||
"version": "0.1.0",
|
||||
"description": "data_analytics 插件内部工具:S3 只读 + Ray 提交。仅限本插件内部 agent 调用。",
|
||||
"tools": [
|
||||
{
|
||||
"name": "s3_list_objects",
|
||||
"file": "s3_list_objects.py",
|
||||
"is_system": true,
|
||||
"action_scope": ["data_analytics_internal"],
|
||||
"config_args": {},
|
||||
"category": "system"
|
||||
},
|
||||
{
|
||||
"name": "s3_peek",
|
||||
"file": "s3_peek.py",
|
||||
"is_system": true,
|
||||
"action_scope": ["data_analytics_internal"],
|
||||
"config_args": {},
|
||||
"category": "system"
|
||||
},
|
||||
{
|
||||
"name": "s3_get_object",
|
||||
"file": "s3_get_object.py",
|
||||
"is_system": true,
|
||||
"action_scope": ["data_analytics_internal"],
|
||||
"config_args": {},
|
||||
"category": "system"
|
||||
},
|
||||
{
|
||||
"name": "ray_submit",
|
||||
"file": "ray_submit.py",
|
||||
"is_system": true,
|
||||
"action_scope": ["data_analytics_internal"],
|
||||
"config_args": {},
|
||||
"category": "system"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
"""ray_submit:把分析脚本提交到 Ray(distributed)或 subprocess(standalone)执行。
|
||||
|
||||
凭证以 ``AWS_*`` 环境变量注入子进程,让 boto3/pandas-s3 自然读到。
|
||||
脚本走 ``kilostar.utils.sandbox.validate_python_code`` 的静态屏蔽兜底。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from kilostar.utils.ray_compat import _STANDALONE
|
||||
from kilostar.utils.sandbox import (
|
||||
CodeViolation,
|
||||
get_python_timeout,
|
||||
validate_python_code,
|
||||
)
|
||||
|
||||
from ._s3_common import get_s3_creds_or_raise
|
||||
|
||||
|
||||
def _build_env(creds) -> dict:
|
||||
env = os.environ.copy()
|
||||
env["AWS_ACCESS_KEY_ID"] = creds["access_key"]
|
||||
env["AWS_SECRET_ACCESS_KEY"] = creds["secret_key"]
|
||||
env["AWS_DEFAULT_REGION"] = creds.get("region") or "us-east-1"
|
||||
if creds.get("endpoint_url"):
|
||||
env["AWS_ENDPOINT_URL_S3"] = creds["endpoint_url"]
|
||||
env["AWS_ENDPOINT_URL"] = creds["endpoint_url"]
|
||||
return env
|
||||
|
||||
|
||||
async def ray_submit(script: str, timeout: int = 300) -> str:
|
||||
"""提交 Python 脚本到 Ray(分布式)或子进程(单机)执行。
|
||||
|
||||
脚本中可直接 ``import boto3`` 读 S3(凭证已通过环境变量注入);可用
|
||||
pandas / polars / numpy 等已安装的依赖。**只读**——不要尝试 put/delete。
|
||||
|
||||
Args:
|
||||
script: Python 源码
|
||||
timeout: 超时秒数(默认 300)
|
||||
|
||||
Returns:
|
||||
stdout(必要时尾部追加 stderr 与 exit code)
|
||||
"""
|
||||
try:
|
||||
script = validate_python_code(script)
|
||||
except CodeViolation as e:
|
||||
return f"[Sandbox] {e}"
|
||||
|
||||
creds = get_s3_creds_or_raise()
|
||||
env = _build_env(creds)
|
||||
timeout = get_python_timeout(timeout)
|
||||
|
||||
# standalone 与 distributed 第一版都走 subprocess,保证环境变量传递可控
|
||||
# (ray.remote 跑函数时 env vars 需另装 runtime_env,复杂度跟 subprocess 持平
|
||||
# 但前者透明可控,先这样落地)
|
||||
tmp_file = None
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".py", delete=False, encoding="utf-8"
|
||||
) as f:
|
||||
f.write(script)
|
||||
tmp_file = f.name
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
sys.executable,
|
||||
tmp_file,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
env=env,
|
||||
)
|
||||
stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
out = stdout.decode("utf-8", errors="replace")
|
||||
err = stderr.decode("utf-8", errors="replace")
|
||||
result = ""
|
||||
if out:
|
||||
result += out
|
||||
if err:
|
||||
result += f"\n[stderr]\n{err}"
|
||||
if proc.returncode != 0:
|
||||
result += f"\n[exit code: {proc.returncode}]"
|
||||
result = result.strip() or "(no output)"
|
||||
if not _STANDALONE:
|
||||
result = f"[mode: ray-cluster (subprocess)]\n{result}"
|
||||
return result
|
||||
except asyncio.TimeoutError:
|
||||
return f"[Error] ray_submit 执行超时({timeout}s)"
|
||||
except Exception as e:
|
||||
return f"[Error] ray_submit 失败:{e}"
|
||||
finally:
|
||||
if tmp_file and os.path.exists(tmp_file):
|
||||
os.unlink(tmp_file)
|
||||
@@ -0,0 +1,46 @@
|
||||
"""s3_get_object:下载到 artifact 目录(路径强校验防穿越)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from kilostar.utils.settings import get_artifact_dir
|
||||
|
||||
from ._s3_common import get_s3_creds_or_raise, make_session_kwargs, parse_s3_uri
|
||||
|
||||
|
||||
async def s3_get_object(uri: str, save_as: str) -> str:
|
||||
"""把 S3 对象下载到本进程的 artifact 工作区,返回本地绝对路径。
|
||||
|
||||
``save_as`` 必须是相对路径,落到 ``data/artifact/data_analytics_downloads/``
|
||||
下面(防越权写入任意目录)。下载后供 python_executor / ray_submit 中以
|
||||
pandas/polars 读取。
|
||||
|
||||
Args:
|
||||
uri: 形如 ``s3://bucket/key`` 的对象路径
|
||||
save_as: 保存的相对文件名(不能含 ``..`` 或绝对路径)
|
||||
|
||||
Returns:
|
||||
本地保存的绝对路径
|
||||
"""
|
||||
from aiobotocore.session import get_session
|
||||
|
||||
creds = get_s3_creds_or_raise()
|
||||
bucket, key = parse_s3_uri(uri)
|
||||
|
||||
save_path = Path(save_as).as_posix()
|
||||
if save_path.startswith("/") or ".." in save_path.split("/"):
|
||||
raise ValueError(f"save_as 必须是相对、不含 .. 的路径,收到 {save_as!r}")
|
||||
|
||||
base = get_artifact_dir() / "data_analytics_downloads"
|
||||
base.mkdir(parents=True, exist_ok=True)
|
||||
target = base / save_path
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
session = get_session()
|
||||
async with session.create_client("s3", **make_session_kwargs(creds)) as client:
|
||||
resp = await client.get_object(Bucket=bucket, Key=key)
|
||||
body = await resp["Body"].read()
|
||||
target.write_bytes(body)
|
||||
return str(target.resolve())
|
||||
@@ -0,0 +1,47 @@
|
||||
"""s3_list_objects:列出 bucket+prefix 下的对象列表(key/size/last_modified)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from ._s3_common import get_s3_creds_or_raise, make_session_kwargs
|
||||
|
||||
|
||||
async def s3_list_objects(
|
||||
bucket: str,
|
||||
prefix: str = "",
|
||||
limit: int = 50,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""列出 S3 bucket 下指定 prefix 的对象(最多 limit 条)。
|
||||
|
||||
Args:
|
||||
bucket: S3 bucket 名
|
||||
prefix: 对象 key 前缀,留空表示根路径
|
||||
limit: 最多返回条数(1-1000),默认 50
|
||||
|
||||
Returns:
|
||||
对象信息列表,每项含 key / size / last_modified(ISO 字符串)
|
||||
"""
|
||||
from aiobotocore.session import get_session
|
||||
|
||||
creds = get_s3_creds_or_raise()
|
||||
limit = max(1, min(int(limit), 1000))
|
||||
|
||||
session = get_session()
|
||||
out: List[Dict[str, Any]] = []
|
||||
async with session.create_client("s3", **make_session_kwargs(creds)) as client:
|
||||
paginator = client.get_paginator("list_objects_v2")
|
||||
async for page in paginator.paginate(
|
||||
Bucket=bucket, Prefix=prefix, PaginationConfig={"MaxItems": limit}
|
||||
):
|
||||
for item in page.get("Contents", []) or []:
|
||||
out.append({
|
||||
"key": item.get("Key"),
|
||||
"size": item.get("Size"),
|
||||
"last_modified": (
|
||||
item["LastModified"].isoformat() if item.get("LastModified") else None
|
||||
),
|
||||
})
|
||||
if len(out) >= limit:
|
||||
return out
|
||||
return out
|
||||
@@ -0,0 +1,35 @@
|
||||
"""s3_peek:读取对象的头若干字节并尝试 UTF-8 解码(看几行用)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from ._s3_common import get_s3_creds_or_raise, make_session_kwargs, parse_s3_uri
|
||||
|
||||
|
||||
async def s3_peek(uri: str, n_bytes: int = 4096) -> str:
|
||||
"""读取 S3 对象的头 ``n_bytes`` 字节,UTF-8 解码后返回。
|
||||
|
||||
适合快速预览 csv/json/log 等文本类对象的开头几行。二进制内容会以
|
||||
``[binary, ...]`` 占位说明返回。
|
||||
|
||||
Args:
|
||||
uri: 形如 ``s3://bucket/key`` 的对象路径
|
||||
n_bytes: 读取字节数,默认 4096,上限 1MB
|
||||
|
||||
Returns:
|
||||
对象内容片段(解码后的字符串或占位说明)
|
||||
"""
|
||||
from aiobotocore.session import get_session
|
||||
|
||||
creds = get_s3_creds_or_raise()
|
||||
bucket, key = parse_s3_uri(uri)
|
||||
n = max(1, min(int(n_bytes), 1024 * 1024))
|
||||
|
||||
session = get_session()
|
||||
async with session.create_client("s3", **make_session_kwargs(creds)) as client:
|
||||
resp = await client.get_object(Bucket=bucket, Key=key, Range=f"bytes=0-{n-1}")
|
||||
body = await resp["Body"].read()
|
||||
try:
|
||||
text = body.decode("utf-8")
|
||||
return text
|
||||
except UnicodeDecodeError:
|
||||
return f"[binary, {len(body)} bytes; first 64 hex] {body[:64].hex()}"
|
||||
Reference in New Issue
Block a user