存档

2026-07-01 09:22:26 +00:00
parent 4aa1dab283
commit aa47a19e98
53 changed files with 4721 additions and 77 deletions
@@ -0,0 +1,16 @@
+"""data_analytics 插件本地工具集。
+
+agent 看到这些工具时不带凭证参数，凭证由 organization 通过 ContextVar 注入。
+"""
+
+from .s3_list_objects import s3_list_objects
+from .s3_peek import s3_peek
+from .s3_get_object import s3_get_object
+from .ray_submit import ray_submit
+
+__all__ = [
+    "s3_list_objects",
+    "s3_peek",
+    "s3_get_object",
+    "ray_submit",
+]
@@ -0,0 +1,43 @@
+"""S3 工具共用辅助：从 ContextVar 拿凭证 + 解析 URI。
+
+所有 s3_* 工具都依赖这个模块，把"明文凭证"的取用集中在一处。
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Dict, Tuple
+
+
+def get_s3_creds_or_raise() -> Dict[str, Any]:
+    """从 organization 注入的 ContextVar 中取出明文凭证；未注入则抛错。"""
+    # 延迟 import 避免循环；这里走 organization 子类被加载时注入的虚拟包路径
+    from ..core.organization import S3_CREDS_VAR
+
+    creds = S3_CREDS_VAR.get()
+    if not creds:
+        raise RuntimeError(
+            "未提供 S3 凭证：本任务上下文中没有 cred_id，请在创建 job 时选择凭证。"
+        )
+    return creds
+
+
+def parse_s3_uri(uri: str) -> Tuple[str, str]:
+    """解析 ``s3://bucket/key`` → ``(bucket, key)``；非法格式抛 ValueError。"""
+    m = re.match(r"^s3://([^/]+)/(.+)$", uri.strip())
+    if not m:
+        raise ValueError(f"非法 S3 URI：{uri!r}（期待 s3://bucket/key 形式）")
+    return m.group(1), m.group(2)
+
+
+def make_session_kwargs(creds: Dict[str, Any]) -> Dict[str, Any]:
+    """转 boto3/aiobotocore client 调用所需的 kwargs。"""
+    kw: Dict[str, Any] = {
+        "aws_access_key_id": creds["access_key"],
+        "aws_secret_access_key": creds["secret_key"],
+        "region_name": creds.get("region") or "us-east-1",
+    }
+    endpoint = creds.get("endpoint_url")
+    if endpoint:
+        kw["endpoint_url"] = endpoint
+    return kw
@@ -0,0 +1,39 @@
+{
+  "name": "data_analytics_internal",
+  "version": "0.1.0",
+  "description": "data_analytics 插件内部工具：S3 只读 + Ray 提交。仅限本插件内部 agent 调用。",
+  "tools": [
+    {
+      "name": "s3_list_objects",
+      "file": "s3_list_objects.py",
+      "is_system": true,
+      "action_scope": ["data_analytics_internal"],
+      "config_args": {},
+      "category": "system"
+    },
+    {
+      "name": "s3_peek",
+      "file": "s3_peek.py",
+      "is_system": true,
+      "action_scope": ["data_analytics_internal"],
+      "config_args": {},
+      "category": "system"
+    },
+    {
+      "name": "s3_get_object",
+      "file": "s3_get_object.py",
+      "is_system": true,
+      "action_scope": ["data_analytics_internal"],
+      "config_args": {},
+      "category": "system"
+    },
+    {
+      "name": "ray_submit",
+      "file": "ray_submit.py",
+      "is_system": true,
+      "action_scope": ["data_analytics_internal"],
+      "config_args": {},
+      "category": "system"
+    }
+  ]
+}
@@ -0,0 +1,95 @@
+"""ray_submit：把分析脚本提交到 Ray（distributed）或 subprocess（standalone）执行。
+
+凭证以 ``AWS_*`` 环境变量注入子进程，让 boto3/pandas-s3 自然读到。
+脚本走 ``kilostar.utils.sandbox.validate_python_code`` 的静态屏蔽兜底。
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+import sys
+import tempfile
+
+from kilostar.utils.ray_compat import _STANDALONE
+from kilostar.utils.sandbox import (
+    CodeViolation,
+    get_python_timeout,
+    validate_python_code,
+)
+
+from ._s3_common import get_s3_creds_or_raise
+
+
+def _build_env(creds) -> dict:
+    env = os.environ.copy()
+    env["AWS_ACCESS_KEY_ID"] = creds["access_key"]
+    env["AWS_SECRET_ACCESS_KEY"] = creds["secret_key"]
+    env["AWS_DEFAULT_REGION"] = creds.get("region") or "us-east-1"
+    if creds.get("endpoint_url"):
+        env["AWS_ENDPOINT_URL_S3"] = creds["endpoint_url"]
+        env["AWS_ENDPOINT_URL"] = creds["endpoint_url"]
+    return env
+
+
+async def ray_submit(script: str, timeout: int = 300) -> str:
+    """提交 Python 脚本到 Ray（分布式）或子进程（单机）执行。
+
+    脚本中可直接 ``import boto3`` 读 S3（凭证已通过环境变量注入）；可用
+    pandas / polars / numpy 等已安装的依赖。**只读**——不要尝试 put/delete。
+
+    Args:
+        script: Python 源码
+        timeout: 超时秒数（默认 300）
+
+    Returns:
+        stdout（必要时尾部追加 stderr 与 exit code）
+    """
+    try:
+        script = validate_python_code(script)
+    except CodeViolation as e:
+        return f"[Sandbox] {e}"
+
+    creds = get_s3_creds_or_raise()
+    env = _build_env(creds)
+    timeout = get_python_timeout(timeout)
+
+    # standalone 与 distributed 第一版都走 subprocess，保证环境变量传递可控
+    # （ray.remote 跑函数时 env vars 需另装 runtime_env，复杂度跟 subprocess 持平
+    #  但前者透明可控，先这样落地）
+    tmp_file = None
+    try:
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".py", delete=False, encoding="utf-8"
+        ) as f:
+            f.write(script)
+            tmp_file = f.name
+
+        proc = await asyncio.create_subprocess_exec(
+            sys.executable,
+            tmp_file,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            env=env,
+        )
+        stdout, stderr = await asyncio.wait_for(proc.communicate(), timeout=timeout)
+        out = stdout.decode("utf-8", errors="replace")
+        err = stderr.decode("utf-8", errors="replace")
+        result = ""
+        if out:
+            result += out
+        if err:
+            result += f"\n[stderr]\n{err}"
+        if proc.returncode != 0:
+            result += f"\n[exit code: {proc.returncode}]"
+        result = result.strip() or "(no output)"
+        if not _STANDALONE:
+            result = f"[mode: ray-cluster (subprocess)]\n{result}"
+        return result
+    except asyncio.TimeoutError:
+        return f"[Error] ray_submit 执行超时（{timeout}s）"
+    except Exception as e:
+        return f"[Error] ray_submit 失败：{e}"
+    finally:
+        if tmp_file and os.path.exists(tmp_file):
+            os.unlink(tmp_file)
@@ -0,0 +1,46 @@
+"""s3_get_object：下载到 artifact 目录（路径强校验防穿越）。"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+from kilostar.utils.settings import get_artifact_dir
+
+from ._s3_common import get_s3_creds_or_raise, make_session_kwargs, parse_s3_uri
+
+
+async def s3_get_object(uri: str, save_as: str) -> str:
+    """把 S3 对象下载到本进程的 artifact 工作区，返回本地绝对路径。
+
+    ``save_as`` 必须是相对路径，落到 ``data/artifact/data_analytics_downloads/``
+    下面（防越权写入任意目录）。下载后供 python_executor / ray_submit 中以
+    pandas/polars 读取。
+
+    Args:
+        uri: 形如 ``s3://bucket/key`` 的对象路径
+        save_as: 保存的相对文件名（不能含 ``..`` 或绝对路径）
+
+    Returns:
+        本地保存的绝对路径
+    """
+    from aiobotocore.session import get_session
+
+    creds = get_s3_creds_or_raise()
+    bucket, key = parse_s3_uri(uri)
+
+    save_path = Path(save_as).as_posix()
+    if save_path.startswith("/") or ".." in save_path.split("/"):
+        raise ValueError(f"save_as 必须是相对、不含 .. 的路径，收到 {save_as!r}")
+
+    base = get_artifact_dir() / "data_analytics_downloads"
+    base.mkdir(parents=True, exist_ok=True)
+    target = base / save_path
+    target.parent.mkdir(parents=True, exist_ok=True)
+
+    session = get_session()
+    async with session.create_client("s3", **make_session_kwargs(creds)) as client:
+        resp = await client.get_object(Bucket=bucket, Key=key)
+        body = await resp["Body"].read()
+    target.write_bytes(body)
+    return str(target.resolve())
@@ -0,0 +1,47 @@
+"""s3_list_objects：列出 bucket+prefix 下的对象列表（key/size/last_modified）。"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List
+
+from ._s3_common import get_s3_creds_or_raise, make_session_kwargs
+
+
+async def s3_list_objects(
+    bucket: str,
+    prefix: str = "",
+    limit: int = 50,
+) -> List[Dict[str, Any]]:
+    """列出 S3 bucket 下指定 prefix 的对象（最多 limit 条）。
+
+    Args:
+        bucket: S3 bucket 名
+        prefix: 对象 key 前缀，留空表示根路径
+        limit: 最多返回条数（1-1000），默认 50
+
+    Returns:
+        对象信息列表，每项含 key / size / last_modified（ISO 字符串）
+    """
+    from aiobotocore.session import get_session
+
+    creds = get_s3_creds_or_raise()
+    limit = max(1, min(int(limit), 1000))
+
+    session = get_session()
+    out: List[Dict[str, Any]] = []
+    async with session.create_client("s3", **make_session_kwargs(creds)) as client:
+        paginator = client.get_paginator("list_objects_v2")
+        async for page in paginator.paginate(
+            Bucket=bucket, Prefix=prefix, PaginationConfig={"MaxItems": limit}
+        ):
+            for item in page.get("Contents", []) or []:
+                out.append({
+                    "key": item.get("Key"),
+                    "size": item.get("Size"),
+                    "last_modified": (
+                        item["LastModified"].isoformat() if item.get("LastModified") else None
+                    ),
+                })
+                if len(out) >= limit:
+                    return out
+    return out
@@ -0,0 +1,35 @@
+"""s3_peek：读取对象的头若干字节并尝试 UTF-8 解码（看几行用）。"""
+
+from __future__ import annotations
+
+from ._s3_common import get_s3_creds_or_raise, make_session_kwargs, parse_s3_uri
+
+
+async def s3_peek(uri: str, n_bytes: int = 4096) -> str:
+    """读取 S3 对象的头 ``n_bytes`` 字节，UTF-8 解码后返回。
+
+    适合快速预览 csv/json/log 等文本类对象的开头几行。二进制内容会以
+    ``[binary, ...]`` 占位说明返回。
+
+    Args:
+        uri: 形如 ``s3://bucket/key`` 的对象路径
+        n_bytes: 读取字节数，默认 4096，上限 1MB
+
+    Returns:
+        对象内容片段（解码后的字符串或占位说明）
+    """
+    from aiobotocore.session import get_session
+
+    creds = get_s3_creds_or_raise()
+    bucket, key = parse_s3_uri(uri)
+    n = max(1, min(int(n_bytes), 1024 * 1024))
+
+    session = get_session()
+    async with session.create_client("s3", **make_session_kwargs(creds)) as client:
+        resp = await client.get_object(Bucket=bucket, Key=key, Range=f"bytes=0-{n-1}")
+        body = await resp["Body"].read()
+    try:
+        text = body.decode("utf-8")
+        return text
+    except UnicodeDecodeError:
+        return f"[binary, {len(body)} bytes; first 64 hex] {body[:64].hex()}"