Pretor/pretor/tool_plugin/web_crawler/web_crawler.py

#  Copyright 2026 zhaoxi826
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import httpx
from typing import Dict, Any

class WebCrawlerTool:
    def __init__(self, timeout: int = 10):
        self.timeout = timeout

    async def crawl(self, url: str) -> Dict[str, Any]:
        try:
            async with httpx.AsyncClient(timeout=self.timeout) as client:
                response = await client.get(url)
                response.raise_for_status()
                # Basic text extraction can happen here (e.g., stripping HTML tags manually or with a library later)
                return {
                    "url": url,
                    "status_code": response.status_code,
                    "content_preview": response.text[:500]
                }
        except Exception as e:
            return {"url": url, "error": str(e)}