diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 7b5bf24..0bb0fbe 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -37,6 +37,11 @@ class Settings(BaseSettings): # http_request 工具:写入 LLM 上下文的响应体最大字符数(HTML/JSON 过大时截断,避免超过模型 context) HTTP_REQUEST_MAX_BODY_CHARS: int = 32_000 + # web_search 工具:Bing Search API Key(Azure 免费层 1000 次/月),留空则使用 DuckDuckGo + BING_SEARCH_API_KEY: str = "" + # web_search 工具:HTTP 代理地址(DuckDuckGo 在国内无法访问时使用),如 http://127.0.0.1:7890 + SEARCH_PROXY: str = "" + # 图片 OCR(file_read 对 png/jpg 等):Tesseract 可执行文件路径,Windows 示例 C:/Program Files/Tesseract-OCR/tesseract.exe TESSERACT_CMD: str = "" # 自定义 tessdata 目录(内含 chi_sim.traineddata 等)。留空时若 LOCAL_FILE_TOOLS_ROOT/tessdata 下存在 .traineddata 则自动使用 diff --git a/backend/app/services/builtin_tools.py b/backend/app/services/builtin_tools.py index 11cad43..b3c657a 100644 --- a/backend/app/services/builtin_tools.py +++ b/backend/app/services/builtin_tools.py @@ -2571,67 +2571,123 @@ GIT_OPERATION_SCHEMA = { # ── web_search ─────────────────────────────────────────────── -async def web_search_tool(query: str, max_results: int = 8) -> str: - """搜索网页(使用 DuckDuckGo HTML 搜索,无需 API Key)。""" - import asyncio as _asyncio +async def _search_via_bing(query: str, max_results: int) -> str: + """通过 Bing Web Search API 搜索(需 BING_SEARCH_API_KEY)。""" + from app.core.config import settings - try: - url = f"https://html.duckduckgo.com/html/?q={httpx.URL(query).raw_path.decode() if hasattr(httpx.URL(query), 'raw_path') else query}" - # 简单 URL 编码 - import urllib.parse - safe_q = urllib.parse.quote(query, safe="") - url = f"https://html.duckduckgo.com/html/?q={safe_q}" + api_key = settings.BING_SEARCH_API_KEY + if not api_key: + return json.dumps({"error": "Bing Search API Key 未配置"}, ensure_ascii=False) - async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client: - resp = await client.get(url, headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" - }) - if resp.status_code != 200: - return json.dumps({"error": f"搜索失败 HTTP {resp.status_code}"}, ensure_ascii=False) + proxy = settings.SEARCH_PROXY or None + client_kwargs = {"timeout": 15.0} + if proxy: + client_kwargs["proxy"] = proxy - html = resp.text - - # 简单解析搜索结果 - results = [] - # 匹配每个结果块:标题、摘要、链接 - import re as _re - # DuckDuckGo HTML 结果模式 - blocks = _re.findall( - r']*class="result__a"[^>]*href="([^"]*)"[^>]*>(.*?).*?]*class="result__snippet"[^>]*>(.*?)', - html, _re.DOTALL | _re.IGNORECASE, + async with httpx.AsyncClient(**client_kwargs) as client: + resp = await client.get( + "https://api.bing.microsoft.com/v7.0/search", + headers={"Ocp-Apim-Subscription-Key": api_key}, + params={"q": query, "count": min(max_results, 50), "mkt": "zh-CN", "textFormat": "Raw"}, ) - if not blocks: - # 备选模式 - blocks = _re.findall( - r']*rel="nofollow"[^>]*class="result__url"[^>]*href="([^"]*)"[^>]*>.*?]*class="result__a"[^>]*>(.*?).*?]*class="result__snippet"[^>]*>(.*?)', - html, _re.DOTALL | _re.IGNORECASE, - ) + if resp.status_code != 200: + return json.dumps({"error": f"Bing 搜索失败 HTTP {resp.status_code}: {resp.text[:200]}"}, ensure_ascii=False) - for i, (link, title, snippet) in enumerate(blocks[:max_results]): - title_clean = _re.sub(r"<.*?>", "", title).strip() - snippet_clean = _re.sub(r"<.*?>", "", snippet).strip() + data = resp.json() + web_pages = (data.get("webPages") or {}).get("value", []) + if not web_pages: + return json.dumps({"message": "Bing 未找到搜索结果,请尝试更具体的关键词", "results": []}, ensure_ascii=False) + + results = [] + for i, page in enumerate(web_pages[:max_results]): results.append({ "index": i + 1, - "title": title_clean, - "url": link, - "snippet": snippet_clean[:300], + "title": page.get("name", ""), + "url": page.get("url", ""), + "snippet": (page.get("snippet", "") or "")[:300], }) + return json.dumps({"results": results, "count": len(results), "source": "bing"}, ensure_ascii=False) - if not results: - return json.dumps({"message": "未找到搜索结果,请尝试更具体的关键词", "results": []}, ensure_ascii=False) - return json.dumps({"results": results, "count": len(results)}, ensure_ascii=False) +async def _search_via_duckduckgo(query: str, max_results: int) -> str: + """通过 DuckDuckGo HTML 搜索(无需 API Key,可配代理)。""" + import re as _re + import urllib.parse + from app.core.config import settings + + safe_q = urllib.parse.quote(query, safe="") + url = f"https://html.duckduckgo.com/html/?q={safe_q}" + + proxy = settings.SEARCH_PROXY or None + client_kwargs = {"timeout": 15.0, "follow_redirects": True} + if proxy: + client_kwargs["proxy"] = proxy + + async with httpx.AsyncClient(**client_kwargs) as client: + resp = await client.get(url, headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + }) + if resp.status_code != 200: + return json.dumps({"error": f"DuckDuckGo 搜索失败 HTTP {resp.status_code}"}, ensure_ascii=False) + html = resp.text + + results = [] + blocks = _re.findall( + r']*class="result__a"[^>]*href="([^"]*)"[^>]*>(.*?).*?]*class="result__snippet"[^>]*>(.*?)', + html, _re.DOTALL | _re.IGNORECASE, + ) + if not blocks: + blocks = _re.findall( + r']*rel="nofollow"[^>]*class="result__url"[^>]*href="([^"]*)"[^>]*>.*?]*class="result__a"[^>]*>(.*?).*?]*class="result__snippet"[^>]*>(.*?)', + html, _re.DOTALL | _re.IGNORECASE, + ) + + for i, (link, title, snippet) in enumerate(blocks[:max_results]): + title_clean = _re.sub(r"<.*?>", "", title).strip() + snippet_clean = _re.sub(r"<.*?>", "", snippet).strip() + results.append({ + "index": i + 1, + "title": title_clean, + "url": link, + "snippet": snippet_clean[:300], + }) + + if not results: + return json.dumps({"message": "DuckDuckGo 未找到搜索结果,请尝试更具体的关键词", "results": []}, ensure_ascii=False) + + return json.dumps({"results": results, "count": len(results), "source": "duckduckgo"}, ensure_ascii=False) + + +async def web_search_tool(query: str, max_results: int = 8) -> str: + """搜索网页(Bing 优先,DuckDuckGo 降级,支持代理)。""" + from app.core.config import settings + + # 1. 优先使用 Bing Search API(国内可用) + if settings.BING_SEARCH_API_KEY: + try: + result = await _search_via_bing(query, max_results) + parsed = json.loads(result) + if "results" in parsed and parsed["results"]: + return result + # Bing 返回空结果时继续降级 + logger.info("Bing 搜索无结果,降级到 DuckDuckGo") + except Exception as e: + logger.warning("Bing 搜索异常,降级到 DuckDuckGo: %s", e) + + # 2. 降级到 DuckDuckGo + try: + return await _search_via_duckduckgo(query, max_results) except Exception as e: - logger.error(f"web_search 失败: {e}") - return json.dumps({"error": f"搜索失败: {e}"}, ensure_ascii=False) + logger.error(f"web_search 全部后端失败: {e}") + return json.dumps({"error": f"搜索失败: 所有搜索后端均不可用(Bing/DuckDuckGo),请检查网络或配置 SEARCH_PROXY"}, ensure_ascii=False) WEB_SEARCH_SCHEMA = { "type": "function", "function": { "name": "web_search", - "description": "搜索互联网获取网页信息,返回标题、摘要和 URL 列表。适合查找最新文档、技术方案、新闻资讯。", + "description": "搜索互联网获取网页信息(Bing 优先,DuckDuckGo 备用),返回标题、摘要和 URL 列表。适合查找最新文档、技术方案、新闻资讯。", "parameters": { "type": "object", "properties": {