security/SSRF vulns (#6682)

2024-07-25 20:50:26 +08:00
parent c5ac004f15
commit 79cb23e8ac
3 changed files with 13 additions and 28 deletions
--- a/api/core/tools/utils/web_reader_tool.py
+++ b/api/core/tools/utils/web_reader_tool.py
@@ -11,11 +11,10 @@ from contextlib import contextmanager
 from urllib.parse import unquote

 import cloudscraper
-import requests
 from bs4 import BeautifulSoup, CData, Comment, NavigableString
-from newspaper import Article
 from regex import regex

+from core.helper import ssrf_proxy
 from core.rag.extractor import extract_processor
 from core.rag.extractor.extract_processor import ExtractProcessor

@@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str:

    main_content_type = None
    supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
-    response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
+    response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10))

    if response.status_code == 200:
        # check content-type
@@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str:
        if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
            return ExtractProcessor.load_from_url(url, return_text=True)

-        response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+        response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
    elif response.status_code == 403:
        scraper = cloudscraper.create_scraper()
-        response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
+        scraper.perform_request = ssrf_proxy.make_request
+        response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))

    if response.status_code != 200:
        return "URL returned status code {}.".format(response.status_code)
@@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str:
    a = extract_using_readabilipy(response.text)

    if not a['plain_text'] or not a['plain_text'].strip():
-        return get_url_from_newspaper3k(url)
+        return ''

    res = FULL_TEMPLATE.format(
        title=a['title'],
@@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str:
    return res


-def get_url_from_newspaper3k(url: str) -> str:
-
-    a = Article(url)
-    a.download()
-    a.parse()
-
-    res = FULL_TEMPLATE.format(
-        title=a.title,
-        authors=a.authors,
-        publish_date=a.publish_date,
-        top_image=a.top_image,
-        text=a.text,
-    )
-
-    return res
-
-
 def extract_using_readabilipy(html):
    with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
        f_html.write(html)