security/SSRF vulns (#6682)

This commit is contained in:
Yeuoly
2024-07-25 20:50:26 +08:00
committed by GitHub
parent c5ac004f15
commit 79cb23e8ac
3 changed files with 13 additions and 28 deletions

View File

@@ -11,11 +11,10 @@ from contextlib import contextmanager
from urllib.parse import unquote
import cloudscraper
import requests
from bs4 import BeautifulSoup, CData, Comment, NavigableString
from newspaper import Article
from regex import regex
from core.helper import ssrf_proxy
from core.rag.extractor import extract_processor
from core.rag.extractor.extract_processor import ExtractProcessor
@@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str:
main_content_type = None
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10))
if response.status_code == 200:
# check content-type
@@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str:
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
return ExtractProcessor.load_from_url(url, return_text=True)
response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
elif response.status_code == 403:
scraper = cloudscraper.create_scraper()
response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
scraper.perform_request = ssrf_proxy.make_request
response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
if response.status_code != 200:
return "URL returned status code {}.".format(response.status_code)
@@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str:
a = extract_using_readabilipy(response.text)
if not a['plain_text'] or not a['plain_text'].strip():
return get_url_from_newspaper3k(url)
return ''
res = FULL_TEMPLATE.format(
title=a['title'],
@@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str:
return res
def get_url_from_newspaper3k(url: str) -> str:
a = Article(url)
a.download()
a.parse()
res = FULL_TEMPLATE.format(
title=a.title,
authors=a.authors,
publish_date=a.publish_date,
top_image=a.top_image,
text=a.text,
)
return res
def extract_using_readabilipy(html):
with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
f_html.write(html)