38
api/core/rag/cleaner/clean_processor.py
Normal file
38
api/core/rag/cleaner/clean_processor.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import re
|
||||
|
||||
|
||||
class CleanProcessor:
|
||||
|
||||
@classmethod
|
||||
def clean(cls, text: str, process_rule: dict) -> str:
|
||||
# default clean
|
||||
# remove invalid symbol
|
||||
text = re.sub(r'<\|', '<', text)
|
||||
text = re.sub(r'\|>', '>', text)
|
||||
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)
|
||||
# Unicode U+FFFE
|
||||
text = re.sub('\uFFFE', '', text)
|
||||
|
||||
rules = process_rule['rules'] if process_rule else None
|
||||
if 'pre_processing_rules' in rules:
|
||||
pre_processing_rules = rules["pre_processing_rules"]
|
||||
for pre_processing_rule in pre_processing_rules:
|
||||
if pre_processing_rule["id"] == "remove_extra_spaces" and pre_processing_rule["enabled"] is True:
|
||||
# Remove extra spaces
|
||||
pattern = r'\n{3,}'
|
||||
text = re.sub(pattern, '\n\n', text)
|
||||
pattern = r'[\t\f\r\x20\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]{2,}'
|
||||
text = re.sub(pattern, ' ', text)
|
||||
elif pre_processing_rule["id"] == "remove_urls_emails" and pre_processing_rule["enabled"] is True:
|
||||
# Remove email
|
||||
pattern = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
|
||||
text = re.sub(pattern, '', text)
|
||||
|
||||
# Remove URL
|
||||
pattern = r'https?://[^\s]+'
|
||||
text = re.sub(pattern, '', text)
|
||||
return text
|
||||
|
||||
def filter_string(self, text):
|
||||
|
||||
return text
|
||||
12
api/core/rag/cleaner/cleaner_base.py
Normal file
12
api/core/rag/cleaner/cleaner_base.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Abstract interface for document cleaner implementations."""
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class BaseCleaner(ABC):
|
||||
"""Interface for clean chunk content.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def clean(self, content: str):
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -0,0 +1,12 @@
|
||||
"""Abstract interface for document clean implementations."""
|
||||
from core.rag.cleaner.cleaner_base import BaseCleaner
|
||||
|
||||
|
||||
class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
|
||||
|
||||
def clean(self, content) -> str:
|
||||
"""clean document content."""
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
|
||||
# Returns "ITEM 1A: RISK FACTORS"
|
||||
return clean_extra_whitespace(content)
|
||||
@@ -0,0 +1,15 @@
|
||||
"""Abstract interface for document clean implementations."""
|
||||
from core.rag.cleaner.cleaner_base import BaseCleaner
|
||||
|
||||
|
||||
class UnstructuredGroupBrokenParagraphsCleaner(BaseCleaner):
|
||||
|
||||
def clean(self, content) -> str:
|
||||
"""clean document content."""
|
||||
import re
|
||||
|
||||
from unstructured.cleaners.core import group_broken_paragraphs
|
||||
|
||||
para_split_re = re.compile(r"(\s*\n\s*){3}")
|
||||
|
||||
return group_broken_paragraphs(content, paragraph_split=para_split_re)
|
||||
@@ -0,0 +1,12 @@
|
||||
"""Abstract interface for document clean implementations."""
|
||||
from core.rag.cleaner.cleaner_base import BaseCleaner
|
||||
|
||||
|
||||
class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
|
||||
|
||||
def clean(self, content) -> str:
|
||||
"""clean document content."""
|
||||
from unstructured.cleaners.core import clean_non_ascii_chars
|
||||
|
||||
# Returns "This text containsnon-ascii characters!"
|
||||
return clean_non_ascii_chars(content)
|
||||
@@ -0,0 +1,11 @@
|
||||
"""Abstract interface for document clean implementations."""
|
||||
from core.rag.cleaner.cleaner_base import BaseCleaner
|
||||
|
||||
|
||||
class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
|
||||
|
||||
def clean(self, content) -> str:
|
||||
"""Replaces unicode quote characters, such as the \x91 character in a string."""
|
||||
|
||||
from unstructured.cleaners.core import replace_unicode_quotes
|
||||
return replace_unicode_quotes(content)
|
||||
@@ -0,0 +1,11 @@
|
||||
"""Abstract interface for document clean implementations."""
|
||||
from core.rag.cleaner.cleaner_base import BaseCleaner
|
||||
|
||||
|
||||
class UnstructuredTranslateTextCleaner(BaseCleaner):
|
||||
|
||||
def clean(self, content) -> str:
|
||||
"""clean document content."""
|
||||
from unstructured.cleaners.translate import translate_text
|
||||
|
||||
return translate_text(content)
|
||||
Reference in New Issue
Block a user