Feat/dify rag (#2528)

Co-authored-by: jyong <jyong@dify.ai>
2024-02-22 23:31:57 +08:00
parent 97fe817186
commit 6c4e6bf1d6
119 changed files with 3181 additions and 5892 deletions
--- a/api/core/rag/cleaner/clean_processor.py
+++ b/api/core/rag/cleaner/clean_processor.py
@@ -0,0 +1,38 @@
+import re
+
+
+class CleanProcessor:
+
+    @classmethod
+    def clean(cls, text: str, process_rule: dict) -> str:
+        # default clean
+        # remove invalid symbol
+        text = re.sub(r'<\|', '<', text)
+        text = re.sub(r'\|>', '>', text)
+        text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xEF\xBF\xBE]', '', text)
+        # Unicode  U+FFFE
+        text = re.sub('\uFFFE', '', text)
+
+        rules = process_rule['rules'] if process_rule else None
+        if 'pre_processing_rules' in rules:
+            pre_processing_rules = rules["pre_processing_rules"]
+            for pre_processing_rule in pre_processing_rules:
+                if pre_processing_rule["id"] == "remove_extra_spaces" and pre_processing_rule["enabled"] is True:
+                    # Remove extra spaces
+                    pattern = r'\n{3,}'
+                    text = re.sub(pattern, '\n\n', text)
+                    pattern = r'[\t\f\r\x20\u00a0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]{2,}'
+                    text = re.sub(pattern, ' ', text)
+                elif pre_processing_rule["id"] == "remove_urls_emails" and pre_processing_rule["enabled"] is True:
+                    # Remove email
+                    pattern = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
+                    text = re.sub(pattern, '', text)
+
+                    # Remove URL
+                    pattern = r'https?://[^\s]+'
+                    text = re.sub(pattern, '', text)
+        return text
+
+    def filter_string(self, text):
+
+        return text
--- a/api/core/rag/cleaner/cleaner_base.py
+++ b/api/core/rag/cleaner/cleaner_base.py
@@ -0,0 +1,12 @@
+"""Abstract interface for document cleaner implementations."""
+from abc import ABC, abstractmethod
+
+
+class BaseCleaner(ABC):
+    """Interface for clean chunk content.
+    """
+
+    @abstractmethod
+    def clean(self, content: str):
+        raise NotImplementedError
+
--- a/api/core/rag/cleaner/unstructured/unstructured_extra_whitespace_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_extra_whitespace_cleaner.py
@@ -0,0 +1,12 @@
+"""Abstract interface for document clean implementations."""
+from core.rag.cleaner.cleaner_base import BaseCleaner
+
+
+class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
+
+    def clean(self, content) -> str:
+        """clean document content."""
+        from unstructured.cleaners.core import clean_extra_whitespace
+
+        # Returns "ITEM 1A: RISK FACTORS"
+        return clean_extra_whitespace(content)
--- a/api/core/rag/cleaner/unstructured/unstructured_group_broken_paragraphs_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_group_broken_paragraphs_cleaner.py
@@ -0,0 +1,15 @@
+"""Abstract interface for document clean implementations."""
+from core.rag.cleaner.cleaner_base import BaseCleaner
+
+
+class UnstructuredGroupBrokenParagraphsCleaner(BaseCleaner):
+
+    def clean(self, content) -> str:
+        """clean document content."""
+        import re
+
+        from unstructured.cleaners.core import group_broken_paragraphs
+
+        para_split_re = re.compile(r"(\s*\n\s*){3}")
+
+        return group_broken_paragraphs(content, paragraph_split=para_split_re)
--- a/api/core/rag/cleaner/unstructured/unstructured_non_ascii_chars_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_non_ascii_chars_cleaner.py
@@ -0,0 +1,12 @@
+"""Abstract interface for document clean implementations."""
+from core.rag.cleaner.cleaner_base import BaseCleaner
+
+
+class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
+
+    def clean(self, content) -> str:
+        """clean document content."""
+        from unstructured.cleaners.core import clean_non_ascii_chars
+
+        # Returns "This text containsnon-ascii characters!"
+        return clean_non_ascii_chars(content)
--- a/api/core/rag/cleaner/unstructured/unstructured_replace_unicode_quotes_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_replace_unicode_quotes_cleaner.py
@@ -0,0 +1,11 @@
+"""Abstract interface for document clean implementations."""
+from core.rag.cleaner.cleaner_base import BaseCleaner
+
+
+class UnstructuredNonAsciiCharsCleaner(BaseCleaner):
+
+    def clean(self, content) -> str:
+        """Replaces unicode quote characters, such as the \x91 character in a string."""
+
+        from unstructured.cleaners.core import replace_unicode_quotes
+        return replace_unicode_quotes(content)
--- a/api/core/rag/cleaner/unstructured/unstructured_translate_text_cleaner.py
+++ b/api/core/rag/cleaner/unstructured/unstructured_translate_text_cleaner.py
@@ -0,0 +1,11 @@
+"""Abstract interface for document clean implementations."""
+from core.rag.cleaner.cleaner_base import BaseCleaner
+
+
+class UnstructuredTranslateTextCleaner(BaseCleaner):
+
+    def clean(self, content) -> str:
+        """clean document content."""
+        from unstructured.cleaners.translate import translate_text
+
+        return translate_text(content)