feat: support LLM process document file (#10966)

Co-authored-by: -LAN- <laipz8200@outlook.com>
2024-11-22 19:32:44 +08:00
parent 556de444e8
commit 08ac36812b
37 changed files with 233 additions and 88 deletions
--- a/api/core/memory/token_buffer_memory.py
+++ b/api/core/memory/token_buffer_memory.py
@@ -3,7 +3,6 @@ from typing import Optional

 from core.app.app_config.features.file_upload.manager import FileUploadConfigManager
 from core.file import file_manager
-from core.file.models import FileType
 from core.model_manager import ModelInstance
 from core.model_runtime.entities import (
    AssistantPromptMessage,
@@ -103,12 +102,11 @@ class TokenBufferMemory:
                    prompt_message_contents: list[PromptMessageContent] = []
                    prompt_message_contents.append(TextPromptMessageContent(data=message.query))
                    for file in file_objs:
-                        if file.type in {FileType.IMAGE, FileType.AUDIO}:
-                            prompt_message = file_manager.to_prompt_message_content(
-                                file,
-                                image_detail_config=detail,
-                            )
-                            prompt_message_contents.append(prompt_message)
+                        prompt_message = file_manager.to_prompt_message_content(
+                            file,
+                            image_detail_config=detail,
+                        )
+                        prompt_message_contents.append(prompt_message)

                    prompt_messages.append(UserPromptMessage(content=prompt_message_contents))

--- a/api/core/model_runtime/entities/message_entities.py
+++ b/api/core/model_runtime/entities/message_entities.py
@@ -49,7 +49,7 @@ class PromptMessageFunction(BaseModel):
    function: PromptMessageTool


-class PromptMessageContentType(Enum):
+class PromptMessageContentType(str, Enum):
    """
    Enum class for prompt message content type.
    """
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-001.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 1048576
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-002.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 1048576
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 1048576
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-8b-exp-0924.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 1048576
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 1048576
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash-latest.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 1048576
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-flash.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 1048576
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-001.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 2097152
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-002.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 2097152
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0801.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 2097152
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-exp-0827.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 2097152
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro-latest.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 2097152
--- a/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-1.5-pro.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 2097152
--- a/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml
+++ b/api/core/model_runtime/model_providers/google/llm/gemini-exp-1114.yaml
@@ -7,6 +7,7 @@ features:
  - vision
  - tool-call
  - stream-tool-call
+  - document
 model_properties:
  mode: chat
  context_size: 32767
--- a/api/core/model_runtime/model_providers/google/llm/llm.py
+++ b/api/core/model_runtime/model_providers/google/llm/llm.py
@@ -16,6 +16,7 @@ from PIL import Image
 from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta
 from core.model_runtime.entities.message_entities import (
    AssistantPromptMessage,
+    DocumentPromptMessageContent,
    ImagePromptMessageContent,
    PromptMessage,
    PromptMessageContentType,
@@ -35,6 +36,21 @@ from core.model_runtime.errors.invoke import (
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel

+GOOGLE_AVAILABLE_MIMETYPE = [
+    "application/pdf",
+    "application/x-javascript",
+    "text/javascript",
+    "application/x-python",
+    "text/x-python",
+    "text/plain",
+    "text/html",
+    "text/css",
+    "text/md",
+    "text/csv",
+    "text/xml",
+    "text/rtf",
+]
+

 class GoogleLargeLanguageModel(LargeLanguageModel):
    def _invoke(
@@ -370,6 +386,12 @@ class GoogleLargeLanguageModel(LargeLanguageModel):
                                raise ValueError(f"Failed to fetch image data from url {message_content.data}, {ex}")
                        blob = {"inline_data": {"mime_type": mime_type, "data": base64_data}}
                        glm_content["parts"].append(blob)
+                    elif c.type == PromptMessageContentType.DOCUMENT:
+                        message_content = cast(DocumentPromptMessageContent, c)
+                        if message_content.mime_type not in GOOGLE_AVAILABLE_MIMETYPE:
+                            raise ValueError(f"Unsupported mime type {message_content.mime_type}")
+                        blob = {"inline_data": {"mime_type": message_content.mime_type, "data": message_content.data}}
+                        glm_content["parts"].append(blob)

            return glm_content
        elif isinstance(message, AssistantPromptMessage):
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max-0809.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
  - vision
  - agent-thought
+  - video
 model_properties:
  mode: chat
  context_size: 32000
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
  - vision
  - agent-thought
+  - video
 model_properties:
  mode: chat
  context_size: 32000
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus-0809.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
  - vision
  - agent-thought
+  - video
 model_properties:
  mode: chat
  context_size: 32768
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
@@ -6,6 +6,7 @@ model_type: llm
 features:
  - vision
  - agent-thought
+  - video
 model_properties:
  mode: chat
  context_size: 8000
--- a/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml
+++ b/api/core/model_runtime/model_providers/zhipuai/llm/glm_4v_plus.yaml
@@ -6,6 +6,7 @@ model_properties:
  mode: chat
 features:
  - vision
+  - video
 parameter_rules:
  - name: temperature
    use_template: temperature
--- a/api/core/workflow/nodes/llm/exc.py
+++ b/api/core/workflow/nodes/llm/exc.py
@@ -26,9 +26,15 @@ class NoPromptFoundError(LLMNodeError):
    """Raised when no prompt is found in the LLM configuration."""


-class NotSupportedPromptTypeError(LLMNodeError):
-    """Raised when the prompt type is not supported."""
+class TemplateTypeNotSupportError(LLMNodeError):
+    def __init__(self, *, type_name: str):
+        super().__init__(f"Prompt type {type_name} is not supported.")


 class MemoryRolePrefixRequiredError(LLMNodeError):
    """Raised when memory role prefix is required for completion model."""
+
+
+class FileTypeNotSupportError(LLMNodeError):
+    def __init__(self, *, type_name: str):
+        super().__init__(f"{type_name} type is not supported by this model")
--- a/api/core/workflow/nodes/llm/node.py
+++ b/api/core/workflow/nodes/llm/node.py
@@ -65,6 +65,7 @@ from .entities import (
    ModelConfig,
 )
 from .exc import (
+    FileTypeNotSupportError,
    InvalidContextStructureError,
    InvalidVariableTypeError,
    LLMModeRequiredError,
@@ -72,7 +73,7 @@ from .exc import (
    MemoryRolePrefixRequiredError,
    ModelNotExistError,
    NoPromptFoundError,
-    NotSupportedPromptTypeError,
+    TemplateTypeNotSupportError,
    VariableNotFoundError,
 )

@@ -621,9 +622,7 @@ class LLMNode(BaseNode[LLMNodeData]):
                prompt_content = prompt_messages[0].content.replace("#sys.query#", user_query)
                prompt_messages[0].content = prompt_content
        else:
-            errmsg = f"Prompt type {type(prompt_template)} is not supported"
-            logger.warning(errmsg)
-            raise NotSupportedPromptTypeError(errmsg)
+            raise TemplateTypeNotSupportError(type_name=str(type(prompt_template)))

        if vision_enabled and user_files:
            file_prompts = []
@@ -671,7 +670,7 @@ class LLMNode(BaseNode[LLMNodeData]):
                            and ModelFeature.AUDIO not in model_config.model_schema.features
                        )
                    ):
-                        continue
+                        raise FileTypeNotSupportError(type_name=content_item.type)
                    prompt_message_content.append(content_item)
                if len(prompt_message_content) == 1 and prompt_message_content[0].type == PromptMessageContentType.TEXT:
                    prompt_message.content = prompt_message_content[0].data