From cb7c9af25c4794a74c8959231b95049248fb2717 Mon Sep 17 00:00:00 2001
From: "Yaguang.Wang" <ice_cream_butter@126.com>
Date: Fri, 5 Dec 2025 22:33:15 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20Expanded=20WeCom=20message=20parsing=20?=
 =?UTF-8?q?to=20capture=20msgtype,=20inline=20voice/video=E2=80=A6=20(#184?=
 =?UTF-8?q?3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Expanded WeCom message parsing to capture msgtype, inline voice/video/file/link data, bounded base64 downloads, and richer mixed-message attachments (src/langbot/libs/wecom_ai_bot_api/api.py); added event accessors for new fields (src/langbot/libs/wecom_ai_bot_api/wecombotevent.py).
Converter now maps richer WeCom payloads (text, images, files, voice, video, links) into platform message chain with fallbacks when nothing parsable is present (src/langbot/pkg/platform/sources/wecombot.py).
Preprocessor now turns voice inputs into file URLs for downstream runners (src/langbot/pkg/pipeline/preproc/preproc.py).
Dify runner uploads all incoming files (images/audio/video/docs) after downloading or decoding data URLs, infers MIME types, and passes typed file descriptors into chat/workflow calls (src/langbot/pkg/provider/runners/difysvapi.py).

* Update src/langbot/pkg/platform/sources/wecombot.py

Fixed the issue of duplicate text in the comments.

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update src/langbot/libs/wecom_ai_bot_api/api.py

Modify the way you approach challenges.

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update src/langbot/pkg/platform/sources/wecombot.py

Changing the variable names makes more sense.

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* feat: use from_base64 for the voice file converting

---------

Co-authored-by: tabriswang <tabriswang@finecomn.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Junyan Qin <rockchinq@gmail.com>
---
 pyproject.toml                                |   2 +-
 src/langbot/libs/wecom_ai_bot_api/api.py      | 168 ++++++++++++++++--
 .../libs/wecom_ai_bot_api/wecombotevent.py    |  56 ++++++
 src/langbot/pkg/pipeline/preproc/preproc.py   |   6 +
 src/langbot/pkg/platform/sources/wecombot.py  |  99 ++++++++++-
 src/langbot/pkg/provider/runners/difysvapi.py | 134 +++++++++-----
 6 files changed, 400 insertions(+), 65 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 95ac34f0..39f97919 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,7 +63,7 @@ dependencies = [
     "langchain-text-splitters>=0.0.1",
     "chromadb>=0.4.24",
     "qdrant-client (>=1.15.1,<2.0.0)",
-    "langbot-plugin==0.2.0",
+    "langbot-plugin==0.2.1b1",
     "asyncpg>=0.30.0",
     "line-bot-sdk>=3.19.0",
     "tboxsdk>=0.0.10",
diff --git a/src/langbot/libs/wecom_ai_bot_api/api.py b/src/langbot/libs/wecom_ai_bot_api/api.py
index bed94d02..a26170e3 100644
--- a/src/langbot/libs/wecom_ai_bot_api/api.py
+++ b/src/langbot/libs/wecom_ai_bot_api/api.py
@@ -458,32 +458,174 @@ class WecomBotClient:
     async def get_message(self, msg_json):
         message_data = {}
 
+        msg_type = msg_json.get('msgtype', '')
+        if msg_type:
+            message_data['msgtype'] = msg_type
+
         if msg_json.get('chattype', '') == 'single':
             message_data['type'] = 'single'
         elif msg_json.get('chattype', '') == 'group':
             message_data['type'] = 'group'
 
-        if msg_json.get('msgtype') == 'text':
+        max_inline_file_size = 5 * 1024 * 1024  # avoid decoding very large payloads by default
+
+        async def _safe_download(url: str):
+            if not url:
+                return None
+            return await self.download_url_to_base64(url, self.EnCodingAESKey)
+
+        if msg_type == 'text':
             message_data['content'] = msg_json.get('text', {}).get('content')
-        elif msg_json.get('msgtype') == 'image':
+        elif msg_type == 'markdown':
+            message_data['content'] = msg_json.get('markdown', {}).get('content') or msg_json.get('text', {}).get(
+                'content', ''
+            )
+        elif msg_type == 'image':
             picurl = msg_json.get('image', {}).get('url', '')
-            base64 = await self.download_url_to_base64(picurl, self.EnCodingAESKey)
-            message_data['picurl'] = base64
-        elif msg_json.get('msgtype') == 'mixed':
+            base64_data = await _safe_download(picurl)
+            if base64_data:
+                message_data['picurl'] = base64_data
+                message_data['images'] = [base64_data]
+        elif msg_type == 'voice':
+            voice_info = msg_json.get('voice', {}) or {}
+            download_url = voice_info.get('url')
+            message_data['voice'] = {
+                'url': download_url,
+                'md5sum': voice_info.get('md5sum') or voice_info.get('md5'),
+                'filesize': voice_info.get('filesize') or voice_info.get('size'),
+                'sdkfileid': voice_info.get('sdkfileid') or voice_info.get('fileid'),
+            }
+            # 企业微信智能转写文本（如果已有）直接复用，避免重复转写
+            if voice_info.get('content'):
+                message_data['content'] = voice_info.get('content')
+            if (message_data['voice'].get('filesize') or 0) <= max_inline_file_size:
+                voice_base64 = await _safe_download(download_url)
+                if voice_base64:
+                    message_data['voice']['base64'] = voice_base64
+        elif msg_type == 'video':
+            video_info = msg_json.get('video', {}) or {}
+            download_url = video_info.get('url')
+            video_data = {
+                'url': download_url,
+                'filesize': video_info.get('filesize') or video_info.get('size'),
+                'sdkfileid': video_info.get('sdkfileid') or video_info.get('fileid'),
+                'md5sum': video_info.get('md5sum') or video_info.get('md5'),
+                'filename': video_info.get('filename') or video_info.get('name'),
+            }
+            if (video_data.get('filesize') or 0) <= max_inline_file_size:
+                video_base64 = await _safe_download(download_url)
+                if video_base64:
+                    video_data['base64'] = video_base64
+            message_data['video'] = video_data
+        elif msg_type == 'file':
+            file_info = msg_json.get('file', {}) or {}
+            download_url = file_info.get('url') or file_info.get('fileurl')
+            file_data = {
+                'filename': file_info.get('filename') or file_info.get('name'),
+                'filesize': file_info.get('filesize') or file_info.get('size'),
+                'md5sum': file_info.get('md5sum') or file_info.get('md5'),
+                'sdkfileid': file_info.get('sdkfileid') or file_info.get('fileid'),
+                'download_url': download_url,
+                'extra': file_info,
+            }
+            if (file_data.get('filesize') or 0) <= max_inline_file_size:
+                file_base64 = await _safe_download(download_url)
+                if file_base64:
+                    file_data['base64'] = file_base64
+            message_data['file'] = file_data
+        elif msg_type == 'link':
+            message_data['link'] = msg_json.get('link', {})
+            if not message_data.get('content'):
+                title = message_data['link'].get('title', '')
+                desc = message_data['link'].get('description') or message_data['link'].get('digest', '')
+                message_data['content'] = '\n'.join(filter(None, [title, desc]))
+        elif msg_type == 'mixed':
             items = msg_json.get('mixed', {}).get('msg_item', [])
             texts = []
-            picurl = None
+            images = []
+            files = []
+            voices = []
+            videos = []
+            links = []
             for item in items:
-                if item.get('msgtype') == 'text':
+                item_type = item.get('msgtype')
+                if item_type == 'text':
                     texts.append(item.get('text', {}).get('content', ''))
-                elif item.get('msgtype') == 'image' and picurl is None:
-                    picurl = item.get('image', {}).get('url')
+                elif item_type == 'image':
+                    img_url = item.get('image', {}).get('url')
+                    base64_data = await _safe_download(img_url)
+                    if base64_data:
+                        images.append(base64_data)
+                elif item_type == 'file':
+                    file_info = item.get('file', {}) or {}
+                    download_url = file_info.get('url') or file_info.get('fileurl')
+                    file_data = {
+                        'filename': file_info.get('filename') or file_info.get('name'),
+                        'filesize': file_info.get('filesize') or file_info.get('size'),
+                        'md5sum': file_info.get('md5sum') or file_info.get('md5'),
+                        'sdkfileid': file_info.get('sdkfileid') or file_info.get('fileid'),
+                        'download_url': download_url,
+                        'extra': file_info,
+                    }
+                    if (file_data.get('filesize') or 0) <= max_inline_file_size:
+                        file_base64 = await _safe_download(download_url)
+                        if file_base64:
+                            file_data['base64'] = file_base64
+                    files.append(file_data)
+                elif item_type == 'voice':
+                    voice_info = item.get('voice', {}) or {}
+                    download_url = voice_info.get('url')
+                    voice_data = {
+                        'url': download_url,
+                        'md5sum': voice_info.get('md5sum') or voice_info.get('md5'),
+                        'filesize': voice_info.get('filesize') or voice_info.get('size'),
+                        'sdkfileid': voice_info.get('sdkfileid') or voice_info.get('fileid'),
+                    }
+                    if voice_info.get('content'):
+                        texts.append(voice_info.get('content'))
+                    if (voice_data.get('filesize') or 0) <= max_inline_file_size:
+                        voice_base64 = await _safe_download(download_url)
+                        if voice_base64:
+                            voice_data['base64'] = voice_base64
+                    voices.append(voice_data)
+                elif item_type == 'video':
+                    video_info = item.get('video', {}) or {}
+                    download_url = video_info.get('url')
+                    video_data = {
+                        'url': download_url,
+                        'filesize': video_info.get('filesize') or video_info.get('size'),
+                        'sdkfileid': video_info.get('sdkfileid') or video_info.get('fileid'),
+                        'md5sum': video_info.get('md5sum') or video_info.get('md5'),
+                        'filename': video_info.get('filename') or video_info.get('name'),
+                    }
+                    if (video_data.get('filesize') or 0) <= max_inline_file_size:
+                        video_base64 = await _safe_download(download_url)
+                        if video_base64:
+                            video_data['base64'] = video_base64
+                    videos.append(video_data)
+                elif item_type == 'link':
+                    links.append(item.get('link', {}))
 
             if texts:
-                message_data['content'] = ''.join(texts)  # 拼接所有 text
-            if picurl:
-                base64 = await self.download_url_to_base64(picurl, self.EnCodingAESKey)
-                message_data['picurl'] = base64  # 只保留第一个 image
+                message_data['content'] = ' '.join(texts)  # 拼接所有 text
+            if images:
+                message_data['images'] = images
+                message_data['picurl'] = images[0]  # 只保留第一个 image
+            if files:
+                message_data['files'] = files
+                message_data['file'] = files[0]
+            if voices:
+                message_data['voices'] = voices
+                message_data['voice'] = voices[0]
+            if videos:
+                message_data['videos'] = videos
+                message_data['video'] = videos[0]
+            if links:
+                message_data['link'] = links[0]
+            if items:
+                message_data['attachments'] = items
+        else:
+            message_data['raw_msg'] = msg_json
 
         # Extract user information
         from_info = msg_json.get('from', {})
diff --git a/src/langbot/libs/wecom_ai_bot_api/wecombotevent.py b/src/langbot/libs/wecom_ai_bot_api/wecombotevent.py
index 099c58bc..75c6bbde 100644
--- a/src/langbot/libs/wecom_ai_bot_api/wecombotevent.py
+++ b/src/langbot/libs/wecom_ai_bot_api/wecombotevent.py
@@ -17,6 +17,13 @@ class WecomBotEvent(dict):
         """
         return self.get('type', '')
 
+    @property
+    def msgtype(self) -> str:
+        """
+        消息 msgtype
+        """
+        return self.get('msgtype', '')
+
     @property
     def userid(self) -> str:
         """
@@ -52,6 +59,55 @@ class WecomBotEvent(dict):
         """
         return self.get('picurl', '')
 
+    @property
+    def images(self):
+        """
+        图片列表（兼容 mixed）
+        """
+        return self.get('images', [])
+
+    @property
+    def file(self):
+        """
+        文件信息
+        """
+        return self.get('file', {})
+
+    @property
+    def voice(self):
+        """
+        语音信息
+        """
+        return self.get('voice', {})
+
+    @property
+    def video(self):
+        """
+        视频信息
+        """
+        return self.get('video', {})
+
+    @property
+    def link(self):
+        """
+        链接消息信息
+        """
+        return self.get('link', {})
+
+    @property
+    def location(self):
+        """
+        位置信息
+        """
+        return self.get('location', {})
+
+    @property
+    def attachments(self):
+        """
+        原始 mixed 中的附件项
+        """
+        return self.get('attachments', [])
+
     @property
     def chatid(self) -> str:
         """
diff --git a/src/langbot/pkg/pipeline/preproc/preproc.py b/src/langbot/pkg/pipeline/preproc/preproc.py
index ace432d8..3f6db6a2 100644
--- a/src/langbot/pkg/pipeline/preproc/preproc.py
+++ b/src/langbot/pkg/pipeline/preproc/preproc.py
@@ -111,6 +111,12 @@ class PreProcessor(stage.PipelineStage):
                 ):
                     if me.base64 is not None:
                         content_list.append(provider_message.ContentElement.from_image_base64(me.base64))
+            elif isinstance(me, platform_message.Voice):
+                # 转成文件链接，让下游 runner 上传到目标模型
+                if me.base64:
+                    content_list.append(provider_message.ContentElement.from_file_base64(me.base64, 'voice.silk'))
+                elif me.url:
+                    content_list.append(provider_message.ContentElement.from_file_url(me.url, 'voice'))
             elif isinstance(me, platform_message.File):
                 # if me.url is not None:
                 content_list.append(provider_message.ContentElement.from_file_url(me.url, me.name))
diff --git a/src/langbot/pkg/platform/sources/wecombot.py b/src/langbot/pkg/platform/sources/wecombot.py
index 77a35ddb..c35c2b18 100644
--- a/src/langbot/pkg/platform/sources/wecombot.py
+++ b/src/langbot/pkg/platform/sources/wecombot.py
@@ -28,9 +28,102 @@ class WecomBotMessageConverter(abstract_platform_adapter.AbstractMessageConverte
         if event.type == 'group':
             yiri_msg_list.append(platform_message.At(target=event.ai_bot_id))
         yiri_msg_list.append(platform_message.Source(id=event.message_id, time=datetime.datetime.now()))
-        yiri_msg_list.append(platform_message.Plain(text=event.content))
-        if event.picurl != '':
-            yiri_msg_list.append(platform_message.Image(base64=event.picurl))
+
+        if event.content:
+            yiri_msg_list.append(platform_message.Plain(text=event.content))
+
+        images = []
+        if event.images:
+            images.extend([img for img in event.images if img])
+        if not images and event.picurl:
+            images.append(event.picurl)
+        for image_base64 in images:
+            if image_base64:
+                yiri_msg_list.append(platform_message.Image(base64=image_base64))
+
+        file_info = event.file or {}
+        if file_info:
+            file_url = (
+                file_info.get('download_url')
+                or file_info.get('url')
+                or file_info.get('fileurl')
+                or file_info.get('path')
+            )
+            file_base64 = file_info.get('base64')
+            file_name = file_info.get('filename') or file_info.get('name')
+            file_size = file_info.get('filesize') or file_info.get('size')
+            file_data = file_url or file_base64
+            if file_data or file_name:
+                file_kwargs = {}
+                if file_data:
+                    file_kwargs['url'] = file_data
+                if file_name:
+                    file_kwargs['name'] = file_name
+                if file_size is not None:
+                    file_kwargs['size'] = file_size
+                try:
+                    yiri_msg_list.append(platform_message.File(**file_kwargs))
+                except Exception:
+                    # 兜底
+                    yiri_msg_list.append(platform_message.Unknown(text='[file message unsupported]'))
+
+        voice_info = event.voice or {}
+        if voice_info:
+            voice_payload = voice_info.get('base64') or voice_info.get('url')
+            if voice_payload:
+                if voice_info.get('base64') and not voice_payload.startswith('data:'):
+                    voice_payload = f"data:audio/mpeg;base64,{voice_info.get('base64')}"
+                try:
+                    yiri_msg_list.append(platform_message.Voice(base64=voice_payload))
+                except Exception:
+                    try:
+                        voice_kwargs = {'url': voice_payload}
+                        voice_name = voice_info.get('filename') or voice_info.get('name')
+                        voice_size = voice_info.get('filesize') or voice_info.get('size')
+                        if voice_name:
+                            voice_kwargs['name'] = voice_name
+                        if voice_size is not None:
+                            voice_kwargs['size'] = voice_size
+                        yiri_msg_list.append(platform_message.File(**voice_kwargs))
+                    except Exception:
+                        yiri_msg_list.append(platform_message.Unknown(text='[voice message unsupported]'))
+
+        video_info = event.video or {}
+        if video_info:
+            video_payload = (
+                video_info.get('base64')
+                or video_info.get('url')
+                or video_info.get('download_url')
+                or video_info.get('fileurl')
+            )
+            if video_payload:
+                video_kwargs = {'url': video_payload}
+                video_name = video_info.get('filename') or video_info.get('name')
+                video_size = video_info.get('filesize') or video_info.get('size')
+                if video_name:
+                    video_kwargs['name'] = video_name
+                if video_size is not None:
+                    video_kwargs['size'] = video_size
+                try:
+                    # 没有专门的视频类型，沿用 File 传递给上层
+                    yiri_msg_list.append(platform_message.File(**video_kwargs))
+                except Exception:
+                    yiri_msg_list.append(platform_message.Unknown(text='[video message unsupported]'))
+
+        if event.msgtype == 'link' and event.link:
+            link = event.link
+            summary = '\n'.join(
+                filter(None, [link.get('title', ''), link.get('description') or link.get('digest', ''), link.get('url', '')])
+            )
+            if summary:
+                yiri_msg_list.append(platform_message.Plain(text=summary))
+
+        has_content_element = any(
+            not isinstance(element, (platform_message.Source, platform_message.At)) for element in yiri_msg_list
+        )
+        if not has_content_element:
+            fallback_type = event.msgtype or 'unknown'
+            yiri_msg_list.append(platform_message.Unknown(text=f'[unsupported wecom msgtype: {fallback_type}]'))
         chain = platform_message.MessageChain(yiri_msg_list)
 
         return chain
diff --git a/src/langbot/pkg/provider/runners/difysvapi.py b/src/langbot/pkg/provider/runners/difysvapi.py
index 21fb471e..8c790f47 100644
--- a/src/langbot/pkg/provider/runners/difysvapi.py
+++ b/src/langbot/pkg/provider/runners/difysvapi.py
@@ -4,6 +4,7 @@ import typing
 import json
 import uuid
 import base64
+import mimetypes
 
 
 from langbot.pkg.provider import runner
@@ -12,6 +13,7 @@ import langbot_plugin.api.entities.builtin.provider.message as provider_message
 from langbot.pkg.utils import image
 import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
 from langbot.libs.dify_service_api.v1 import client, errors
+import httpx
 
 
 @runner.runner_class('dify-service-api')
@@ -70,14 +72,43 @@ class DifyServiceAPIRunner(runner.RequestRunner):
                 content = f'<think>\n{thinking_content}\n</think>\n{content}'.strip()
             return content, thinking_content
 
-    async def _preprocess_user_message(self, query: pipeline_query.Query) -> tuple[str, list[str]]:
-        """预处理用户消息，提取纯文本，并将图片上传到 Dify 服务
+    async def _preprocess_user_message(self, query: pipeline_query.Query) -> tuple[str, list[dict]]:
+        """预处理用户消息，提取纯文本，并将图片/文件上传到 Dify 服务
 
         Returns:
-            tuple[str, list[str]]: 纯文本和图片的 Dify 服务图片 ID
+            tuple[str, list[dict]]: 纯文本和上传后的文件描述（包含 type 与 id）
         """
         plain_text = ''
-        file_ids = []
+        upload_files: list[dict] = []
+        user_tag = f'{query.session.launcher_type.value}_{query.session.launcher_id}'
+
+        async def upload_file_bytes(file_name: str, file_bytes: bytes, content_type: str) -> str:
+            file_name = file_name or 'file'
+            content_type = content_type or 'application/octet-stream'
+            file = (file_name, file_bytes, content_type)
+            resp = await self.dify_client.upload_file(file, user_tag)
+            return resp['id']
+
+        async def download_file(file_url: str) -> tuple[bytes, str]:
+            """Download file from url (supports data url)."""
+
+            async with httpx.AsyncClient() as client_session:
+                resp = await client_session.get(file_url)
+                resp.raise_for_status()
+                content_type = (
+                    resp.headers.get('content-type') or mimetypes.guess_type(file_url)[0] or 'application/octet-stream'
+                )
+                return resp.content, content_type
+
+        def _detect_file_type(content_type: str) -> str:
+            """Map MIME to dify file type."""
+            if content_type and content_type.startswith('image/'):
+                return 'image'
+            if content_type and content_type.startswith('audio/'):
+                return 'audio'
+            if content_type and content_type.startswith('video/'):
+                return 'video'
+            return 'document'
 
         if isinstance(query.user_message.content, list):
             for ce in query.user_message.content:
@@ -86,30 +117,36 @@ class DifyServiceAPIRunner(runner.RequestRunner):
                 elif ce.type == 'image_base64':
                     image_b64, image_format = await image.extract_b64_and_format(ce.image_base64)
                     file_bytes = base64.b64decode(image_b64)
-                    file = ('img.png', file_bytes, f'image/{image_format}')
-                    file_upload_resp = await self.dify_client.upload_file(
-                        file,
-                        f'{query.session.launcher_type.value}_{query.session.launcher_id}',
-                    )
-                    image_id = file_upload_resp['id']
-                    file_ids.append(image_id)
-                # elif ce.type == "file_url":
-                #     file_bytes = base64.b64decode(ce.file_url)
-                #     file_upload_resp = await self.dify_client.upload_file(
-                #         file_bytes,
-                #         f'{query.session.launcher_type.value}_{query.session.launcher_id}',
-                #     )
-                #     file_id = file_upload_resp['id']
-                #     file_ids.append(file_id)
+                    image_id = await upload_file_bytes(f'img.{image_format}', file_bytes, f'image/{image_format}')
+                    upload_files.append({'type': 'image', 'id': image_id})
+                elif ce.type == 'file_url':
+                    file_url = getattr(ce, 'file_url', None)
+                    file_name = getattr(ce, 'file_name', None) or 'file'
+                    try:
+                        file_bytes, content_type = await download_file(file_url)
+                        file_id = await upload_file_bytes(file_name, file_bytes, content_type)
+                        file_type = _detect_file_type(content_type)
+                        upload_files.append({'type': file_type, 'id': file_id})
+                    except Exception as e:
+                        self.ap.logger.warning(f'dify file upload failed: {e}')
+                elif ce.type == 'file_base64':
+                    file_name = getattr(ce, 'file_name', None) or 'file'
+
+                    header, b64_data = ce.file_base64.split(',', 1)
+                    content_type = 'application/octet-stream'
+                    if ';' in header:
+                        content_type = header.split(';')[0][5:] or content_type
+                    file_bytes = base64.b64decode(b64_data)
+                    file_id = await upload_file_bytes(file_name, file_bytes, content_type)
+                    file_type = _detect_file_type(content_type)
+                    upload_files.append({'type': file_type, 'id': file_id})
+
         elif isinstance(query.user_message.content, str):
             plain_text = query.user_message.content
-        # plain_text = "When the file content is readable, please read the content of this file. When the file is an image, describe the content of this image." if file_ids and not plain_text else plain_text
-        # plain_text = "The user message type cannot be parsed." if not file_ids and not plain_text else plain_text
-        # plain_text = plain_text if plain_text else "When the file content is readable, please read the content of this file. When the file is an image, describe the content of this image."
-        # print(self.pipeline_config['ai'])
+
         plain_text = plain_text if plain_text else self.pipeline_config['ai']['dify-service-api']['base-prompt']
 
-        return plain_text, file_ids
+        return plain_text, upload_files
 
     async def _chat_messages(
         self, query: pipeline_query.Query
@@ -118,14 +155,15 @@ class DifyServiceAPIRunner(runner.RequestRunner):
         cov_id = query.session.using_conversation.uuid or ''
         query.variables['conversation_id'] = cov_id
 
-        plain_text, image_ids = await self._preprocess_user_message(query)
+        plain_text, upload_files = await self._preprocess_user_message(query)
 
         files = [
             {
-                'type': 'image',
-                'upload_file_id': image_id,
+                'type': f['type'],
+                'transfer_method': 'local_file',
+                'upload_file_id': f['id'],
             }
-            for image_id in image_ids
+            for f in upload_files
         ]
 
         mode = 'basic'  # 标记是基础编排还是工作流编排
@@ -183,15 +221,15 @@ class DifyServiceAPIRunner(runner.RequestRunner):
         cov_id = query.session.using_conversation.uuid or ''
         query.variables['conversation_id'] = cov_id
 
-        plain_text, image_ids = await self._preprocess_user_message(query)
+        plain_text, upload_files = await self._preprocess_user_message(query)
 
         files = [
             {
-                'type': 'image',
+                'type': f['type'],
                 'transfer_method': 'local_file',
-                'upload_file_id': image_id,
+                'upload_file_id': f['id'],
             }
-            for image_id in image_ids
+            for f in upload_files
         ]
 
         ignored_events = []
@@ -280,15 +318,15 @@ class DifyServiceAPIRunner(runner.RequestRunner):
 
         query.variables['conversation_id'] = query.session.using_conversation.uuid
 
-        plain_text, image_ids = await self._preprocess_user_message(query)
+        plain_text, upload_files = await self._preprocess_user_message(query)
 
         files = [
             {
-                'type': 'image',
+                'type': f['type'],
                 'transfer_method': 'local_file',
-                'upload_file_id': image_id,
+                'upload_file_id': f['id'],
             }
-            for image_id in image_ids
+            for f in upload_files
         ]
 
         ignored_events = ['text_chunk', 'workflow_started']
@@ -352,15 +390,15 @@ class DifyServiceAPIRunner(runner.RequestRunner):
         cov_id = query.session.using_conversation.uuid or ''
         query.variables['conversation_id'] = cov_id
 
-        plain_text, image_ids = await self._preprocess_user_message(query)
+        plain_text, upload_files = await self._preprocess_user_message(query)
 
         files = [
             {
-                'type': 'image',
+                'type': f['type'],
                 'transfer_method': 'local_file',
-                'upload_file_id': image_id,
+                'upload_file_id': f['id'],
             }
-            for image_id in image_ids
+            for f in upload_files
         ]
 
         basic_mode_pending_chunk = ''
@@ -436,15 +474,15 @@ class DifyServiceAPIRunner(runner.RequestRunner):
         cov_id = query.session.using_conversation.uuid or ''
         query.variables['conversation_id'] = cov_id
 
-        plain_text, image_ids = await self._preprocess_user_message(query)
+        plain_text, upload_files = await self._preprocess_user_message(query)
 
         files = [
             {
-                'type': 'image',
+                'type': f['type'],
                 'transfer_method': 'local_file',
-                'upload_file_id': image_id,
+                'upload_file_id': f['id'],
             }
-            for image_id in image_ids
+            for f in upload_files
         ]
 
         ignored_events = []
@@ -558,15 +596,15 @@ class DifyServiceAPIRunner(runner.RequestRunner):
 
         query.variables['conversation_id'] = query.session.using_conversation.uuid
 
-        plain_text, image_ids = await self._preprocess_user_message(query)
+        plain_text, upload_files = await self._preprocess_user_message(query)
 
         files = [
             {
-                'type': 'image',
+                'type': f['type'],
                 'transfer_method': 'local_file',
-                'upload_file_id': image_id,
+                'upload_file_id': f['id'],
             }
-            for image_id in image_ids
+            for f in upload_files
         ]
 
         ignored_events = ['workflow_started']