From 75e5af26d0214aafc33ccf65cd34467b69241fdf Mon Sep 17 00:00:00 2001
From: RockChinQ <rockchinq@gmail.com>
Date: Wed, 17 Jun 2026 21:57:09 -0400
Subject: [PATCH] feat(box): support voice/file attachment round-trip
 end-to-end
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extends the bidirectional attachment transfer to audio and arbitrary files
through the real webchat UI, and fixes the model-payload errors that
non-image attachments triggered.

- platform(websocket_adapter): resolve Voice/File component storage keys to
  base64 (previously only Image), so audio/documents reach the sandbox inbox.
- web(debug-dialog): accept audio/* and any file in the uploader (was
  image-only), classify by mimetype, upload Voice/File via the documents
  endpoint, and render non-image staged attachments as a chip.
- provider(litellmchat): drop non-image file parts (file_base64 / file_url)
  when building the OpenAI/LiteLLM payload. These come from Voice/File
  attachments — including ones replayed from conversation history — and the
  agent reads their bytes from the sandbox, not the model. Without this the
  provider rejects the request: 'invalid content type=file_base64'.
- provider(localagent): also strip those parts from the current user message
  alongside the sandbox-path note (model-facing clarity; the requester is the
  real safety net for history).
- tests: cover the requester strip/keep behavior (file dropped, image kept and
  reshaped to image_url, mixed history, plain-string content).
---
 .../pkg/platform/sources/websocket_adapter.py | 42 ++++----
 .../modelmgr/requesters/litellmchat.py        | 11 +++
 .../pkg/provider/runners/localagent.py        | 15 +++
 .../provider/test_litellm_convert_messages.py | 93 ++++++++++++++++++
 .../components/debug-dialog/DebugDialog.tsx   | 97 ++++++++++++++-----
 5 files changed, 217 insertions(+), 41 deletions(-)
 create mode 100644 tests/unit_tests/provider/test_litellm_convert_messages.py

diff --git a/src/langbot/pkg/platform/sources/websocket_adapter.py b/src/langbot/pkg/platform/sources/websocket_adapter.py
index 9ffcf04ac..0574292f3 100644
--- a/src/langbot/pkg/platform/sources/websocket_adapter.py
+++ b/src/langbot/pkg/platform/sources/websocket_adapter.py
@@ -312,12 +312,18 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter)
 
     async def _process_image_components(self, message_chain_obj: list):
         """
-        处理消息链中的图片和文件组件，将path转换为base64
+        处理消息链中的图片、语音和文件组件，将 path 转换为 base64
+
+        Image / Voice / File components uploaded from the web client carry a
+        storage key in ``path``. Resolve it to a base64 data URI so downstream
+        stages (multimodal LLM input and the Box sandbox inbox) have a usable
+        payload, then drop the now-consumed storage object.
 
         Args:
             message_chain_obj: 消息链对象列表
         """
         import base64
+        import mimetypes
 
         storage_mgr = self.ap.storage_mgr
 
@@ -325,31 +331,33 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter)
             comp_type = component.get('type', '')
             comp_path = component.get('path', '')
 
-            if not comp_path:
+            if not comp_path or comp_type not in ('Image', 'Voice', 'File'):
                 continue
 
-            if comp_type == 'Image':
-                try:
-                    file_content = await storage_mgr.storage_provider.load(comp_path)
-                    base64_str = base64.b64encode(file_content).decode('utf-8')
+            try:
+                file_content = await storage_mgr.storage_provider.load(comp_path)
+                base64_str = base64.b64encode(file_content).decode('utf-8')
 
-                    file_key = comp_path
-                    if file_key.lower().endswith(('.jpg', '.jpeg')):
+                lowered = comp_path.lower()
+                if comp_type == 'Image':
+                    if lowered.endswith(('.jpg', '.jpeg')):
                         mime_type = 'image/jpeg'
-                    elif file_key.lower().endswith('.png'):
-                        mime_type = 'image/png'
-                    elif file_key.lower().endswith('.gif'):
+                    elif lowered.endswith('.gif'):
                         mime_type = 'image/gif'
-                    elif file_key.lower().endswith('.webp'):
+                    elif lowered.endswith('.webp'):
                         mime_type = 'image/webp'
                     else:
                         mime_type = 'image/png'
+                elif comp_type == 'Voice':
+                    mime_type = mimetypes.guess_type(comp_path)[0] or 'audio/wav'
+                else:  # File
+                    mime_type = mimetypes.guess_type(comp_path)[0] or 'application/octet-stream'
 
-                    component['base64'] = f'data:{mime_type};base64,{base64_str}'
-                    await storage_mgr.storage_provider.delete(comp_path)
-                    component['path'] = ''
-                except Exception as e:
-                    await self.logger.error(f'Failed to load image file {comp_path}: {e}')
+                component['base64'] = f'data:{mime_type};base64,{base64_str}'
+                await storage_mgr.storage_provider.delete(comp_path)
+                component['path'] = ''
+            except Exception as e:
+                await self.logger.error(f'Failed to load {comp_type} file {comp_path}: {e}')
 
     async def handle_websocket_message(
         self,
diff --git a/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py b/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py
index 8c750bd7d..d58dd2c5f 100644
--- a/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py
+++ b/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py
@@ -216,11 +216,22 @@ class LiteLLMRequester(requester.ProviderAPIRequester):
             content = msg_dict.get('content')
 
             if isinstance(content, list):
+                converted_parts = []
                 for part in content:
                     if isinstance(part, dict) and part.get('type') == 'image_base64':
                         part['image_url'] = {'url': part['image_base64']}
                         part['type'] = 'image_url'
                         del part['image_base64']
+                    # OpenAI-compatible chat models reject non-image file parts
+                    # (audio/document base64 or url). These originate from Voice /
+                    # File attachments — including ones replayed from conversation
+                    # history — and the agent already accesses their bytes via the
+                    # sandbox. Drop them from the model payload to avoid
+                    # "Invalid user message ... invalid content type=file_base64".
+                    if isinstance(part, dict) and part.get('type') in ('file_base64', 'file_url'):
+                        continue
+                    converted_parts.append(part)
+                msg_dict['content'] = converted_parts
 
             req_messages.append(msg_dict)
 
diff --git a/src/langbot/pkg/provider/runners/localagent.py b/src/langbot/pkg/provider/runners/localagent.py
index b724c5a4c..5143e5b08 100644
--- a/src/langbot/pkg/provider/runners/localagent.py
+++ b/src/langbot/pkg/provider/runners/localagent.py
@@ -143,6 +143,21 @@ class LocalAgentRunner(runner.RequestRunner):
         )
         note = '\n'.join(lines)
 
+        # Voice/File attachments are now available to the agent via the sandbox
+        # (exec/read/write tools). Their raw bytes must NOT be forwarded to the
+        # chat model as multimodal content: providers reject non-image file
+        # parts ("Invalid user message ... ensure all user messages are valid
+        # OpenAI chat completion messages"). Strip those content elements and
+        # rely on the sandbox-path note instead. Images are kept so vision
+        # models can still see them.
+        _model_unsafe_types = {'file_base64', 'file_url'}
+        if isinstance(user_message.content, list):
+            user_message.content = [
+                ce
+                for ce in user_message.content
+                if getattr(ce, 'type', None) not in _model_unsafe_types
+            ]
+
         if isinstance(user_message.content, str):
             user_message.content = [
                 provider_message.ContentElement.from_text(user_message.content),
diff --git a/tests/unit_tests/provider/test_litellm_convert_messages.py b/tests/unit_tests/provider/test_litellm_convert_messages.py
new file mode 100644
index 000000000..87ad2e027
--- /dev/null
+++ b/tests/unit_tests/provider/test_litellm_convert_messages.py
@@ -0,0 +1,93 @@
+"""Unit tests for LiteLLMRequester._convert_messages.
+
+Focus: the content-part normalization that (a) converts image_base64 parts to
+the OpenAI image_url shape and (b) drops non-image file parts (file_base64 /
+file_url) which OpenAI-compatible chat models reject. The latter is essential
+for Voice/File attachments — including ones replayed from conversation history —
+since the agent consumes their bytes via the sandbox, not the model payload.
+"""
+
+import langbot_plugin.api.entities.builtin.provider.message as provider_message
+
+from langbot.pkg.provider.modelmgr.requesters.litellmchat import LiteLLMRequester
+
+
+def _make_requester() -> LiteLLMRequester:
+    # _convert_messages does not touch instance config, so bypass __init__.
+    return LiteLLMRequester.__new__(LiteLLMRequester)
+
+
+def test_convert_messages_drops_file_base64_part():
+    req = _make_requester()
+    msg = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('analyze this audio'),
+            provider_message.ContentElement.from_file_base64('data:audio/wav;base64,AAAA', 'voice.wav'),
+        ],
+    )
+    out = req._convert_messages([msg])
+    parts = out[0]['content']
+    types = [p.get('type') for p in parts]
+    assert 'file_base64' not in types
+    assert types == ['text']
+    assert parts[0]['text'] == 'analyze this audio'
+
+
+def test_convert_messages_drops_file_url_part():
+    req = _make_requester()
+    msg = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('here is a doc'),
+            provider_message.ContentElement.from_file_url('http://example.com/report.xlsx', 'report.xlsx'),
+        ],
+    )
+    out = req._convert_messages([msg])
+    types = [p.get('type') for p in out[0]['content']]
+    assert types == ['text']
+
+
+def test_convert_messages_keeps_image_and_converts_to_image_url():
+    req = _make_requester()
+    msg = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('look'),
+            provider_message.ContentElement.from_image_base64('data:image/png;base64,AAAA'),
+        ],
+    )
+    out = req._convert_messages([msg])
+    parts = out[0]['content']
+    types = [p.get('type') for p in parts]
+    # image is preserved and reshaped to the OpenAI image_url form
+    assert types == ['text', 'image_url']
+    img_part = parts[1]
+    assert img_part['image_url'] == {'url': 'data:image/png;base64,AAAA'}
+    assert 'image_base64' not in img_part
+
+
+def test_convert_messages_mixed_history_strips_only_files():
+    req = _make_requester()
+    # Simulate replayed history: an old voice turn + a current text turn.
+    history_voice = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('old audio turn'),
+            provider_message.ContentElement.from_file_base64('data:audio/wav;base64,BBBB', 'voice.wav'),
+        ],
+    )
+    current = provider_message.Message(
+        role='user',
+        content=[provider_message.ContentElement.from_text('now do the csv')],
+    )
+    out = req._convert_messages([history_voice, current])
+    assert [p.get('type') for p in out[0]['content']] == ['text']
+    assert [p.get('type') for p in out[1]['content']] == ['text']
+
+
+def test_convert_messages_plain_string_content_untouched():
+    req = _make_requester()
+    msg = provider_message.Message(role='user', content='just text')
+    out = req._convert_messages([msg])
+    assert out[0]['content'] == 'just text'
diff --git a/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx b/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx
index 77dce2777..b45e87dd1 100644
--- a/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx
+++ b/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx
@@ -65,7 +65,12 @@ export default function DebugDialog({
   const [isHovering, setIsHovering] = useState(false);
   const [isConnected, setIsConnected] = useState(false);
   const [selectedImages, setSelectedImages] = useState<
-    Array<{ file: File; preview: string; fileKey?: string }>
+    Array<{
+      file: File;
+      preview: string;
+      fileKey?: string;
+      kind: 'image' | 'voice' | 'file';
+    }>
   >([]);
   const [isUploading, setIsUploading] = useState(false);
   const [previewImageUrl, setPreviewImageUrl] = useState<string>('');
@@ -293,23 +298,38 @@ export default function DebugDialog({
     const files = e.target.files;
     if (!files || files.length === 0) return;
 
-    const newImages: Array<{ file: File; preview: string }> = [];
+    const newImages: Array<{
+      file: File;
+      preview: string;
+      kind: 'image' | 'voice' | 'file';
+    }> = [];
 
     for (let i = 0; i < files.length; i++) {
       const file = files[i];
       if (file.type.startsWith('image/')) {
-        const preview = URL.createObjectURL(file);
-        newImages.push({ file, preview });
+        newImages.push({
+          file,
+          preview: URL.createObjectURL(file),
+          kind: 'image',
+        });
+      } else if (file.type.startsWith('audio/')) {
+        newImages.push({ file, preview: '', kind: 'voice' });
+      } else {
+        newImages.push({ file, preview: '', kind: 'file' });
       }
     }
 
     setSelectedImages((prev) => [...prev, ...newImages]);
+    // reset the input so selecting the same file again re-triggers onChange
+    e.target.value = '';
   };
 
   const handleRemoveImage = (index: number) => {
     setSelectedImages((prev) => {
       const newImages = [...prev];
-      URL.revokeObjectURL(newImages[index].preview);
+      if (newImages[index].preview) {
+        URL.revokeObjectURL(newImages[index].preview);
+      }
       newImages.splice(index, 1);
       return newImages;
     });
@@ -373,19 +393,33 @@ export default function DebugDialog({
         });
       }
 
-      // Upload images and add to message chain
-      for (const image of selectedImages) {
+      // Upload attachments and add to message chain
+      for (const attachment of selectedImages) {
         try {
-          const result = await httpClient.uploadWebSocketImage(
-            selectedPipelineId,
-            image.file,
-          );
-          messageChain.push({
-            type: 'Image',
-            path: result.file_key,
-          });
+          if (attachment.kind === 'image') {
+            const result = await httpClient.uploadWebSocketImage(
+              selectedPipelineId,
+              attachment.file,
+            );
+            messageChain.push({
+              type: 'Image',
+              path: result.file_key,
+            });
+          } else {
+            // Voice / File go through the generic document upload endpoint,
+            // which returns a storage key the backend resolves into the
+            // sandbox inbox just like images.
+            const result = await httpClient.uploadDocumentFile(attachment.file);
+            messageChain.push({
+              type: attachment.kind === 'voice' ? 'Voice' : 'File',
+              path: result.file_id,
+              ...(attachment.kind === 'file'
+                ? { name: attachment.file.name }
+                : {}),
+            });
+          }
         } catch (error) {
-          console.error('Image upload failed:', error);
+          console.error('Attachment upload failed:', error);
           toast.error(t('pipelines.debugDialog.imageUploadFailed'));
         }
       }
@@ -394,7 +428,9 @@ export default function DebugDialog({
       setInputValue('');
       setHasAt(false);
       setQuotedMessage(null);
-      selectedImages.forEach((img) => URL.revokeObjectURL(img.preview));
+      selectedImages.forEach((img) => {
+        if (img.preview) URL.revokeObjectURL(img.preview);
+      });
       setSelectedImages([]);
 
       // Send message via WebSocket
@@ -861,17 +897,30 @@ export default function DebugDialog({
           </div>
         )}
 
-        {/* Image preview area */}
+        {/* Attachment preview area */}
         {selectedImages.length > 0 && (
           <div className="px-4 pb-2">
             <div className="flex gap-2 flex-wrap">
               {selectedImages.map((image, index) => (
                 <div key={index} className="relative group">
-                  <img
-                    src={image.preview}
-                    alt={`preview-${index}`}
-                    className="w-20 h-20 object-cover rounded-lg border"
-                  />
+                  {image.kind === 'image' ? (
+                    <img
+                      src={image.preview}
+                      alt={`preview-${index}`}
+                      className="w-20 h-20 object-cover rounded-lg border"
+                    />
+                  ) : (
+                    <div className="w-36 h-20 px-2 rounded-lg border bg-muted/40 flex items-center gap-2 overflow-hidden">
+                      {image.kind === 'voice' ? (
+                        <Music className="size-5 shrink-0 text-muted-foreground" />
+                      ) : (
+                        <Paperclip className="size-5 shrink-0 text-muted-foreground" />
+                      )}
+                      <span className="text-xs text-muted-foreground truncate">
+                        {image.file.name}
+                      </span>
+                    </div>
+                  )}
                   <button
                     type="button"
                     onClick={() => handleRemoveImage(index)}
@@ -900,7 +949,7 @@ export default function DebugDialog({
             <input
               ref={fileInputRef}
               type="file"
-              accept="image/*"
+              accept="image/*,audio/*,*/*"
               multiple
               onChange={handleImageSelect}
               className="hidden"