feat(box): support voice/file attachment round-trip end-to-end

Extends the bidirectional attachment transfer to audio and arbitrary files through the real webchat UI, and fixes the model-payload errors that non-image attachments triggered. - platform(websocket_adapter): resolve Voice/File component storage keys to base64 (previously only Image), so audio/documents reach the sandbox inbox. - web(debug-dialog): accept audio/* and any file in the uploader (was image-only), classify by mimetype, upload Voice/File via the documents endpoint, and render non-image staged attachments as a chip. - provider(litellmchat): drop non-image file parts (file_base64 / file_url) when building the OpenAI/LiteLLM payload. These come from Voice/File attachments — including ones replayed from conversation history — and the agent reads their bytes from the sandbox, not the model. Without this the provider rejects the request: 'invalid content type=file_base64'. - provider(localagent): also strip those parts from the current user message alongside the sandbox-path note (model-facing clarity; the requester is the real safety net for history). - tests: cover the requester strip/keep behavior (file dropped, image kept and reshaped to image_url, mixed history, plain-string content).
2026-06-18 11:44:18 +00:00 · 2026-06-17 21:57:09 -04:00
parent 22c0a18bea
commit 75e5af26d0
5 changed files with 217 additions and 41 deletions
@@ -312,12 +312,18 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter)

    async def _process_image_components(self, message_chain_obj: list):
        """
-        处理消息链中的图片和文件组件，将path转换为base64
+        处理消息链中的图片、语音和文件组件，将 path 转换为 base64
+
+        Image / Voice / File components uploaded from the web client carry a
+        storage key in ``path``. Resolve it to a base64 data URI so downstream
+        stages (multimodal LLM input and the Box sandbox inbox) have a usable
+        payload, then drop the now-consumed storage object.

        Args:
            message_chain_obj: 消息链对象列表
        """
        import base64
+        import mimetypes

        storage_mgr = self.ap.storage_mgr

@@ -325,31 +331,33 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter)
            comp_type = component.get('type', '')
            comp_path = component.get('path', '')

-            if not comp_path:
+            if not comp_path or comp_type not in ('Image', 'Voice', 'File'):
                continue

-            if comp_type == 'Image':
-                try:
-                    file_content = await storage_mgr.storage_provider.load(comp_path)
-                    base64_str = base64.b64encode(file_content).decode('utf-8')
+            try:
+                file_content = await storage_mgr.storage_provider.load(comp_path)
+                base64_str = base64.b64encode(file_content).decode('utf-8')

-                    file_key = comp_path
-                    if file_key.lower().endswith(('.jpg', '.jpeg')):
+                lowered = comp_path.lower()
+                if comp_type == 'Image':
+                    if lowered.endswith(('.jpg', '.jpeg')):
                        mime_type = 'image/jpeg'
-                    elif file_key.lower().endswith('.png'):
-                        mime_type = 'image/png'
-                    elif file_key.lower().endswith('.gif'):
+                    elif lowered.endswith('.gif'):
                        mime_type = 'image/gif'
-                    elif file_key.lower().endswith('.webp'):
+                    elif lowered.endswith('.webp'):
                        mime_type = 'image/webp'
                    else:
                        mime_type = 'image/png'
+                elif comp_type == 'Voice':
+                    mime_type = mimetypes.guess_type(comp_path)[0] or 'audio/wav'
+                else:  # File
+                    mime_type = mimetypes.guess_type(comp_path)[0] or 'application/octet-stream'

-                    component['base64'] = f'data:{mime_type};base64,{base64_str}'
-                    await storage_mgr.storage_provider.delete(comp_path)
-                    component['path'] = ''
-                except Exception as e:
-                    await self.logger.error(f'Failed to load image file {comp_path}: {e}')
+                component['base64'] = f'data:{mime_type};base64,{base64_str}'
+                await storage_mgr.storage_provider.delete(comp_path)
+                component['path'] = ''
+            except Exception as e:
+                await self.logger.error(f'Failed to load {comp_type} file {comp_path}: {e}')

    async def handle_websocket_message(
        self,
@@ -216,11 +216,22 @@ class LiteLLMRequester(requester.ProviderAPIRequester):
            content = msg_dict.get('content')

            if isinstance(content, list):
+                converted_parts = []
                for part in content:
                    if isinstance(part, dict) and part.get('type') == 'image_base64':
                        part['image_url'] = {'url': part['image_base64']}
                        part['type'] = 'image_url'
                        del part['image_base64']
+                    # OpenAI-compatible chat models reject non-image file parts
+                    # (audio/document base64 or url). These originate from Voice /
+                    # File attachments — including ones replayed from conversation
+                    # history — and the agent already accesses their bytes via the
+                    # sandbox. Drop them from the model payload to avoid
+                    # "Invalid user message ... invalid content type=file_base64".
+                    if isinstance(part, dict) and part.get('type') in ('file_base64', 'file_url'):
+                        continue
+                    converted_parts.append(part)
+                msg_dict['content'] = converted_parts

            req_messages.append(msg_dict)

@@ -143,6 +143,21 @@ class LocalAgentRunner(runner.RequestRunner):
        )
        note = '\n'.join(lines)

+        # Voice/File attachments are now available to the agent via the sandbox
+        # (exec/read/write tools). Their raw bytes must NOT be forwarded to the
+        # chat model as multimodal content: providers reject non-image file
+        # parts ("Invalid user message ... ensure all user messages are valid
+        # OpenAI chat completion messages"). Strip those content elements and
+        # rely on the sandbox-path note instead. Images are kept so vision
+        # models can still see them.
+        _model_unsafe_types = {'file_base64', 'file_url'}
+        if isinstance(user_message.content, list):
+            user_message.content = [
+                ce
+                for ce in user_message.content
+                if getattr(ce, 'type', None) not in _model_unsafe_types
+            ]
+
        if isinstance(user_message.content, str):
            user_message.content = [
                provider_message.ContentElement.from_text(user_message.content),
@@ -0,0 +1,93 @@
+"""Unit tests for LiteLLMRequester._convert_messages.
+
+Focus: the content-part normalization that (a) converts image_base64 parts to
+the OpenAI image_url shape and (b) drops non-image file parts (file_base64 /
+file_url) which OpenAI-compatible chat models reject. The latter is essential
+for Voice/File attachments — including ones replayed from conversation history —
+since the agent consumes their bytes via the sandbox, not the model payload.
+"""
+
+import langbot_plugin.api.entities.builtin.provider.message as provider_message
+
+from langbot.pkg.provider.modelmgr.requesters.litellmchat import LiteLLMRequester
+
+
+def _make_requester() -> LiteLLMRequester:
+    # _convert_messages does not touch instance config, so bypass __init__.
+    return LiteLLMRequester.__new__(LiteLLMRequester)
+
+
+def test_convert_messages_drops_file_base64_part():
+    req = _make_requester()
+    msg = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('analyze this audio'),
+            provider_message.ContentElement.from_file_base64('data:audio/wav;base64,AAAA', 'voice.wav'),
+        ],
+    )
+    out = req._convert_messages([msg])
+    parts = out[0]['content']
+    types = [p.get('type') for p in parts]
+    assert 'file_base64' not in types
+    assert types == ['text']
+    assert parts[0]['text'] == 'analyze this audio'
+
+
+def test_convert_messages_drops_file_url_part():
+    req = _make_requester()
+    msg = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('here is a doc'),
+            provider_message.ContentElement.from_file_url('http://example.com/report.xlsx', 'report.xlsx'),
+        ],
+    )
+    out = req._convert_messages([msg])
+    types = [p.get('type') for p in out[0]['content']]
+    assert types == ['text']
+
+
+def test_convert_messages_keeps_image_and_converts_to_image_url():
+    req = _make_requester()
+    msg = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('look'),
+            provider_message.ContentElement.from_image_base64('data:image/png;base64,AAAA'),
+        ],
+    )
+    out = req._convert_messages([msg])
+    parts = out[0]['content']
+    types = [p.get('type') for p in parts]
+    # image is preserved and reshaped to the OpenAI image_url form
+    assert types == ['text', 'image_url']
+    img_part = parts[1]
+    assert img_part['image_url'] == {'url': 'data:image/png;base64,AAAA'}
+    assert 'image_base64' not in img_part
+
+
+def test_convert_messages_mixed_history_strips_only_files():
+    req = _make_requester()
+    # Simulate replayed history: an old voice turn + a current text turn.
+    history_voice = provider_message.Message(
+        role='user',
+        content=[
+            provider_message.ContentElement.from_text('old audio turn'),
+            provider_message.ContentElement.from_file_base64('data:audio/wav;base64,BBBB', 'voice.wav'),
+        ],
+    )
+    current = provider_message.Message(
+        role='user',
+        content=[provider_message.ContentElement.from_text('now do the csv')],
+    )
+    out = req._convert_messages([history_voice, current])
+    assert [p.get('type') for p in out[0]['content']] == ['text']
+    assert [p.get('type') for p in out[1]['content']] == ['text']
+
+
+def test_convert_messages_plain_string_content_untouched():
+    req = _make_requester()
+    msg = provider_message.Message(role='user', content='just text')
+    out = req._convert_messages([msg])
+    assert out[0]['content'] == 'just text'
@@ -65,7 +65,12 @@ export default function DebugDialog({
  const [isHovering, setIsHovering] = useState(false);
  const [isConnected, setIsConnected] = useState(false);
  const [selectedImages, setSelectedImages] = useState<
-    Array<{ file: File; preview: string; fileKey?: string }>
+    Array<{
+      file: File;
+      preview: string;
+      fileKey?: string;
+      kind: 'image' | 'voice' | 'file';
+    }>
  >([]);
  const [isUploading, setIsUploading] = useState(false);
  const [previewImageUrl, setPreviewImageUrl] = useState<string>('');
@@ -293,23 +298,38 @@ export default function DebugDialog({
    const files = e.target.files;
    if (!files || files.length === 0) return;

-    const newImages: Array<{ file: File; preview: string }> = [];
+    const newImages: Array<{
+      file: File;
+      preview: string;
+      kind: 'image' | 'voice' | 'file';
+    }> = [];

    for (let i = 0; i < files.length; i++) {
      const file = files[i];
      if (file.type.startsWith('image/')) {
-        const preview = URL.createObjectURL(file);
-        newImages.push({ file, preview });
+        newImages.push({
+          file,
+          preview: URL.createObjectURL(file),
+          kind: 'image',
+        });
+      } else if (file.type.startsWith('audio/')) {
+        newImages.push({ file, preview: '', kind: 'voice' });
+      } else {
+        newImages.push({ file, preview: '', kind: 'file' });
      }
    }

    setSelectedImages((prev) => [...prev, ...newImages]);
+    // reset the input so selecting the same file again re-triggers onChange
+    e.target.value = '';
  };

  const handleRemoveImage = (index: number) => {
    setSelectedImages((prev) => {
      const newImages = [...prev];
-      URL.revokeObjectURL(newImages[index].preview);
+      if (newImages[index].preview) {
+        URL.revokeObjectURL(newImages[index].preview);
+      }
      newImages.splice(index, 1);
      return newImages;
    });
@@ -373,19 +393,33 @@ export default function DebugDialog({
        });
      }

-      // Upload images and add to message chain
-      for (const image of selectedImages) {
+      // Upload attachments and add to message chain
+      for (const attachment of selectedImages) {
        try {
-          const result = await httpClient.uploadWebSocketImage(
-            selectedPipelineId,
-            image.file,
-          );
-          messageChain.push({
-            type: 'Image',
-            path: result.file_key,
-          });
+          if (attachment.kind === 'image') {
+            const result = await httpClient.uploadWebSocketImage(
+              selectedPipelineId,
+              attachment.file,
+            );
+            messageChain.push({
+              type: 'Image',
+              path: result.file_key,
+            });
+          } else {
+            // Voice / File go through the generic document upload endpoint,
+            // which returns a storage key the backend resolves into the
+            // sandbox inbox just like images.
+            const result = await httpClient.uploadDocumentFile(attachment.file);
+            messageChain.push({
+              type: attachment.kind === 'voice' ? 'Voice' : 'File',
+              path: result.file_id,
+              ...(attachment.kind === 'file'
+                ? { name: attachment.file.name }
+                : {}),
+            });
+          }
        } catch (error) {
-          console.error('Image upload failed:', error);
+          console.error('Attachment upload failed:', error);
          toast.error(t('pipelines.debugDialog.imageUploadFailed'));
        }
      }
@@ -394,7 +428,9 @@ export default function DebugDialog({
      setInputValue('');
      setHasAt(false);
      setQuotedMessage(null);
-      selectedImages.forEach((img) => URL.revokeObjectURL(img.preview));
+      selectedImages.forEach((img) => {
+        if (img.preview) URL.revokeObjectURL(img.preview);
+      });
      setSelectedImages([]);

      // Send message via WebSocket
@@ -861,17 +897,30 @@ export default function DebugDialog({
          </div>
        )}

-        {/* Image preview area */}
+        {/* Attachment preview area */}
        {selectedImages.length > 0 && (
          <div className="px-4 pb-2">
            <div className="flex gap-2 flex-wrap">
              {selectedImages.map((image, index) => (
                <div key={index} className="relative group">
-                  <img
-                    src={image.preview}
-                    alt={`preview-${index}`}
-                    className="w-20 h-20 object-cover rounded-lg border"
-                  />
+                  {image.kind === 'image' ? (
+                    <img
+                      src={image.preview}
+                      alt={`preview-${index}`}
+                      className="w-20 h-20 object-cover rounded-lg border"
+                    />
+                  ) : (
+                    <div className="w-36 h-20 px-2 rounded-lg border bg-muted/40 flex items-center gap-2 overflow-hidden">
+                      {image.kind === 'voice' ? (
+                        <Music className="size-5 shrink-0 text-muted-foreground" />
+                      ) : (
+                        <Paperclip className="size-5 shrink-0 text-muted-foreground" />
+                      )}
+                      <span className="text-xs text-muted-foreground truncate">
+                        {image.file.name}
+                      </span>
+                    </div>
+                  )}
                  <button
                    type="button"
                    onClick={() => handleRemoveImage(index)}
@@ -900,7 +949,7 @@ export default function DebugDialog({
            <input
              ref={fileInputRef}
              type="file"
-              accept="image/*"
+              accept="image/*,audio/*,*/*"
              multiple
              onChange={handleImageSelect}
              className="hidden"