From 75e5af26d0214aafc33ccf65cd34467b69241fdf Mon Sep 17 00:00:00 2001 From: RockChinQ Date: Wed, 17 Jun 2026 21:57:09 -0400 Subject: [PATCH] feat(box): support voice/file attachment round-trip end-to-end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the bidirectional attachment transfer to audio and arbitrary files through the real webchat UI, and fixes the model-payload errors that non-image attachments triggered. - platform(websocket_adapter): resolve Voice/File component storage keys to base64 (previously only Image), so audio/documents reach the sandbox inbox. - web(debug-dialog): accept audio/* and any file in the uploader (was image-only), classify by mimetype, upload Voice/File via the documents endpoint, and render non-image staged attachments as a chip. - provider(litellmchat): drop non-image file parts (file_base64 / file_url) when building the OpenAI/LiteLLM payload. These come from Voice/File attachments — including ones replayed from conversation history — and the agent reads their bytes from the sandbox, not the model. Without this the provider rejects the request: 'invalid content type=file_base64'. - provider(localagent): also strip those parts from the current user message alongside the sandbox-path note (model-facing clarity; the requester is the real safety net for history). - tests: cover the requester strip/keep behavior (file dropped, image kept and reshaped to image_url, mixed history, plain-string content). --- .../pkg/platform/sources/websocket_adapter.py | 42 ++++---- .../modelmgr/requesters/litellmchat.py | 11 +++ .../pkg/provider/runners/localagent.py | 15 +++ .../provider/test_litellm_convert_messages.py | 93 ++++++++++++++++++ .../components/debug-dialog/DebugDialog.tsx | 97 ++++++++++++++----- 5 files changed, 217 insertions(+), 41 deletions(-) create mode 100644 tests/unit_tests/provider/test_litellm_convert_messages.py diff --git a/src/langbot/pkg/platform/sources/websocket_adapter.py b/src/langbot/pkg/platform/sources/websocket_adapter.py index 9ffcf04ac..0574292f3 100644 --- a/src/langbot/pkg/platform/sources/websocket_adapter.py +++ b/src/langbot/pkg/platform/sources/websocket_adapter.py @@ -312,12 +312,18 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter) async def _process_image_components(self, message_chain_obj: list): """ - 处理消息链中的图片和文件组件,将path转换为base64 + 处理消息链中的图片、语音和文件组件,将 path 转换为 base64 + + Image / Voice / File components uploaded from the web client carry a + storage key in ``path``. Resolve it to a base64 data URI so downstream + stages (multimodal LLM input and the Box sandbox inbox) have a usable + payload, then drop the now-consumed storage object. Args: message_chain_obj: 消息链对象列表 """ import base64 + import mimetypes storage_mgr = self.ap.storage_mgr @@ -325,31 +331,33 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter) comp_type = component.get('type', '') comp_path = component.get('path', '') - if not comp_path: + if not comp_path or comp_type not in ('Image', 'Voice', 'File'): continue - if comp_type == 'Image': - try: - file_content = await storage_mgr.storage_provider.load(comp_path) - base64_str = base64.b64encode(file_content).decode('utf-8') + try: + file_content = await storage_mgr.storage_provider.load(comp_path) + base64_str = base64.b64encode(file_content).decode('utf-8') - file_key = comp_path - if file_key.lower().endswith(('.jpg', '.jpeg')): + lowered = comp_path.lower() + if comp_type == 'Image': + if lowered.endswith(('.jpg', '.jpeg')): mime_type = 'image/jpeg' - elif file_key.lower().endswith('.png'): - mime_type = 'image/png' - elif file_key.lower().endswith('.gif'): + elif lowered.endswith('.gif'): mime_type = 'image/gif' - elif file_key.lower().endswith('.webp'): + elif lowered.endswith('.webp'): mime_type = 'image/webp' else: mime_type = 'image/png' + elif comp_type == 'Voice': + mime_type = mimetypes.guess_type(comp_path)[0] or 'audio/wav' + else: # File + mime_type = mimetypes.guess_type(comp_path)[0] or 'application/octet-stream' - component['base64'] = f'data:{mime_type};base64,{base64_str}' - await storage_mgr.storage_provider.delete(comp_path) - component['path'] = '' - except Exception as e: - await self.logger.error(f'Failed to load image file {comp_path}: {e}') + component['base64'] = f'data:{mime_type};base64,{base64_str}' + await storage_mgr.storage_provider.delete(comp_path) + component['path'] = '' + except Exception as e: + await self.logger.error(f'Failed to load {comp_type} file {comp_path}: {e}') async def handle_websocket_message( self, diff --git a/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py b/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py index 8c750bd7d..d58dd2c5f 100644 --- a/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py +++ b/src/langbot/pkg/provider/modelmgr/requesters/litellmchat.py @@ -216,11 +216,22 @@ class LiteLLMRequester(requester.ProviderAPIRequester): content = msg_dict.get('content') if isinstance(content, list): + converted_parts = [] for part in content: if isinstance(part, dict) and part.get('type') == 'image_base64': part['image_url'] = {'url': part['image_base64']} part['type'] = 'image_url' del part['image_base64'] + # OpenAI-compatible chat models reject non-image file parts + # (audio/document base64 or url). These originate from Voice / + # File attachments — including ones replayed from conversation + # history — and the agent already accesses their bytes via the + # sandbox. Drop them from the model payload to avoid + # "Invalid user message ... invalid content type=file_base64". + if isinstance(part, dict) and part.get('type') in ('file_base64', 'file_url'): + continue + converted_parts.append(part) + msg_dict['content'] = converted_parts req_messages.append(msg_dict) diff --git a/src/langbot/pkg/provider/runners/localagent.py b/src/langbot/pkg/provider/runners/localagent.py index b724c5a4c..5143e5b08 100644 --- a/src/langbot/pkg/provider/runners/localagent.py +++ b/src/langbot/pkg/provider/runners/localagent.py @@ -143,6 +143,21 @@ class LocalAgentRunner(runner.RequestRunner): ) note = '\n'.join(lines) + # Voice/File attachments are now available to the agent via the sandbox + # (exec/read/write tools). Their raw bytes must NOT be forwarded to the + # chat model as multimodal content: providers reject non-image file + # parts ("Invalid user message ... ensure all user messages are valid + # OpenAI chat completion messages"). Strip those content elements and + # rely on the sandbox-path note instead. Images are kept so vision + # models can still see them. + _model_unsafe_types = {'file_base64', 'file_url'} + if isinstance(user_message.content, list): + user_message.content = [ + ce + for ce in user_message.content + if getattr(ce, 'type', None) not in _model_unsafe_types + ] + if isinstance(user_message.content, str): user_message.content = [ provider_message.ContentElement.from_text(user_message.content), diff --git a/tests/unit_tests/provider/test_litellm_convert_messages.py b/tests/unit_tests/provider/test_litellm_convert_messages.py new file mode 100644 index 000000000..87ad2e027 --- /dev/null +++ b/tests/unit_tests/provider/test_litellm_convert_messages.py @@ -0,0 +1,93 @@ +"""Unit tests for LiteLLMRequester._convert_messages. + +Focus: the content-part normalization that (a) converts image_base64 parts to +the OpenAI image_url shape and (b) drops non-image file parts (file_base64 / +file_url) which OpenAI-compatible chat models reject. The latter is essential +for Voice/File attachments — including ones replayed from conversation history — +since the agent consumes their bytes via the sandbox, not the model payload. +""" + +import langbot_plugin.api.entities.builtin.provider.message as provider_message + +from langbot.pkg.provider.modelmgr.requesters.litellmchat import LiteLLMRequester + + +def _make_requester() -> LiteLLMRequester: + # _convert_messages does not touch instance config, so bypass __init__. + return LiteLLMRequester.__new__(LiteLLMRequester) + + +def test_convert_messages_drops_file_base64_part(): + req = _make_requester() + msg = provider_message.Message( + role='user', + content=[ + provider_message.ContentElement.from_text('analyze this audio'), + provider_message.ContentElement.from_file_base64('data:audio/wav;base64,AAAA', 'voice.wav'), + ], + ) + out = req._convert_messages([msg]) + parts = out[0]['content'] + types = [p.get('type') for p in parts] + assert 'file_base64' not in types + assert types == ['text'] + assert parts[0]['text'] == 'analyze this audio' + + +def test_convert_messages_drops_file_url_part(): + req = _make_requester() + msg = provider_message.Message( + role='user', + content=[ + provider_message.ContentElement.from_text('here is a doc'), + provider_message.ContentElement.from_file_url('http://example.com/report.xlsx', 'report.xlsx'), + ], + ) + out = req._convert_messages([msg]) + types = [p.get('type') for p in out[0]['content']] + assert types == ['text'] + + +def test_convert_messages_keeps_image_and_converts_to_image_url(): + req = _make_requester() + msg = provider_message.Message( + role='user', + content=[ + provider_message.ContentElement.from_text('look'), + provider_message.ContentElement.from_image_base64('data:image/png;base64,AAAA'), + ], + ) + out = req._convert_messages([msg]) + parts = out[0]['content'] + types = [p.get('type') for p in parts] + # image is preserved and reshaped to the OpenAI image_url form + assert types == ['text', 'image_url'] + img_part = parts[1] + assert img_part['image_url'] == {'url': 'data:image/png;base64,AAAA'} + assert 'image_base64' not in img_part + + +def test_convert_messages_mixed_history_strips_only_files(): + req = _make_requester() + # Simulate replayed history: an old voice turn + a current text turn. + history_voice = provider_message.Message( + role='user', + content=[ + provider_message.ContentElement.from_text('old audio turn'), + provider_message.ContentElement.from_file_base64('data:audio/wav;base64,BBBB', 'voice.wav'), + ], + ) + current = provider_message.Message( + role='user', + content=[provider_message.ContentElement.from_text('now do the csv')], + ) + out = req._convert_messages([history_voice, current]) + assert [p.get('type') for p in out[0]['content']] == ['text'] + assert [p.get('type') for p in out[1]['content']] == ['text'] + + +def test_convert_messages_plain_string_content_untouched(): + req = _make_requester() + msg = provider_message.Message(role='user', content='just text') + out = req._convert_messages([msg]) + assert out[0]['content'] == 'just text' diff --git a/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx b/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx index 77dce2777..b45e87dd1 100644 --- a/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx +++ b/web/src/app/home/pipelines/components/debug-dialog/DebugDialog.tsx @@ -65,7 +65,12 @@ export default function DebugDialog({ const [isHovering, setIsHovering] = useState(false); const [isConnected, setIsConnected] = useState(false); const [selectedImages, setSelectedImages] = useState< - Array<{ file: File; preview: string; fileKey?: string }> + Array<{ + file: File; + preview: string; + fileKey?: string; + kind: 'image' | 'voice' | 'file'; + }> >([]); const [isUploading, setIsUploading] = useState(false); const [previewImageUrl, setPreviewImageUrl] = useState(''); @@ -293,23 +298,38 @@ export default function DebugDialog({ const files = e.target.files; if (!files || files.length === 0) return; - const newImages: Array<{ file: File; preview: string }> = []; + const newImages: Array<{ + file: File; + preview: string; + kind: 'image' | 'voice' | 'file'; + }> = []; for (let i = 0; i < files.length; i++) { const file = files[i]; if (file.type.startsWith('image/')) { - const preview = URL.createObjectURL(file); - newImages.push({ file, preview }); + newImages.push({ + file, + preview: URL.createObjectURL(file), + kind: 'image', + }); + } else if (file.type.startsWith('audio/')) { + newImages.push({ file, preview: '', kind: 'voice' }); + } else { + newImages.push({ file, preview: '', kind: 'file' }); } } setSelectedImages((prev) => [...prev, ...newImages]); + // reset the input so selecting the same file again re-triggers onChange + e.target.value = ''; }; const handleRemoveImage = (index: number) => { setSelectedImages((prev) => { const newImages = [...prev]; - URL.revokeObjectURL(newImages[index].preview); + if (newImages[index].preview) { + URL.revokeObjectURL(newImages[index].preview); + } newImages.splice(index, 1); return newImages; }); @@ -373,19 +393,33 @@ export default function DebugDialog({ }); } - // Upload images and add to message chain - for (const image of selectedImages) { + // Upload attachments and add to message chain + for (const attachment of selectedImages) { try { - const result = await httpClient.uploadWebSocketImage( - selectedPipelineId, - image.file, - ); - messageChain.push({ - type: 'Image', - path: result.file_key, - }); + if (attachment.kind === 'image') { + const result = await httpClient.uploadWebSocketImage( + selectedPipelineId, + attachment.file, + ); + messageChain.push({ + type: 'Image', + path: result.file_key, + }); + } else { + // Voice / File go through the generic document upload endpoint, + // which returns a storage key the backend resolves into the + // sandbox inbox just like images. + const result = await httpClient.uploadDocumentFile(attachment.file); + messageChain.push({ + type: attachment.kind === 'voice' ? 'Voice' : 'File', + path: result.file_id, + ...(attachment.kind === 'file' + ? { name: attachment.file.name } + : {}), + }); + } } catch (error) { - console.error('Image upload failed:', error); + console.error('Attachment upload failed:', error); toast.error(t('pipelines.debugDialog.imageUploadFailed')); } } @@ -394,7 +428,9 @@ export default function DebugDialog({ setInputValue(''); setHasAt(false); setQuotedMessage(null); - selectedImages.forEach((img) => URL.revokeObjectURL(img.preview)); + selectedImages.forEach((img) => { + if (img.preview) URL.revokeObjectURL(img.preview); + }); setSelectedImages([]); // Send message via WebSocket @@ -861,17 +897,30 @@ export default function DebugDialog({ )} - {/* Image preview area */} + {/* Attachment preview area */} {selectedImages.length > 0 && (
{selectedImages.map((image, index) => (
- {`preview-${index}`} + {image.kind === 'image' ? ( + {`preview-${index}`} + ) : ( +
+ {image.kind === 'voice' ? ( + + ) : ( + + )} + + {image.file.name} + +
+ )}