mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-18 11:44:18 +00:00
feat(box): support voice/file attachment round-trip end-to-end
Extends the bidirectional attachment transfer to audio and arbitrary files through the real webchat UI, and fixes the model-payload errors that non-image attachments triggered. - platform(websocket_adapter): resolve Voice/File component storage keys to base64 (previously only Image), so audio/documents reach the sandbox inbox. - web(debug-dialog): accept audio/* and any file in the uploader (was image-only), classify by mimetype, upload Voice/File via the documents endpoint, and render non-image staged attachments as a chip. - provider(litellmchat): drop non-image file parts (file_base64 / file_url) when building the OpenAI/LiteLLM payload. These come from Voice/File attachments — including ones replayed from conversation history — and the agent reads their bytes from the sandbox, not the model. Without this the provider rejects the request: 'invalid content type=file_base64'. - provider(localagent): also strip those parts from the current user message alongside the sandbox-path note (model-facing clarity; the requester is the real safety net for history). - tests: cover the requester strip/keep behavior (file dropped, image kept and reshaped to image_url, mixed history, plain-string content).
This commit is contained in:
@@ -312,12 +312,18 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter)
|
||||
|
||||
async def _process_image_components(self, message_chain_obj: list):
|
||||
"""
|
||||
处理消息链中的图片和文件组件,将path转换为base64
|
||||
处理消息链中的图片、语音和文件组件,将 path 转换为 base64
|
||||
|
||||
Image / Voice / File components uploaded from the web client carry a
|
||||
storage key in ``path``. Resolve it to a base64 data URI so downstream
|
||||
stages (multimodal LLM input and the Box sandbox inbox) have a usable
|
||||
payload, then drop the now-consumed storage object.
|
||||
|
||||
Args:
|
||||
message_chain_obj: 消息链对象列表
|
||||
"""
|
||||
import base64
|
||||
import mimetypes
|
||||
|
||||
storage_mgr = self.ap.storage_mgr
|
||||
|
||||
@@ -325,31 +331,33 @@ class WebSocketAdapter(abstract_platform_adapter.AbstractMessagePlatformAdapter)
|
||||
comp_type = component.get('type', '')
|
||||
comp_path = component.get('path', '')
|
||||
|
||||
if not comp_path:
|
||||
if not comp_path or comp_type not in ('Image', 'Voice', 'File'):
|
||||
continue
|
||||
|
||||
if comp_type == 'Image':
|
||||
try:
|
||||
file_content = await storage_mgr.storage_provider.load(comp_path)
|
||||
base64_str = base64.b64encode(file_content).decode('utf-8')
|
||||
try:
|
||||
file_content = await storage_mgr.storage_provider.load(comp_path)
|
||||
base64_str = base64.b64encode(file_content).decode('utf-8')
|
||||
|
||||
file_key = comp_path
|
||||
if file_key.lower().endswith(('.jpg', '.jpeg')):
|
||||
lowered = comp_path.lower()
|
||||
if comp_type == 'Image':
|
||||
if lowered.endswith(('.jpg', '.jpeg')):
|
||||
mime_type = 'image/jpeg'
|
||||
elif file_key.lower().endswith('.png'):
|
||||
mime_type = 'image/png'
|
||||
elif file_key.lower().endswith('.gif'):
|
||||
elif lowered.endswith('.gif'):
|
||||
mime_type = 'image/gif'
|
||||
elif file_key.lower().endswith('.webp'):
|
||||
elif lowered.endswith('.webp'):
|
||||
mime_type = 'image/webp'
|
||||
else:
|
||||
mime_type = 'image/png'
|
||||
elif comp_type == 'Voice':
|
||||
mime_type = mimetypes.guess_type(comp_path)[0] or 'audio/wav'
|
||||
else: # File
|
||||
mime_type = mimetypes.guess_type(comp_path)[0] or 'application/octet-stream'
|
||||
|
||||
component['base64'] = f'data:{mime_type};base64,{base64_str}'
|
||||
await storage_mgr.storage_provider.delete(comp_path)
|
||||
component['path'] = ''
|
||||
except Exception as e:
|
||||
await self.logger.error(f'Failed to load image file {comp_path}: {e}')
|
||||
component['base64'] = f'data:{mime_type};base64,{base64_str}'
|
||||
await storage_mgr.storage_provider.delete(comp_path)
|
||||
component['path'] = ''
|
||||
except Exception as e:
|
||||
await self.logger.error(f'Failed to load {comp_type} file {comp_path}: {e}')
|
||||
|
||||
async def handle_websocket_message(
|
||||
self,
|
||||
|
||||
@@ -216,11 +216,22 @@ class LiteLLMRequester(requester.ProviderAPIRequester):
|
||||
content = msg_dict.get('content')
|
||||
|
||||
if isinstance(content, list):
|
||||
converted_parts = []
|
||||
for part in content:
|
||||
if isinstance(part, dict) and part.get('type') == 'image_base64':
|
||||
part['image_url'] = {'url': part['image_base64']}
|
||||
part['type'] = 'image_url'
|
||||
del part['image_base64']
|
||||
# OpenAI-compatible chat models reject non-image file parts
|
||||
# (audio/document base64 or url). These originate from Voice /
|
||||
# File attachments — including ones replayed from conversation
|
||||
# history — and the agent already accesses their bytes via the
|
||||
# sandbox. Drop them from the model payload to avoid
|
||||
# "Invalid user message ... invalid content type=file_base64".
|
||||
if isinstance(part, dict) and part.get('type') in ('file_base64', 'file_url'):
|
||||
continue
|
||||
converted_parts.append(part)
|
||||
msg_dict['content'] = converted_parts
|
||||
|
||||
req_messages.append(msg_dict)
|
||||
|
||||
|
||||
@@ -143,6 +143,21 @@ class LocalAgentRunner(runner.RequestRunner):
|
||||
)
|
||||
note = '\n'.join(lines)
|
||||
|
||||
# Voice/File attachments are now available to the agent via the sandbox
|
||||
# (exec/read/write tools). Their raw bytes must NOT be forwarded to the
|
||||
# chat model as multimodal content: providers reject non-image file
|
||||
# parts ("Invalid user message ... ensure all user messages are valid
|
||||
# OpenAI chat completion messages"). Strip those content elements and
|
||||
# rely on the sandbox-path note instead. Images are kept so vision
|
||||
# models can still see them.
|
||||
_model_unsafe_types = {'file_base64', 'file_url'}
|
||||
if isinstance(user_message.content, list):
|
||||
user_message.content = [
|
||||
ce
|
||||
for ce in user_message.content
|
||||
if getattr(ce, 'type', None) not in _model_unsafe_types
|
||||
]
|
||||
|
||||
if isinstance(user_message.content, str):
|
||||
user_message.content = [
|
||||
provider_message.ContentElement.from_text(user_message.content),
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Unit tests for LiteLLMRequester._convert_messages.
|
||||
|
||||
Focus: the content-part normalization that (a) converts image_base64 parts to
|
||||
the OpenAI image_url shape and (b) drops non-image file parts (file_base64 /
|
||||
file_url) which OpenAI-compatible chat models reject. The latter is essential
|
||||
for Voice/File attachments — including ones replayed from conversation history —
|
||||
since the agent consumes their bytes via the sandbox, not the model payload.
|
||||
"""
|
||||
|
||||
import langbot_plugin.api.entities.builtin.provider.message as provider_message
|
||||
|
||||
from langbot.pkg.provider.modelmgr.requesters.litellmchat import LiteLLMRequester
|
||||
|
||||
|
||||
def _make_requester() -> LiteLLMRequester:
|
||||
# _convert_messages does not touch instance config, so bypass __init__.
|
||||
return LiteLLMRequester.__new__(LiteLLMRequester)
|
||||
|
||||
|
||||
def test_convert_messages_drops_file_base64_part():
|
||||
req = _make_requester()
|
||||
msg = provider_message.Message(
|
||||
role='user',
|
||||
content=[
|
||||
provider_message.ContentElement.from_text('analyze this audio'),
|
||||
provider_message.ContentElement.from_file_base64('data:audio/wav;base64,AAAA', 'voice.wav'),
|
||||
],
|
||||
)
|
||||
out = req._convert_messages([msg])
|
||||
parts = out[0]['content']
|
||||
types = [p.get('type') for p in parts]
|
||||
assert 'file_base64' not in types
|
||||
assert types == ['text']
|
||||
assert parts[0]['text'] == 'analyze this audio'
|
||||
|
||||
|
||||
def test_convert_messages_drops_file_url_part():
|
||||
req = _make_requester()
|
||||
msg = provider_message.Message(
|
||||
role='user',
|
||||
content=[
|
||||
provider_message.ContentElement.from_text('here is a doc'),
|
||||
provider_message.ContentElement.from_file_url('http://example.com/report.xlsx', 'report.xlsx'),
|
||||
],
|
||||
)
|
||||
out = req._convert_messages([msg])
|
||||
types = [p.get('type') for p in out[0]['content']]
|
||||
assert types == ['text']
|
||||
|
||||
|
||||
def test_convert_messages_keeps_image_and_converts_to_image_url():
|
||||
req = _make_requester()
|
||||
msg = provider_message.Message(
|
||||
role='user',
|
||||
content=[
|
||||
provider_message.ContentElement.from_text('look'),
|
||||
provider_message.ContentElement.from_image_base64('data:image/png;base64,AAAA'),
|
||||
],
|
||||
)
|
||||
out = req._convert_messages([msg])
|
||||
parts = out[0]['content']
|
||||
types = [p.get('type') for p in parts]
|
||||
# image is preserved and reshaped to the OpenAI image_url form
|
||||
assert types == ['text', 'image_url']
|
||||
img_part = parts[1]
|
||||
assert img_part['image_url'] == {'url': 'data:image/png;base64,AAAA'}
|
||||
assert 'image_base64' not in img_part
|
||||
|
||||
|
||||
def test_convert_messages_mixed_history_strips_only_files():
|
||||
req = _make_requester()
|
||||
# Simulate replayed history: an old voice turn + a current text turn.
|
||||
history_voice = provider_message.Message(
|
||||
role='user',
|
||||
content=[
|
||||
provider_message.ContentElement.from_text('old audio turn'),
|
||||
provider_message.ContentElement.from_file_base64('data:audio/wav;base64,BBBB', 'voice.wav'),
|
||||
],
|
||||
)
|
||||
current = provider_message.Message(
|
||||
role='user',
|
||||
content=[provider_message.ContentElement.from_text('now do the csv')],
|
||||
)
|
||||
out = req._convert_messages([history_voice, current])
|
||||
assert [p.get('type') for p in out[0]['content']] == ['text']
|
||||
assert [p.get('type') for p in out[1]['content']] == ['text']
|
||||
|
||||
|
||||
def test_convert_messages_plain_string_content_untouched():
|
||||
req = _make_requester()
|
||||
msg = provider_message.Message(role='user', content='just text')
|
||||
out = req._convert_messages([msg])
|
||||
assert out[0]['content'] == 'just text'
|
||||
@@ -65,7 +65,12 @@ export default function DebugDialog({
|
||||
const [isHovering, setIsHovering] = useState(false);
|
||||
const [isConnected, setIsConnected] = useState(false);
|
||||
const [selectedImages, setSelectedImages] = useState<
|
||||
Array<{ file: File; preview: string; fileKey?: string }>
|
||||
Array<{
|
||||
file: File;
|
||||
preview: string;
|
||||
fileKey?: string;
|
||||
kind: 'image' | 'voice' | 'file';
|
||||
}>
|
||||
>([]);
|
||||
const [isUploading, setIsUploading] = useState(false);
|
||||
const [previewImageUrl, setPreviewImageUrl] = useState<string>('');
|
||||
@@ -293,23 +298,38 @@ export default function DebugDialog({
|
||||
const files = e.target.files;
|
||||
if (!files || files.length === 0) return;
|
||||
|
||||
const newImages: Array<{ file: File; preview: string }> = [];
|
||||
const newImages: Array<{
|
||||
file: File;
|
||||
preview: string;
|
||||
kind: 'image' | 'voice' | 'file';
|
||||
}> = [];
|
||||
|
||||
for (let i = 0; i < files.length; i++) {
|
||||
const file = files[i];
|
||||
if (file.type.startsWith('image/')) {
|
||||
const preview = URL.createObjectURL(file);
|
||||
newImages.push({ file, preview });
|
||||
newImages.push({
|
||||
file,
|
||||
preview: URL.createObjectURL(file),
|
||||
kind: 'image',
|
||||
});
|
||||
} else if (file.type.startsWith('audio/')) {
|
||||
newImages.push({ file, preview: '', kind: 'voice' });
|
||||
} else {
|
||||
newImages.push({ file, preview: '', kind: 'file' });
|
||||
}
|
||||
}
|
||||
|
||||
setSelectedImages((prev) => [...prev, ...newImages]);
|
||||
// reset the input so selecting the same file again re-triggers onChange
|
||||
e.target.value = '';
|
||||
};
|
||||
|
||||
const handleRemoveImage = (index: number) => {
|
||||
setSelectedImages((prev) => {
|
||||
const newImages = [...prev];
|
||||
URL.revokeObjectURL(newImages[index].preview);
|
||||
if (newImages[index].preview) {
|
||||
URL.revokeObjectURL(newImages[index].preview);
|
||||
}
|
||||
newImages.splice(index, 1);
|
||||
return newImages;
|
||||
});
|
||||
@@ -373,19 +393,33 @@ export default function DebugDialog({
|
||||
});
|
||||
}
|
||||
|
||||
// Upload images and add to message chain
|
||||
for (const image of selectedImages) {
|
||||
// Upload attachments and add to message chain
|
||||
for (const attachment of selectedImages) {
|
||||
try {
|
||||
const result = await httpClient.uploadWebSocketImage(
|
||||
selectedPipelineId,
|
||||
image.file,
|
||||
);
|
||||
messageChain.push({
|
||||
type: 'Image',
|
||||
path: result.file_key,
|
||||
});
|
||||
if (attachment.kind === 'image') {
|
||||
const result = await httpClient.uploadWebSocketImage(
|
||||
selectedPipelineId,
|
||||
attachment.file,
|
||||
);
|
||||
messageChain.push({
|
||||
type: 'Image',
|
||||
path: result.file_key,
|
||||
});
|
||||
} else {
|
||||
// Voice / File go through the generic document upload endpoint,
|
||||
// which returns a storage key the backend resolves into the
|
||||
// sandbox inbox just like images.
|
||||
const result = await httpClient.uploadDocumentFile(attachment.file);
|
||||
messageChain.push({
|
||||
type: attachment.kind === 'voice' ? 'Voice' : 'File',
|
||||
path: result.file_id,
|
||||
...(attachment.kind === 'file'
|
||||
? { name: attachment.file.name }
|
||||
: {}),
|
||||
});
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Image upload failed:', error);
|
||||
console.error('Attachment upload failed:', error);
|
||||
toast.error(t('pipelines.debugDialog.imageUploadFailed'));
|
||||
}
|
||||
}
|
||||
@@ -394,7 +428,9 @@ export default function DebugDialog({
|
||||
setInputValue('');
|
||||
setHasAt(false);
|
||||
setQuotedMessage(null);
|
||||
selectedImages.forEach((img) => URL.revokeObjectURL(img.preview));
|
||||
selectedImages.forEach((img) => {
|
||||
if (img.preview) URL.revokeObjectURL(img.preview);
|
||||
});
|
||||
setSelectedImages([]);
|
||||
|
||||
// Send message via WebSocket
|
||||
@@ -861,17 +897,30 @@ export default function DebugDialog({
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Image preview area */}
|
||||
{/* Attachment preview area */}
|
||||
{selectedImages.length > 0 && (
|
||||
<div className="px-4 pb-2">
|
||||
<div className="flex gap-2 flex-wrap">
|
||||
{selectedImages.map((image, index) => (
|
||||
<div key={index} className="relative group">
|
||||
<img
|
||||
src={image.preview}
|
||||
alt={`preview-${index}`}
|
||||
className="w-20 h-20 object-cover rounded-lg border"
|
||||
/>
|
||||
{image.kind === 'image' ? (
|
||||
<img
|
||||
src={image.preview}
|
||||
alt={`preview-${index}`}
|
||||
className="w-20 h-20 object-cover rounded-lg border"
|
||||
/>
|
||||
) : (
|
||||
<div className="w-36 h-20 px-2 rounded-lg border bg-muted/40 flex items-center gap-2 overflow-hidden">
|
||||
{image.kind === 'voice' ? (
|
||||
<Music className="size-5 shrink-0 text-muted-foreground" />
|
||||
) : (
|
||||
<Paperclip className="size-5 shrink-0 text-muted-foreground" />
|
||||
)}
|
||||
<span className="text-xs text-muted-foreground truncate">
|
||||
{image.file.name}
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
<button
|
||||
type="button"
|
||||
onClick={() => handleRemoveImage(index)}
|
||||
@@ -900,7 +949,7 @@ export default function DebugDialog({
|
||||
<input
|
||||
ref={fileInputRef}
|
||||
type="file"
|
||||
accept="image/*"
|
||||
accept="image/*,audio/*,*/*"
|
||||
multiple
|
||||
onChange={handleImageSelect}
|
||||
className="hidden"
|
||||
|
||||
Reference in New Issue
Block a user