feat(box): bidirectional attachment transfer for sandbox (#2257)

* feat(box): bidirectional attachment transfer for sandbox

Materialize inbound attachments into the sandbox workspace so agents can
process user-sent files, and collect agent-produced files from the outbox
to attach them back to the reply.

- box(service): add materialize_inbound_attachments / collect_outbound
  attachments. Prefer direct host-filesystem read/write on the bind-mounted
  workspace (no size limit), falling back to chunked exec only for
  non-shared backends (e2b/remote). Clear per-query inbox/outbox dirs at
  turn start to avoid query_id-reuse collisions.
- provider(localagent): inject inbound attachment descriptors into the
  sandbox and append a system note telling the agent the inbox/outbox paths.
- pipeline(wrapper): collect outbox files on the final stream chunk and
  append them as attachment components to the response chain.
- web(debug-dialog): render File components with a download link when
  base64/url is present; add base64/path fields to the File entity.
- tests: cover inbound/outbound, large-file transfer without truncation,
  and stale-dir clearing (86 passing).

* feat(box): support voice/file attachment round-trip end-to-end

Extends the bidirectional attachment transfer to audio and arbitrary files
through the real webchat UI, and fixes the model-payload errors that
non-image attachments triggered.

- platform(websocket_adapter): resolve Voice/File component storage keys to
  base64 (previously only Image), so audio/documents reach the sandbox inbox.
- web(debug-dialog): accept audio/* and any file in the uploader (was
  image-only), classify by mimetype, upload Voice/File via the documents
  endpoint, and render non-image staged attachments as a chip.
- provider(litellmchat): drop non-image file parts (file_base64 / file_url)
  when building the OpenAI/LiteLLM payload. These come from Voice/File
  attachments — including ones replayed from conversation history — and the
  agent reads their bytes from the sandbox, not the model. Without this the
  provider rejects the request: 'invalid content type=file_base64'.
- provider(localagent): also strip those parts from the current user message
  alongside the sandbox-path note (model-facing clarity; the requester is the
  real safety net for history).
- tests: cover the requester strip/keep behavior (file dropped, image kept and
  reshaped to image_url, mixed history, plain-string content).

* test(box): cover inbound/outbound attachment helpers; fix ruff format

- ruff format localagent.py (CI ruff format --check was failing)
- add unit tests for ResponseWrapper outbound-attachment helpers (wrapper.py 78%->98%)
- add unit tests for LocalAgentRunner._inject_inbound_attachments
- add unit tests for WebSocketAdapter._process_image_components (0%->covered)

Lifts PR patch coverage from 68.97% to ~88% (>75% target).
This commit is contained in:
Junyan Chin
2026-06-18 21:40:31 +08:00
committed by GitHub
parent b3c6de2072
commit a1e6eccdeb
12 changed files with 1405 additions and 48 deletions
@@ -15,6 +15,7 @@ import {
At,
Quote,
Voice,
File as FileComponent,
Source,
} from '@/app/infra/entities/message';
import { toast } from 'sonner';
@@ -64,7 +65,12 @@ export default function DebugDialog({
const [isHovering, setIsHovering] = useState(false);
const [isConnected, setIsConnected] = useState(false);
const [selectedImages, setSelectedImages] = useState<
Array<{ file: File; preview: string; fileKey?: string }>
Array<{
file: File;
preview: string;
fileKey?: string;
kind: 'image' | 'voice' | 'file';
}>
>([]);
const [isUploading, setIsUploading] = useState(false);
const [previewImageUrl, setPreviewImageUrl] = useState<string>('');
@@ -292,23 +298,38 @@ export default function DebugDialog({
const files = e.target.files;
if (!files || files.length === 0) return;
const newImages: Array<{ file: File; preview: string }> = [];
const newImages: Array<{
file: File;
preview: string;
kind: 'image' | 'voice' | 'file';
}> = [];
for (let i = 0; i < files.length; i++) {
const file = files[i];
if (file.type.startsWith('image/')) {
const preview = URL.createObjectURL(file);
newImages.push({ file, preview });
newImages.push({
file,
preview: URL.createObjectURL(file),
kind: 'image',
});
} else if (file.type.startsWith('audio/')) {
newImages.push({ file, preview: '', kind: 'voice' });
} else {
newImages.push({ file, preview: '', kind: 'file' });
}
}
setSelectedImages((prev) => [...prev, ...newImages]);
// reset the input so selecting the same file again re-triggers onChange
e.target.value = '';
};
const handleRemoveImage = (index: number) => {
setSelectedImages((prev) => {
const newImages = [...prev];
URL.revokeObjectURL(newImages[index].preview);
if (newImages[index].preview) {
URL.revokeObjectURL(newImages[index].preview);
}
newImages.splice(index, 1);
return newImages;
});
@@ -372,19 +393,33 @@ export default function DebugDialog({
});
}
// Upload images and add to message chain
for (const image of selectedImages) {
// Upload attachments and add to message chain
for (const attachment of selectedImages) {
try {
const result = await httpClient.uploadWebSocketImage(
selectedPipelineId,
image.file,
);
messageChain.push({
type: 'Image',
path: result.file_key,
});
if (attachment.kind === 'image') {
const result = await httpClient.uploadWebSocketImage(
selectedPipelineId,
attachment.file,
);
messageChain.push({
type: 'Image',
path: result.file_key,
});
} else {
// Voice / File go through the generic document upload endpoint,
// which returns a storage key the backend resolves into the
// sandbox inbox just like images.
const result = await httpClient.uploadDocumentFile(attachment.file);
messageChain.push({
type: attachment.kind === 'voice' ? 'Voice' : 'File',
path: result.file_id,
...(attachment.kind === 'file'
? { name: attachment.file.name }
: {}),
});
}
} catch (error) {
console.error('Image upload failed:', error);
console.error('Attachment upload failed:', error);
toast.error(t('pipelines.debugDialog.imageUploadFailed'));
}
}
@@ -393,7 +428,9 @@ export default function DebugDialog({
setInputValue('');
setHasAt(false);
setQuotedMessage(null);
selectedImages.forEach((img) => URL.revokeObjectURL(img.preview));
selectedImages.forEach((img) => {
if (img.preview) URL.revokeObjectURL(img.preview);
});
setSelectedImages([]);
// Send message via WebSocket
@@ -460,13 +497,29 @@ export default function DebugDialog({
}
case 'File': {
const file = component as MessageChainComponent & { name?: string };
const file = component as FileComponent;
const downloadHref = file.base64
? file.base64.startsWith('data:')
? file.base64
: `data:application/octet-stream;base64,${file.base64}`
: file.url || '';
const fileName = file.name || 'Unknown';
return (
<div key={index} className="my-2 flex items-center gap-2 text-sm">
<Paperclip className="size-4" />
<span>
[{t('pipelines.debugDialog.file')}] {file.name || 'Unknown'}
</span>
{downloadHref ? (
<a
href={downloadHref}
download={fileName}
className="text-primary underline hover:opacity-80"
>
[{t('pipelines.debugDialog.file')}] {fileName}
</a>
) : (
<span>
[{t('pipelines.debugDialog.file')}] {fileName}
</span>
)}
</div>
);
}
@@ -844,17 +897,30 @@ export default function DebugDialog({
</div>
)}
{/* Image preview area */}
{/* Attachment preview area */}
{selectedImages.length > 0 && (
<div className="px-4 pb-2">
<div className="flex gap-2 flex-wrap">
{selectedImages.map((image, index) => (
<div key={index} className="relative group">
<img
src={image.preview}
alt={`preview-${index}`}
className="w-20 h-20 object-cover rounded-lg border"
/>
{image.kind === 'image' ? (
<img
src={image.preview}
alt={`preview-${index}`}
className="w-20 h-20 object-cover rounded-lg border"
/>
) : (
<div className="w-36 h-20 px-2 rounded-lg border bg-muted/40 flex items-center gap-2 overflow-hidden">
{image.kind === 'voice' ? (
<Music className="size-5 shrink-0 text-muted-foreground" />
) : (
<Paperclip className="size-5 shrink-0 text-muted-foreground" />
)}
<span className="text-xs text-muted-foreground truncate">
{image.file.name}
</span>
</div>
)}
<button
type="button"
onClick={() => handleRemoveImage(index)}
@@ -883,7 +949,7 @@ export default function DebugDialog({
<input
ref={fileInputRef}
type="file"
accept="image/*"
accept="image/*,audio/*,*/*"
multiple
onChange={handleImageSelect}
className="hidden"
@@ -64,6 +64,8 @@ export interface File extends MessageComponent {
name?: string;
size?: number;
url?: string;
path?: string;
base64?: string;
}
// Unknown component