feat: add ZIP file upload support for knowledge base (#1626)

* feat: add ZIP file upload support for knowledge base

- Add _parse_zip method to FileParser class using zipfile library
- Support extraction and processing of TXT, PDF, DOCX, MD, HTML files from ZIP
- Update FileUploadZone to accept .zip files
- Add ZIP format to supported formats in internationalization files
- Implement error handling for invalid ZIP files and unsupported content
- Follow existing async parsing patterns and error handling conventions

Co-Authored-By: Rock <rockchinq@gmail.com>

* refactor: modify ZIP processing to store each document as separate file

- Remove _parse_zip method from FileParser as ZIP handling now occurs at knowledge base level
- Add _store_zip_file method to RuntimeKnowledgeBase to extract and store each document separately
- Each document in ZIP is now stored as individual file entry in knowledge base
- Process ZIP files in memory using io.BytesIO to avoid filesystem writes
- Generate unique file names for extracted documents to prevent conflicts

Co-Authored-By: Rock <rockchinq@gmail.com>

* perf: delete temp files

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Rock <rockchinq@gmail.com>
This commit is contained in:
devin-ai-integration[bot]
2025-08-23 21:18:13 +08:00
committed by GitHub
parent 87ecb4e519
commit 83ff64698b
4 changed files with 69 additions and 5 deletions

View File

@@ -1,6 +1,8 @@
from __future__ import annotations
import traceback
import uuid
import zipfile
import io
from .services import parser, chunker
from pkg.core import app
from pkg.rag.knowledge.services.embedder import Embedder
@@ -89,16 +91,23 @@ class RuntimeKnowledgeBase:
)
raise
finally:
# delete file from storage
await self.ap.storage_mgr.storage_provider.delete(file.file_name)
async def store_file(self, file_id: str) -> str:
# pre checking
if not await self.ap.storage_mgr.storage_provider.exists(file_id):
raise Exception(f'File {file_id} not found')
file_name = file_id
extension = file_name.split('.')[-1].lower()
if extension == 'zip':
return await self._store_zip_file(file_id)
file_uuid = str(uuid.uuid4())
kb_id = self.knowledge_base_entity.uuid
file_name = file_id
extension = file_name.split('.')[-1]
file_obj_data = {
'uuid': file_uuid,
@@ -123,6 +132,61 @@ class RuntimeKnowledgeBase:
)
return wrapper.id
async def _store_zip_file(self, zip_file_id: str) -> str:
"""Handle ZIP file by extracting each document and storing them separately."""
self.ap.logger.info(f'Processing ZIP file: {zip_file_id}')
zip_bytes = await self.ap.storage_mgr.storage_provider.load(zip_file_id)
supported_extensions = {'txt', 'pdf', 'docx', 'md', 'html'}
stored_file_tasks = []
# use utf-8 encoding
with zipfile.ZipFile(io.BytesIO(zip_bytes), 'r', metadata_encoding='utf-8') as zip_ref:
for file_info in zip_ref.filelist:
# skip directories and hidden files
if file_info.is_dir() or file_info.filename.startswith('.'):
continue
file_extension = file_info.filename.split('.')[-1].lower()
if file_extension not in supported_extensions:
self.ap.logger.debug(f'Skipping unsupported file in ZIP: {file_info.filename}')
continue
try:
file_content = zip_ref.read(file_info.filename)
base_name = file_info.filename.replace('/', '_').replace('\\', '_')
extension = base_name.split('.')[-1]
file_name = base_name.split('.')[0]
if file_name.startswith('__MACOSX'):
continue
extracted_file_id = file_name + '_' + str(uuid.uuid4())[:8] + '.' + extension
# save file to storage
await self.ap.storage_mgr.storage_provider.save(extracted_file_id, file_content)
task_id = await self.store_file(extracted_file_id)
stored_file_tasks.append(task_id)
self.ap.logger.info(
f'Extracted and stored file from ZIP: {file_info.filename} -> {extracted_file_id}'
)
except Exception as e:
self.ap.logger.warning(f'Failed to extract file {file_info.filename} from ZIP: {e}')
continue
if not stored_file_tasks:
raise Exception('No supported files found in ZIP archive')
self.ap.logger.info(f'Successfully processed ZIP file {zip_file_id}, extracted {len(stored_file_tasks)} files')
await self.ap.storage_mgr.storage_provider.delete(zip_file_id)
return stored_file_tasks[0] if stored_file_tasks else ''
async def retrieve(self, query: str, top_k: int) -> list[retriever_entities.RetrieveResultEntry]:
embedding_model = await self.ap.model_mgr.get_embedding_model_by_uuid(
self.knowledge_base_entity.embedding_model_uuid

View File

@@ -104,7 +104,7 @@ export default function FileUploadZone({
id="file-upload"
className="hidden"
onChange={handleFileSelect}
accept=".pdf,.doc,.docx,.txt,.md,.html"
accept=".pdf,.doc,.docx,.txt,.md,.html,.zip"
disabled={isUploading}
/>

View File

@@ -292,7 +292,7 @@ const enUS = {
dragAndDrop: 'Drag and drop files here or click to upload',
uploading: 'Uploading...',
supportedFormats:
'Supports PDF, Word, TXT, Markdown and other document formats',
'Supports PDF, Word, TXT, Markdown, HTML, ZIP and other document formats',
uploadSuccess: 'File uploaded successfully!',
uploadError: 'File upload failed, please try again',
uploadingFile: 'Uploading file...',

View File

@@ -282,7 +282,7 @@ const zhHans = {
noResults: '暂无文档',
dragAndDrop: '拖拽文件到此处或点击上传',
uploading: '上传中...',
supportedFormats: '支持 PDF、Word、TXT、Markdown 等文档格式',
supportedFormats: '支持 PDF、Word、TXT、Markdown、HTML、ZIP 等文档格式',
uploadSuccess: '文件上传成功!',
uploadError: '文件上传失败,请重试',
uploadingFile: '上传文件中...',