mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-02 03:55:55 +00:00
feat: add ZIP file upload support for knowledge base (#1626)
* feat: add ZIP file upload support for knowledge base - Add _parse_zip method to FileParser class using zipfile library - Support extraction and processing of TXT, PDF, DOCX, MD, HTML files from ZIP - Update FileUploadZone to accept .zip files - Add ZIP format to supported formats in internationalization files - Implement error handling for invalid ZIP files and unsupported content - Follow existing async parsing patterns and error handling conventions Co-Authored-By: Rock <rockchinq@gmail.com> * refactor: modify ZIP processing to store each document as separate file - Remove _parse_zip method from FileParser as ZIP handling now occurs at knowledge base level - Add _store_zip_file method to RuntimeKnowledgeBase to extract and store each document separately - Each document in ZIP is now stored as individual file entry in knowledge base - Process ZIP files in memory using io.BytesIO to avoid filesystem writes - Generate unique file names for extracted documents to prevent conflicts Co-Authored-By: Rock <rockchinq@gmail.com> * perf: delete temp files --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: Rock <rockchinq@gmail.com>
This commit is contained in:
committed by
GitHub
parent
87ecb4e519
commit
83ff64698b
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
import traceback
|
||||
import uuid
|
||||
import zipfile
|
||||
import io
|
||||
from .services import parser, chunker
|
||||
from pkg.core import app
|
||||
from pkg.rag.knowledge.services.embedder import Embedder
|
||||
@@ -89,16 +91,23 @@ class RuntimeKnowledgeBase:
|
||||
)
|
||||
|
||||
raise
|
||||
finally:
|
||||
# delete file from storage
|
||||
await self.ap.storage_mgr.storage_provider.delete(file.file_name)
|
||||
|
||||
async def store_file(self, file_id: str) -> str:
|
||||
# pre checking
|
||||
if not await self.ap.storage_mgr.storage_provider.exists(file_id):
|
||||
raise Exception(f'File {file_id} not found')
|
||||
|
||||
file_name = file_id
|
||||
extension = file_name.split('.')[-1].lower()
|
||||
|
||||
if extension == 'zip':
|
||||
return await self._store_zip_file(file_id)
|
||||
|
||||
file_uuid = str(uuid.uuid4())
|
||||
kb_id = self.knowledge_base_entity.uuid
|
||||
file_name = file_id
|
||||
extension = file_name.split('.')[-1]
|
||||
|
||||
file_obj_data = {
|
||||
'uuid': file_uuid,
|
||||
@@ -123,6 +132,61 @@ class RuntimeKnowledgeBase:
|
||||
)
|
||||
return wrapper.id
|
||||
|
||||
async def _store_zip_file(self, zip_file_id: str) -> str:
|
||||
"""Handle ZIP file by extracting each document and storing them separately."""
|
||||
self.ap.logger.info(f'Processing ZIP file: {zip_file_id}')
|
||||
|
||||
zip_bytes = await self.ap.storage_mgr.storage_provider.load(zip_file_id)
|
||||
|
||||
supported_extensions = {'txt', 'pdf', 'docx', 'md', 'html'}
|
||||
stored_file_tasks = []
|
||||
|
||||
# use utf-8 encoding
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes), 'r', metadata_encoding='utf-8') as zip_ref:
|
||||
for file_info in zip_ref.filelist:
|
||||
# skip directories and hidden files
|
||||
if file_info.is_dir() or file_info.filename.startswith('.'):
|
||||
continue
|
||||
|
||||
file_extension = file_info.filename.split('.')[-1].lower()
|
||||
if file_extension not in supported_extensions:
|
||||
self.ap.logger.debug(f'Skipping unsupported file in ZIP: {file_info.filename}')
|
||||
continue
|
||||
|
||||
try:
|
||||
file_content = zip_ref.read(file_info.filename)
|
||||
|
||||
base_name = file_info.filename.replace('/', '_').replace('\\', '_')
|
||||
extension = base_name.split('.')[-1]
|
||||
file_name = base_name.split('.')[0]
|
||||
|
||||
if file_name.startswith('__MACOSX'):
|
||||
continue
|
||||
|
||||
extracted_file_id = file_name + '_' + str(uuid.uuid4())[:8] + '.' + extension
|
||||
# save file to storage
|
||||
|
||||
await self.ap.storage_mgr.storage_provider.save(extracted_file_id, file_content)
|
||||
|
||||
task_id = await self.store_file(extracted_file_id)
|
||||
stored_file_tasks.append(task_id)
|
||||
|
||||
self.ap.logger.info(
|
||||
f'Extracted and stored file from ZIP: {file_info.filename} -> {extracted_file_id}'
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.ap.logger.warning(f'Failed to extract file {file_info.filename} from ZIP: {e}')
|
||||
continue
|
||||
|
||||
if not stored_file_tasks:
|
||||
raise Exception('No supported files found in ZIP archive')
|
||||
|
||||
self.ap.logger.info(f'Successfully processed ZIP file {zip_file_id}, extracted {len(stored_file_tasks)} files')
|
||||
await self.ap.storage_mgr.storage_provider.delete(zip_file_id)
|
||||
|
||||
return stored_file_tasks[0] if stored_file_tasks else ''
|
||||
|
||||
async def retrieve(self, query: str, top_k: int) -> list[retriever_entities.RetrieveResultEntry]:
|
||||
embedding_model = await self.ap.model_mgr.get_embedding_model_by_uuid(
|
||||
self.knowledge_base_entity.embedding_model_uuid
|
||||
|
||||
@@ -104,7 +104,7 @@ export default function FileUploadZone({
|
||||
id="file-upload"
|
||||
className="hidden"
|
||||
onChange={handleFileSelect}
|
||||
accept=".pdf,.doc,.docx,.txt,.md,.html"
|
||||
accept=".pdf,.doc,.docx,.txt,.md,.html,.zip"
|
||||
disabled={isUploading}
|
||||
/>
|
||||
|
||||
|
||||
@@ -292,7 +292,7 @@ const enUS = {
|
||||
dragAndDrop: 'Drag and drop files here or click to upload',
|
||||
uploading: 'Uploading...',
|
||||
supportedFormats:
|
||||
'Supports PDF, Word, TXT, Markdown and other document formats',
|
||||
'Supports PDF, Word, TXT, Markdown, HTML, ZIP and other document formats',
|
||||
uploadSuccess: 'File uploaded successfully!',
|
||||
uploadError: 'File upload failed, please try again',
|
||||
uploadingFile: 'Uploading file...',
|
||||
|
||||
@@ -282,7 +282,7 @@ const zhHans = {
|
||||
noResults: '暂无文档',
|
||||
dragAndDrop: '拖拽文件到此处或点击上传',
|
||||
uploading: '上传中...',
|
||||
supportedFormats: '支持 PDF、Word、TXT、Markdown 等文档格式',
|
||||
supportedFormats: '支持 PDF、Word、TXT、Markdown、HTML、ZIP 等文档格式',
|
||||
uploadSuccess: '文件上传成功!',
|
||||
uploadError: '文件上传失败,请重试',
|
||||
uploadingFile: '上传文件中...',
|
||||
|
||||
Reference in New Issue
Block a user