mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-05 05:16:03 +00:00
Fix/storage retention cleanup (#2159)
* fix: add storage retention cleanup * fix: prune completed tasks on completion * fix: complete storage analysis i18n
This commit is contained in:
@@ -136,6 +136,10 @@ class SystemRouterGroup(group.RouterGroup):
|
||||
|
||||
return self.success(data=task.to_dict())
|
||||
|
||||
@self.route('/storage-analysis', methods=['GET'], auth_type=group.AuthType.USER_TOKEN)
|
||||
async def _() -> str:
|
||||
return self.success(data=await self.ap.maintenance_service.get_storage_analysis())
|
||||
|
||||
@self.route('/debug/exec', methods=['POST'], auth_type=group.AuthType.USER_TOKEN)
|
||||
async def _() -> str:
|
||||
if not constants.debug_mode:
|
||||
|
||||
309
src/langbot/pkg/api/http/service/maintenance.py
Normal file
309
src/langbot/pkg/api/http/service/maintenance.py
Normal file
@@ -0,0 +1,309 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import sqlalchemy
|
||||
|
||||
from ....core import app
|
||||
from ....entity.persistence import bstorage as persistence_bstorage
|
||||
from ....entity.persistence import monitoring as persistence_monitoring
|
||||
|
||||
|
||||
LOG_FILE_PATTERN = re.compile(r'^langbot-(\d{4}-\d{2}-\d{2})\.log(?:\.\d+)?$')
|
||||
DEFAULT_UPLOAD_FILE_RETENTION_DAYS = 7
|
||||
DEFAULT_LOG_RETENTION_DAYS = 3
|
||||
|
||||
|
||||
class MaintenanceService:
|
||||
"""Storage maintenance and diagnostics."""
|
||||
|
||||
ap: app.Application
|
||||
|
||||
def __init__(self, ap: app.Application) -> None:
|
||||
self.ap = ap
|
||||
|
||||
async def cleanup_expired_files(self) -> dict[str, int]:
|
||||
cleanup_cfg = self.ap.instance_config.data.get('storage', {}).get('cleanup', {})
|
||||
upload_retention_days = self._positive_int(
|
||||
cleanup_cfg.get('uploaded_file_retention_days'),
|
||||
DEFAULT_UPLOAD_FILE_RETENTION_DAYS,
|
||||
'storage.cleanup.uploaded_file_retention_days',
|
||||
)
|
||||
log_retention_days = self._positive_int(
|
||||
cleanup_cfg.get('log_retention_days'),
|
||||
DEFAULT_LOG_RETENTION_DAYS,
|
||||
'storage.cleanup.log_retention_days',
|
||||
)
|
||||
|
||||
return {
|
||||
'uploaded_files': await self._cleanup_expired_uploaded_files(upload_retention_days),
|
||||
'log_files': self._cleanup_expired_log_files(log_retention_days),
|
||||
}
|
||||
|
||||
async def get_storage_analysis(self) -> dict[str, Any]:
|
||||
cleanup_cfg = self.ap.instance_config.data.get('storage', {}).get('cleanup', {})
|
||||
upload_retention_days = self._positive_int(
|
||||
cleanup_cfg.get('uploaded_file_retention_days'),
|
||||
DEFAULT_UPLOAD_FILE_RETENTION_DAYS,
|
||||
'storage.cleanup.uploaded_file_retention_days',
|
||||
)
|
||||
log_retention_days = self._positive_int(
|
||||
cleanup_cfg.get('log_retention_days'),
|
||||
DEFAULT_LOG_RETENTION_DAYS,
|
||||
'storage.cleanup.log_retention_days',
|
||||
)
|
||||
|
||||
database_cfg = self.ap.instance_config.data.get('database', {})
|
||||
database_type = database_cfg.get('use', 'sqlite')
|
||||
database_path = (
|
||||
Path(database_cfg.get('sqlite', {}).get('path', 'data/langbot.db')) if database_type == 'sqlite' else None
|
||||
)
|
||||
roots: list[tuple[str, Path | None]] = [
|
||||
('database', database_path),
|
||||
('logs', Path('data/logs')),
|
||||
('storage', Path('data/storage')),
|
||||
('vector_store', Path('data/chroma')),
|
||||
('plugins', Path('data/plugins')),
|
||||
('mcp', Path('data/mcp')),
|
||||
('temp', Path('data/temp')),
|
||||
]
|
||||
|
||||
sections = []
|
||||
for key, path in roots:
|
||||
sections.append(
|
||||
{
|
||||
'key': key,
|
||||
'path': str(path) if path else '',
|
||||
'exists': path.exists() if path else False,
|
||||
'size_bytes': self._path_size(path) if path else 0,
|
||||
'file_count': self._file_count(path) if path else 0,
|
||||
}
|
||||
)
|
||||
|
||||
monitoring_counts = await self._monitoring_counts()
|
||||
binary_storage = await self._binary_storage_stats()
|
||||
upload_candidates = await self._expired_uploaded_candidates(upload_retention_days)
|
||||
log_candidates = self._expired_log_candidates(log_retention_days)
|
||||
|
||||
return {
|
||||
'generated_at': datetime.datetime.now(datetime.timezone.utc).isoformat(),
|
||||
'cleanup_policy': {
|
||||
'uploaded_file_retention_days': upload_retention_days,
|
||||
'log_retention_days': log_retention_days,
|
||||
},
|
||||
'sections': sections,
|
||||
'database': {
|
||||
'type': database_type,
|
||||
'monitoring_counts': monitoring_counts,
|
||||
'binary_storage': binary_storage,
|
||||
},
|
||||
'cleanup_candidates': {
|
||||
'uploaded_files': upload_candidates,
|
||||
'log_files': log_candidates,
|
||||
},
|
||||
'tasks': self.ap.task_mgr.get_stats() if self.ap.task_mgr else {},
|
||||
}
|
||||
|
||||
async def _cleanup_expired_uploaded_files(self, retention_days: int) -> int:
|
||||
provider = self.ap.storage_mgr.storage_provider
|
||||
provider_name = provider.__class__.__name__
|
||||
if provider_name == 'LocalStorageProvider':
|
||||
candidates = self._expired_local_upload_candidates(retention_days, include_paths=True)
|
||||
deleted = 0
|
||||
for item in candidates:
|
||||
try:
|
||||
os.remove(item['path'])
|
||||
deleted += 1
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.ap.logger.warning(f'Failed to delete expired uploaded file {item["key"]}: {e}')
|
||||
return deleted
|
||||
|
||||
if provider_name == 'S3StorageProvider':
|
||||
return await self._cleanup_expired_s3_uploaded_files(retention_days)
|
||||
|
||||
return 0
|
||||
|
||||
async def _expired_uploaded_candidates(self, retention_days: int) -> list[dict[str, Any]]:
|
||||
provider_name = self.ap.storage_mgr.storage_provider.__class__.__name__
|
||||
if provider_name == 'LocalStorageProvider':
|
||||
return self._expired_local_upload_candidates(retention_days)
|
||||
if provider_name == 'S3StorageProvider':
|
||||
return await self._expired_s3_upload_candidates(retention_days)
|
||||
return []
|
||||
|
||||
async def _cleanup_expired_s3_uploaded_files(self, retention_days: int) -> int:
|
||||
provider = self.ap.storage_mgr.storage_provider
|
||||
candidates = await self._expired_s3_upload_candidates(retention_days)
|
||||
deleted = 0
|
||||
for item in candidates:
|
||||
await provider.delete(item['key'])
|
||||
deleted += 1
|
||||
return deleted
|
||||
|
||||
async def _expired_s3_upload_candidates(self, retention_days: int) -> list[dict[str, Any]]:
|
||||
provider = self.ap.storage_mgr.storage_provider
|
||||
cutoff = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=retention_days)
|
||||
candidates = []
|
||||
paginator = provider.s3_client.get_paginator('list_objects_v2')
|
||||
|
||||
for page in paginator.paginate(Bucket=provider.bucket_name):
|
||||
for obj in page.get('Contents', []):
|
||||
key = obj.get('Key', '')
|
||||
last_modified = obj.get('LastModified')
|
||||
if not self._is_uploaded_file_key(key):
|
||||
continue
|
||||
if last_modified and last_modified < cutoff:
|
||||
candidates.append(
|
||||
{
|
||||
'key': key,
|
||||
'size_bytes': obj.get('Size', 0),
|
||||
'modified_at': last_modified.isoformat(),
|
||||
}
|
||||
)
|
||||
|
||||
return candidates
|
||||
|
||||
def _cleanup_expired_log_files(self, retention_days: int) -> int:
|
||||
deleted = 0
|
||||
for item in self._expired_log_candidates(retention_days, include_paths=True):
|
||||
try:
|
||||
os.remove(item['path'])
|
||||
deleted += 1
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.ap.logger.warning(f'Failed to delete expired log file {item["name"]}: {e}')
|
||||
return deleted
|
||||
|
||||
def _expired_local_upload_candidates(
|
||||
self, retention_days: int, include_paths: bool = False
|
||||
) -> list[dict[str, Any]]:
|
||||
storage_root = Path('data/storage')
|
||||
if not storage_root.exists():
|
||||
return []
|
||||
|
||||
cutoff = datetime.datetime.now().timestamp() - retention_days * 86400
|
||||
candidates = []
|
||||
for entry in storage_root.iterdir():
|
||||
if not entry.is_file() or not self._is_uploaded_file_key(entry.name):
|
||||
continue
|
||||
stat = entry.stat()
|
||||
if stat.st_mtime >= cutoff:
|
||||
continue
|
||||
item = {
|
||||
'key': entry.name,
|
||||
'size_bytes': stat.st_size,
|
||||
'modified_at': datetime.datetime.fromtimestamp(stat.st_mtime, datetime.timezone.utc).isoformat(),
|
||||
}
|
||||
if include_paths:
|
||||
item['path'] = str(entry)
|
||||
candidates.append(item)
|
||||
return candidates
|
||||
|
||||
def _expired_log_candidates(self, retention_days: int, include_paths: bool = False) -> list[dict[str, Any]]:
|
||||
log_root = Path('data/logs')
|
||||
if not log_root.exists():
|
||||
return []
|
||||
|
||||
cutoff_date = datetime.date.today() - datetime.timedelta(days=retention_days - 1)
|
||||
candidates = []
|
||||
for entry in log_root.iterdir():
|
||||
if not entry.is_file():
|
||||
continue
|
||||
match = LOG_FILE_PATTERN.match(entry.name)
|
||||
if not match:
|
||||
continue
|
||||
try:
|
||||
file_date = datetime.date.fromisoformat(match.group(1))
|
||||
except ValueError:
|
||||
continue
|
||||
if file_date >= cutoff_date:
|
||||
continue
|
||||
stat = entry.stat()
|
||||
item = {
|
||||
'name': entry.name,
|
||||
'date': file_date.isoformat(),
|
||||
'size_bytes': stat.st_size,
|
||||
}
|
||||
if include_paths:
|
||||
item['path'] = str(entry)
|
||||
candidates.append(item)
|
||||
return candidates
|
||||
|
||||
def _is_uploaded_file_key(self, key: str) -> bool:
|
||||
return '/' not in key and not key.startswith('plugin_config_')
|
||||
|
||||
async def _monitoring_counts(self) -> dict[str, int]:
|
||||
tables = {
|
||||
'messages': persistence_monitoring.MonitoringMessage.id,
|
||||
'llm_calls': persistence_monitoring.MonitoringLLMCall.id,
|
||||
'embedding_calls': persistence_monitoring.MonitoringEmbeddingCall.id,
|
||||
'errors': persistence_monitoring.MonitoringError.id,
|
||||
'sessions': persistence_monitoring.MonitoringSession.session_id,
|
||||
'feedback': persistence_monitoring.MonitoringFeedback.id,
|
||||
}
|
||||
counts: dict[str, int] = {}
|
||||
for key, column in tables.items():
|
||||
result = await self.ap.persistence_mgr.execute_async(sqlalchemy.select(sqlalchemy.func.count(column)))
|
||||
counts[key] = result.scalar() or 0
|
||||
return counts
|
||||
|
||||
async def _binary_storage_stats(self) -> dict[str, Any]:
|
||||
count_result = await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.select(sqlalchemy.func.count(persistence_bstorage.BinaryStorage.unique_key))
|
||||
)
|
||||
size_bytes = None
|
||||
try:
|
||||
size_result = await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.select(sqlalchemy.func.sum(sqlalchemy.func.length(persistence_bstorage.BinaryStorage.value)))
|
||||
)
|
||||
size_bytes = size_result.scalar() or 0
|
||||
except Exception as e:
|
||||
self.ap.logger.warning(f'Failed to estimate binary storage size: {e}')
|
||||
|
||||
return {
|
||||
'count': count_result.scalar() or 0,
|
||||
'size_bytes': size_bytes,
|
||||
}
|
||||
|
||||
def _path_size(self, path: Path) -> int:
|
||||
if not path.exists():
|
||||
return 0
|
||||
if path.is_file():
|
||||
return path.stat().st_size
|
||||
total = 0
|
||||
for root, _, files in os.walk(path):
|
||||
for file_name in files:
|
||||
file_path = Path(root) / file_name
|
||||
try:
|
||||
total += file_path.stat().st_size
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return total
|
||||
|
||||
def _file_count(self, path: Path) -> int:
|
||||
if not path.exists():
|
||||
return 0
|
||||
if path.is_file():
|
||||
return 1
|
||||
count = 0
|
||||
for _, _, files in os.walk(path):
|
||||
count += len(files)
|
||||
return count
|
||||
|
||||
def _positive_int(self, value: Any, default: int, name: str) -> int:
|
||||
try:
|
||||
parsed = int(value)
|
||||
except (TypeError, ValueError):
|
||||
self.ap.logger.warning(f'Invalid {name}: {value!r}, using {default}')
|
||||
return default
|
||||
if parsed < 1:
|
||||
self.ap.logger.warning(f'Invalid {name}: {value!r}, using {default}')
|
||||
return default
|
||||
return parsed
|
||||
@@ -18,55 +18,119 @@ class MonitoringService:
|
||||
|
||||
# ========== Cleanup Methods ==========
|
||||
|
||||
async def cleanup_expired_records(self, retention_days: int) -> dict[str, int]:
|
||||
async def cleanup_expired_records(self, retention_days: int, batch_size: int = 1000) -> dict[str, int]:
|
||||
"""Delete monitoring records older than the specified retention period.
|
||||
|
||||
Args:
|
||||
retention_days: Number of days to retain records.
|
||||
batch_size: Maximum rows to delete per table batch.
|
||||
|
||||
Returns:
|
||||
A dict mapping table name to the number of deleted rows.
|
||||
"""
|
||||
if retention_days < 1:
|
||||
raise ValueError('retention_days must be >= 1')
|
||||
if batch_size < 1:
|
||||
raise ValueError('batch_size must be >= 1')
|
||||
|
||||
cutoff = datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) - datetime.timedelta(
|
||||
days=retention_days
|
||||
)
|
||||
|
||||
tables_and_columns: list[tuple[str, type, sqlalchemy.Column]] = [
|
||||
tables_and_columns: list[tuple[str, type, sqlalchemy.Column, sqlalchemy.Column]] = [
|
||||
(
|
||||
'monitoring_messages',
|
||||
persistence_monitoring.MonitoringMessage,
|
||||
persistence_monitoring.MonitoringMessage.timestamp,
|
||||
persistence_monitoring.MonitoringMessage.id,
|
||||
),
|
||||
(
|
||||
'monitoring_llm_calls',
|
||||
persistence_monitoring.MonitoringLLMCall,
|
||||
persistence_monitoring.MonitoringLLMCall.timestamp,
|
||||
persistence_monitoring.MonitoringLLMCall.id,
|
||||
),
|
||||
(
|
||||
'monitoring_embedding_calls',
|
||||
persistence_monitoring.MonitoringEmbeddingCall,
|
||||
persistence_monitoring.MonitoringEmbeddingCall.timestamp,
|
||||
persistence_monitoring.MonitoringEmbeddingCall.id,
|
||||
),
|
||||
(
|
||||
'monitoring_errors',
|
||||
persistence_monitoring.MonitoringError,
|
||||
persistence_monitoring.MonitoringError.timestamp,
|
||||
persistence_monitoring.MonitoringError.id,
|
||||
),
|
||||
(
|
||||
'monitoring_sessions',
|
||||
persistence_monitoring.MonitoringSession,
|
||||
persistence_monitoring.MonitoringSession.last_activity,
|
||||
persistence_monitoring.MonitoringSession.session_id,
|
||||
),
|
||||
(
|
||||
'monitoring_feedback',
|
||||
persistence_monitoring.MonitoringFeedback,
|
||||
persistence_monitoring.MonitoringFeedback.timestamp,
|
||||
persistence_monitoring.MonitoringFeedback.id,
|
||||
),
|
||||
]
|
||||
|
||||
deleted_counts: dict[str, int] = {}
|
||||
|
||||
for table_name, model_cls, ts_column in tables_and_columns:
|
||||
result = await self.ap.persistence_mgr.execute_async(sqlalchemy.delete(model_cls).where(ts_column < cutoff))
|
||||
deleted_counts[table_name] = result.rowcount
|
||||
for table_name, model_cls, ts_column, pk_column in tables_and_columns:
|
||||
deleted_counts[table_name] = await self._delete_expired_in_batches(
|
||||
model_cls=model_cls,
|
||||
ts_column=ts_column,
|
||||
pk_column=pk_column,
|
||||
cutoff=cutoff,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
if sum(deleted_counts.values()) > 0:
|
||||
await self._release_sqlite_space()
|
||||
|
||||
return deleted_counts
|
||||
|
||||
async def _delete_expired_in_batches(
|
||||
self,
|
||||
model_cls: type,
|
||||
ts_column: sqlalchemy.Column,
|
||||
pk_column: sqlalchemy.Column,
|
||||
cutoff: datetime.datetime,
|
||||
batch_size: int,
|
||||
) -> int:
|
||||
deleted_total = 0
|
||||
|
||||
while True:
|
||||
select_result = await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.select(pk_column).where(ts_column < cutoff).limit(batch_size)
|
||||
)
|
||||
pk_values = list(select_result.scalars().all())
|
||||
if not pk_values:
|
||||
break
|
||||
|
||||
delete_result = await self.ap.persistence_mgr.execute_async(
|
||||
sqlalchemy.delete(model_cls).where(pk_column.in_(pk_values))
|
||||
)
|
||||
deleted = delete_result.rowcount or 0
|
||||
deleted_total += deleted
|
||||
|
||||
if len(pk_values) < batch_size:
|
||||
break
|
||||
|
||||
return deleted_total
|
||||
|
||||
async def _release_sqlite_space(self) -> None:
|
||||
database_type = self.ap.instance_config.data.get('database', {}).get('use', 'sqlite')
|
||||
if database_type != 'sqlite':
|
||||
return
|
||||
|
||||
async with self.ap.persistence_mgr.get_db_engine().connect() as conn:
|
||||
autocommit_conn = await conn.execution_options(isolation_level='AUTOCOMMIT')
|
||||
await autocommit_conn.execute(sqlalchemy.text('PRAGMA wal_checkpoint(TRUNCATE)'))
|
||||
await autocommit_conn.execute(sqlalchemy.text('VACUUM'))
|
||||
|
||||
# ========== Recording Methods ==========
|
||||
|
||||
async def record_message(
|
||||
|
||||
Reference in New Issue
Block a user