"""LLM Call Node - invoke large language model with Agent capabilities. Supports: - Primary model with fallback models - Knowledge base retrieval with reranking - Max round context control - Streaming output """ from __future__ import annotations import json import logging import re import time from typing import Any, AsyncGenerator import langbot_plugin.api.entities.builtin.provider.message as provider_message import langbot_plugin.api.entities.builtin.rag.context as rag_context from langbot_plugin.api.entities.builtin.workflow.entities import ExecutionContext from ..node import WorkflowNode, workflow_node from .. import monitoring_helper logger = logging.getLogger(__name__) # Pre-compiled regex patterns for CoT content removal (performance optimization) _THINK_PATTERNS = [ re.compile(r'.*?', re.DOTALL | re.IGNORECASE), re.compile(r'.*?', re.DOTALL | re.IGNORECASE), re.compile(r'.*?', re.DOTALL | re.IGNORECASE), re.compile(r'<\u601d\u8003>.*?', re.DOTALL | re.IGNORECASE), re.compile(r'<\u63a8\u7406>.*?', re.DOTALL | re.IGNORECASE), ] # Template variable regex _TEMPLATE_VAR_RE = re.compile(r'\{\{([^}]+)\}\}') @workflow_node('llm_call') class LLMCallNode(WorkflowNode): """LLM call node - invoke large language model""" category = 'process' def _resolve_template(self, template: str, inputs: dict[str, Any], context: ExecutionContext) -> str: """Resolve {{variable}} placeholders in a template string.""" if not template: return '' unresolved_vars = [] def replacer(match: re.Match) -> str: expr = match.group(1).strip() # Try inputs first if expr in inputs: return str(inputs[expr]) # Try context variables if expr.startswith('variables.'): var_name = expr[len('variables.'):] return str(context.variables.get(var_name, '')) # Try message context if expr.startswith('message.') and context.message_context: attr = expr[len('message.'):] return str(getattr(context.message_context, attr, '')) unresolved_vars.append(expr) return match.group(0) # leave unresolved result = _TEMPLATE_VAR_RE.sub(replacer, template) # Log warning for unresolved variables if unresolved_vars: logger.warning( f'LLM call node {self.node_id}: unresolved template variables: {unresolved_vars}' ) return result def _remove_think_content(self, text: str) -> str: """Remove CoT (Chain of Thought) thinking content from response.""" if not text: return text result = text for pattern in _THINK_PATTERNS: result = pattern.sub('', result) return result.strip() def _apply_content_filter(self, text: str) -> tuple[str, bool, str]: """Apply content safety filter to text. Returns: (filtered_text, is_blocked, user_notice) """ if not text or not self.ap: return text, False, '' # Check if content filter is enabled safety_config = getattr(self.ap, 'pipeline_cfg', None) if not safety_config: return text, False, '' # Check sensitive words sensitive_words = [] try: if hasattr(self.ap, 'sensitive_meta') and hasattr(self.ap.sensitive_meta, 'data'): sensitive_words = self.ap.sensitive_meta.data.get('words', []) except Exception as e: logger.warning("Failed to load sensitive words from sensitive_meta: %s", e) sensitive_words = [] if not sensitive_words: return text, False, '' found = False filtered_text = text for word in sensitive_words: try: matches = re.findall(word, filtered_text, re.IGNORECASE) if matches: found = True mask_word = '' mask = '*' try: if hasattr(self.ap, 'sensitive_meta') and hasattr(self.ap.sensitive_meta, 'data'): mask_word = self.ap.sensitive_meta.data.get('mask_word', '') mask = self.ap.sensitive_meta.data.get('mask', '*') except Exception as e: # Keep default mask settings when sensitive metadata is unavailable or malformed. logger.debug( f'LLM call node {self.node_id}: failed to read sensitive mask config, using defaults: {e}' ) for m in matches: if mask_word: filtered_text = filtered_text.replace(m, mask_word) else: filtered_text = filtered_text.replace(m, mask * len(m)) except re.error: # Invalid regex pattern, skip continue if found: return filtered_text, False, '消息中存在不合适的内容, 请修改' return text, False, '' # RAG combined prompt template (same as localagent.py) RAG_COMBINED_PROMPT_TEMPLATE = """ The following are relevant context entries retrieved from the knowledge base. Please use them to answer the user's message. Respond in the same language as the user's input. {rag_context} {user_message} """ def _build_system_prompt_with_format(self, base_prompt: str, output_format: str, json_schema: str) -> str: """Build system prompt with output format instructions.""" prompt = base_prompt if output_format == 'json': prompt += '\n\nPlease respond in valid JSON format.' if json_schema: prompt += f'\nFollow this JSON schema:\n{json_schema}' elif output_format == 'markdown': prompt += '\n\nPlease respond in Markdown format.' return prompt def _build_messages_from_prompt_array( self, prompt_array: list[dict], inputs: dict[str, Any], context: ExecutionContext, output_format: str, json_schema: str, ) -> list[provider_message.Message]: """Build messages list from prompt array (same format as pipeline). Each item in prompt_array is {role: str, content: str}. Resolves template variables in content. """ messages: list[provider_message.Message] = [] for item in prompt_array: role = item.get('role', 'user') content = item.get('content', '') # Resolve template variables in content resolved_content = self._resolve_template(content, inputs, context) # Apply format instructions to system prompt if role == 'system': resolved_content = self._build_system_prompt_with_format( resolved_content, output_format, json_schema ) messages.append(provider_message.Message(role=role, content=resolved_content)) return messages async def _get_model_candidates(self, model_uuid: str, fallback_models: list) -> list: """Build ordered list of models to try: primary model + fallback models.""" candidates = [] # Primary model if model_uuid: try: primary = await self.ap.model_mgr.get_model_by_uuid(model_uuid) candidates.append(primary) except ValueError: logger.warning(f'[LLM:{self.node_id}] Primary model {model_uuid} not found') # Fallback models for fb_uuid in fallback_models: try: fb_model = await self.ap.model_mgr.get_model_by_uuid(fb_uuid) candidates.append(fb_model) except ValueError: logger.warning(f'[LLM:{self.node_id}] Fallback model {fb_uuid} not found, skipping') return candidates async def _invoke_with_fallback( self, candidates: list, messages: list, funcs: list | None, extra_args: dict, ) -> tuple[Any, Any, dict]: """Try non-streaming invocation with sequential fallback. Returns (message, model_used, usage_info).""" last_error = None for model in candidates: try: result = await model.provider.invoke_llm( query=None, model=model, messages=messages, funcs=funcs if model.model_entity.abilities.__contains__('func_call') else [], extra_args=extra_args, ) # invoke_llm returns (message, usage_info) tuple if isinstance(result, tuple) and len(result) == 2: msg, usage_info = result else: msg = result usage_info = {} return msg, model, usage_info except Exception as e: last_error = e logger.warning(f'[LLM:{self.node_id}] Model {model.model_entity.name} failed: {e}, trying next...') raise last_error or RuntimeError('No model candidates available') async def _retrieve_knowledge( self, user_message_text: str, knowledge_bases: list[str], rerank_model_uuid: str, rerank_top_k: int, ) -> str: """Retrieve from knowledge bases and optionally rerank results. Returns the enhanced user message text with RAG context, or original text if no results. """ if not knowledge_bases or not user_message_text: return user_message_text all_results: list[rag_context.RetrievalResultEntry] = [] # Retrieve from each knowledge base for kb_uuid in knowledge_bases: try: kb = await self.ap.rag_mgr.get_knowledge_base_by_uuid(kb_uuid) if not kb: logger.warning(f'[LLM:{self.node_id}] Knowledge base {kb_uuid} not found, skipping') continue result = await kb.retrieve(user_message_text, settings={}) if result: all_results.extend(result) except Exception as e: logger.warning(f'[LLM:{self.node_id}] Failed to retrieve from KB {kb_uuid}: {e}') # Rerank step: re-score results using a rerank model if configured if all_results and rerank_model_uuid: try: rerank_model = await self.ap.model_mgr.get_rerank_model_by_uuid(rerank_model_uuid) doc_texts = [] for entry in all_results: text = ' '.join(c.text for c in entry.content if c.type == 'text' and c.text) doc_texts.append(text) doc_texts_capped = doc_texts[:64] # Cap for reranker input scores = await rerank_model.provider.invoke_rerank( model=rerank_model, query=user_message_text, documents=doc_texts_capped, ) scored = sorted(scores, key=lambda x: x.get('relevance_score', 0), reverse=True) top_indices = [s['index'] for s in scored[:rerank_top_k] if s['index'] < len(all_results)] all_results = [all_results[i] for i in top_indices] logger.info( f'[LLM:{self.node_id}] Rerank complete: {len(doc_texts)} docs -> top {len(all_results)} kept (top_k={rerank_top_k})' ) except ValueError: logger.warning(f'[LLM:{self.node_id}] Rerank model {rerank_model_uuid} not found, skipping rerank') except Exception as e: logger.warning(f'[LLM:{self.node_id}] Rerank failed, using original order: {e}') # Build RAG context text if all_results: texts = [] idx = 1 for entry in all_results: for content in entry.content: if content.type == 'text' and content.text is not None: texts.append(f'[{idx}] {content.text}') idx += 1 rag_context_text = '\n\n'.join(texts) return self.RAG_COMBINED_PROMPT_TEMPLATE.format( rag_context=rag_context_text, user_message=user_message_text, ) return user_message_text def _build_messages_with_history( self, system_prompt: str, user_message_text: str, context: ExecutionContext, max_round: int, ) -> list[provider_message.Message]: """Build messages list with conversation history up to max_round.""" messages: list[provider_message.Message] = [] # Add system prompt if system_prompt: messages.append(provider_message.Message(role='system', content=system_prompt)) # Get conversation history from context conversation_history = context.variables.get('_conversation_history', []) # Apply max_round limit (each round = 1 user + 1 assistant message) if max_round > 0 and conversation_history: # Keep only the last max_round * 2 messages (user + assistant pairs) max_messages = max_round * 2 if len(conversation_history) > max_messages: conversation_history = conversation_history[-max_messages:] # Add conversation history for msg in conversation_history: if isinstance(msg, dict): role = msg.get('role', 'user') content = msg.get('content', '') messages.append(provider_message.Message(role=role, content=content)) elif hasattr(msg, 'role') and hasattr(msg, 'content'): messages.append(provider_message.Message(role=msg.role, content=msg.content)) # Add current user message messages.append(provider_message.Message(role='user', content=user_message_text)) return messages def _save_to_conversation_history( self, context: ExecutionContext, user_message_text: str, response_text: str, max_round: int, ) -> None: """Save the exchange to conversation history.""" if max_round <= 0: return history = context.variables.get('_conversation_history', []) history.append({'role': 'user', 'content': user_message_text}) history.append({'role': 'assistant', 'content': response_text}) # Enforce max_round limit max_messages = max_round * 2 if len(history) > max_messages: history = history[-max_messages:] context.variables['_conversation_history'] = history async def execute(self, inputs: dict[str, Any], context: ExecutionContext) -> dict[str, Any]: # Support both new model_config format and legacy model + fallback_models format model_config = self.get_config('model_config', None) if model_config and isinstance(model_config, dict): # New format: {primary: uuid, fallbacks: [uuid1, uuid2, ...]} model_uuid = model_config.get('primary', '') fallback_models = model_config.get('fallbacks', []) else: # Legacy format: separate model and fallback_models model_uuid = self.get_config('model', '') fallback_models = self.get_config('fallback_models', []) if not model_uuid: raise ValueError('No model configured for LLM call node') if not self.ap: raise RuntimeError('Application instance not available - cannot call LLM') # Get error handling config exception_handling = self.get_config('exception_handling', 'show-error') failure_hint = self.get_config('failure_hint', 'Request failed.') track_function_calls = self.get_config('track_function_calls', False) # Get output format and json_schema config output_format = self.get_config('output_format', 'text') json_schema = self.get_config('json_schema', '') # Agent config: knowledge bases, rerank, max_round # (fallback_models already resolved above from model_config or fallback_models) knowledge_bases = self.get_config('knowledge_bases', []) rerank_model = self.get_config('rerank_model', '') rerank_top_k = self.get_config('rerank_top_k', 5) max_round = self.get_config('max_round', 10) # Resolve prompts - support both new prompt array format and legacy format prompt_array = self.get_config('prompt') user_prompt = '' # Initialize for later use in _save_to_conversation_history if prompt_array and isinstance(prompt_array, list): # New format: prompt array like pipeline messages = self._build_messages_from_prompt_array( prompt_array, inputs, context, output_format, json_schema ) # Get user input text for knowledge retrieval user_input = inputs.get('input', '') # Knowledge retrieval: enhance user input with RAG context user_input = await self._retrieve_knowledge( user_message_text=user_input, knowledge_bases=knowledge_bases, rerank_model_uuid=rerank_model, rerank_top_k=rerank_top_k, ) # Track user_prompt for conversation history user_prompt = user_input # Add user input as last message if user_input: messages.append(provider_message.Message(role='user', content=user_input)) # Apply max_round to conversation history conversation_history = context.variables.get('_conversation_history', []) if max_round > 0 and conversation_history: max_messages = max_round * 2 if len(conversation_history) > max_messages: conversation_history = conversation_history[-max_messages:] # Insert conversation history before user input history_messages = [] for msg in conversation_history: if isinstance(msg, dict): role = msg.get('role', 'user') content = msg.get('content', '') history_messages.append(provider_message.Message(role=role, content=content)) elif hasattr(msg, 'role') and hasattr(msg, 'content'): history_messages.append(provider_message.Message(role=msg.role, content=msg.content)) # Insert history before user message if history_messages and len(messages) > 0: messages = messages[:-1] + history_messages + [messages[-1]] else: # Legacy format: separate system_prompt and user_prompt_template system_prompt = self._resolve_template(self.get_config('system_prompt') or '', inputs, context) user_prompt_template = self.get_config('user_prompt_template') if user_prompt_template is None: user_prompt_template = '{{input}}' user_prompt = self._resolve_template(user_prompt_template, inputs, context) # Build system prompt with format instructions system_prompt = self._build_system_prompt_with_format(system_prompt, output_format, json_schema) # Knowledge retrieval: enhance user prompt with RAG context user_prompt = await self._retrieve_knowledge( user_message_text=user_prompt, knowledge_bases=knowledge_bases, rerank_model_uuid=rerank_model, rerank_top_k=rerank_top_k, ) # Build messages with conversation history messages = self._build_messages_with_history( system_prompt=system_prompt, user_message_text=user_prompt, context=context, max_round=max_round, ) # Get model candidates (primary + fallbacks) candidates = await self._get_model_candidates(model_uuid, fallback_models) if not candidates: raise ValueError('No valid model candidates available') # Build extra args from config extra_args: dict[str, Any] = {} temperature = self.get_config('temperature') if temperature is not None: extra_args['temperature'] = float(temperature) max_tokens = self.get_config('max_tokens', 0) if max_tokens and int(max_tokens) > 0: extra_args['max_tokens'] = int(max_tokens) # Track start time for duration calculation self._llm_start_time = time.time() # Invoke LLM with fallback try: result_message, used_model, llm_usage = await self._invoke_with_fallback( candidates=candidates, messages=messages, funcs=None, extra_args=extra_args, ) except Exception as e: logger.warning(f'[LLM:{self.node_id}] LLM call failed: {e}') # Handle based on exception handling strategy if exception_handling == 'show-error': raise elif exception_handling == 'show-hint': return { 'response': failure_hint, 'usage': { 'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0, }, 'error': str(e), 'error_hint_shown': True, } else: # hide return { 'response': '', 'usage': { 'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0, }, 'error': str(e), } # Extract response text response_text = '' if isinstance(result_message.content, str): response_text = result_message.content elif isinstance(result_message.content, list): for elem in result_message.content: if hasattr(elem, 'text') and elem.text: response_text += elem.text elif isinstance(elem, str): response_text += elem # Remove CoT content (always remove to avoid leaking internal reasoning) response_text = self._remove_think_content(response_text) # Initialize usage default usage = { 'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0, } # Apply content safety filter response_text, is_blocked, filter_notice = self._apply_content_filter(response_text) if is_blocked: logger.warning(f'[LLM:{self.node_id}] Response blocked by content filter: {filter_notice}') return { 'response': filter_notice, 'usage': usage, 'blocked_by_filter': True, } # Extract usage info from LLM call result # Priority: llm_usage (from _invoke_with_fallback) > result_message.usage > result_message.token_usage if llm_usage: usage = { 'prompt_tokens': llm_usage.get('input_tokens', 0) or llm_usage.get('prompt_tokens', 0), 'completion_tokens': llm_usage.get('output_tokens', 0) or llm_usage.get('completion_tokens', 0), 'total_tokens': llm_usage.get('total_tokens', 0), } # Check result_message.usage (set by RuntimeProvider.invoke_llm) elif hasattr(result_message, 'usage') and result_message.usage: u = result_message.usage if isinstance(u, dict): usage = { 'prompt_tokens': u.get('input_tokens', 0) or u.get('prompt_tokens', 0), 'completion_tokens': u.get('output_tokens', 0) or u.get('completion_tokens', 0), 'total_tokens': u.get('total_tokens', 0), } else: usage = { 'prompt_tokens': getattr(u, 'input_tokens', 0) or getattr(u, 'prompt_tokens', 0), 'completion_tokens': getattr(u, 'output_tokens', 0) or getattr(u, 'completion_tokens', 0), 'total_tokens': getattr(u, 'total_tokens', 0), } elif hasattr(result_message, 'token_usage') and result_message.token_usage: u = result_message.token_usage if isinstance(u, dict): usage = { 'prompt_tokens': u.get('prompt_tokens', 0) or 0, 'completion_tokens': u.get('completion_tokens', 0) or 0, 'total_tokens': u.get('total_tokens', 0) or 0, } else: usage = { 'prompt_tokens': getattr(u, 'prompt_tokens', 0) or 0, 'completion_tokens': getattr(u, 'completion_tokens', 0) or 0, 'total_tokens': getattr(u, 'total_tokens', 0) or 0, } # Log successful response (matching Pipeline's cut_str behavior) def _cut_str(s: str) -> str: s0 = s.split('\n')[0] if len(s0) > 20 or '\n' in s: s0 = s0[:20] + '...' return s0 logger.info(f'[LLM:{self.node_id}] Response: {_cut_str(response_text)}') # Record LLM call log only (response log is redundant) try: if self.ap and context.query: workflow_id = context.workflow_id or '' workflow_name = context.variables.get('_workflow_name', 'Workflow') bot_name = context.variables.get('_bot_name', 'Workflow') node_name = self.get_config('name', self.node_id) model_name = used_model.model_entity.name if used_model else 'unknown' # Calculate duration duration_ms = 0 if hasattr(self, '_llm_start_time'): duration_ms = int((time.time() - self._llm_start_time) * 1000) # Get message_id for LLM call association message_id = context.variables.get('_monitoring_message_id') # Record LLM call log with message_id association await monitoring_helper.WorkflowMonitoringHelper.record_llm_call_log( ap=self.ap, query=context.query, workflow_id=workflow_id, workflow_name=workflow_name, node_name=node_name, model_name=model_name, input_tokens=usage.get('prompt_tokens', 0), output_tokens=usage.get('completion_tokens', 0), duration_ms=duration_ms, status='success', bot_name=bot_name, context_vars=context.variables, message_id=message_id, ) except Exception as e: logger.warning(f'[LLM:{self.node_id}] Failed to record LLM logs: {e}') # Save to conversation history self._save_to_conversation_history( context=context, user_message_text=user_prompt, response_text=response_text, max_round=max_round, ) # Build result result: dict[str, Any] = { 'response': response_text, 'usage': usage, 'model_used': used_model.model_entity.name if used_model else None, 'model_uuid': used_model.model_entity.uuid if used_model else None, } # Parse JSON output if format is json if output_format == 'json' and response_text: try: result['parsed'] = json.loads(response_text) except json.JSONDecodeError as e: logger.warning(f'[LLM:{self.node_id}] Failed to parse JSON: {e}') result['parsed'] = None result['parse_error'] = str(e) # Add function call tracking info if configured if track_function_calls: result['function_calls'] = [] return result async def execute_stream( self, inputs: dict[str, Any], context: ExecutionContext ) -> AsyncGenerator[str, None]: """Execute the LLM call with streaming output. Yields chunks of response text as they arrive. Falls back to non-streaming if streaming is not available. """ # Support both new model_config format and legacy model + fallback_models format model_config = self.get_config('model_config', None) if model_config and isinstance(model_config, dict): model_uuid = model_config.get('primary', '') else: model_uuid = self.get_config('model', '') if not model_uuid: raise ValueError('No model configured for LLM call node') if not self.ap: raise RuntimeError('Application instance not available - cannot call LLM') exception_handling = self.get_config('exception_handling', 'show-error') failure_hint = self.get_config('failure_hint', 'Request failed.') # Resolve prompts - support both new prompt array format and legacy format prompt_array = self.get_config('prompt') if prompt_array and isinstance(prompt_array, list): # New format: prompt array like pipeline messages = self._build_messages_from_prompt_array( prompt_array, inputs, context, 'text', '' # No format instructions for streaming ) # Add user input user_input = inputs.get('input', '') if user_input: messages.append(provider_message.Message(role='user', content=user_input)) else: # Legacy format system_prompt = self._resolve_template(self.get_config('system_prompt') or '', inputs, context) user_prompt_template = self.get_config('user_prompt_template') if user_prompt_template is None: user_prompt_template = '{{input}}' user_prompt = self._resolve_template(user_prompt_template, inputs, context) # Build messages messages = [] if system_prompt: messages.append(provider_message.Message(role='system', content=system_prompt)) messages.append(provider_message.Message(role='user', content=user_prompt)) # Get model runtime_model = await self.ap.model_mgr.get_model_by_uuid(model_uuid) # Build extra args extra_args: dict[str, Any] = {} temperature = self.get_config('temperature') if temperature is not None: extra_args['temperature'] = float(temperature) max_tokens = self.get_config('max_tokens', 0) if max_tokens and int(max_tokens) > 0: extra_args['max_tokens'] = int(max_tokens) logger.info(f'[LLM:{self.node_id}] Streaming model {model_uuid}') try: # Try streaming first stream = runtime_model.provider.invoke_llm_stream( query=None, model=runtime_model, messages=messages, funcs=None, extra_args=extra_args, ) full_response = '' in_think_block = False async for chunk in stream: chunk_text = '' if hasattr(chunk, 'content'): if isinstance(chunk.content, str): chunk_text = chunk.content elif isinstance(chunk.content, list): for elem in chunk.content: if hasattr(elem, 'text') and elem.text: chunk_text += elem.text elif isinstance(elem, str): chunk_text += elem if chunk_text: # Filter blocks in streaming mode if '' in chunk_text or '' in chunk_text: in_think_block = True if in_think_block: if '' in chunk_text or '' in chunk_text: in_think_block = False chunk_text = chunk_text.split('')[-1].split('')[-1] else: chunk_text = '' if chunk_text: full_response += chunk_text yield chunk_text # Store in context for downstream nodes context.variables['_last_llm_response'] = full_response except Exception as e: logger.warning(f'[LLM:{self.node_id}] Streaming failed, falling back - {e}') # Fallback to non-streaming try: result_message = await runtime_model.provider.invoke_llm( query=None, model=runtime_model, messages=messages, funcs=None, extra_args=extra_args, ) response_text = self._extract_response_text(result_message) # Always remove content in fallback response_text = self._remove_think_content(response_text) yield response_text context.variables['_last_llm_response'] = response_text except Exception as e2: logger.error(f'[LLM:{self.node_id}] Fallback also failed - {e2}') if exception_handling == 'show-hint': yield failure_hint elif exception_handling != 'hide': raise def _extract_response_text(self, result_message: provider_message.Message) -> str: """Extract response text from LLM result message.""" response_text = '' if isinstance(result_message.content, str): response_text = result_message.content elif isinstance(result_message.content, list): for elem in result_message.content: if hasattr(elem, 'text') and elem.text: response_text += elem.text elif isinstance(elem, str): response_text += elem return response_text