fix(longtext): avoid split interfering with multi-chain agent responses

Use query variable '_longtext_split_extra_chains' to pass extra split segments instead of appending to resp_message_chain directly. This prevents agent tool-call multi-round responses from being misidentified as split results and sent repeatedly. respback.py reverts to original single-chain logic and appends split extra chains after the main response.
feat(longtext): implement long text splitting strategy with Markdown awareness
2026-06-02 12:05:54 +00:00 · 2026-03-12 09:52:59 -04:00 · 2026-03-09 01:39:25 +08:00
4 changed files with 260 additions and 3 deletions
--- a/src/langbot/pkg/pipeline/longtext/longtext.py
+++ b/src/langbot/pkg/pipeline/longtext/longtext.py
@@ -22,10 +22,13 @@ class LongTextProcessStage(stage.PipelineStage):
    """
    strategy_impl: strategy.LongTextStrategy | None
    is_split: bool
    async def initialize(self, pipeline_config: dict):
        config = pipeline_config['output']['long-text-processing']
        self.is_split = config['strategy'] == 'split'
        if config['strategy'] == 'none':
            self.strategy_impl = None
            return
@@ -90,8 +93,23 @@ class LongTextProcessStage(stage.PipelineStage):
            len(str(query.resp_message_chain[-1]))
            > query.pipeline_config['output']['long-text-processing']['threshold']
        ):
-            query.resp_message_chain[-1] = platform_message.MessageChain(
+            if self.is_split:
-                await self.strategy_impl.process(str(query.resp_message_chain[-1]), query)
+                original_text = str(query.resp_message_chain[-1])
-            )
+                threshold = query.pipeline_config['output']['long-text-processing']['threshold']
                segments = self.strategy_impl.split_text(original_text, threshold)
                # Replace the last chain with the first segment, store extra segments separately
                # to avoid interfering with existing multi-chain scenarios (e.g. agent tool calls)
                query.resp_message_chain[-1] = platform_message.MessageChain(
                    [platform_message.Plain(text=segments[0])]
                )
                if len(segments) > 1:
                    query.set_variable(
                        '_longtext_split_extra_chains',
                        [platform_message.MessageChain([platform_message.Plain(text=seg)]) for seg in segments[1:]],
                    )
            else:
                query.resp_message_chain[-1] = platform_message.MessageChain(
                    await self.strategy_impl.process(str(query.resp_message_chain[-1]), query)
                )
        return entities.StageProcessResult(result_type=entities.ResultType.CONTINUE, new_query=query)
--- a/src/langbot/pkg/pipeline/longtext/strategies/split.py
+++ b/src/langbot/pkg/pipeline/longtext/strategies/split.py
@@ -0,0 +1,224 @@
 from __future__ import annotations
 import re
 from .. import strategy as strategy_model
 import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
 import langbot_plugin.api.entities.builtin.platform.message as platform_message
@strategy_model.strategy_class('split')
 class SplitStrategy(strategy_model.LongTextStrategy):
    """Split long text into multiple message segments with Markdown awareness."""
    async def process(self, message: str, query: pipeline_query.Query) -> list[platform_message.MessageComponent]:
        segments = self.split_text(
            message,
            query.pipeline_config['output']['long-text-processing']['threshold'],
        )
        return [platform_message.Plain(text=segments[0])] if segments else []
    def split_text(self, text: str, max_length: int) -> list[str]:
        """Split text into segments respecting Markdown structure.
        Priority:
            1. Markdown structural boundaries (headings, code blocks, horizontal rules)
            2. Paragraph breaks (blank lines)
            3. List item boundaries
            4. Line breaks
            5. Hard cut (fallback)
        """
        if len(text) <= max_length:
            return [text]
        blocks = self._parse_markdown_blocks(text)
        return self._merge_blocks(blocks, max_length)
    def _parse_markdown_blocks(self, text: str) -> list[str]:
        """Parse text into Markdown-aware blocks.
        Keeps code blocks intact and splits the rest by structural elements.
        """
        blocks: list[str] = []
        lines = text.split('\n')
        current_block: list[str] = []
        in_code_block = False
        for line in lines:
            stripped = line.strip()
            # Toggle fenced code block state
            if stripped.startswith('```'):
                if in_code_block:
                    # End of code block - close it as one block
                    current_block.append(line)
                    blocks.append('\n'.join(current_block))
                    current_block = []
                    in_code_block = False
                    continue
                else:
                    # Start of code block - flush current block first
                    if current_block:
                        blocks.append('\n'.join(current_block))
                        current_block = []
                    current_block.append(line)
                    in_code_block = True
                    continue
            if in_code_block:
                current_block.append(line)
                continue
            # Heading (# ...) - start a new block
            if re.match(r'^#{1,6}\s', stripped):
                if current_block:
                    blocks.append('\n'.join(current_block))
                    current_block = []
                current_block.append(line)
                continue
            # Horizontal rule (---, ***, ___) - start a new block
            if re.match(r'^(-{3,}|\*{3,}|_{3,})\s*$', stripped):
                if current_block:
                    blocks.append('\n'.join(current_block))
                    current_block = []
                blocks.append(line)
                continue
            # Blank line - paragraph boundary
            if stripped == '':
                if current_block:
                    current_block.append(line)
                    blocks.append('\n'.join(current_block))
                    current_block = []
                continue
            current_block.append(line)
        # Flush remaining (including unclosed code blocks)
        if current_block:
            blocks.append('\n'.join(current_block))
        return [b for b in blocks if b.strip()]
    def _merge_blocks(self, blocks: list[str], max_length: int) -> list[str]:
        """Merge small blocks greedily until approaching max_length.
        If a single block exceeds max_length, split it by lines as fallback.
        """
        segments: list[str] = []
        current = ''
        for block in blocks:
            candidate = (current + '\n\n' + block) if current else block
            if len(candidate) <= max_length:
                current = candidate
            else:
                # Flush current segment
                if current:
                    segments.append(current)
                # Check if this single block fits
                if len(block) <= max_length:
                    current = block
                else:
                    # Block too large - split it by lines
                    for part in self._split_large_block(block, max_length):
                        segments.append(part)
                    current = ''
        if current:
            segments.append(current)
        return [s for s in segments if s.strip()]
    def _split_large_block(self, block: str, max_length: int) -> list[str]:
        """Split an oversized block by lines, preserving code block fences.
        For single-line plain text (no newlines), falls back to splitting at
        natural language boundaries (spaces, punctuation).
        """
        lines = block.split('\n')
        # Single long line with no newlines - use plain text splitting
        if len(lines) == 1:
            return self._split_plain_text(block, max_length)
        is_code_block = lines[0].strip().startswith('```')
        segments: list[str] = []
        current_lines: list[str] = []
        current_len = 0
        # For code blocks, track the opening fence to re-apply on continuations
        code_fence = lines[0] if is_code_block else ''
        for i, line in enumerate(lines):
            line_len = len(line) + 1  # +1 for newline
            # Single line exceeds limit on its own - split it first
            if line_len > max_length:
                if current_lines:
                    seg = '\n'.join(current_lines)
                    if is_code_block and not seg.rstrip().endswith('```'):
                        seg += '\n```'
                    segments.append(seg)
                    current_lines = []
                    current_len = 0
                for part in self._split_plain_text(line, max_length):
                    segments.append(part)
                continue
            if current_len + line_len > max_length and current_lines:
                segment = '\n'.join(current_lines)
                # Close code block fence if splitting mid-code-block
                if is_code_block and not segment.rstrip().endswith('```'):
                    segment += '\n```'
                segments.append(segment)
                current_lines = []
                current_len = 0
                # Re-open code block fence for continuation
                if is_code_block and i < len(lines) - 1 and not line.strip().startswith('```'):
                    current_lines.append(code_fence)
                    current_len = len(code_fence) + 1
            current_lines.append(line)
            current_len += line_len
        if current_lines:
            segments.append('\n'.join(current_lines))
        return segments
    def _split_plain_text(self, text: str, max_length: int) -> list[str]:
        """Split a long plain text string (no newlines) at word/space boundaries."""
        if len(text) <= max_length:
            return [text]
        segments: list[str] = []
        remaining = text
        while remaining:
            if len(remaining) <= max_length:
                segments.append(remaining)
                break
            chunk = remaining[:max_length]
            min_pos = int(max_length * 0.3)
            # Try to find a space to split at
            pos = chunk.rfind(' ')
            if pos >= min_pos:
                split_pos = pos
            else:
                # Hard cut as last resort
                split_pos = max_length
            segments.append(remaining[:split_pos].rstrip())
            remaining = remaining[split_pos:].lstrip()
        return [s for s in segments if s]
--- a/src/langbot/pkg/pipeline/respback/respback.py
+++ b/src/langbot/pkg/pipeline/respback/respback.py
@@ -55,4 +55,15 @@ class SendResponseBackStage(stage.PipelineStage):
                quote_origin=quote_origin,
            )
        # Send extra chains produced by long text split strategy
        extra_chains = query.get_variable('_longtext_split_extra_chains')
        if extra_chains:
            for chain in extra_chains:
                await query.adapter.reply_message(
                    message_source=query.message_event,
                    message=chain,
                    quote_origin=False,
                )
            query.set_variable('_longtext_split_extra_chains', None)
        return entities.StageProcessResult(result_type=entities.ResultType.CONTINUE, new_query=query)
--- a/src/langbot/templates/metadata/pipeline/output.yaml
+++ b/src/langbot/templates/metadata/pipeline/output.yaml
@@ -37,6 +37,10 @@ stages:
            label:
              en_US: Convert to Image
              zh_Hans: 转换为图片
          - name: split
            label:
              en_US: Split into Multiple Messages
              zh_Hans: 分割为多条消息发送
          - name: none
            label:
              en_US: None