feat(longtext): implement long text splitting strategy with Markdown awareness

2026-07-21 20:06:06 +00:00 · 2026-03-09 01:39:25 +08:00
parent 40c7b0f731
commit c92d3d7ad7
4 changed files with 284 additions and 24 deletions
@@ -22,10 +22,13 @@ class LongTextProcessStage(stage.PipelineStage):
    """

    strategy_impl: strategy.LongTextStrategy | None
+    is_split: bool

    async def initialize(self, pipeline_config: dict):
        config = pipeline_config['output']['long-text-processing']

+        self.is_split = config['strategy'] == 'split'
+
        if config['strategy'] == 'none':
            self.strategy_impl = None
            return
@@ -90,8 +93,18 @@ class LongTextProcessStage(stage.PipelineStage):
            len(str(query.resp_message_chain[-1]))
            > query.pipeline_config['output']['long-text-processing']['threshold']
        ):
-            query.resp_message_chain[-1] = platform_message.MessageChain(
-                await self.strategy_impl.process(str(query.resp_message_chain[-1]), query)
-            )
+            if self.is_split:
+                original_text = str(query.resp_message_chain[-1])
+                threshold = query.pipeline_config['output']['long-text-processing']['threshold']
+                segments = self.strategy_impl.split_text(original_text, threshold)
+                query.resp_message_chain.pop()
+                for segment in segments:
+                    query.resp_message_chain.append(
+                        platform_message.MessageChain([platform_message.Plain(text=segment)])
+                    )
+            else:
+                query.resp_message_chain[-1] = platform_message.MessageChain(
+                    await self.strategy_impl.process(str(query.resp_message_chain[-1]), query)
+                )

        return entities.StageProcessResult(result_type=entities.ResultType.CONTINUE, new_query=query)
@@ -0,0 +1,224 @@
+from __future__ import annotations
+
+import re
+
+from .. import strategy as strategy_model
+
+import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
+import langbot_plugin.api.entities.builtin.platform.message as platform_message
+
+
+@strategy_model.strategy_class('split')
+class SplitStrategy(strategy_model.LongTextStrategy):
+    """Split long text into multiple message segments with Markdown awareness."""
+
+    async def process(self, message: str, query: pipeline_query.Query) -> list[platform_message.MessageComponent]:
+        segments = self.split_text(
+            message,
+            query.pipeline_config['output']['long-text-processing']['threshold'],
+        )
+        return [platform_message.Plain(text=segments[0])] if segments else []
+
+    def split_text(self, text: str, max_length: int) -> list[str]:
+        """Split text into segments respecting Markdown structure.
+
+        Priority:
+            1. Markdown structural boundaries (headings, code blocks, horizontal rules)
+            2. Paragraph breaks (blank lines)
+            3. List item boundaries
+            4. Line breaks
+            5. Hard cut (fallback)
+        """
+        if len(text) <= max_length:
+            return [text]
+
+        blocks = self._parse_markdown_blocks(text)
+        return self._merge_blocks(blocks, max_length)
+
+    def _parse_markdown_blocks(self, text: str) -> list[str]:
+        """Parse text into Markdown-aware blocks.
+
+        Keeps code blocks intact and splits the rest by structural elements.
+        """
+        blocks: list[str] = []
+        lines = text.split('\n')
+        current_block: list[str] = []
+        in_code_block = False
+
+        for line in lines:
+            stripped = line.strip()
+
+            # Toggle fenced code block state
+            if stripped.startswith('```'):
+                if in_code_block:
+                    # End of code block - close it as one block
+                    current_block.append(line)
+                    blocks.append('\n'.join(current_block))
+                    current_block = []
+                    in_code_block = False
+                    continue
+                else:
+                    # Start of code block - flush current block first
+                    if current_block:
+                        blocks.append('\n'.join(current_block))
+                        current_block = []
+                    current_block.append(line)
+                    in_code_block = True
+                    continue
+
+            if in_code_block:
+                current_block.append(line)
+                continue
+
+            # Heading (# ...) - start a new block
+            if re.match(r'^#{1,6}\s', stripped):
+                if current_block:
+                    blocks.append('\n'.join(current_block))
+                    current_block = []
+                current_block.append(line)
+                continue
+
+            # Horizontal rule (---, ***, ___) - start a new block
+            if re.match(r'^(-{3,}|\*{3,}|_{3,})\s*$', stripped):
+                if current_block:
+                    blocks.append('\n'.join(current_block))
+                    current_block = []
+                blocks.append(line)
+                continue
+
+            # Blank line - paragraph boundary
+            if stripped == '':
+                if current_block:
+                    current_block.append(line)
+                    blocks.append('\n'.join(current_block))
+                    current_block = []
+                continue
+
+            current_block.append(line)
+
+        # Flush remaining (including unclosed code blocks)
+        if current_block:
+            blocks.append('\n'.join(current_block))
+
+        return [b for b in blocks if b.strip()]
+
+    def _merge_blocks(self, blocks: list[str], max_length: int) -> list[str]:
+        """Merge small blocks greedily until approaching max_length.
+
+        If a single block exceeds max_length, split it by lines as fallback.
+        """
+        segments: list[str] = []
+        current = ''
+
+        for block in blocks:
+            candidate = (current + '\n\n' + block) if current else block
+
+            if len(candidate) <= max_length:
+                current = candidate
+            else:
+                # Flush current segment
+                if current:
+                    segments.append(current)
+
+                # Check if this single block fits
+                if len(block) <= max_length:
+                    current = block
+                else:
+                    # Block too large - split it by lines
+                    for part in self._split_large_block(block, max_length):
+                        segments.append(part)
+                    current = ''
+
+        if current:
+            segments.append(current)
+
+        return [s for s in segments if s.strip()]
+
+    def _split_large_block(self, block: str, max_length: int) -> list[str]:
+        """Split an oversized block by lines, preserving code block fences.
+
+        For single-line plain text (no newlines), falls back to splitting at
+        natural language boundaries (spaces, punctuation).
+        """
+        lines = block.split('\n')
+
+        # Single long line with no newlines - use plain text splitting
+        if len(lines) == 1:
+            return self._split_plain_text(block, max_length)
+
+        is_code_block = lines[0].strip().startswith('```')
+
+        segments: list[str] = []
+        current_lines: list[str] = []
+        current_len = 0
+
+        # For code blocks, track the opening fence to re-apply on continuations
+        code_fence = lines[0] if is_code_block else ''
+
+        for i, line in enumerate(lines):
+            line_len = len(line) + 1  # +1 for newline
+
+            # Single line exceeds limit on its own - split it first
+            if line_len > max_length:
+                if current_lines:
+                    seg = '\n'.join(current_lines)
+                    if is_code_block and not seg.rstrip().endswith('```'):
+                        seg += '\n```'
+                    segments.append(seg)
+                    current_lines = []
+                    current_len = 0
+
+                for part in self._split_plain_text(line, max_length):
+                    segments.append(part)
+                continue
+
+            if current_len + line_len > max_length and current_lines:
+                segment = '\n'.join(current_lines)
+                # Close code block fence if splitting mid-code-block
+                if is_code_block and not segment.rstrip().endswith('```'):
+                    segment += '\n```'
+                segments.append(segment)
+
+                current_lines = []
+                current_len = 0
+                # Re-open code block fence for continuation
+                if is_code_block and i < len(lines) - 1 and not line.strip().startswith('```'):
+                    current_lines.append(code_fence)
+                    current_len = len(code_fence) + 1
+
+            current_lines.append(line)
+            current_len += line_len
+
+        if current_lines:
+            segments.append('\n'.join(current_lines))
+
+        return segments
+
+    def _split_plain_text(self, text: str, max_length: int) -> list[str]:
+        """Split a long plain text string (no newlines) at word/space boundaries."""
+        if len(text) <= max_length:
+            return [text]
+
+        segments: list[str] = []
+        remaining = text
+
+        while remaining:
+            if len(remaining) <= max_length:
+                segments.append(remaining)
+                break
+
+            chunk = remaining[:max_length]
+            min_pos = int(max_length * 0.3)
+
+            # Try to find a space to split at
+            pos = chunk.rfind(' ')
+            if pos >= min_pos:
+                split_pos = pos
+            else:
+                # Hard cut as last resort
+                split_pos = max_length
+
+            segments.append(remaining[:split_pos].rstrip())
+            remaining = remaining[split_pos:].lstrip()
+
+        return [s for s in segments if s]
@@ -30,29 +30,48 @@ class SendResponseBackStage(stage.PipelineStage):

        await asyncio.sleep(random_delay)

-        if query.pipeline_config['output']['misc']['at-sender'] and isinstance(
-            query.message_event, platform_events.GroupMessage
-        ):
-            query.resp_message_chain[-1].insert(0, platform_message.At(target=query.message_event.sender.id))
-
        quote_origin = query.pipeline_config['output']['misc']['quote-origin']

-        has_chunks = any(isinstance(msg, provider_message.MessageChunk) for msg in query.resp_messages)
-        # TODO 命令与流式的兼容性问题
-        if await query.adapter.is_stream_output_supported() and has_chunks:
-            is_final = [msg.is_final for msg in query.resp_messages][0]
-            await query.adapter.reply_message_chunk(
-                message_source=query.message_event,
-                bot_message=query.resp_messages[-1],
-                message=query.resp_message_chain[-1],
-                quote_origin=quote_origin,
-                is_final=is_final,
-            )
+        if len(query.resp_message_chain) > 1:
+            # Multiple chains (split strategy): send each sequentially
+            for i, chain in enumerate(query.resp_message_chain):
+                is_first = i == 0
+
+                if (
+                    is_first
+                    and query.pipeline_config['output']['misc']['at-sender']
+                    and isinstance(query.message_event, platform_events.GroupMessage)
+                ):
+                    chain.insert(0, platform_message.At(target=query.message_event.sender.id))
+
+                await query.adapter.reply_message(
+                    message_source=query.message_event,
+                    message=chain,
+                    quote_origin=quote_origin if is_first else False,
+                )
+
        else:
-            await query.adapter.reply_message(
-                message_source=query.message_event,
-                message=query.resp_message_chain[-1],
-                quote_origin=quote_origin,
-            )
+            if query.pipeline_config['output']['misc']['at-sender'] and isinstance(
+                query.message_event, platform_events.GroupMessage
+            ):
+                query.resp_message_chain[-1].insert(0, platform_message.At(target=query.message_event.sender.id))
+
+            has_chunks = any(isinstance(msg, provider_message.MessageChunk) for msg in query.resp_messages)
+            # TODO 命令与流式的兼容性问题
+            if await query.adapter.is_stream_output_supported() and has_chunks:
+                is_final = [msg.is_final for msg in query.resp_messages][0]
+                await query.adapter.reply_message_chunk(
+                    message_source=query.message_event,
+                    bot_message=query.resp_messages[-1],
+                    message=query.resp_message_chain[-1],
+                    quote_origin=quote_origin,
+                    is_final=is_final,
+                )
+            else:
+                await query.adapter.reply_message(
+                    message_source=query.message_event,
+                    message=query.resp_message_chain[-1],
+                    quote_origin=quote_origin,
+                )

        return entities.StageProcessResult(result_type=entities.ResultType.CONTINUE, new_query=query)
@@ -37,6 +37,10 @@ stages:
            label:
              en_US: Convert to Image
              zh_Hans: 转换为图片
+          - name: split
+            label:
+              en_US: Split into Multiple Messages
+              zh_Hans: 分割为多条消息发送
          - name: none
            label:
              en_US: None