mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-02 03:55:55 +00:00
Compare commits
2 Commits
v4.9.1
...
feat/long-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
662e6a4815 | ||
|
|
c92d3d7ad7 |
@@ -22,10 +22,13 @@ class LongTextProcessStage(stage.PipelineStage):
|
||||
"""
|
||||
|
||||
strategy_impl: strategy.LongTextStrategy | None
|
||||
is_split: bool
|
||||
|
||||
async def initialize(self, pipeline_config: dict):
|
||||
config = pipeline_config['output']['long-text-processing']
|
||||
|
||||
self.is_split = config['strategy'] == 'split'
|
||||
|
||||
if config['strategy'] == 'none':
|
||||
self.strategy_impl = None
|
||||
return
|
||||
@@ -90,8 +93,23 @@ class LongTextProcessStage(stage.PipelineStage):
|
||||
len(str(query.resp_message_chain[-1]))
|
||||
> query.pipeline_config['output']['long-text-processing']['threshold']
|
||||
):
|
||||
query.resp_message_chain[-1] = platform_message.MessageChain(
|
||||
await self.strategy_impl.process(str(query.resp_message_chain[-1]), query)
|
||||
)
|
||||
if self.is_split:
|
||||
original_text = str(query.resp_message_chain[-1])
|
||||
threshold = query.pipeline_config['output']['long-text-processing']['threshold']
|
||||
segments = self.strategy_impl.split_text(original_text, threshold)
|
||||
# Replace the last chain with the first segment, store extra segments separately
|
||||
# to avoid interfering with existing multi-chain scenarios (e.g. agent tool calls)
|
||||
query.resp_message_chain[-1] = platform_message.MessageChain(
|
||||
[platform_message.Plain(text=segments[0])]
|
||||
)
|
||||
if len(segments) > 1:
|
||||
query.set_variable(
|
||||
'_longtext_split_extra_chains',
|
||||
[platform_message.MessageChain([platform_message.Plain(text=seg)]) for seg in segments[1:]],
|
||||
)
|
||||
else:
|
||||
query.resp_message_chain[-1] = platform_message.MessageChain(
|
||||
await self.strategy_impl.process(str(query.resp_message_chain[-1]), query)
|
||||
)
|
||||
|
||||
return entities.StageProcessResult(result_type=entities.ResultType.CONTINUE, new_query=query)
|
||||
|
||||
224
src/langbot/pkg/pipeline/longtext/strategies/split.py
Normal file
224
src/langbot/pkg/pipeline/longtext/strategies/split.py
Normal file
@@ -0,0 +1,224 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
from .. import strategy as strategy_model
|
||||
|
||||
import langbot_plugin.api.entities.builtin.pipeline.query as pipeline_query
|
||||
import langbot_plugin.api.entities.builtin.platform.message as platform_message
|
||||
|
||||
|
||||
@strategy_model.strategy_class('split')
|
||||
class SplitStrategy(strategy_model.LongTextStrategy):
|
||||
"""Split long text into multiple message segments with Markdown awareness."""
|
||||
|
||||
async def process(self, message: str, query: pipeline_query.Query) -> list[platform_message.MessageComponent]:
|
||||
segments = self.split_text(
|
||||
message,
|
||||
query.pipeline_config['output']['long-text-processing']['threshold'],
|
||||
)
|
||||
return [platform_message.Plain(text=segments[0])] if segments else []
|
||||
|
||||
def split_text(self, text: str, max_length: int) -> list[str]:
|
||||
"""Split text into segments respecting Markdown structure.
|
||||
|
||||
Priority:
|
||||
1. Markdown structural boundaries (headings, code blocks, horizontal rules)
|
||||
2. Paragraph breaks (blank lines)
|
||||
3. List item boundaries
|
||||
4. Line breaks
|
||||
5. Hard cut (fallback)
|
||||
"""
|
||||
if len(text) <= max_length:
|
||||
return [text]
|
||||
|
||||
blocks = self._parse_markdown_blocks(text)
|
||||
return self._merge_blocks(blocks, max_length)
|
||||
|
||||
def _parse_markdown_blocks(self, text: str) -> list[str]:
|
||||
"""Parse text into Markdown-aware blocks.
|
||||
|
||||
Keeps code blocks intact and splits the rest by structural elements.
|
||||
"""
|
||||
blocks: list[str] = []
|
||||
lines = text.split('\n')
|
||||
current_block: list[str] = []
|
||||
in_code_block = False
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
|
||||
# Toggle fenced code block state
|
||||
if stripped.startswith('```'):
|
||||
if in_code_block:
|
||||
# End of code block - close it as one block
|
||||
current_block.append(line)
|
||||
blocks.append('\n'.join(current_block))
|
||||
current_block = []
|
||||
in_code_block = False
|
||||
continue
|
||||
else:
|
||||
# Start of code block - flush current block first
|
||||
if current_block:
|
||||
blocks.append('\n'.join(current_block))
|
||||
current_block = []
|
||||
current_block.append(line)
|
||||
in_code_block = True
|
||||
continue
|
||||
|
||||
if in_code_block:
|
||||
current_block.append(line)
|
||||
continue
|
||||
|
||||
# Heading (# ...) - start a new block
|
||||
if re.match(r'^#{1,6}\s', stripped):
|
||||
if current_block:
|
||||
blocks.append('\n'.join(current_block))
|
||||
current_block = []
|
||||
current_block.append(line)
|
||||
continue
|
||||
|
||||
# Horizontal rule (---, ***, ___) - start a new block
|
||||
if re.match(r'^(-{3,}|\*{3,}|_{3,})\s*$', stripped):
|
||||
if current_block:
|
||||
blocks.append('\n'.join(current_block))
|
||||
current_block = []
|
||||
blocks.append(line)
|
||||
continue
|
||||
|
||||
# Blank line - paragraph boundary
|
||||
if stripped == '':
|
||||
if current_block:
|
||||
current_block.append(line)
|
||||
blocks.append('\n'.join(current_block))
|
||||
current_block = []
|
||||
continue
|
||||
|
||||
current_block.append(line)
|
||||
|
||||
# Flush remaining (including unclosed code blocks)
|
||||
if current_block:
|
||||
blocks.append('\n'.join(current_block))
|
||||
|
||||
return [b for b in blocks if b.strip()]
|
||||
|
||||
def _merge_blocks(self, blocks: list[str], max_length: int) -> list[str]:
|
||||
"""Merge small blocks greedily until approaching max_length.
|
||||
|
||||
If a single block exceeds max_length, split it by lines as fallback.
|
||||
"""
|
||||
segments: list[str] = []
|
||||
current = ''
|
||||
|
||||
for block in blocks:
|
||||
candidate = (current + '\n\n' + block) if current else block
|
||||
|
||||
if len(candidate) <= max_length:
|
||||
current = candidate
|
||||
else:
|
||||
# Flush current segment
|
||||
if current:
|
||||
segments.append(current)
|
||||
|
||||
# Check if this single block fits
|
||||
if len(block) <= max_length:
|
||||
current = block
|
||||
else:
|
||||
# Block too large - split it by lines
|
||||
for part in self._split_large_block(block, max_length):
|
||||
segments.append(part)
|
||||
current = ''
|
||||
|
||||
if current:
|
||||
segments.append(current)
|
||||
|
||||
return [s for s in segments if s.strip()]
|
||||
|
||||
def _split_large_block(self, block: str, max_length: int) -> list[str]:
|
||||
"""Split an oversized block by lines, preserving code block fences.
|
||||
|
||||
For single-line plain text (no newlines), falls back to splitting at
|
||||
natural language boundaries (spaces, punctuation).
|
||||
"""
|
||||
lines = block.split('\n')
|
||||
|
||||
# Single long line with no newlines - use plain text splitting
|
||||
if len(lines) == 1:
|
||||
return self._split_plain_text(block, max_length)
|
||||
|
||||
is_code_block = lines[0].strip().startswith('```')
|
||||
|
||||
segments: list[str] = []
|
||||
current_lines: list[str] = []
|
||||
current_len = 0
|
||||
|
||||
# For code blocks, track the opening fence to re-apply on continuations
|
||||
code_fence = lines[0] if is_code_block else ''
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line_len = len(line) + 1 # +1 for newline
|
||||
|
||||
# Single line exceeds limit on its own - split it first
|
||||
if line_len > max_length:
|
||||
if current_lines:
|
||||
seg = '\n'.join(current_lines)
|
||||
if is_code_block and not seg.rstrip().endswith('```'):
|
||||
seg += '\n```'
|
||||
segments.append(seg)
|
||||
current_lines = []
|
||||
current_len = 0
|
||||
|
||||
for part in self._split_plain_text(line, max_length):
|
||||
segments.append(part)
|
||||
continue
|
||||
|
||||
if current_len + line_len > max_length and current_lines:
|
||||
segment = '\n'.join(current_lines)
|
||||
# Close code block fence if splitting mid-code-block
|
||||
if is_code_block and not segment.rstrip().endswith('```'):
|
||||
segment += '\n```'
|
||||
segments.append(segment)
|
||||
|
||||
current_lines = []
|
||||
current_len = 0
|
||||
# Re-open code block fence for continuation
|
||||
if is_code_block and i < len(lines) - 1 and not line.strip().startswith('```'):
|
||||
current_lines.append(code_fence)
|
||||
current_len = len(code_fence) + 1
|
||||
|
||||
current_lines.append(line)
|
||||
current_len += line_len
|
||||
|
||||
if current_lines:
|
||||
segments.append('\n'.join(current_lines))
|
||||
|
||||
return segments
|
||||
|
||||
def _split_plain_text(self, text: str, max_length: int) -> list[str]:
|
||||
"""Split a long plain text string (no newlines) at word/space boundaries."""
|
||||
if len(text) <= max_length:
|
||||
return [text]
|
||||
|
||||
segments: list[str] = []
|
||||
remaining = text
|
||||
|
||||
while remaining:
|
||||
if len(remaining) <= max_length:
|
||||
segments.append(remaining)
|
||||
break
|
||||
|
||||
chunk = remaining[:max_length]
|
||||
min_pos = int(max_length * 0.3)
|
||||
|
||||
# Try to find a space to split at
|
||||
pos = chunk.rfind(' ')
|
||||
if pos >= min_pos:
|
||||
split_pos = pos
|
||||
else:
|
||||
# Hard cut as last resort
|
||||
split_pos = max_length
|
||||
|
||||
segments.append(remaining[:split_pos].rstrip())
|
||||
remaining = remaining[split_pos:].lstrip()
|
||||
|
||||
return [s for s in segments if s]
|
||||
@@ -55,4 +55,15 @@ class SendResponseBackStage(stage.PipelineStage):
|
||||
quote_origin=quote_origin,
|
||||
)
|
||||
|
||||
# Send extra chains produced by long text split strategy
|
||||
extra_chains = query.get_variable('_longtext_split_extra_chains')
|
||||
if extra_chains:
|
||||
for chain in extra_chains:
|
||||
await query.adapter.reply_message(
|
||||
message_source=query.message_event,
|
||||
message=chain,
|
||||
quote_origin=False,
|
||||
)
|
||||
query.set_variable('_longtext_split_extra_chains', None)
|
||||
|
||||
return entities.StageProcessResult(result_type=entities.ResultType.CONTINUE, new_query=query)
|
||||
|
||||
@@ -37,6 +37,10 @@ stages:
|
||||
label:
|
||||
en_US: Convert to Image
|
||||
zh_Hans: 转换为图片
|
||||
- name: split
|
||||
label:
|
||||
en_US: Split into Multiple Messages
|
||||
zh_Hans: 分割为多条消息发送
|
||||
- name: none
|
||||
label:
|
||||
en_US: None
|
||||
|
||||
Reference in New Issue
Block a user