mirror of
https://github.com/langbot-app/LangBot.git
synced 2026-06-02 03:55:55 +00:00
fix(dingtalk): use voice recognition text instead of raw audio binary
When DingTalk sends a voice message to the bot, the callback JSON contains a 'recognition' field with the speech-to-text result (powered by Qwen). Previously, LangBot only extracted the 'downloadCode' to download the raw audio binary and passed it as 'file_base64' to LLM APIs, which caused 400 errors since most models don't support this content type. This patch: - Extracts the 'recognition' field from DingTalk audio message content - Uses it as plain text input to the LLM instead of raw audio - Falls back to audio binary only when no recognition text is available - Fixes duplicate text issue for audio messages with recognition Fixes voice messages returning 'Request failed' on all LLM models.
This commit is contained in:
@@ -268,7 +268,25 @@ class DingTalkClient:
|
|||||||
|
|
||||||
message_data['Type'] = 'image'
|
message_data['Type'] = 'image'
|
||||||
elif incoming_message.message_type == 'audio':
|
elif incoming_message.message_type == 'audio':
|
||||||
message_data['Audio'] = await self.get_audio_url(incoming_message.to_dict()['content']['downloadCode'])
|
raw_content = incoming_message.to_dict().get('content', {})
|
||||||
|
# 兼容处理:如果 content 仍为 JSON 字符串则进行解析
|
||||||
|
if isinstance(raw_content, str):
|
||||||
|
try:
|
||||||
|
raw_content = json.loads(raw_content)
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
raw_content = {}
|
||||||
|
|
||||||
|
if self.logger:
|
||||||
|
await self.logger.info(f'DingTalk audio raw content: {json.dumps(raw_content, ensure_ascii=False)}')
|
||||||
|
|
||||||
|
# 提取钉钉自带的语音转写文字(Powered by Qwen)
|
||||||
|
recognition = raw_content.get('recognition', '')
|
||||||
|
if recognition:
|
||||||
|
message_data['Content'] = recognition
|
||||||
|
|
||||||
|
download_code = raw_content.get('downloadCode')
|
||||||
|
if download_code:
|
||||||
|
message_data['Audio'] = await self.get_audio_url(download_code)
|
||||||
|
|
||||||
message_data['Type'] = 'audio'
|
message_data['Type'] = 'audio'
|
||||||
elif incoming_message.message_type == 'file':
|
elif incoming_message.message_type == 'file':
|
||||||
|
|||||||
@@ -71,7 +71,8 @@ class DingTalkMessageConverter(abstract_platform_adapter.AbstractMessageConverte
|
|||||||
yiri_msg_list.append(platform_message.Image(base64=element['Picture']))
|
yiri_msg_list.append(platform_message.Image(base64=element['Picture']))
|
||||||
else:
|
else:
|
||||||
# 回退到原有简单逻辑
|
# 回退到原有简单逻辑
|
||||||
if event.content:
|
# 对于音频消息,content 来自 recognition 转写文字,在下方音频处理块中统一处理
|
||||||
|
if event.content and event.type != 'audio':
|
||||||
text_content = event.content.replace('@' + bot_name, '')
|
text_content = event.content.replace('@' + bot_name, '')
|
||||||
yiri_msg_list.append(platform_message.Plain(text=text_content))
|
yiri_msg_list.append(platform_message.Plain(text=text_content))
|
||||||
if event.picture:
|
if event.picture:
|
||||||
@@ -81,7 +82,11 @@ class DingTalkMessageConverter(abstract_platform_adapter.AbstractMessageConverte
|
|||||||
if event.file:
|
if event.file:
|
||||||
yiri_msg_list.append(platform_message.File(url=event.file, name=event.name))
|
yiri_msg_list.append(platform_message.File(url=event.file, name=event.name))
|
||||||
if event.audio:
|
if event.audio:
|
||||||
yiri_msg_list.append(platform_message.Voice(base64=event.audio))
|
# 优先使用钉钉自带的语音转写文字(recognition字段)
|
||||||
|
if event.content and event.type == 'audio':
|
||||||
|
yiri_msg_list.append(platform_message.Plain(text=event.content))
|
||||||
|
else:
|
||||||
|
yiri_msg_list.append(platform_message.Voice(base64=event.audio))
|
||||||
|
|
||||||
chain = platform_message.MessageChain(yiri_msg_list)
|
chain = platform_message.MessageChain(yiri_msg_list)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user