From 814ce7a43b949fa3849319d83a986fa86a8b6cad Mon Sep 17 00:00:00 2001 From: thzjy Date: Sun, 18 May 2025 17:32:17 +0800 Subject: [PATCH 1/2] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E7=99=BE=E5=BA=A6?= =?UTF-8?q?=E8=AF=AD=E9=9F=B3=E5=90=88=E6=88=90=E9=95=BF=E6=96=87=E5=A4=84?= =?UTF-8?q?=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- voice/baidu/baidu_voice.py | 182 ++++++++++++++++++++++++++----------- 1 file changed, 130 insertions(+), 52 deletions(-) diff --git a/voice/baidu/baidu_voice.py b/voice/baidu/baidu_voice.py index fbf53ce..4fa6a56 100644 --- a/voice/baidu/baidu_voice.py +++ b/voice/baidu/baidu_voice.py @@ -1,9 +1,11 @@ """ -baidu voice service +baidu voice service with thread-safe token caching """ import json import os import time +import threading +import requests from aip import AipSpeech @@ -14,28 +16,13 @@ from config import conf from voice.audio_convert import get_pcm_from_wav from voice.voice import Voice -""" - 百度的语音识别API. - dev_pid: - - 1936: 普通话远场 - - 1536:普通话(支持简单的英文识别) - - 1537:普通话(纯中文识别) - - 1737:英语 - - 1637:粤语 - - 1837:四川话 - 要使用本模块, 首先到 yuyin.baidu.com 注册一个开发者账号, - 之后创建一个新应用, 然后在应用管理的"查看key"中获得 API Key 和 Secret Key - 然后在 config.json 中填入这两个值, 以及 app_id, dev_pid - """ - - class BaiduVoice(Voice): def __init__(self): try: + # 读取本地 TTS 参数配置 curdir = os.path.dirname(__file__) config_path = os.path.join(curdir, "config.json") - bconf = None - if not os.path.exists(config_path): # 如果没有配置文件,创建本地配置文件 + if not os.path.exists(config_path): bconf = {"lang": "zh", "ctp": 1, "spd": 5, "pit": 5, "vol": 5, "per": 0} with open(config_path, "w") as fw: json.dump(bconf, fw, indent=4) @@ -47,48 +34,139 @@ class BaiduVoice(Voice): self.api_key = str(conf().get("baidu_api_key")) self.secret_key = str(conf().get("baidu_secret_key")) self.dev_id = conf().get("baidu_dev_pid") - self.lang = bconf["lang"] - self.ctp = bconf["ctp"] - self.spd = bconf["spd"] - self.pit = bconf["pit"] - self.vol = bconf["vol"] - self.per = bconf["per"] + self.lang = bconf["lang"] + self.ctp = bconf["ctp"] + self.spd = bconf["spd"] + self.pit = bconf["pit"] + self.vol = bconf["vol"] + self.per = bconf["per"] + + # 百度 SDK 客户端(短文本合成 & 语音识别) self.client = AipSpeech(self.app_id, self.api_key, self.secret_key) + + # access_token 缓存与锁 + self._access_token = None + self._token_expire_ts = 0 + self._token_lock = threading.Lock() except Exception as e: - logger.warn("BaiduVoice init failed: %s, ignore " % e) + logger.warn("BaiduVoice init failed: %s, ignore" % e) + + def _get_access_token(self): + # 多线程安全获取 token + with self._token_lock: + now = time.time() + if self._access_token and now < self._token_expire_ts: + return self._access_token + url = "https://aip.baidubce.com/oauth/2.0/token" + params = { + "grant_type": "client_credentials", + "client_id": self.api_key, + "client_secret": self.secret_key, + } + resp = requests.post(url, params=params).json() + token = resp.get("access_token") + expires_in = resp.get("expires_in", 2592000) + if token: + self._access_token = token + self._token_expire_ts = now + expires_in - 60 # 提前 1 分钟过期 + return token + else: + logger.error("BaiduVoice _get_access_token failed: %s", resp) + return None def voiceToText(self, voice_file): - # 识别本地文件 - logger.debug("[Baidu] voice file name={}".format(voice_file)) + logger.debug("[Baidu] recognize voice file=%s", voice_file) pcm = get_pcm_from_wav(voice_file) res = self.client.asr(pcm, "pcm", 16000, {"dev_pid": self.dev_id}) - if res["err_no"] == 0: - logger.info("百度语音识别到了:{}".format(res["result"])) + if res.get("err_no") == 0: text = "".join(res["result"]) - reply = Reply(ReplyType.TEXT, text) + logger.info("[Baidu] ASR result: %s", text) + return Reply(ReplyType.TEXT, text) else: - logger.info("百度语音识别出错了: {}".format(res["err_msg"])) - if res["err_msg"] == "request pv too much": - logger.info(" 出现这个原因很可能是你的百度语音服务调用量超出限制,或未开通付费") - reply = Reply(ReplyType.ERROR, "百度语音识别出错了;{0}".format(res["err_msg"])) - return reply + err = res.get("err_msg", "") + logger.error("[Baidu] ASR error: %s", err) + return Reply(ReplyType.ERROR, f"语音识别失败:{err}") + + def _long_text_synthesis(self, text): + token = self._get_access_token() + if not token: + return Reply(ReplyType.ERROR, "获取百度 access_token 失败") + + # 创建合成任务 + create_url = f"https://aip.baidubce.com/rpc/2.0/tts/v1/create?access_token={token}" + payload = { + "text": text, + "format": "mp3-16k", + "voice": 0, + "lang": self.lang, + "speed": self.spd, + "pitch": self.pit, + "volume": self.vol, + "enable_subtitle": 0, + } + headers = {"Content-Type": "application/json"} + create_resp = requests.post(create_url, headers=headers, json=payload).json() + task_id = create_resp.get("task_id") + if not task_id: + logger.error("[Baidu] 长文本合成创建任务失败: %s", create_resp) + return Reply(ReplyType.ERROR, "长文本合成任务提交失败") + logger.info("[Baidu] 长文本合成任务已提交 task_id=%s", task_id) + + # 轮询查询任务状态 + query_url = f"https://aip.baidubce.com/rpc/2.0/tts/v1/query?access_token={token}" + for _ in range(30): + time.sleep(1.5) + resp = requests.post(query_url, headers=headers, json={"task_ids":[task_id]}) + result = resp.json() + infos = result.get("tasks_info") or result.get("tasks") or [] + if not infos: + continue + info = infos[0] + status = info.get("task_status") + if status == "Success": + task_res = info.get("task_result", {}) + audio_url = task_res.get("audio_address") or task_res.get("speech_url") + break + elif status == "Running": + continue + else: + logger.error("[Baidu] 长文本合成失败: %s", info) + return Reply(ReplyType.ERROR, "长文本合成执行失败") + else: + return Reply(ReplyType.ERROR, "长文本合成超时,请稍后重试") + + # 下载并保存音频 + audio_data = requests.get(audio_url).content + fn = TmpDir().path() + f"reply-long-{int(time.time())}-{hash(text)&0x7FFFFFFF}.mp3" + with open(fn, "wb") as f: + f.write(audio_data) + logger.info("[Baidu] 长文本合成 success: %s", fn) + return Reply(ReplyType.VOICE, fn) def textToVoice(self, text): - result = self.client.synthesis( - text, - self.lang, - self.ctp, - {"spd": self.spd, "pit": self.pit, "vol": self.vol, "per": self.per}, - ) - if not isinstance(result, dict): - # Avoid the same filename under multithreading - fileName = TmpDir().path() + "reply-" + str(int(time.time())) + "-" + str(hash(text) & 0x7FFFFFFF) + ".mp3" - with open(fileName, "wb") as f: - f.write(result) - logger.info("[Baidu] textToVoice text={} voice file name={}".format(text, fileName)) - reply = Reply(ReplyType.VOICE, fileName) - else: - logger.error("[Baidu] textToVoice error={}".format(result)) - reply = Reply(ReplyType.ERROR, "抱歉,语音合成失败") - return reply + try: + # GBK 编码字节长度 + gbk_len = len(text.encode("gbk", errors="ignore")) + if gbk_len <= 120: + # 短文本走 SDK 合成 + result = self.client.synthesis( + text, self.lang, self.ctp, + {"spd":self.spd, "pit":self.pit, "vol":self.vol, "per":self.per} + ) + if not isinstance(result, dict): + fn = TmpDir().path() + f"reply-{int(time.time())}-{hash(text)&0x7FFFFFFF}.mp3" + with open(fn, "wb") as f: + f.write(result) + logger.info("[Baidu] 短文本合成 success: %s", fn) + return Reply(ReplyType.VOICE, fn) + else: + logger.error("[Baidu] 短文本合成 error: %s", result) + return Reply(ReplyType.ERROR, "短文本语音合成失败") + else: + # 长文本 + return self._long_text_synthesis(text) + except Exception as e: + logger.error("BaiduVoice textToVoice exception: %s", e) + return Reply(ReplyType.ERROR, f"合成异常:{e}") + From e0dd21406dfe4d27a2d8bf3137e7d74a711fb664 Mon Sep 17 00:00:00 2001 From: vision Date: Fri, 23 May 2025 15:13:28 +0800 Subject: [PATCH 2/2] Update baidu_voice.py --- voice/baidu/baidu_voice.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/voice/baidu/baidu_voice.py b/voice/baidu/baidu_voice.py index 4fa6a56..2c4bc4b 100644 --- a/voice/baidu/baidu_voice.py +++ b/voice/baidu/baidu_voice.py @@ -115,8 +115,8 @@ class BaiduVoice(Voice): # 轮询查询任务状态 query_url = f"https://aip.baidubce.com/rpc/2.0/tts/v1/query?access_token={token}" - for _ in range(30): - time.sleep(1.5) + for _ in range(100): + time.sleep(3) resp = requests.post(query_url, headers=headers, json={"task_ids":[task_id]}) result = resp.json() infos = result.get("tasks_info") or result.get("tasks") or [] @@ -148,7 +148,7 @@ class BaiduVoice(Voice): try: # GBK 编码字节长度 gbk_len = len(text.encode("gbk", errors="ignore")) - if gbk_len <= 120: + if gbk_len <= 1024: # 短文本走 SDK 合成 result = self.client.synthesis( text, self.lang, self.ctp,