增加了使用阿里云进行语音识别的引擎

2026-03-06 16:42:09 +08:00 · 2024-07-15 22:03:31 +08:00
parent f7a2c97943
commit f0e416455f
4 changed files with 91 additions and 6 deletions
--- a/config.py
+++ b/config.py
@@ -95,7 +95,7 @@ available_setting = {
    "group_speech_recognition": False,  # 是否开启群组语音识别
    "voice_reply_voice": False,  # 是否使用语音回复语音，需要设置对应语音合成引擎的api key
    "always_reply_voice": False,  # 是否一直使用语音回复
-    "voice_to_text": "openai",  # 语音识别引擎，支持openai,baidu,google,azure
+    "voice_to_text": "openai",  # 语音识别引擎，支持openai,baidu,google,ali,azure
    "text_to_voice": "openai",  # 语音合成引擎，支持openai,baidu,google,pytts(offline),ali,azure,elevenlabs,edge(online)
    "text_to_voice_model": "tts-1",
    "tts_voice_id": "alloy",
--- a/voice/ali/ali_api.py
+++ b/voice/ali/ali_api.py
@@ -8,6 +8,7 @@ Description:

 """

+import http.client
 import json
 import time
 import requests
@@ -61,6 +62,69 @@ def text_to_speech_aliyun(url, text, appkey, token):

    return output_file

+def speech_to_text_aliyun(url, audioContent, appkey, token):
+    """
+    使用阿里云的语音识别服务识别音频文件中的语音。
+
+    参数:
+    - url (str): 阿里云语音识别服务的端点URL。
+    - audioContent (byte): pcm音频数据。
+    - appkey (str): 您的阿里云appkey。
+    - token (str): 阿里云API的认证令牌。
+
+    返回值:
+    - str: 成功时输出识别到的文本，否则为None。
+    """
+    format = 'pcm'
+    sample_rate = 16000
+    enablePunctuationPrediction  = True
+    enableInverseTextNormalization = True
+    enableVoiceDetection  = False
+
+    # 设置RESTful请求参数
+    request = url + '?appkey=' + appkey
+    request = request + '&format=' + format
+    request = request + '&sample_rate=' + str(sample_rate)
+
+    if enablePunctuationPrediction :
+        request = request + '&enable_punctuation_prediction=' + 'true'
+
+    if enableInverseTextNormalization :
+        request = request + '&enable_inverse_text_normalization=' + 'true'
+
+    if enableVoiceDetection :
+        request = request + '&enable_voice_detection=' + 'true'
+        
+    host = 'nls-gateway-cn-shanghai.aliyuncs.com'
+
+    # 设置HTTPS请求头部
+    httpHeaders = {
+        'X-NLS-Token': token,
+        'Content-type': 'application/octet-stream',
+        'Content-Length': len(audioContent)
+        }
+
+    conn = http.client.HTTPSConnection(host)
+    conn.request(method='POST', url=request, body=audioContent, headers=httpHeaders)
+
+    response = conn.getresponse()
+    body = response.read()
+    try:
+        body = json.loads(body)
+        status = body['status']
+        if status == 20000000 :
+            result = body['result']
+            if result :
+                logger.info(f"阿里云语音识别到了：{result}")
+            conn.close()
+            return result
+        else :
+            logger.error(f"语音识别失败，状态码: {status}")
+    except ValueError:
+        logger.error(f"语音识别失败，收到非JSON格式的数据: {body}")
+    conn.close()
+    return None
+

 class AliyunTokenGenerator:
    """
--- a/voice/ali/ali_voice.py
+++ b/voice/ali/ali_voice.py
@@ -15,9 +15,9 @@ import time

 from bridge.reply import Reply, ReplyType
 from common.log import logger
+from voice.audio_convert import get_pcm_from_wav
 from voice.voice import Voice
-from voice.ali.ali_api import AliyunTokenGenerator
-from voice.ali.ali_api import text_to_speech_aliyun
+from voice.ali.ali_api import AliyunTokenGenerator, speech_to_text_aliyun, text_to_speech_aliyun
 from config import conf


@@ -34,7 +34,8 @@ class AliVoice(Voice):
            self.token = None
            self.token_expire_time = 0
            # 默认复用阿里云千问的 access_key 和 access_secret
-            self.api_url = config.get("api_url")
+            self.api_url_voice_to_text = config.get("api_url_voice_to_text")
+            self.api_url_text_to_voice = config.get("api_url_text_to_voice")
            self.app_key = config.get("app_key")
            self.access_key_id = conf().get("qwen_access_key_id") or config.get("access_key_id")
            self.access_key_secret = conf().get("qwen_access_key_secret") or config.get("access_key_secret")
@@ -53,7 +54,7 @@ class AliVoice(Voice):
                      r'äöüÄÖÜáéíóúÁÉÍÓÚàèìòùÀÈÌÒÙâêîôûÂÊÎÔÛçÇñÑ，。！？,.]', '', text)
        # 提取有效的token
        token_id = self.get_valid_token()
-        fileName = text_to_speech_aliyun(self.api_url, text, self.app_key, token_id)
+        fileName = text_to_speech_aliyun(self.api_url_text_to_voice, text, self.app_key, token_id)
        if fileName:
            logger.info("[Ali] textToVoice text={} voice file name={}".format(text, fileName))
            reply = Reply(ReplyType.VOICE, fileName)
@@ -61,6 +62,25 @@ class AliVoice(Voice):
            reply = Reply(ReplyType.ERROR, "抱歉，语音合成失败")
        return reply

+    def voiceToText(self, voice_file):
+        """
+        将语音文件转换为文本。
+
+        :param voice_file: 要转换的语音文件。
+        :return: 返回一个Reply对象，其中包含转换得到的文本或错误信息。
+        """
+        # 提取有效的token
+        token_id = self.get_valid_token()
+        logger.debug("[Ali] voice file name={}".format(voice_file))
+        pcm = get_pcm_from_wav(voice_file)
+        text = speech_to_text_aliyun(self.api_url_voice_to_text, pcm, self.app_key, token_id)
+        if text:
+            logger.info("[Ali] VoicetoText = {}".format(text))
+            reply = Reply(ReplyType.TEXT, text)
+        else:
+            reply = Reply(ReplyType.ERROR, "抱歉，语音识别失败")
+        return reply
+
    def get_valid_token(self):
        """
        获取有效的阿里云token。
--- a/voice/ali/config.json.template
+++ b/voice/ali/config.json.template
@@ -1,5 +1,6 @@
 {
-    "api_url": "https://nls-gateway-cn-shanghai.aliyuncs.com/stream/v1/tts",
+    "api_url_text_to_voice": "https://nls-gateway-cn-shanghai.aliyuncs.com/stream/v1/tts",
+    "api_url_voice_to_text": "https://nls-gateway.cn-shanghai.aliyuncs.com/stream/v1/asr",
    "app_key": "",
    "access_key_id": "",
    "access_key_secret": ""