1. 基本思路
基于whisper.cpp的examples/command,代码修改如下:
- 第13行,增加python依赖:
#include </miniconda3/include/python3.11/Python.h>
- 第559行,修改唤醒词:
std::string k_prompt = "自定义唤醒词";
- 第607行,增加唤醒后的处理代码:
std::system("python read.py 我在");
- 第664行开始,自定义待机/关机/活跃状态的代码:
if (command=="待机"){
fprintf(stdout,"好的!");
std::system("python read.py 好的");
ask_prompt = true;
}
else if (command=="退出"){
fprintf(stdout,"下次再见!");
std::system("python read.py 下次再见");
is_running = false;
}
else{
char str3[strlen(command.c_str())+30];
sprintf(str3, "%s%s%s", "python chat.py \"", command.c_str(),"\"");
std::system(str3);
}
2. 语音部分
接下来是tts部分的python脚本,包括
read.py:读wav文件
write.py:写文本到wav文件
chat.py:生成聊天对话并read
## chat.py。 需要起一个本地的llm服务
import read
import requests, sys, edge_tts,os,asyncio
from pydub import AudioSegment,playback
url = 'http://localhost:8080/v1/chat/completions'
def send_message(message):
headers = {"Content-Type": "application/json"}
data = {
"model": "MiniCPM",
"messages": [
{"role": "system", "content": "你是一个助理,名字叫小特,正在和用户对话。尽量简短回复,不超过50字。"},
{"role": "user", "content": f"{message}"}
]
}
response = requests.post(url, headers=headers, json=data, verify=False)
if response.status_code == 200:
return response.json()["choices"][0]["message"]['content']
else:
return "我没有听清"
command = sys.argv[1]
if "播放" in command:
os.system("pkill -9 ffplay")
file = command.split("播放")[-1]
if file+".mp3" in os.listdir("music/"):
os.popen("ffplay -autoexit -i music/"+file+".mp3&")
else:
read("本地没有找到音乐"+file)
playback.play(AudioSegment.from_mp3('temp.mp3'))
else:
if len(command)>1:
resp = send_message(command)
print(resp)
if resp != "我没有听清":
os.system("pkill -9 ffplay")
read(resp)
playback.play(AudioSegment.from_mp3('temp.mp3'))
接下来是read.py和write.py,这里使用sherpa
## write.py
import soundfile as sf
import sherpa_onnx,sys
def write(text,output_filename,sid=10,provider='cpu'):
tts_config = sherpa_onnx.OfflineTtsConfig(
model=sherpa_onnx.OfflineTtsModelConfig(
vits=sherpa_onnx.OfflineTtsVitsModelConfig(
model='tts/vits-aishell3.onnx',
lexicon='tts/lexicon.txt',
tokens='tts/tokens.txt'
),
provider=provider
),
rule_fsts='tts/number.fst',
max_num_sentences=2,
)
audio = sherpa_onnx.OfflineTts(tts_config).generate(text, sid=sid)
sf.write(
output_filename,
audio.samples,
samplerate=audio.sample_rate,
subtype="PCM_16",
)
write(sys.argv[1],sys.argv[1]+'.wav')
## read.py
import write,sys,os
from pydub import AudioSegment,playback
def read(text):
if 'temp.wav' in os.listdir('.'):
os.system("rm temp.wav")
write(sys.argv[1],'temp.wav')
playback.play(AudioSegment.from_wav('temp.wav'))
使用write.py,提前将“下次再见,好的,我在”这三个词保存在本地。
3. 编译
记得编译时带上cuda:
WHISPER_CUDA=1 make -j command
然后执行:
./command -m ../models/ggml-medium.bin -l zh