Jetson 音频/语音处理Whisper 语音识别与 TTS1. Jetson 音频硬件# 检查音频设备arecord-l# 录音设备aplay-l# 播放设备# USB 麦克风推荐arecord-Dplughw:1,0-fS16_LE-r16000-c1test.wav-d5aplay test.wav# 安装音频工具sudoaptinstall-yalsa-utils pulseaudio portaudio19-dev pip3installpyaudio sounddevice2. Whisper 语音识别2.1 安装# 安装 whisper.cppC 版本性能更好gitclone https://github.com/ggerganov/whisper.cpp.gitcdwhisper.cppmake-j$(nproc)# 下载模型bash./models/download-ggml-model.sh base.en# 安装 Python 绑定pip3installopenai-whisper# 或 faster-whisper推荐pip3installfaster-whisper2.2 实时语音识别#!/usr/bin/env python3whisper_realtime.py - 实时语音识别importnumpyasnpimportpyaudioimportthreadingimportqueuefromfaster_whisperimportWhisperModelclassRealtimeWhisper:实时语音识别def__init__(self,model_sizebase,devicecuda):self.modelWhisperModel(model_size,devicedevice,compute_typefloat16)self.audio_queuequeue.Queue()self.sample_rate16000self.chunk_duration3# 每 3 秒识别一次self.runningFalsedefstart(self):启动识别self.runningTrue# 录音线程self.record_threadthreading.Thread(targetself._record_loop,daemonTrue)self.record_thread.start()# 识别线程self.transcribe_threadthreading.Thread(targetself._transcribe_loop,daemonTrue)self.transcribe_thread.start()def_record_loop(self):录音循环ppyaudio.PyAudio()streamp.open(formatpyaudio.paFloat32,channels1,rateself.sample_rate,inputTrue,frames_per_buffer1024)chunk_sizeint(self.sample_rate*self.chunk_duration)buffer[]whileself.running:datanp.frombuffer(stream.read(1024),dtypenp.float32)buffer.extend(data)iflen(buffer)chunk_size:self.audio_queue.put(np.array(buffer,dtypenp.float32))buffer[]stream.stop_stream()stream.close()p.terminate()def_transcribe_loop(self):识别循环whileself.running:try:audioself.audio_queue.get(timeout0.5)# Whisper 识别segments,infoself.model.transcribe(audio,beam_size5,languagezh,vad_filterTrue)forsegmentinsegments:textsegment.text.strip()iftext:print(f[{segment.start:.1f}s-{segment.end:.1f}s]{text})exceptqueue.Empty:continuedefstop(self):self.runningFalseif__name____main__:whisperRealtimeWhisper(model_sizebase,devicecuda)whisper.start()try:whileTrue:passexceptKeyboardInterrupt:whisper.stop()2.3 Whisper TensorRT 加速#!/usr/bin/env python3whisper_trt.py - Whisper TensorRT 加速importtensorrtastrtimportnumpyasnpdefconvert_whisper_to_trt(whisper_model_path,trt_engine_path):将 Whisper 模型转换为 TensorRT# whisper.cpp 已支持 CUDA 加速# 使用 faster-whisper 的 CTranslate2 后端fromfaster_whisperimportWhisperModel modelWhisperModel(base,devicecuda,compute_typefloat16,# FP16 推理cpu_threads4)returnmodel# 性能对比Orin NX 16GB# ┌─────────────┬──────────┬──────────┐# │ 模型 │ FP32 │ FP16 │# ├─────────────┼──────────┼──────────┤# │ tiny │ 15x │ 25x │# │ base │ 8x │ 15x │# │ small │ 3x │ 6x │# │ medium │ 1x │ 2.5x │# └─────────────┴──────────┴──────────┘# * 表示实时倍率1x 表示快于实时3. TTS 语音合成#!/usr/bin/env python3tts_jetson.py - 语音合成fromTTS.apiimportTTSclassJetsonTTS:Jetson 语音合成def__init__(self,model_nametts_models/zh-CN/baker/tacotron2-DDC-GST):self.ttsTTS(model_name).to(cuda)defsynthesize(self,text,output_pathoutput.wav):合成语音self.tts.tts_to_file(texttext,file_pathoutput_path)print(f语音已保存:{output_path})defspeak(self,text):实时播放importsubprocess self.synthesize(text,/tmp/tts_output.wav)subprocess.run([aplay,/tmp/tts_output.wav])if__name____main__:ttsJetsonTTS()tts.speak(你好我是 Jetson 语音助手)4. 语音唤醒词检测#!/usr/bin/env python3wake_word.py - 语音唤醒词importpvporcupineimportpyaudioimportstructclassWakeWordDetector:唤醒词检测Porcupinedef__init__(self,keyword_pathsNone,sensitivitiesNone):self.porcupinepvporcupine.create(access_keyYOUR_ACCESS_KEY,keyword_pathskeyword_paths,sensitivitiessensitivitiesor[0.5])self.papyaudio.PyAudio()self.streamself.pa.open(rateself.porcupine.sample_rate,channels1,formatpyaudio.paInt16,inputTrue,frames_per_bufferself.porcupine.frame_length)deflisten(self):监听唤醒词print(等待唤醒词...)whileTrue:pcmself.stream.read(self.porcupine.frame_length)pcmstruct.unpack_from(h*self.porcupine.frame_length,pcm)keyword_indexself.porcupine.process(pcm)ifkeyword_index0:print(f唤醒词检测到索引:{keyword_index})returnkeyword_indexdefcleanup(self):self.stream.close()self.pa.terminate()self.porcupine.delete()5. 完整语音助手#!/usr/bin/env python3voice_assistant.py - Jetson 语音助手importthreadingimportqueueclassVoiceAssistant:语音助手def__init__(self):self.wake_detectorWakeWordDetector()self.whisperRealtimeWhisper(model_sizebase)self.ttsJetsonTTS()self.command_queuequeue.Queue()defrun(self):运行助手print(语音助手已启动等待唤醒词...)whileTrue:# 等待唤醒词self.wake_detector.listen()print(已唤醒请说话...)# 语音识别textself.whisper.recognize_once()print(f识别结果:{text})# 处理命令responseself.process_command(text)print(f回复:{response})# 语音播报self.tts.speak(response)defprocess_command(self,text):处理语音命令texttext.lower()if天气intext:return今天天气晴朗温度 25 度elif时间intext:fromdatetimeimportdatetimereturnf现在时间是{datetime.now().strftime(%H:%M)}elif拍照intext:return已拍照保存else:return抱歉我没有听懂if__name____main__:assistantVoiceAssistant()assistant.run()总结功能方案延迟语音识别faster-whisper (base)1s语音合成TTS (tacotron2)2s唤醒词Porcupine100ms实时转写whisper.cpp VAD3s核心要点faster-whisper比 OpenAI Whisper 快 4x支持 FP16GPU 加速Whisper 和 TTS 都可以用 GPU 推理VAD 过滤语音活动检测减少无效推理Porcupine低功耗唤醒词检测适合常开场景