Hướng dẫn tích hợp từ khóa đánh thức giọng nói trên Linux với FunASR

Tích hợp hệ thống nhận diện từ khóa giọng nói ngoại tuyến trên bo mạch Ubuntu RK3588 sử dụng FunASR

RK3588 là một vi xử lý hiệu suất cao, khi kết hợp với công cụ FunASR có thể thực hiện chức năng nhận diện từ khóa giọng nói một cách hiệu quả mà không cần kết nối internet. Dưới đây là hướng dẫn triển khai chi tiết:

1. Chuẩn bị phần cứng

Lựa chọn thiết bị thu âm

Vì sơ đồ chân của bo mạch không hiển thị cổng âm thanh chuyên dụng, bạn cần bổ sung một trong các thiết bị sau:

Micro USB hoặc Card âm thanh USB (khuyến nghị)
- Giải pháp đơn giản nhất
- Cắm vào là dùng, không cần cấu hình thêm
- Đề xuất: Micro USB đa hướng hoặc card âm thanh USB có hỗ trợ micro
Module micro I2S
- Như INMP441 hoặc MSM261S4030H0
- Cần cấu hình chân GPIO để hỗ trợ tính năng I2S
- Phải thêm cấu hình I2S vào device tree
Module codec âm thanh như WM8960
- Cung cấp đầu vào micro và đầu ra âm thanh
- Điều khiển qua I2C, yêu cầu driver tương ứng

2. Cấu hình môi trường

Cài đặt thư viện phụ trợ

# Cập nhật hệ thống
sudo apt update && sudo apt upgrade -y

# Cài đặt công cụ cơ bản
sudo apt install -y build-essential git cmake python3-pip python3-dev

# Cài đặt thư viện âm thanh
sudo apt install -y alsa-utils pulseaudio portaudio19-dev

# Cài đặt thư viện tính toán khoa học và AI
sudo apt install -y python3-numpy python3-scipy

# Kiểm tra thiết bị âm thanh
arecord -l

Cấu hình âm thanh ALSA

# Tạo hoặc chỉnh sửa tệp .asoundrc
nano ~/.asoundrc

# Thêm nội dung sau (giả định micro USB là card 1)
pcm.!default {
  type asym
  playback.pcm "hw:0,0"
  capture.pcm "hw:1,0"
}

ctl.!default {
  type hw
  card 0
}

3. Cài đặt FunASR

FunASR là công cụ nhận diện giọng nói mã nguồn mở do Alibaba phát triển, hỗ trợ nhận diện từ khóa ngoại tuyến.

Cài đặt gói Python FunASR

# Tạo môi trường ảo (khuyến nghị)
python3 -m pip install virtualenv
virtualenv asr_environment
source asr_environment/bin/activate

# Cài đặt FunASR
pip install -U funasr
pip install funasr-onnx  # Nếu muốn sử dụng mô hình ONNX tăng tốc

Tải về mô hình từ khóa

mkdir -p model_storage
cd model_storage

# Tải mô hình từ khóa (ví dụ: "xin chào trợ lý")
wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/models/wekws_wenetspeech_xiaoyu.tar.gz
tar -xzf wekws_wenetspeech_xiaoyu.tar.gz

4. Triển khai chương trình nhận diện từ khóa

Tạo chương trình phát hiện từ khóa

# keyword_detector.py
import os
import time
import wave
import pyaudio
import funasr
import threading
import signal
import sys

# Cấu hình tham số
SAMPLE_CHUNK = 1024
AUDIO_FORMAT = pyaudio.paInt16
AUDIO_CHANNELS = 1
SAMPLE_RATE = 16000
PROCESS_DURATION = 0.5  # Độ dài âm thanh xử lý mỗi lần
ACTIVATION_THRESHOLD = 0.6  # Ngưỡng kích hoạt từ khóa

# Khởi tạo mô hình nhận diện từ khóa FunASR
model_path = "model_storage/wekws_wenetspeech_xiaoyu"
detector_model = funasr.AutoModel(
    model=model_path,
    model_type="wekws",
    device_id=0  # Sử dụng CPU, đặt >=0 để dùng GPU
)

# Biến toàn cục
execution_flag = True
activation_status = False

def handle_signal(sig, frame):
    global execution_flag
    print("Đang dừng chương trình...")
    execution_flag = False
    sys.exit(0)

# Đăng ký hàm xử lý tín hiệu
signal.signal(signal.SIGINT, handle_signal)

# Hàm callback, thực thi sau khi phát hiện từ khóa
def trigger_activation():
    print("\nHệ thống đã được kích hoạt, chờ lệnh...")
    # Có thể thêm hành động sau khi kích hoạt như phát âm báo, bắt đầu nhận diện lệnh...
    os.system("aplay -q activation_sound.wav")  # Phát âm báo (cần chuẩn bị tệp này)
    
    # Ví dụ: Sau khi kích hoạt chuyển sang chế độ tương tác
    global activation_status
    activation_status = True
    
    # Thiết lập thời gian chờ kích hoạt
    def reset_activation_state():
        global activation_status
        time.sleep(10)  # Tự động đặt lại trạng thái sau 10 giây
        if activation_status:
            print("Hết thời gian chờ, trở về chế độ lắng nghe...")
            activation_status = False
    
    threading.Thread(target=reset_activation_state, daemon=True).start()

def run_detection():
    # Khởi tạo PyAudio
    audio_engine = pyaudio.PyAudio()
    
    # Mở luồng âm thanh
    audio_stream = audio_engine.open(format=AUDIO_FORMAT,
                                   channels=AUDIO_CHANNELS,
                                   rate=SAMPLE_RATE,
                                   input=True,
                                   frames_per_buffer=SAMPLE_CHUNK)
    
    print("Bắt đầu lắng nghe từ khóa 'xin chào trợ lý'...")
    print("Nhấn Ctrl+C để thoát")
    
    global execution_flag, activation_status
    audio_buffer = []
    
    while execution_flag:
        # Đọc dữ liệu âm thanh
        audio_data = audio_stream.read(SAMPLE_CHUNK, exception_on_overflow=False)
        audio_buffer.append(audio_data)
        
        # Giữ kích thước buffer cố định (khoảng PROCESS_DURATION giây)
        max_buffer_size = int(SAMPLE_RATE / SAMPLE_CHUNK * PROCESS_DURATION)
        if len(audio_buffer) > max_buffer_size:
            audio_buffer.pop(0)
        
        # Trong trạng thái chưa kích hoạt thì kiểm tra từ khóa
        if not activation_status:
            if len(audio_buffer) >= int(SAMPLE_RATE / SAMPLE_CHUNK * PROCESS_DURATION):
                # Lưu dữ liệu buffer thành tệp wav tạm thời
                with wave.open("temp_audio.wav", 'wb') as temp_file:
                    temp_file.setnchannels(AUDIO_CHANNELS)
                    temp_file.setsampwidth(audio_engine.get_sample_size(AUDIO_FORMAT))
                    temp_file.setframerate(SAMPLE_RATE)
                    temp_file.writeframes(b''.join(audio_buffer))
                
                # Sử dụng FunASR để kiểm tra từ khóa
                detection_result = detector_model.generate("temp_audio.wav")
                
                # Kiểm tra kết quả
                if ("keyword" in detection_result and 
                    detection_result["keyword"] == "xin chào trợ lý" and 
                    detection_result["score"] > ACTIVATION_THRESHOLD):
                    print(f"Phát hiện từ khóa! Điểm số: {detection_result['score']:.2f}")
                    trigger_activation()
        else:
            # Logic xử lý trong trạng thái đã kích hoạt (có thể thêm nhận diện lệnh...)
            pass
        
        # Nghỉ ngắn để giảm tải CPU
        time.sleep(0.01)
    
    # Giải phóng tài nguyên
    audio_stream.stop_stream()
    audio_stream.close()
    audio_engine.terminate()
    if os.path.exists("temp_audio.wav"):
        os.remove("temp_audio.wav")

if __name__ == "__main__":
    run_detection()

Tạo dịch vụ khởi động

# Tạo tệp dịch vụ systemd
sudo nano /etc/systemd/system/keyword-service.service

[Unit]
Description=Keyword Detection Service
After=network.target

[Service]
ExecStart=/bin/bash -c 'source /home/user_name/asr_environment/bin/activate && python3 /home/user_name/keyword_detector.py'
WorkingDirectory=/home/user_name
StandardOutput=inherit
StandardError=inherit
Restart=always
User=user_name

[Install]
WantedBy=multi-user.target

Kích hoạt dịch vụ:

sudo systemctl daemon-reload
sudo systemctl enable keyword-service
sudo systemctl start keyword-service

5. Tối ưu hóa và mở rộng

Tối ưu hiệu suất

RK3588 có khả năng tăng tốc NPU, có thể tận dụng để cải thiện tốc độ nhận diện:

# Cài đặt thư viện tăng tốc NPU (cần điều chỉnh theo bo mạch RK3588 cụ thể)
sudo apt install librga-dev

# Sử dụng TNN hoặc ONNX runtime để tối ưu suy luận mô hình
pip install funasr-onnx onnxruntime

Thêm chức năng nhận diện lệnh

Sau khi kích hoạt có thể thêm chức năng nhận diện lệnh:

# Thêm nhận diện lệnh trong hàm trigger_activation
def trigger_activation():
    print("Hệ thống đã kích hoạt, chờ lệnh...")
    os.system("aplay activation_sound.wav")
    
    # Ghi âm lệnh trong 5 giây
    cmd_recorder = pyaudio.PyAudio()
    cmd_stream = cmd_recorder.open(format=AUDIO_FORMAT,
                               channels=AUDIO_CHANNELS,
                               rate=SAMPLE_RATE,
                               input=True,
                               frames_per_buffer=SAMPLE_CHUNK)
    
    print("Vui lòng nói lệnh của bạn...")
    recorded_frames = []
    duration_samples = int(SAMPLE_RATE / SAMPLE_CHUNK * 5)
    for idx in range(0, duration_samples):
        frame_data = cmd_stream.read(SAMPLE_CHUNK)
        recorded_frames.append(frame_data)
    
    # Lưu âm thanh lệnh
    with wave.open("user_command.wav", 'wb') as cmd_file:
        cmd_file.setnchannels(AUDIO_CHANNELS)
        cmd_file.setsampwidth(cmd_recorder.get_sample_size(AUDIO_FORMAT))
        cmd_file.setframerate(SAMPLE_RATE)
        cmd_file.writeframes(b''.join(recorded_frames))
    
    # Nhận diện lệnh
    command_model = funasr.AutoModel(
        model="model_storage/speech_paraformer-large_asr_nat-zh-cn",
        model_type="paraformer",
        device_id=0
    )
    recognition_result = command_model.generate("user_command.wav")
    command_text = recognition_result["text"]
    print(f"Lệnh được nhận diện: {command_text}")
    
    # Thực thi logic lệnh
    process_user_command(command_text)

Việc triển khai hệ thống nhận diện từ khóa giọng nói ngoại tuyến trên hệ thống Ubuntu RK3588 sử dụng FunASR là hoàn toàn khả thi. Giải pháp này mang lại nhiều lợi ích như hoạt động độc lập không cần mạng, phản hồi nhanh chóng, tùy chỉnh linh hoạt và hiệu quả tài nguyên nhờ sức mạnh xử lý của vi xử lý RK3588.

Thẻ: funasr Speech-Recognition keyword-spotting rk3588 offline-asr

Đăng vào ngày 24 tháng 5 lúc 17:42

Thành phố Cuồng loạn