import numpy as np
import soundfile as sf
# 读入音频文件
audio, sr = sf.read('example.wav')
# 计算信号的能量
signal_energy = np.sum(audio ** 2)
# 设置信噪比
SNR = 10
# 计算噪声的能量
noise_energy = signal_energy / (10 ** (SNR / 10))
# 生成噪声信号
noise = np.random.randn(len(audio))
noise *= np.sqrt(noise_energy / np.sum(noise ** 2))
# 合并信号和噪声
noisy_audio = audio + noise
# 输出到文件
sf.write('noisy_example.wav', noisy_audio, sr)
使用 awgn( ) 函数,往语音中添加不同信噪比的白噪声,观察端点 检测效果。
好的,这里提供一个 MATLAB 的代码示例,可以往语音中添加不同信噪比的白噪声,并观察端点检测效果。
% 读取语音文件
[x, fs] = audioread('example.wav');
% 设定信噪比(SNR)和噪声功率
SNR = 10; % dB
Pn = var(x) / (10^(SNR/10));
% 生成白噪声
n = sqrt(Pn) * randn(size(x));
% 添加噪声
y = x + n;
% 播放原始语音和添加噪声的语音
sound(x, fs);
sound(y, fs);
% 端点检测
[~, endpoint] = vad(y, fs, 'energy', 'plot');
% 绘制端点检测结果
t = (0:length(x)-1)/fs;
plot(t, x, 'b-', t, y, 'r-', endpoint/fs, zeros(size(endpoint)), 'g*');
legend('原始语音', '添加噪声的语音', '端点检测');
这里使用了 `vad` 函数进行端点检测,其中 `'energy'` 表示使用短时能量法,`'plot'` 表示绘制端点检测结果。
你可以将代码保存为 `add_noise.m` 文件,然后在 MATLAB 命令行中运行 `add_noise` 命令即可。注意需要将 `example.wav` 替换成实际的语音文件。
1.根据基于双门限法、相关法、谱熵法等的语音端点检测原理,编写程序实现语音信号的端点检测函数, (1)画出算法实现的流程图; (2)函数输出参数包括:voiceseg是一个数据结构,记录了语音端点的信息;vsl是voiceseg 的长度;SF是语音帧标志(SF=1表示该帧是语音段);NF是噪声/无声帧标志(NF=1表示该帧是噪声/无声段)……。 (3)基于一段语音文件(自己的学号的录音)验证所写的端点检测算法,图例如2-1所示。 图2-1 双门限法端点检测例图 2.【扩展部分】往语音中添加不同信噪比的白噪声,观察端点检测的效果,并分析改进算法的思路。
1. 端点检测算法实现流程图:
2. 端点检测函数代码实现:
import numpy as np
def endpoint_detection(signal, sr, frame_size=0.025, frame_stride=0.01, energy_threshold_ratio=1.5, zcr_threshold_ratio=0.5):
# 1. Pre-emphasis
pre_emphasis = 0.97
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
# 2. Frame blocking and windowing
frame_length, frame_step = frame_size * sr, frame_stride * sr
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z)
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
frames *= np.hamming(frame_length)
# 3. Feature extraction (short-term energy and zero-crossing rate)
st_energy = np.sum(np.power(frames, 2), axis=1)
st_zcr = np.sum(np.abs(np.diff(np.sign(frames))), axis=1) / 2
# 4. Threshold computation
energy_threshold = energy_threshold_ratio * np.mean(st_energy)
zcr_threshold = zcr_threshold_ratio * np.mean(st_zcr)
# 5. Endpoint detection
sf, nf = np.zeros((num_frames, 1)), np.zeros((num_frames, 1))
for i in range(num_frames):
if st_energy[i] > energy_threshold:
sf[i] = 1
if st_zcr[i] > zcr_threshold:
nf[i] = 1
# 6. Post-processing (speech segment detection based on speech and non-speech frame sequences)
seq = np.concatenate(([0], np.diff(sf.T)))
start = np.where(seq == 1)[0] * frame_step
end = np.where(seq == -1)[0] * frame_step
if len(end) == 0:
end = np.array([signal_length])
if len(start) == 0:
start = np.array([0])
if end[0] < start[0]:
end = end[1:]
if len(end) > len(start):
end = end[:-1]
duration = end - start
min_duration = 0.1
voiceseg = np.compress(duration > min_duration * sr, np.vstack((start, end)).T, axis=0)
vsl = len(voiceseg)
return voiceseg, vsl, sf, nf
3. 基于一段语音文件验证端点检测算法代码实现:
import librosa
import librosa.display
import matplotlib.pyplot as plt
# Load audio file
filename = 'your_student_id.wav'
signal, sr = librosa.load(filename, sr=None, mono=True)
# Perform endpoint detection
voiceseg, vsl, sf, nf = endpoint_detection(signal, sr)
# Visualize speech/non-speech frames
plt.figure(figsize=(14, 5))
plt.subplot(2, 1, 1)
librosa.display.waveplot(signal, sr=sr, alpha=0.5)
plt.vlines(voiceseg[:, 0], -1, 1, color='r', linestyle='--', label='Speech Segments')
plt.legend(loc='upper right')
plt.title('Speech Segments Detected using Endpoint Detection')
plt.xlabel('Time (s)')
plt.xlim(0, len(signal) / sr)
plt.subplot(2, 1, 2)
plt.plot(sf, color='b', label='Speech Frame')
plt.plot(nf, color='g', label='Non-Speech Frame')
plt.legend(loc='upper right')
plt.xlabel('Frame Index')
plt.ylabel('Frame Label')
plt.xlim(0, len(sf))
plt.ylim(-0.1, 1.1)
4. 扩展部分:往语音中添加不同信噪比的白噪声,观察端点检测的效果,并分析改进算法的思路。
import librosa
import librosa.display
import matplotlib.pyplot as plt
from scipy.io import wavfile
from scipy.signal import lfilter, firwin
# Load audio file
filename = "your_student_id.wav"
signal, sr = librosa.load(filename, sr=None, mono=True)
# Add white noise to signal with different SNRs
snrs = [-5, 0, 5, 10, 15]
noisy_signals = []
for snr in snrs:
noise = np.random.randn(len(signal))
signal_power = np.sum(signal ** 2) / len(signal)
noise_power = np.sum(noise ** 2) / len(noise)
noise = np.sqrt(signal_power / (noise_power * 10 ** (snr / 10))) * noise
noisy_signal = signal + noise
# Perform endpoint detection on noisy signals
voicesegs = []
for noisy_signal in noisy_signals:
voiceseg, vsl, sf, nf = endpoint_detection(noisy_signal, sr)
# Visualize speech/non-speech frames for different SNRs
plt.figure(figsize=(14, 10))
for i in range(len(snrs)):
plt.subplot(len(snrs), 1, i+1)
librosa.display.waveplot(noisy_signals[i], sr=sr, alpha=0.5)
plt.vlines(voicesegs[i][:, 0], -1, 1, color='r', linestyle='--', label='Speech Segments')
plt.legend(loc='upper right')
plt.title(f'Speech Segments Detected using Endpoint Detection with SNR={snrs[i]}dB')
plt.xlabel('Time (s)')
plt.xlim(0, len(noisy_signals[i]) / sr)