用python对一个wav文件进行logmmse算法降噪
时间: 2023-06-14 13:04:55 浏览: 169
首先,需要导入以下依赖库:
```python
import numpy as np
import scipy.signal as signal
import scipy.io.wavfile as wavfile
```
接着,我们可以定义一个logmmse函数:
```python
def logmmse(sig, rate, noise_frames=6, SSub=1, alpha=0.98, over_sub=3, noise_mu=0.1):
# 预处理
sig = sig.astype(np.float64)
sig -= np.mean(sig) # 消除DC偏置
sig /= np.max(np.abs(sig)) # 归一化
# 分帧
N = 512
frames = signal.frame(sig, frame_length=N, hop_length=N//2).astype(np.float64)
num_frames, frame_len = frames.shape
# 分帧FFT
frames_fft = np.fft.fft(frames, axis=1)
# 计算帧能量
frame_energy = np.sum(frames ** 2, axis=1) / frame_len
# 估计噪声能量
noise_frames_fft = frames_fft[:noise_frames]
noise_energy = np.sum(noise_frames_fft ** 2, axis=1) / frame_len
noise_mu = np.median(noise_energy)
noise_energy_db = 10 * np.log10(noise_energy)
# 初始化变量
alpha_s = 1
g = np.ones(frame_len)
xi_w = np.ones(frame_len)
xi_w_last = 0
sub_speech_flag = np.zeros(num_frames)
over_sub_count = np.zeros(num_frames)
# 主循环
for i in range(num_frames):
# 计算当前帧信噪比
snr = 10 * np.log10(frame_energy[i] / noise_mu)
snr_db = snr - 10 * np.log10(SSub)
# 判断是否为语音帧
if snr_db > noise_energy_db.max():
sub_speech_flag[i] = 1
if sub_speech_flag[i]:
# 计算当前帧的谱增益函数
gamma = np.minimum(np.maximum(alpha_s * (snr_db - noise_energy_db), 0), 1)
# 调整谱增益函数
if xi_w_last > 1:
over_sub_count[i] = over_sub_count[i-1] + 1
else:
over_sub_count[i] = 0
if over_sub_count[i] < over_sub:
xi_w = np.power(gamma, alpha) * xi_w_last
else:
xi_w = np.power(gamma, alpha) * (xi_w_last + 1)
over_sub_count[i] = over_sub
# 计算平滑系数
alpha_s = np.where(xi_w > xi_w_last, alpha_s + 0.1, alpha_s - 0.1)
# 更新帧谱
g = np.minimum(xi_w, 1)
else:
# 更新噪声谱
noise_mu = (1 - noise_mu) * noise_mu + noise_mu * np.minimum(frame_energy[i] / noise_mu, 1)
noise_energy = np.append(noise_energy[1:], frame_energy[i])
noise_energy_db = 10 * np.log10(noise_energy)
xi_w_last = xi_w
g = np.ones(frame_len)
# 应用谱增益
frames_fft[i] *= g
# 合成信号
stft = np.zeros((num_frames, N), dtype=np.complex64)
stft[:, :N//2+1] = frames_fft
stft[:, N//2+1:] = np.flip(np.conj(frames_fft[:, 1:N//2]), axis=1)
sig_out = signal.istft(stft, hop_length=N//2)[1]
# 去掉前面的静音部分
start = np.argmax(sub_speech_flag)
sig_out = sig_out[start * N//2:]
# 恢复幅度
sig_out *= np.max(np.abs(sig)) / np.max(np.abs(sig_out))
return sig_out
```
最后,我们可以读取一个wav文件,调用logmmse函数进行降噪,然后保存降噪后的文件:
```python
# 读取wav文件
rate, sig = wavfile.read('test.wav')
# 降噪
sig_out = logmmse(sig, rate)
# 保存降噪后的文件
wavfile.write('test_out.wav', rate, sig_out.astype(np.int16))
```
注意,logmmse函数中的参数可以根据实际情况进行调整,例如:
- noise_frames:用于估计噪声能量的帧数
- SSub:语音信号的能量与噪声能量的比值
- alpha:平滑系数
- over_sub:连续多少帧使用xi_w_last+1进行谱增益调整
- noise_mu:初始噪声能量估计值
阅读全文