python实现mfcc
时间: 2023-09-27 07:08:48 浏览: 135
MFCC(Mel频率倒谱系数)是一种常用的语音信号特征提取方法,下面是Python实现MFCC的示例代码:
```python
import numpy as np
from scipy.fftpack import dct
def pre_emphasis(signal, pre_emphasis_coeff=0.97):
# 预加重
return np.append(signal[0], signal[1:] - pre_emphasis_coeff * signal[:-1])
def framing(signal, frame_length, frame_step, window_func=None):
# 分帧
signal_length = len(signal)
frame_size = int(frame_length * signal_length)
step_size = int(frame_step * signal_length)
num_frames = int(np.ceil(float(np.abs(signal_length - frame_size)) / step_size))
padding_size = num_frames * step_size + frame_size
padding_signal = np.append(signal, np.zeros(padding_size - signal_length))
indices = np.tile(np.arange(0, frame_size), (num_frames, 1)) + np.tile(np.arange(0, num_frames * step_size, step_size), (frame_size, 1)).T
frames = padding_signal[indices.astype(np.int32, copy=False)]
if window_func is not None:
frames *= window_func(frame_size)
return frames
def power_spectrum(frames, nfft):
# 计算功率谱
magnitude_spectrum = np.abs(np.fft.rfft(frames, n=nfft))
return (1.0 / nfft) * np.square(magnitude_spectrum)
def mel_filter_bank(num_filters, nfft, sampling_rate, low_freq, high_freq):
# 计算Mel滤波器组
low_mel = 1127 * np.log(1 + low_freq / 700)
high_mel = 1127 * np.log(1 + high_freq / 700)
mel_points = np.linspace(low_mel, high_mel, num_filters + 2)
hertz_points = 700 * (np.exp(mel_points / 1127) - 1)
bin = np.floor((nfft + 1) * hertz_points / sampling_rate)
fbank = np.zeros((num_filters, int(np.floor(nfft / 2 + 1))))
for j in range(0, num_filters):
for i in range(int(bin[j]), int(bin[j + 1])):
fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j])
for i in range(int(bin[j + 1]), int(bin[j + 2])):
fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1])
return fbank
def mfcc(signal, sampling_rate, num_filters=26, ncoeff=13, nfft=512, frame_length=0.025, frame_step=0.01, pre_emphasis_coeff=0.97, window_func=np.hamming):
# 计算MFCC
signal = pre_emphasis(signal, pre_emphasis_coeff)
frames = framing(signal, frame_length, frame_step, window_func)
pspec = power_spectrum(frames, nfft)
fbank = mel_filter_bank(num_filters, nfft, sampling_rate, 0, sampling_rate / 2)
feat = np.dot(pspec, fbank.T)
feat = np.where(feat == 0, np.finfo(float).eps, feat)
feat = np.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:, 1 : (ncoeff + 1)]
return feat
```
这里的`signal`是输入的语音信号,`sampling_rate`是采样率,`num_filters`是Mel滤波器组的数量,`ncoeff`是MFCC系数的数量,`nfft`是FFT的点数,`frame_length`是帧长(以秒为单位),`frame_step`是帧移(以秒为单位),`pre_emphasis_coeff`是预加重系数,`window_func`是窗函数(默认为Hamming窗)。
这个函数返回一个`ncoeff`行的矩阵,每行代表一个MFCC系数。
阅读全文