流程圖:
從 人聲的模擬信號 獲得 MFCC的梅爾倒譜
ios
%% % r = audiorecorder(16000, 16, 1); % record(r); % servel seconds % stop(r); % mySpeech = getaudiodata(r); % figure;plot(mySpeech);title('mySpeech'); %% mySpeech = wavread('mySpeech.wav', 'native'); figure;plot(mySpeech);title('mySpeech'); SizeOfmySpeech = size(mySpeech, 1); for i = 2 : SizeOfmySpeech mySpeech(i) = mySpeech(i) - 0.95 * mySpeech(i-1); end figure;plot(mySpeech);title('mySpeech_fix');
錄音的要求是採用率爲16000Hz,量化爲16bit數據結構
ret_value temp; short waveData2[60000]; int main() { load_wave_file("mySpeech.wav", &temp, waveData2); return 0; }
總共有60000個採樣點函數
void setHammingWindow(float* frameWindow){ for(int i = 0; i < FRAMES_PER_BUFFER; i++){ frameWindow[i] = 0.54 - 0.46*cos(2 * PI * i / (FRAMES_PER_BUFFER - 1)); } } void setHanningWindow(float* frameWindow){ for(int i = 0; i < FRAMES_PER_BUFFER; i++){ frameWindow[i] = 0.5 - 0.5*cos(2 * PI * i / (FRAMES_PER_BUFFER - 1)); } } void setBlackManWindow(float* frameWindow){ for(int i = 0; i < FRAMES_PER_BUFFER; i++){ frameWindow[i] = 0.42 - 0.5*cos(2 * PI * i / (FRAMES_PER_BUFFER - 1)) + 0.08*cos(4 * PI*i / (FRAMES_PER_BUFFER - 1)); } }
這次選取的是海明窗spa
// 加窗操做 int seg_shift = (i - 1) * NOT_OVERLAP; for(j = 0; j < FRAMES_PER_BUFFER && (seg_shift + j) < numSamples; j++){ afterWin[j] = spreemp[seg_shift + j] * frameWindow[j]; }
每次分幀,數據點變爲400個點3d
// 知足FFT爲2^n個點,補零操做 for(int k = j - 1; k < LEN_SPECTRUM; k++){ afterWin[k] = 0; }
知足fft操做須要,補零至512個點code
void FFT_Power(float* in, float* energySpectrum){ fftwf_complex* out = (fftwf_complex*)fftwf_malloc(sizeof(fftwf_complex)*LEN_SPECTRUM); fftwf_plan p = fftwf_plan_dft_r2c_1d(LEN_SPECTRUM, in, out, FFTW_ESTIMATE); fftwf_execute(p); for(int i = 0; i < LEN_SPECTRUM; i++){ energySpectrum[i] = out[i][0] * out[i][0] + out[i][1] * out[i][1]; } fftwf_destroy_plan(p); fftwf_free(out); }
這裏用到了MIT大學的開源FFT變換庫fftw3.h,快速計算能量譜(能夠搜索下載根據本身的IDE配置)orm
void computeMel(float* mel, int sampleRate, const float* energySpectrum){ int fmax = sampleRate / 2; float maxMelFreq = 1125 * log(1 + fmax / 700);
// 計算頻譜到梅爾譜的映射關係 for(int i = 0; i < NUM_FILTER + 2; i++){ m[i] = i*delta; h[i] = 700 * (exp(m[i] / 1125) - 1); f[i] = floor((256 + 1)*h[i] / sampleRate); }
// 梅爾濾波 for(int i = 0; i < NUM_FILTER; i++){ for(int j = 0; j < 256; j++){ if(j >= melFilters[i][0] && j <= melFilters[i][1]){ mel[i] += ((j - melFilters[i][0]) / (melFilters[i][1] - melFilters[i][0]))*energySpectrum[j]; } else if(j > melFilters[i][1] && j <= melFilters[i][2]){ mel[i] += ((melFilters[i][2] - j) / (melFilters[i][2] - melFilters[i][1]))*energySpectrum[j]; } } }
一共選擇了40個三角濾波器,最後的梅爾譜也是40個點blog
void DCT(const float* mel, float* melRec){ for(int i = 0; i < LEN_MELREC; i++){ for(int j = 0; j < NUM_FILTER; j++){ if(mel[j] <= -0.0001 || mel[j] >= 0.0001){ melRec[i] += log(mel[j])*cos(PI*i / (2 * NUM_FILTER)*(2 * j + 1)); } } } }
把40個點的梅爾譜映射到13維的倒譜上。取對數作離散餘弦變換內存
// 歸一化處理 for(int i = 0; i < LEN_MELREC; i++){ sumMelRec[i] = sqrt(sumMelRec[i] / numFrames); } fstream fout("All_MelRec.txt", ios::out); fstream fout2("All_MelRec_Bef.txt", ios::out); for(int i = 0; i < numFrames; i++){ for(int j = 0; j < LEN_MELREC; j++){ fout2 << mulMelRec[i][j] << " "; mulMelRec[i][j] /= sumMelRec[j]; fout << mulMelRec[i][j] << " "; } fout << endl; fout2 << endl; }
使得最終的結果數據聚攏,易於觀察ci
%% 讀取原始音頻文件 fidin = fopen('wavData.txt', 'r'); len_waveData = fscanf(fidin, '%d', 1); waveData = zeros(len_waveData, 1); for i = 1 : 1 : len_waveData waveData(i) = fscanf(fidin, '%d', 1); end fclose(fidin); subplot(2, 3, 1); plot(1:len_waveData, waveData); axis([0 400 -2 2]); title('原始音頻文件');
%% 梅爾倒譜的色域 A = load('All_MelRec_Bef.txt'); figure; imagesc(A'); hold on colorbar; title('梅爾倒譜的色域'); %% 梅爾倒譜的色域(歸一化) B = load('All_MelRec.txt'); figure; imagesc(B'); hold on colorbar; title('梅爾倒譜的色域(歸一化)');
其他輸出操做是相同的,操做見最後的完整代碼
錄音後的原始音頻信號
總共有6000個採樣點,量化爲16bit,所以數據量級能達到10^4
MFCC操做中,第五幀的結果流程
原始音頻分幀後,每一幀是400的點,從結果來看,在一幀的時間長度裏面,數據變化不大,幅值維持在 [-1 1] 之間浮動。(如選取其餘幀能夠看到變化比較明顯,看看原始音頻就知道了)
加窗操做後,端點值被明顯收斂到0,所以不會對能量譜的計算產生突變的狀況。
能量譜和梅爾譜能夠看出,與咱們已知的人聲特色相關。
歸一化以前的梅爾倒譜
高頻能量集中在較低的維度,和能量譜的顯示吻合
歸一化的梅爾倒譜
歸一化以後,相比未歸一化的圖,較高維度的能量可以較好地被分辨出來,易於分析
至此,梅爾倒譜工做完成。
matlab錄音文件 main.m
clear all close all clc %% % r = audiorecorder(16000, 16, 1); % record(r); % servel seconds % stop(r); % mySpeech = getaudiodata(r); % figure;plot(mySpeech);title('mySpeech'); %% mySpeech = wavread('mySpeech.wav', 'native'); figure;plot(mySpeech);title('mySpeech'); SizeOfmySpeech = size(mySpeech, 1); for i = 2 : SizeOfmySpeech mySpeech(i) = mySpeech(i) - 0.95 * mySpeech(i-1); end figure;plot(mySpeech);title('mySpeech_fix');
C++主函數文件 main.cpp
#include<iostream> #include "fftw3.h" #include"MFCC.h" #include"wav.h" using namespace std; int wavLen; double waveData[60000]; ret_value temp; short waveData2[60000]; int main() { /*wavLen = wavread("mySpeech.txt", waveData); if(wavLen == -1) exit(0);*/ load_wave_file("mySpeech.wav", &temp, waveData2); MFCC(waveData2, 60000, 16000); system("pause"); return 0; }
C++音頻定義頭文件 wav.h
#ifndef _WAV_H #define _WAV_H #define MAXDATA (512*400) //通常採樣數據大小,語音文件的數據不能大於該數據 #define SFREMQ (16000) //採樣數據的採樣頻率8khz #define NBIT 16 typedef struct WaveStruck{//wav數據結構 //data head struct HEAD{ char cRiffFlag[4]; int nFileLen; char cWaveFlag[4];//WAV文件標誌 char cFmtFlag[4]; int cTransition; short nFormatTag; short nChannels; int nSamplesPerSec;//採樣頻率,mfcc爲8khz int nAvgBytesperSec; short nBlockAlign; short nBitNumPerSample;//樣本數據位數,mfcc爲12bit } head; //data block struct BLOCK{ char cDataFlag[4];//數據標誌符(data) int nAudioLength;//採樣數據總數 } block; } WAVE; int wavread(char* filename, double* destination); struct ret_value { char *data; unsigned long size; ret_value() { data = 0; size = 0; } }; void load_wave_file(char *fname, struct ret_value *ret, short* waveData2); #endif
C++音頻實現文件 wav.cpp
#include"wav.h" #include<cstdio> #include<cstring> #include<malloc.h> int wavread(char* filename, double* destination){ WAVE wave[1]; FILE * f; f = fopen(filename, "rb"); if(!f) { printf("Cannot open %s for reading\n", filename); return -1; } //讀取wav文件頭而且分析 fread(wave, 1, sizeof(wave), f); if(wave[0].head.cWaveFlag[0] == 'W'&&wave[0].head.cWaveFlag[1] == 'A' &&wave[0].head.cWaveFlag[2] == 'V'&&wave[0].head.cWaveFlag[3] == 'E')//判斷是不是wav文件 { printf("It's not .wav file\n"); return -1; } if(wave[0].head.nSamplesPerSec != SFREMQ || wave[1].head.nBitNumPerSample != NBIT)//判斷是否採樣頻率是16khz,16bit量化 { printf("It's not 16khz and 16 bit\n"); return -1; } if(wave[0].block.nAudioLength>MAXDATA / 2)//wav文件不能太大,爲sample長度的一半 { printf("wav file is to long\n"); return -1; } //讀取採樣數據 fread(destination, sizeof(char), wave[0].block.nAudioLength, f); fclose(f); return wave[0].block.nAudioLength; } void load_wave_file(char *fname, struct ret_value *ret, short* waveData2) { FILE *fp; fp = fopen(fname, "rb"); if(fp) { char id[5]; // 5個字節存儲空間存儲'RIFF'和'\0',這個是爲方便利用strcmp unsigned long size; // 存儲文件大小 short format_tag, channels, block_align, bits_per_sample; // 16位數據 unsigned long format_length, sample_rate, avg_bytes_sec, data_size; // 32位數據 fread(id, sizeof(char), 4, fp); // 讀取'RIFF' id[4] = '\0'; if(!strcmp(id, "RIFF")) { fread(&size, sizeof(unsigned long), 1, fp); // 讀取文件大小 fread(id, sizeof(char), 4, fp); // 讀取'WAVE' id[4] = '\0'; if(!strcmp(id, "WAVE")) { fread(id, sizeof(char), 4, fp); // 讀取4字節 "fmt "; fread(&format_length, sizeof(unsigned long), 1, fp); fread(&format_tag, sizeof(short), 1, fp); // 讀取文件tag fread(&channels, sizeof(short), 1, fp); // 讀取通道數目 fread(&sample_rate, sizeof(unsigned long), 1, fp); // 讀取採樣率大小 fread(&avg_bytes_sec, sizeof(unsigned long), 1, fp); // 讀取每秒數據量 fread(&block_align, sizeof(short), 1, fp); // 讀取塊對齊 fread(&bits_per_sample, sizeof(short), 1, fp); // 讀取每同樣本大小 fread(id, sizeof(char), 4, fp); // 讀入'data' fread(&data_size, sizeof(unsigned long), 1, fp); // 讀取數據大小 ret->size = data_size; ret->data = (char*)malloc(sizeof(char)*data_size); // 申請內存空間 //fread(ret->data, sizeof(char), data_size, fp); // 讀取數據 fread(waveData2, sizeof(short), data_size, fp); // my fix } else { printf("Error: RIFF file but not a wave file\n"); } } else { printf("Error: not a RIFF file\n"); } } }
C++實現MFCC.h
#ifndef _MFCC_H #define _MFCC_H #define FRAMES_PER_BUFFER 400 #define NOT_OVERLAP 200 #define NUM_FILTER 40 #define PI 3.1415926 #define LEN_SPECTRUM 512 #define LEN_MELREC 13 void MFCC(const short* waveData, int numSamples, int sampleRate); void preEmphasizing(const short* waveData, float* spreemp, int numSamples, float heavyFactor); void setHammingWindow(float* frameWindow); void setHanningWindow(float* frameWindow); void setBlackManWindow(float* frameWindow); void FFT_Power(float* in, float* energySpectrum); void computeMel(float* mel, int sampleRate, const float* energySpectrum); void DCT(const float* mel, float* melRec); #endif
C++實現MFCC.cpp
#include"MFCC.h" #include"fftw3.h" #include<cmath> #include<cstring> #include<fstream> #include<string> using namespace std; template<class T> void print_Array(T* arr, int len, string filename); #define TORPINT true #define PRINT_FRAME 100 float mulMelRec[500][LEN_MELREC]; void MFCC(const short* waveData, int numSamples, int sampleRate){ if(TORPINT) print_Array(waveData, 60000, "wavDataAll.txt"); // 預加劇 float* spreemp = new float[numSamples]; preEmphasizing(waveData, spreemp, numSamples, -0.95); if(TORPINT) print_Array(waveData, 60000, "spreempAll.txt"); // 計算幀的數量 int numFrames = ceil((numSamples - FRAMES_PER_BUFFER) / NOT_OVERLAP) + 1; // 申請內存 float* frameWindow = new float[FRAMES_PER_BUFFER]; float* afterWin = new float[LEN_SPECTRUM]; float* energySpectrum = new float[LEN_SPECTRUM]; float* mel = new float[NUM_FILTER]; float* melRec = new float[LEN_MELREC]; /*float** mulMelRec = new float*[numFrames + 200]; for(int i = 0; i < numFrames; i++){ mulMelRec[i] = new float[LEN_MELREC]; }*/ float* sumMelRec = new float[LEN_MELREC]; memset(sumMelRec, 0, sizeof(float)*LEN_MELREC); memset(mulMelRec, 0, sizeof(float)*numFrames*LEN_MELREC); // 設置窗參數 setHammingWindow(frameWindow); //setHanningWindow(frameWindow); //setBlackManWindow(frameWindow); // 幀操做 for(int i = 0; i < numFrames; i++){ if(TORPINT && i == PRINT_FRAME) print_Array(waveData, FRAMES_PER_BUFFER, "wavData.txt"); if(TORPINT && i == PRINT_FRAME) print_Array(waveData, FRAMES_PER_BUFFER, "spreemp.txt"); int j; // 加窗操做 int seg_shift = (i - 1) * NOT_OVERLAP; for(j = 0; j < FRAMES_PER_BUFFER && (seg_shift + j) < numSamples; j++){ afterWin[j] = spreemp[seg_shift + j] * frameWindow[j]; } // 知足FFT爲2^n個點,補零操做 for(int k = j - 1; k < LEN_SPECTRUM; k++){ afterWin[k] = 0; } if(TORPINT && i == PRINT_FRAME) print_Array(afterWin, LEN_SPECTRUM, "After.txt"); // 計算能量譜 FFT_Power(afterWin, energySpectrum); if(TORPINT && i == PRINT_FRAME) print_Array(energySpectrum, LEN_SPECTRUM, "energySpectrum.txt"); // 計算梅爾譜 memset(mel, 0, sizeof(float)*NUM_FILTER); computeMel(mel, sampleRate, energySpectrum); if(TORPINT && i == PRINT_FRAME) print_Array(mel, NUM_FILTER, "mel.txt"); // 計算離散餘弦變換 memset(melRec, 0, sizeof(float)*LEN_MELREC); DCT(mel, melRec); if(TORPINT && i == PRINT_FRAME) print_Array(melRec, LEN_MELREC, "melRec.txt"); // 累計總值 for(int p = 0; p < LEN_MELREC; p++){ mulMelRec[i][p] = melRec[p]; sumMelRec[p] += melRec[p] * melRec[p]; } } // 歸一化處理 for(int i = 0; i < LEN_MELREC; i++){ sumMelRec[i] = sqrt(sumMelRec[i] / numFrames); } fstream fout("All_MelRec.txt", ios::out); fstream fout2("All_MelRec_Bef.txt", ios::out); for(int i = 0; i < numFrames; i++){ for(int j = 0; j < LEN_MELREC; j++){ fout2 << mulMelRec[i][j] << " "; mulMelRec[i][j] /= sumMelRec[j]; fout << mulMelRec[i][j] << " "; } fout << endl; fout2 << endl; } fout.close(); fout2.close(); // 釋放內存 delete[] spreemp; delete[] frameWindow; delete[] afterWin; delete[] energySpectrum; delete[] mel; delete[] melRec; delete[] sumMelRec; /*for(int i = 0; i < LEN_MELREC; i++){ delete[] mulMelRec[i]; } delete[] mulMelRec;*/ } void preEmphasizing(const short* waveData, float* spreemp, int numSamples, float heavyFactor){ spreemp[0] = (float)waveData[0]; for(int i = 1; i < numSamples; i++){ spreemp[i] = waveData[i] + heavyFactor * waveData[i - 1]; } } void setHammingWindow(float* frameWindow){ for(int i = 0; i < FRAMES_PER_BUFFER; i++){ frameWindow[i] = 0.54 - 0.46*cos(2 * PI * i / (FRAMES_PER_BUFFER - 1)); } } void setHanningWindow(float* frameWindow){ for(int i = 0; i < FRAMES_PER_BUFFER; i++){ frameWindow[i] = 0.5 - 0.5*cos(2 * PI * i / (FRAMES_PER_BUFFER - 1)); } } void setBlackManWindow(float* frameWindow){ for(int i = 0; i < FRAMES_PER_BUFFER; i++){ frameWindow[i] = 0.42 - 0.5*cos(2 * PI * i / (FRAMES_PER_BUFFER - 1)) + 0.08*cos(4 * PI*i / (FRAMES_PER_BUFFER - 1)); } } void FFT_Power(float* in, float* energySpectrum){ fftwf_complex* out = (fftwf_complex*)fftwf_malloc(sizeof(fftwf_complex)*LEN_SPECTRUM); fftwf_plan p = fftwf_plan_dft_r2c_1d(LEN_SPECTRUM, in, out, FFTW_ESTIMATE); fftwf_execute(p); for(int i = 0; i < LEN_SPECTRUM; i++){ energySpectrum[i] = out[i][0] * out[i][0] + out[i][1] * out[i][1]; } fftwf_destroy_plan(p); fftwf_free(out); } void computeMel(float* mel, int sampleRate, const float* energySpectrum){ int fmax = sampleRate / 2; float maxMelFreq = 1125 * log(1 + fmax / 700); int delta = (int)(maxMelFreq / (NUM_FILTER + 1)); // 申請空間 float** melFilters = new float*[NUM_FILTER]; for(int i = 0; i < NUM_FILTER; i++){ melFilters[i] = new float[3]; } float* m = new float[NUM_FILTER + 2]; float* h = new float[NUM_FILTER + 2]; float* f = new float[NUM_FILTER + 2]; // 計算頻譜到梅爾譜的映射關係 for(int i = 0; i < NUM_FILTER + 2; i++){ m[i] = i*delta; h[i] = 700 * (exp(m[i] / 1125) - 1); f[i] = floor((256 + 1)*h[i] / sampleRate); } // 計算梅爾濾波參數 for(int i = 0; i < NUM_FILTER; i++){ for(int j = 0; j < 3; j++){ melFilters[i][j] = f[i + j]; } } // 梅爾濾波 for(int i = 0; i < NUM_FILTER; i++){ for(int j = 0; j < 256; j++){ if(j >= melFilters[i][0] && j <= melFilters[i][1]){ mel[i] += ((j - melFilters[i][0]) / (melFilters[i][1] - melFilters[i][0]))*energySpectrum[j]; } else if(j > melFilters[i][1] && j <= melFilters[i][2]){ mel[i] += ((melFilters[i][2] - j) / (melFilters[i][2] - melFilters[i][1]))*energySpectrum[j]; } } } // 釋放內存 for(int i = 0; i < 3; i++){ delete[] melFilters[i]; } delete[] melFilters; delete[] m; delete[] h; delete[] f; } void DCT(const float* mel, float* melRec){ for(int i = 0; i < LEN_MELREC; i++){ for(int j = 0; j < NUM_FILTER; j++){ if(mel[j] <= -0.0001 || mel[j] >= 0.0001){ melRec[i] += log(mel[j])*cos(PI*i / (2 * NUM_FILTER)*(2 * j + 1)); } } } } template<class T> void print_Array(T* arr, int len, string filename){ fstream fout(filename, ios::out); fout << len << endl; for(int i = 0; i < len; i++){ fout << arr[i] << " "; } fout << endl; fout.close(); return; }
Matlab實現輸出觀察文件 Matlab_print.m
clear all close all clc %% 原始音頻全部 fidin = fopen('wavDataAll.txt', 'r'); len_waveData = fscanf(fidin, '%d', 1); waveData = zeros(len_waveData, 1); for i = 1 : 1 : len_waveData waveData(i) = fscanf(fidin, '%d', 1); end fclose(fidin); subplot(2, 3, 1); plot(1:len_waveData, waveData); title('原始音頻文件'); fidin = fopen('spreempAll.txt', 'r'); len_spreemp = fscanf(fidin, '%d', 1); spreemp = zeros(len_spreemp, 1); for i = 1 : 1 : len_spreemp spreemp(i) = fscanf(fidin, '%d', 1); end fclose(fidin); subplot(2, 3, 2); plot(1:len_spreemp, waveData); title('預加劇音頻文件'); figure; %% 讀取原始音頻文件 fidin = fopen('wavData.txt', 'r'); len_waveData = fscanf(fidin, '%d', 1); waveData = zeros(len_waveData, 1); for i = 1 : 1 : len_waveData waveData(i) = fscanf(fidin, '%d', 1); end fclose(fidin); subplot(2, 3, 1); plot(1:len_waveData, waveData); axis([0 400 -2 2]); title('原始音頻文件'); %% 讀取預加劇的音頻 fidin = fopen('spreemp.txt', 'r'); len_spreemp = fscanf(fidin, '%d', 1); spreemp = zeros(len_spreemp, 1); for i = 1 : 1 : len_spreemp spreemp(i) = fscanf(fidin, '%d', 1); end fclose(fidin); subplot(2, 3, 2); plot(1:len_spreemp, waveData); axis([0 400 -2 2]); title('預加劇音頻文件'); %% 加窗操做 fidin = fopen('After.txt', 'r'); len_AfterWin = fscanf(fidin, '%d', 1); AfterWin = zeros(len_AfterWin, 1); for i = 1 : 1 : len_AfterWin AfterWin(i) = fscanf(fidin, '%f', 1); end fclose(fidin); subplot(2, 3, 3); plot(1:len_AfterWin, AfterWin); grid on title('加窗操做'); %% 能量譜 fidin = fopen('energySpectrum.txt', 'r'); len_energySpectrum = fscanf(fidin, '%d', 1); energySpectrum = zeros(len_energySpectrum, 1); for i = 1 : 1 : len_energySpectrum energySpectrum(i) = fscanf(fidin, '%f', 1); end fclose(fidin); subplot(2, 3, 4); plot(1:len_energySpectrum, energySpectrum); grid on title('能量譜'); %% 梅爾譜 fidin = fopen('mel.txt', 'r'); len_mel = fscanf(fidin, '%d', 1); mel = zeros(len_mel, 1); for i = 1 : 1 : len_mel mel(i) = fscanf(fidin, '%f', 1); end fclose(fidin); subplot(2, 3, 5); plot(1:len_mel, mel); grid on title('梅爾譜'); %% 梅爾倒譜 fidin = fopen('melRec.txt', 'r'); len_melRec = fscanf(fidin, '%d', 1); melRec = zeros(len_melRec, 1); for i = 1 : 1 : len_melRec melRec(i) = fscanf(fidin, '%f', 1); end fclose(fidin); subplot(2, 3, 6); stem(1:len_melRec, melRec); grid on title('梅爾倒譜'); %% 梅爾倒譜的色域 A = load('All_MelRec_Bef.txt'); figure; imagesc(A'); hold on colorbar; title('梅爾倒譜的色域'); %% 梅爾倒譜的色域(歸一化) B = load('All_MelRec.txt'); figure; imagesc(B'); hold on colorbar; title('梅爾倒譜的色域(歸一化)');