最近下載了一批相似百家講壇的音頻文件。這些文件前面部分是演講類的音頻,主要講歷史的,後面一部分是音樂,每一個文件的音樂都不同。
因此但願把文件切割,把音樂部分切走,只留下演講部分。
這裏一個技術難點就是怎麼識別哪些音頻是演講,哪些音頻是音樂。
經過KNN算法,1s的音頻文件的預測正確率是92%。
同時3s都判斷爲音樂才進行分割,整個文件的分割正確率是98%。git
音頻文件和源碼能夠在這裏下載github
# encoding=gbk import random import wave import matplotlib.pyplot as plt import numpy as np import os # nchannels 聲道 # sampwidth 樣本寬度 # framerate 幀率,也就是一秒有多少幀 # nframes 文件一共有多少幀 def pre_deal(file_path): """音頻解析,返回音頻數據""" f = wave.open(file_path, 'rb') params = f.getparams() nchannels, sampwidth, framerate, nframes = params[:4] strData = f.readframes(nframes) # 讀取音頻,字符串格式 waveData = np.fromstring(strData, dtype=np.int16) # 將字符串轉化爲int waveData = waveData[::nchannels] # 根據聲道數,轉換爲單聲道 rate = 20.00 framerate = framerate / rate # 下降幀率 nframes = nframes / rate # 下降幀率 waveData = waveData[::int(rate)] # wave幅值歸一化 max_ = float(max(abs(waveData))) waveData = waveData / max_ return waveData, framerate, nframes def plpot(waveData): """畫圖""" time = [i for i, v in enumerate(waveData)] plt.plot(time, waveData) plt.xlabel("Time") plt.ylabel("Amplitude") plt.title("Single channel wavedata") plt.grid('on') # 標尺,on:有,off:無。 plt.show() def mp3towav(file_path, to_file_path): """mp3文件轉wav文件""" if os.path.exists(to_file_path): return to_file_path from pydub import AudioSegment print file_path song1 = AudioSegment.from_mp3(file_path) song1.export(to_file_path, 'wav') return to_file_path if __name__ == '__main__': file_path = 'D:\BaiduNetdiskDownload\\a.mp3' file_path = mp3towav('D:\BaiduNetdiskDownload\\a.mp3', file_path.replace('mp3', 'wav')) data, _, _ = pre_deal(file_path) plpot(data)
wave
庫,能夠識別音頻文件,聲道,樣本寬度,幀率,幀數等使用音頻處理軟件goldwave,採用人工聽的方法來把音頻文件的音樂部分剪掉,保存的文件放在chg目錄裏面,剪以前的文件放在raw目錄下面。一共剪了18個文件。算法
class LeaningTest(): chg_path = r'D:\BaiduNetdiskDownload\test\chg' raw_path = r'D:\BaiduNetdiskDownload\test\raw' model = None @classmethod def load_model(cls): cls.model = pickle_utils.load('knn.model.pkl') @classmethod def chg(cls): chg_path = r'D:\BaiduNetdiskDownload\test\chg' raw_path = r'D:\BaiduNetdiskDownload\test\raw' for i, f in enumerate(os.listdir(chg_path)): shutil.copy(chg_path + '\\' + f, chg_path + '\\' + '%s.mp3' % i) shutil.copy(raw_path + '\\' + f, raw_path + '\\' + '%s.mp3' % i) @classmethod def get_path(cls, i, t): p = cls.chg_path if t == 'chg' else cls.raw_path return p + '\\' + '%s.mp3' % i @classmethod def sample_cnt(cls, sample): """ 轉換樣本數據,返回每一個區間的計數。 例如從[0.1,0.1,0.8]轉換爲[2,1] 2是[0,0.5)區間的計數 1是[0.5,1)區間的計數 """ step = 0.025 qujians = [] start = 0 while start < 1: qujians.append((start, start + step)) start += step new_sample = [0 for i in range(len(qujians))] for s in sample: for i, qujian in enumerate(qujians): if qujian[0] <= s < qujian[1]: new_sample[i] += 1 return new_sample @classmethod def get_sample(cls, i): """ 獲取用於機器學習的數據 return [([100,200],0)] """ chg = cls.to_wav(cls.get_path(i, 'chg')) raw = cls.to_wav(cls.get_path(i, 'raw')) data_chg, framerate_chg, n_frames_chg = pre_deal(chg) total_sec_chg = int(n_frames_chg / framerate_chg) data_raw, framerate_raw, n_frames_raw = pre_deal(raw) total_sec_raw = int(n_frames_raw / framerate_raw) length = 1 samples = [] for i in range(60, total_sec_raw, length): if total_sec_chg + 5 < i < total_sec_chg + 5: continue # 不要這部分 flag = 0 if i < total_sec_chg else 1 # print get_index(framerate, 0, i),get_index(framerate, 0, i + length),total_sec sample = data_raw[get_index(framerate_raw, 0, i):get_index(framerate_raw, 0, i + length)] sample = cls.sample_cnt(sample) samples.append((sample, flag)) return samples @classmethod def to_wav(cls, file_path): """轉換mp3爲wav""" if 'mp3' in file_path: to_file_path = file_path.replace('mp3', 'wav') mp3towav(file_path, to_file_path) file_path = to_file_path return file_path @classmethod def get_all_sample(cls, ): """獲取全部樣本""" file_name = 'sample4.json' if os.path.exists(file_name): with open(file_name, 'r') as f: return json.loads(f.read()) else: samples = [] for i in range(1): print 'get sample', i samples.extend(cls.get_sample(i)) with open(file_name, 'w') as f: f.write(json.dumps(samples)) return samples @classmethod def train_wrapper(cls): """訓練""" samples = cls.get_all_sample() label0 = [s for s in samples if s[1] == 0] label1 = [s for s in samples if s[1] == 1] random.shuffle(label0) random.shuffle(label1) train_datas_sets = [i[0] for i in label0[:int(len(label0) * 0.7)]] + [i[0] for i in label1[:int(len(label1) * 0.7)]] train_labels_set = [i[1] for i in label0[:int(len(label0) * 0.7)]] + [i[1] for i in label1[:int(len(label1) * 0.7)]] test_datas_set = [i[0] for i in label0[int(len(label0) * 0.7):]] + [i[0] for i in label1[int(len(label1) * 0.7):]] test_labels_set = [i[1] for i in label0[int(len(label0) * 0.7):]] + [i[1] for i in label1[int(len(label1) * 0.7):]] print len(train_datas_sets) # cls.train_knn(train_datas_sets, train_labels_set, test_datas_set, test_labels_set) if __name__ == '__main__': LeaningTest.train_wrapper()
@classmethod def train(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set): """ """ from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LinearRegression from sklearn import tree from sklearn import svm from sklearn.neural_network import MLPClassifier from sklearn import neighbors for mechine in [svm.SVC, LogisticRegression, LinearRegression, tree.DecisionTreeClassifier, neighbors.KNeighborsClassifier, MLPClassifier, GaussianNB]: clf = mechine() clf.fit(train_datas_sets, train_labels_set) # 訓練 score = clf.score(test_datas_set, test_labels_set) # 預測測試集,並計算正確率 print 'score', mechine, score
訓練結果:json
score <class 'sklearn.svm.classes.SVC'> 0.7203252032520325 score <class 'sklearn.linear_model.logistic.LogisticRegression'> 0.8886178861788618 score <class 'sklearn.linear_model.base.LinearRegression'> 0.40864632529611417 score <class 'sklearn.tree.tree.DecisionTreeClassifier'> 0.8888888888888888 score <class 'sklearn.neighbors.classification.KNeighborsClassifier'> 0.9224932249322493 score <class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'> 0.835230352303523 score <class 'sklearn.naive_bayes.GaussianNB'> 0.8035230352303523
因此訓練knn模型,並保存爲picklewindows
@classmethod def train_knn(cls, train_datas_sets, train_labels_set, test_datas_set, test_labels_set): from sklearn import neighbors mechine = neighbors.KNeighborsClassifier clf = mechine() clf.fit(train_datas_sets, train_labels_set) # 訓練 score = clf.score(test_datas_set, test_labels_set) # 預測測試集,並計算正確率 print 'score', mechine, score pickle_utils.dump(clf, 'knn.model.pkl')
@classmethod def get_cut_sce(cls, file_path, model): """獲取分割的秒數,找不到返回None""" file_path = cls.to_wav(file_path) data_raw, framerate, n_frames = pre_deal(file_path) total_sec = int(n_frames / framerate) length = 1 rets = [] for i in range(60, total_sec, length): # print file_path, i sample = data_raw[get_index(framerate, 0, i):get_index(framerate, 0, i + length)] sample = cls.sample_cnt(sample) ret = model.predict([sample]) rets.append(ret) if ret == 1 and len(rets) >= 3 and rets[-2] == 1 and rets[-3] == 1: return i return None @classmethod def get_min(cls, sec): """轉換秒數爲 分秒格式""" print '%s:%s' % (int(sec / 60), int(sec % 60)) @classmethod def predict(cls, ): """預測""" file_path = r'D:\BaiduNetdiskDownload\c.mp3' model = pickle_utils.load('knn.model.pkl') sec = cls.get_cut_sce(file_path, model) print 'sec', sec, cls.get_min(sec) @classmethod def cut_song(cls, file_path, to_file_path, file_name): """分割歌曲""" print 'cut_song', file_name.decode('gbk'), file_path sec = cls.get_cut_sce(file_path, cls.model) if sec is None: print 'error can not find sec', file_path, file_name.decode('gbk') return 0 song = AudioSegment.from_mp3(file_path) # to_file_path=file_path.replace('mp3','wav') song = song[:sec * 1000] song.export(to_file_path, 'mp3', bitrate='64k') return 1 @classmethod def cut_songs(cls, ): """分割某個文件夾下面的全部歌曲""" root_path = r'D:\BaiduNetdiskDownload\聽世界-戰國5(156集)64kbps' del_path = r'D:\BaiduNetdiskDownload\to_del' for f in os.listdir(root_path): if 'mp3' in f and 'cut' not in f: file_path = root_path + '\\' + f if os.path.exists(file_path + '.cut.mp3'): print 'exist', file_path.decode('gbk') + '.cut.mp3' continue # 因爲pydub不支持windows的中文路徑,因此只能把源文件已到一個臨時的英文目錄,而後執行分割 而後把臨時文件移走 tmp_file_path = 'D:\BaiduNetdiskDownload\\test.mp3' # pydub不支持中文地址,只能這樣 tmp_wav_path = tmp_file_path.replace('mp3', 'wav') tmp_to_file_path = tmp_file_path + '.cut.mp3' shutil.copy(file_path, tmp_file_path) ret = cls.cut_song(tmp_file_path, tmp_to_file_path,f) shutil.move(tmp_file_path, del_path + '\\del1_' + f) shutil.move(tmp_wav_path, del_path + '\\del3_' + f) try: # 有可能找不到分割點,致使沒有分割,因此加上try shutil.copy(tmp_to_file_path, file_path + '.cut.mp3') shutil.move(tmp_to_file_path, del_path + '\\del2_' + f) except: import traceback print traceback.format_exc() @classmethod def test(cls): song = AudioSegment.from_mp3(u'D:\BaiduNetdiskDownload\測試\\a.mp3'.encode('gbk')) if __name__ == '__main__': LeaningTest.load_model() LeaningTest.cut_songs()