在一家專一於AI音頻公司作了一年,最近正處於預離職狀態,正好剛剛給客戶寫了個關於android音頻方面的demo,花了我足足一天趕出來的,感受挺全面的決定再努力一點寫個總結。 公司雖小,是和中科院聲學所合做,也和訊飛同樣也有本身關於音頻的一系列語音識別/語音轉寫等引擎,麻雀雖小五臟俱全的感受。 Android 音頻這塊其實也沒那麼神祕,神祕的地方有專門的C++/算法工程師等爲咱們負責,你們都懂得,我只是搬搬磚。android
- SpeechToText(音頻轉文本:STT): AudioRecord 錄製音頻 並用本地和Socket2中方式上傳 。
- TextToSpeech (文本轉語音:TTS) API獲取音頻流並用AudioTrack 播放。
- Speex 加密
這裏不講TTS/STT底層原理,怎麼實現的呆了這麼久我也只是一點點,一點點而已,涉及人耳聽聲相關函數/聲波/傅里葉分析/一系列複雜函數, 這裏不敢班門弄斧了 感興趣請你們自行Google 。,git
AudioRecord 過程是一個IPC過程,Java層經過JNI調用到native層的AudioRecord,後者經過IAudioRecord接口跨進程調用到 AudioFlinger,AudioFlinger負責啓動錄音線程,將從錄音數據源裏採集的音頻數據填充到共享內存緩衝區,而後應用程序側從其裏面拷貝數據到本身的緩衝區。github
public AudioRecord(int audioSource, //指定聲音源 MediaRecorder.AudioSource.MIC;
int sampleRateInHz,//指定採樣率 這裏8000
int channelConfig,//指定聲道數,單聲道
int audioFormat, //指定8/16pcm 這裏16bit 模擬信號轉化爲數字信號時的量化單位
int bufferSizeInBytes)//緩衝區大小 根據採樣率 通道 量化參數決定
複製代碼
第二步再與socket 上傳比較 //參數初始化 // 音頻輸入-麥克風web
public final static int AUDIO_INPUT = MediaRecorder.AudioSource.MIC;
public final static int AUDIO_SAMPLE_RATE = 8000; // 44.1KHz,廣泛使用的頻率
public final static int CHANNEL_CONFIG = AudioFormat.CHANNEL_IN_MONO;
public final static int AUDIO_FORMAT = AudioFormat.ENCODING_PCM_16BIT;
private int bufferSizeInBytes = 0;//緩衝區字節大小
private AudioRecord audioRecord;
private volatile boolean isRecord = false;// volatile 可見性 設置正在錄製的狀態
複製代碼
//建立AudioRecord算法
private void creatAudioRecord() {
// 得到緩衝區字節大小
bufferSizeInBytes = AudioRecord.getMinBufferSize(AudioFileUtils.AUDIO_SAMPLE_RATE,
AudioFileUtils.CHANNEL_CONFIG, AudioFileUtils.AUDIO_FORMAT);
// MONO單聲道
audioRecord = new AudioRecord(AudioFileUtils.AUDIO_INPUT, AudioFileUtils.AUDIO_SAMPLE_RATE,
AudioFileUtils.CHANNEL_CONFIG, AudioFileUtils.AUDIO_FORMAT, bufferSizeInBytes);
}
//
@Override
public boolean onTouch(View v, MotionEvent event) {
AudioRecordUtils utils = AudioRecordUtils.getInstance();
switch (event.getAction()) {
case MotionEvent.ACTION_DOWN:
utils.startRecordAndFile();
break;
case MotionEvent.ACTION_UP:
utils.stopRecordAndFile();
Log.d(TAG, "stopRecordAndFile");
stt();
break;
}
return false;
}
//開始錄音
public int startRecordAndFile() {
Log.d("NLPService", "startRecordAndFile");
// 判斷是否有外部存儲設備sdcard
if (AudioFileUtils.isSdcardExit()) {
if (isRecord) {
return ErrorCode.E_STATE_RECODING;
} else {
if (audioRecord == null) {
creatAudioRecord();
}
audioRecord.startRecording();
// 讓錄製狀態爲true
isRecord = true;
// 開啓音頻文件寫入線程
new Thread(new AudioRecordThread()).start();
return ErrorCode.SUCCESS;
}
} else {
return ErrorCode.E_NOSDCARD;
}
}
//錄音線程
class AudioRecordThread implements Runnable {
@Override
public void run() {
writeDateTOFile();// 往文件中寫入裸數據
AudioFileUtils.raw2Wav(mAudioRaw, mAudioWav, bufferSizeInBytes);// 給裸數據加上頭文件
}
}
// 往文件中寫入裸數據
private void writeDateTOFile() {
Log.d("NLPService", "writeDateTOFile");
// new一個byte數組用來存一些字節數據,大小爲緩衝區大小
byte[] audiodata = new byte[bufferSizeInBytes];
FileOutputStream fos = null;
int readsize = 0;
try {
File file = new File(mAudioRaw);
if (file.exists()) {
file.delete();
}
fos = new FileOutputStream(file);// 創建一個可存取字節的文件
} catch (Exception e) {
e.printStackTrace();
}
while (isRecord) {
readsize = audioRecord.read(audiodata, 0, bufferSizeInBytes);
if (AudioRecord.ERROR_INVALID_OPERATION != readsize && fos != null) {
try {
fos.write(audiodata);
} catch (IOException e) {
e.printStackTrace();
}
}
}
try {
if (fos != null)
fos.close();// 關閉寫入流
} catch (IOException e) {
e.printStackTrace();
}
}
//add wav header
public static void raw2Wav(String inFilename, String outFilename, int bufferSizeInBytes) {
Log.d("NLPService", "raw2Wav");
FileInputStream in = null;
RandomAccessFile out = null;
byte[] data = new byte[bufferSizeInBytes];
try {
in = new FileInputStream(inFilename);
out = new RandomAccessFile(outFilename, "rw");
fixWavHeader(out, AUDIO_SAMPLE_RATE, 1, AudioFormat.ENCODING_PCM_16BIT);
while (in.read(data) != -1) {
out.write(data);
}
in.close();
out.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private static void fixWavHeader(RandomAccessFile file, int rate, int channels, int format) {
try {
int blockAlign;
if (format == AudioFormat.ENCODING_PCM_16BIT)
blockAlign = channels * 2;
else
blockAlign = channels;
int bitsPerSample;
if (format == AudioFormat.ENCODING_PCM_16BIT)
bitsPerSample = 16;
else
bitsPerSample = 8;
long dataLen = file.length() - 44;
// hard coding
byte[] header = new byte[44];
header[0] = 'R'; // RIFF/WAVE header
header[1] = 'I';
header[2] = 'F';
header[3] = 'F';
header[4] = (byte) ((dataLen + 36) & 0xff);
header[5] = (byte) (((dataLen + 36) >> 8) & 0xff);
header[6] = (byte) (((dataLen + 36) >> 16) & 0xff);
header[7] = (byte) (((dataLen + 36) >> 24) & 0xff);
header[8] = 'W';
header[9] = 'A';
header[10] = 'V';
header[11] = 'E';
header[12] = 'f'; // 'fmt ' chunk
header[13] = 'm';
header[14] = 't';
header[15] = ' ';
header[16] = 16; // 4 bytes: size of 'fmt ' chunk
header[17] = 0;
header[18] = 0;
header[19] = 0;
header[20] = 1; // format = 1
header[21] = 0;
header[22] = (byte) channels;
header[23] = 0;
header[24] = (byte) (rate & 0xff);
header[25] = (byte) ((rate >> 8) & 0xff);
header[26] = (byte) ((rate >> 16) & 0xff);
header[27] = (byte) ((rate >> 24) & 0xff);
header[28] = (byte) ((rate * blockAlign) & 0xff);
header[29] = (byte) (((rate * blockAlign) >> 8) & 0xff);
header[30] = (byte) (((rate * blockAlign) >> 16) & 0xff);
header[31] = (byte) (((rate * blockAlign) >> 24) & 0xff);
header[32] = (byte) (blockAlign); // block align
header[33] = 0;
header[34] = (byte) bitsPerSample; // bits per sample
header[35] = 0;
header[36] = 'd';
header[37] = 'a';
header[38] = 't';
header[39] = 'a';
header[40] = (byte) (dataLen & 0xff);
header[41] = (byte) ((dataLen >> 8) & 0xff);
header[42] = (byte) ((dataLen >> 16) & 0xff);
header[43] = (byte) ((dataLen >> 24) & 0xff);
file.seek(0);
file.write(header, 0, 44);
} catch (Exception e) {
} finally {
}
}
//文件上傳 結果回調
public void stt() {
File voiceFile = new File(AudioFileUtils.getWavFilePath());
if (!voiceFile.exists()) {
return;
}
RequestBody requestBody = RequestBody.create(MediaType.parse("multipart/form-data"), voiceFile);
MultipartBody.Part file =
MultipartBody.Part.createFormData("file", voiceFile.getName(), requestBody);
NetRequest.sAPIClient.stt(RequestBodyUtil.getParams(), file)
.observeOn(AndroidSchedulers.mainThread())
.subscribe(new Action1<STT>() {
@Override
public void call(STT result) {
if (result != null && result.getCount() > 0) {
sttTv.setText("結果: " + result.getSegments().get(0).getContent());
}
}
});
}
//記得關閉AudioRecord
private void stopRecordAndFile() {
if (audioRecord != null) {
isRecord = false;// 中止文件寫入
audioRecord.stop();
audioRecord.release();// 釋放資源
audioRecord = null;
}
}
複製代碼
WebSocket介紹: 我只記住一點點:它是應用層協議 ,就像http 也是,不過它是一種全雙工通訊, socket 只是TCP/IP 的封裝,不算協議。websocket 第一次須要以http 接口創建長鏈接,就這麼點了。後端
//MyWebSocketListener Websocket 回調api
class MyWebSocketListener extends WebSocketListener {
@Override
public void onOpen(WebSocket webSocket, Response response) {
output("onOpen: " + "webSocket connect success");
STTWebSocketActivity.this.webSocket = webSocket;
startRecordAndFile();
//看清楚了開始錄音函數在這裏,緣由因爲涉及回調,當分離時候 處理邏輯複雜
//,並且第二次錄音時候因爲服務端WebSocket已經關閉 ,錄音數據不能正常傳輸,須要從新創建鏈接
}
@Override
public void onMessage(WebSocket webSocket, final String text) {
runOnUiThread(new Runnable() {
@Override
public void run() {
sttTv.setText("Stt result:" + text);
}
});
output("onMessage1: " + text);
}
@Override
public void onMessage(WebSocket webSocket, ByteString bytes) {
output("onMessage2 byteString: " + bytes);
}
@Override
public void onClosing(WebSocket webSocket, int code, String reason) {
output("onClosing: " + code + "/" + reason);
}
@Override
public void onClosed(WebSocket webSocket, int code, String reason) {
output("onClosed: " + code + "/" + reason);
}
@Override
public void onFailure(WebSocket webSocket, Throwable t, Response response) {
output("onFailure: " + t.getMessage());
}
private void output(String s) {
Log.d("NLPService", s);
}
}
補充:AudioRecord建立與前面相同
// okhttp 建立websocket 並設置監聽
private void createWebSocket() {
Request request = new Request.Builder().url(sttApi).build();
NetRequest.getOkHttpClient().newWebSocket(request, socketListener);
}
class AudioRecordThread implements Runnable {
@Override
public void run() {
//byteBuffer 緩衝區 (內存地址以數組形式排列,一個基本數據類型的數組)
ByteBuffer audioBuffer = ByteBuffer.allocateDirect(bufferSizeInBytes).order(ByteOrder.LITTLE_ENDIAN);//小端模式
int readSize = 0;
Log.d(TAG, "isRecord=" + isRecord);
while (isRecord) {
readSize = audioRecord.read(audioBuffer, audioBuffer.capacity());
if (readSize == AudioRecord.ERROR_INVALID_OPERATION || readSize == AudioRecord.ERROR_BAD_VALUE) {
Log.d("NLPService", "Could not read audio data.");
break;
}
boolean send = webSocket.send(ByteString.of(audioBuffer));//就這麼簡單哈哈
Log.d("NLPService", "send=" + send);
audioBuffer.clear();//記住清空
}
webSocket.send("close");//錄製完以後發送約定字段。通知服務端關閉。
}
}
複製代碼
......而後老司機就要說了。。。你這沒有加密啊,效率很低啊。在此陳述一點,這裏是轉寫引擎,每次就一句話 ,傳輸數據量自己不大,後端大神們說不必加密,而後我就照辦了...固然也能夠一邊加密一邊傳輸數組
這裏就比較簡單了,okhttp 調用API 傳遞text 獲取response 而後用之AudioTrack 播放。這裏是原始音頻流,mediaplayer播放就有點大才小用了(我沒試過),不過 mediaplayer播放也是IPC過程,底層最終也是調用AudioTrack 進行播放的。 直接上代碼 :websocket
public boolean request() {
OkHttpClient client = NetRequest.getOkHttpClient();
Request request = new Request.Builder().url(NetRequest.BASE_URL + "api/tts?text=今天是星期三").build();
client.newCall(request).enqueue(new Callback() {
@Override
public void onFailure(Call call, IOException e) {
}
@Override
public void onResponse(Call call, Response response) throws IOException {
play(response.body().bytes());
}
});
return true;
}
public void play( byte[] data) {
try {
Log.d(TAG, "audioTrack start ");
AudioTrack audioTrack = new AudioTrack(mOutput, mSamplingRate,
AudioFormat.CHANNEL_OUT_MONO, AudioFormat.ENCODING_PCM_16BIT,
data.length, AudioTrack.MODE_STATIC);
audioTrack.write(data, 0, data.length);
audioTrack.play();
while (audioTrack.getPlaybackHeadPosition() < (data.length / 2)) {
Thread.yield();//播放延遲處理......
}
audioTrack.stop();
audioTrack.release();
} catch (IllegalArgumentException e) {
} catch (IllegalStateException e) {
}
}
複製代碼
speex 是一個開源免費的音頻加密庫,C++ 寫的。demo裏面是編譯好的so 文件, ,我親自編譯了很久各類坑,最後沒成功,只能借用了。-_-||。 下面有個speexDemo整個項目在工程裏,音頻加密解密都正常,親測可用。學習這塊時候CSDN下來的, 搬過來湊合數。dom
public static void raw2spx(String inFileName, String outFileName) {
FileInputStream rawFileInputStream = null;
FileOutputStream fileOutputStream = null;
try {
rawFileInputStream = new FileInputStream(inFileName);
fileOutputStream = new FileOutputStream(outFileName);
byte[] rawbyte = new byte[320];
byte[] encoded = new byte[160];
//將原數據轉換成spx壓縮的文件,speex只能編碼160字節的數據,須要使用一個循環
int readedtotal = 0;
int size = 0;
int encodedtotal = 0;
while ((size = rawFileInputStream.read(rawbyte, 0, 320)) != -1) {
readedtotal = readedtotal + size;
short[] rawdata = ShortByteUtil.byteArray2ShortArray(rawbyte);
int encodesize = SpeexUtil.getInstance().encode(rawdata, 0, encoded, rawdata.length);
fileOutputStream.write(encoded, 0, encodesize);
encodedtotal = encodedtotal + encodesize;
}
fileOutputStream.close();
rawFileInputStream.close();
} catch (Exception e) {
}
}
複製代碼
Date:2018/10/17 Author:weimin