年前,微信開源了Matrix項目,提供了Android、ios的APM實現方案。對於Android端實現,主要包括APK Checker
、Resource Canary
、Trace Canary
、SQLite Lint
、IO Canary
五部分。本文主要介紹IO Canary
的源碼實現,其餘部分的源碼分析將在後續推出。java
IOCanary大致上從Java Hook、Native Hook兩個角度來檢測應用的IO行爲;並根據不一樣的策略細化了IO Issue的種類。android
Java Hook的hook點是系統類CloseGuard
,hook的方式是使用動態代理。ios
private boolean tryHook() {
try {
Class<?> closeGuardCls = Class.forName("dalvik.system.CloseGuard");
Class<?> closeGuardReporterCls = Class.forName("dalvik.system.CloseGuard$Reporter");
Field fieldREPORTER = closeGuardCls.getDeclaredField("REPORTER");
Field fieldENABLED = closeGuardCls.getDeclaredField("ENABLED");
fieldREPORTER.setAccessible(true);
fieldENABLED.setAccessible(true);
sOriginalReporter = fieldREPORTER.get(null);
fieldENABLED.set(null, true);
// open matrix close guard also
MatrixCloseGuard.setEnabled(true);
ClassLoader classLoader = closeGuardReporterCls.getClassLoader();
if (classLoader == null) {
return false;
}
fieldREPORTER.set(null, Proxy.newProxyInstance(classLoader,
new Class<?>[]{closeGuardReporterCls},
new IOCloseLeakDetector(issueListener, sOriginalReporter)));
fieldREPORTER.setAccessible(false);
return true;
} catch (Throwable e) {
MatrixLog.e(TAG, "tryHook exp=%s", e);
}
return false;
}
複製代碼
系統CloseGuard
的實現原理是在一些資源類中預埋一些代碼,從而使CloseGuard
感知到資源是否被正常關閉。例如系統類FileOutputStream
中有以下代碼:c++
private final CloseGuard guard = CloseGuard.get();
...
public FileOutputStream(File file, boolean append) throws FileNotFoundException {
...
guard.open("close");
}
...
public void close() throws IOException {
...
guard.close();
...
}
...
protected void finalize() throws IOException {
// Android-added: CloseGuard support.
if (guard != null) {
guard.warnIfOpen();
}
if (fd != null) {
if (fd == FileDescriptor.out || fd == FileDescriptor.err) {
flush();
} else {
// Android-removed: Obsoleted comment about shared FileDescriptor handling.
close();
}
}
}
複製代碼
能夠看到在調用finalize
以前未調用close
方法會走到CloseGuard
的warnIfOpen
方法,從而檢測到此次資源未正常關閉的行爲。git
固然應用也有一些自定義的資源類,對於這種狀況Matrix建議使用MatrixCloseGuard
這個類模擬系統埋點的方式,達到資源監控的目的。github
Native Hook是採用PLT(GOT) Hook的方式hook了系統so中的IO相關的open
、read
、write
、close
方法。在代理了這些系統方法後,Matrix作了一些邏輯上的細分,從而檢測出不一樣的IO Issue。安全
JNIEXPORT jboolean JNICALL Java_com_tencent_matrix_iocanary_core_IOCanaryJniBridge_doHook(JNIEnv *env, jclass type) {
__android_log_print(ANDROID_LOG_INFO, kTag, "doHook");
for (int i = 0; i < TARGET_MODULE_COUNT; ++i) {
const char* so_name = TARGET_MODULES[i];
__android_log_print(ANDROID_LOG_INFO, kTag, "try to hook function in %s.", so_name);
loaded_soinfo* soinfo = elfhook_open(so_name);
if (!soinfo) {
__android_log_print(ANDROID_LOG_WARN, kTag, "Failure to open %s, try next.", so_name);
continue;
}
elfhook_replace(soinfo, "open", (void*)ProxyOpen, (void**)&original_open);
elfhook_replace(soinfo, "open64", (void*)ProxyOpen64, (void**)&original_open64);
bool is_libjavacore = (strstr(so_name, "libjavacore.so") != nullptr);
if (is_libjavacore) {
if (!elfhook_replace(soinfo, "read", (void*)ProxyRead, (void**)&original_read)) {
__android_log_print(ANDROID_LOG_WARN, kTag, "doHook hook read failed, try __read_chk");
if (!elfhook_replace(soinfo, "__read_chk", (void*)ProxyRead, (void**)&original_read)) {
__android_log_print(ANDROID_LOG_WARN, kTag, "doHook hook failed: __read_chk");
return false;
}
}
if (!elfhook_replace(soinfo, "write", (void*)ProxyWrite, (void**)&original_write)) {
__android_log_print(ANDROID_LOG_WARN, kTag, "doHook hook write failed, try __write_chk");
if (!elfhook_replace(soinfo, "__write_chk", (void*)ProxyWrite, (void**)&original_write)) {
__android_log_print(ANDROID_LOG_WARN, kTag, "doHook hook failed: __write_chk");
return false;
}
}
}
elfhook_replace(soinfo, "close", (void*)ProxyClose, (void**)&original_close);
elfhook_close(soinfo);
}
return true;
}
複製代碼
hook住系統調用以後,接下來再看看代理方法的實現:微信
int ProxyOpen64(const char *pathname, int flags, mode_t mode) {
if(!IsMainThread()) {
return original_open64(pathname, flags, mode);
}
int ret = original_open64(pathname, flags, mode);
if (ret != -1) {
DoProxyOpenLogic(pathname, flags, mode, ret);
}
return ret;
}
/** * Proxy for read: callback to the java layer */
ssize_t ProxyRead(int fd, void *buf, size_t size) {
if(!IsMainThread()) {
return original_read(fd, buf, size);
}
int64_t start = GetTickCountMicros();
size_t ret = original_read(fd, buf, size);
long read_cost_μs = GetTickCountMicros() - start;
//__android_log_print(ANDROID_LOG_DEBUG, kTag, "ProxyRead fd:%d buf:%p size:%d ret:%d cost:%d", fd, buf, size, ret, read_cost_μs);
iocanary::IOCanary::Get().OnRead(fd, buf, size, ret, read_cost_μs);
return ret;
}
/** * Proxy for write: callback to the java layer */
ssize_t ProxyWrite(int fd, const void *buf, size_t size) {
if(!IsMainThread()) {
return original_write(fd, buf, size);
}
int64_t start = GetTickCountMicros();
size_t ret = original_write(fd, buf, size);
long write_cost_μs = GetTickCountMicros() - start;
//__android_log_print(ANDROID_LOG_DEBUG, kTag, "ProxyWrite fd:%d buf:%p size:%d ret:%d cost:%d", fd, buf, size, ret, write_cost_μs);
iocanary::IOCanary::Get().OnWrite(fd, buf, size, ret, write_cost_μs);
return ret;
}
/** * Proxy for close: callback to the java layer */
int ProxyClose(int fd) {
if(!IsMainThread()) {
return original_close(fd);
}
int ret = original_close(fd);
//__android_log_print(ANDROID_LOG_DEBUG, kTag, "ProxyClose fd:%d ret:%d", fd, ret);
iocanary::IOCanary::Get().OnClose(fd, ret);
return ret;
}
複製代碼
仔細閱讀代理方法的代碼,發現全部的代理方法在非主線程都是直接執行原方法(沒有添加IO檢測的相關邏輯)。這部分Matrix官方認可因爲多線程併發的問題暫時支持單線程模型。因爲限制了只對主線程進行檢測,總體IO檢測方案的實際應用場景變得很受限,但願Matrix後續能夠優化。多線程
void FileIOMainThreadDetector::Detect(const IOCanaryEnv &env, const IOInfo &file_io_info,
std::vector<Issue>& issues) {
if (GetMainThreadId() == file_io_info.java_context_.thread_id_) {
int type = 0;
//可能引發卡頓的主線程IO,默認值13ms
if (file_io_info.max_continual_rw_cost_time_μs_ > IOCanaryEnv::kPossibleNegativeThreshold) {
type = 1;
}
//引發主線程嚴重性能問題的IO,默認500ms
if(file_io_info.max_continual_rw_cost_time_μs_ > env.GetMainThreadThreshold()) {
type |= 2;
}
if (type != 0) {
Issue issue(kType, file_io_info);
issue.repeat_read_cnt_ = type; //use repeat to record type
PublishIssue(issue, issues);
}
}
}
複製代碼
void FileIOSmallBufferDetector::Detect(const IOCanaryEnv &env, const IOInfo &file_io_info,
std::vector<Issue>& issues) {
//單次操做的字節數小於閾值
if (file_io_info.op_cnt_ > env.kSmallBufferOpTimesThreshold && (file_io_info.op_size_ / file_io_info.op_cnt_) < env.GetSmallBufferThreshold()
&& file_io_info.max_continual_rw_cost_time_μs_ >= env.kPossibleNegativeThreshold) {
PublishIssue(Issue(kType, file_io_info), issues);
}
}
複製代碼
void FileIORepeatReadDetector::Detect(const IOCanaryEnv &env,
const IOInfo &file_io_info,
std::vector<Issue>& issues) {
const std::string& path = file_io_info.path_;
if (observing_map_.find(path) == observing_map_.end()) {
if (file_io_info.max_continual_rw_cost_time_μs_ < env.kPossibleNegativeThreshold) {
return;
}
observing_map_.insert(std::make_pair(path, std::vector<RepeatReadInfo>()));
}
std::vector<RepeatReadInfo>& repeat_infos = observing_map_[path];
//有write行爲,清空repeat_info
if (file_io_info.op_type_ == FileOpType::kWrite) {
repeat_infos.clear();
return;
}
RepeatReadInfo repeat_read_info(file_io_info.path_, file_io_info.java_context_.stack_, file_io_info.java_context_.thread_id_, file_io_info.op_size_, file_io_info.file_size_);
if (repeat_infos.size() == 0) {
repeat_infos.push_back(repeat_read_info);
return;
}
//read操做間隔17ms,清空repeat_info
if((GetTickCount() - repeat_infos[repeat_infos.size() - 1].op_timems) > 17) { //17ms todo astrozhou add to params
repeat_infos.clear();
}
bool found = false;
int repeatCnt;
for (auto& info : repeat_infos) {
if (info == repeat_read_info) {
found = true;
info.IncRepeatReadCount();
repeatCnt = info.GetRepeatReadCount();
break;
}
}
if (!found) {
repeat_infos.push_back(repeat_read_info);
return;
}
//重複read次數達到閾值,上報IO Issue
if (repeatCnt >= env.GetRepeatReadThreshold()) {
Issue issue(kType, file_io_info);
issue.repeat_read_cnt_ = repeatCnt;
issue.stack = repeat_read_info.GetStack();
PublishIssue(issue, issues);
}
}
複製代碼
Native Hook大致上能夠分爲PLT(GOT) Hook、ART Hook(基於ART虛擬機)、Dalvik Hook(基於Dalvik虛擬機)、inline Hook這幾類Hook手段。相關文章能夠詳見Android Native Hook技術路線概述。PLT(GOT) Hook是基於so(實際是一個elf格式的文件)的GOT跳轉表實現的。ELF文件格式的詳細說明能夠參見文章。對於PLT(GOT) HOOK,須要關注的是ELF文件連接視圖下名爲.plt和.got的Section。併發
plt Section說明:
got Section說明:
PLT(GOT) HOOK的原理從Android Native Hook技術路線概述摘錄以下:
先來介紹一下Android PLT Hook的基本原理。Linux在執行動態連接的ELF的時候,爲了優化性能使用了一個叫延時綁定的策略。相關資料有不少,這邊簡述一下:這個策略是爲了解決本來靜態編譯時要把各類系統API的具體實現代碼都編譯進當前ELF文件裏致使文件巨大臃腫的問題。因此當在動態連接的ELF程序裏調用共享庫的函數時,第一次調用時先去查找PLT表中相應的項目,而PLT表中再跳躍到GOT表中但願獲得該函數的實際地址,但這時GOT表中指向的是PLT中那條跳躍指令下面的代碼,最終會執行_dl_runtime_resolve()
並執行目標函數。第二次調用時也是PLT跳轉到GOT表,可是GOT中對應項目已經在第一次_dl_runtime_resolve()
中被修改成函數實際地址,所以第二次及之後的調用直接就去執行目標函數,不用再去執行_dl_runtime_resolve()
了。所以,PLT Hook經過直接修改GOT表,使得在調用該共享庫的函數時跳轉到的是用戶自定義的Hook功能代碼。
解析須要hook的so文件,封裝一個loaded_soinfo對象。
查找GOT表中是否有對應的方法聲明。
locate_symbol
內部調用locate_symbol_hash
。
備選方案, locate_symbol_hash
失敗後會走到這個方法。
實際替換對應的函數地址。
源碼地址位於android / platform / bionic / froyo / . / linker / linker.c
連接so文件(elf文件格式)
so文件(ELF文件)中的Section包括三種狀態:
Android加載so的過程,暫時未徹底弄懂,待後續完善~~
技術特色:
應用場景:
Matrix IO檢測的代碼邏輯相對簡單。難點在於so(elf文件)文件格式的理解,以及PLT(GOT) Hook的實現原理