趣探 Mach-O:FishHook 解析

這是Mach-O系列的第三篇git

閱讀 FishHook源碼以前,你可能須要對如下知識有個簡單的瞭解程序員

本文的闡述順序按照函數調用過程來進行github

Fishhook 能夠作什麼

在此借用阿里百川的一張分析圖,能夠比較清晰的瞭解FishHook發揮了哪些做用數組

阿里百川
阿里百川

FishHook在這裏是對動態連接庫起做用,修改對應的函數實現bash

對於動態連接庫裏面的C函數,第一次調用的時候,咱們會獲得函數和實現地址的對應關係,函數的實現地址存放在一個叫la_symbol_ptr的地方,第二次調用的時候,直接經過la_symbol_ptr找到函數地址就能夠,再也不須要繁瑣的獲取函數地址的過程。(具體經過哪些過程,能夠參考剛纔的連接:Mach-O 的動態連接過程app

那麼,上圖的含義就很明瞭了dom

在程序運行時,動態連接的 C 函數dynamic(...)地址記錄在DATA segment下的la_symbol_ptr中;初始時,程序只知道dynamic函數的符號名而不知道函數的實現地址;首次調用時,程序經過TEXT segment中的stub_helper取得綁定信息,經過dyld_stub_binder來更新la_symbol_ptr中的符號實現地址;這樣,再次調用時,就能夠經過la_symbol_ptr直接找到dynamic函數的實現;若是咱們須要替換dynamic函數的實現,只須要修改__la_symbol_ptr便可,也就是咱們要談的Fishhookide

Fishhook 的實現

經過fishhook的官方文檔能夠知道,Fishhook的使用方法大體以下: 函數

static int (*original_open)(const char *, int, ...);

int new_open(const char *path, int oflag, ...) {
    va_list ap = {0};
    mode_t mode = 0;

    if ((oflag & O_CREAT) != 0) {
        // mode only applies to O_CREAT
        va_start(ap, oflag);
        mode = va_arg(ap, int);
        va_end(ap);
        printf("Calling real open('%s', %d, %d)\n", path, oflag, mode);
        return original_open(path, oflag, mode);
    } else {
        printf("Calling real open('%s', %d)\n", path, oflag);
        return original_open(path, oflag, mode);
    }
}

int main(int argc, const char * argv[]) {
    @autoreleasepool {
        struct rebinding open_rebinding = { "open", new_open, (void *)&original_open };
        rebind_symbols((struct rebinding[1]){open_rebinding}, 1);
        __unused int fd = open(argv[0], O_RDONLY);
    }
    return 0;
}複製代碼

先從函數的入口,rebind_symbols開始談起吧,rebind_symbols主要是使用_dyld_register_func_for_add_image來註冊回調函數,在加載動態庫的時候執行一些操做oop

int rebind_symbols(struct rebinding rebindings[], size_t rebindings_nel) {
  // 調用 prepend_rebindings 的函數,將整個 rebindings 數組添加到 _rebindings_head 這個私有鏈表的頭部
  int retval = prepend_rebindings(&_rebindings_head, rebindings, rebindings_nel);
  if (retval < 0) {
    return retval;
  }

  // 判斷 _rebindings_head->next 的值來判斷是否爲第一次調用
  // If this was the first call, register callback for image additions (which is also invoked for
  // existing images, otherwise, just run on existing images
  if (!_rebindings_head->next) {
    _dyld_register_func_for_add_image(_rebind_symbols_for_image);
  } else {
    uint32_t c = _dyld_image_count();
    for (uint32_t i = 0; i < c; i++) {
      _rebind_symbols_for_image(_dyld_get_image_header(i), _dyld_get_image_vmaddr_slide(i));
    }
  }
  return retval;
}複製代碼

對於prepend_rebindings的代碼以下

// 鏈表的數組結構
struct rebindings_entry {
  struct rebinding *rebindings;
  size_t rebindings_nel;
  struct rebindings_entry *next;
};

static struct rebindings_entry *_rebindings_head;

static int prepend_rebindings(struct rebindings_entry **rebindings_head,
                              struct rebinding rebindings[],
                              size_t nel) {
  struct rebindings_entry *new_entry = malloc(sizeof(struct rebindings_entry));
  if (!new_entry) {
    return -1;
  }
  new_entry->rebindings = malloc(sizeof(struct rebinding) * nel);
  if (!new_entry->rebindings) {
    free(new_entry);
    return -1;
  }
  // 將 rebindings 插入到鏈表頭部
  memcpy(new_entry->rebindings, rebindings, sizeof(struct rebinding) * nel);
  new_entry->rebindings_nel = nel;
  new_entry->next = *rebindings_head;
  *rebindings_head = new_entry;
  return 0;
}複製代碼

基礎結構解釋

Dl_info

/*
 * Structure filled in by dladdr().
 */
typedef struct dl_info {
        const char      *dli_fname;     /* Pathname of shared object */
        void            *dli_fbase;     /* Base address of shared object */
        const char      *dli_sname;     /* Name of nearest symbol */
        void            *dli_saddr;     /* Address of nearest symbol */
} Dl_info;複製代碼

咱們一會通過 dladdr()處理後的有效信息都會放進這個結構體中

  • fname:路徑名,例如
/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneSimulator.platform/Developer/SDKs/iPhoneSimulator.sdk/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation複製代碼
  • dli_fbase:鏡像的的起始地址(Base address of shared object,好比上面的 CoreFoundation)
  • dli_saddr :符號的地址
  • dli_sname:符號的名字,即下面的第四列的函數信息
Thread 0:
0     libsystem_kernel.dylib          0x11135810a __semwait_signal + 94474
1     libsystem_c.dylib               0x1110dab0b sleep + 518923
2     QYPerformanceMonitor            0x10dda4f1b -[ViewController tableView:cellForRowAtIndexPath:] + 7963
3     UIKit                           0x10ed4d4f4 -[UITableView _createPreparedCellForGlobalRow:withIndexPath:willDisplay:] + 1586420複製代碼

LC_SYMTAB

struct symtab_command {
    uint32_t    cmd;        /* LC_SYMTAB */
    uint32_t    cmdsize;    /* sizeof(struct symtab_command) */
    uint32_t    symoff;        /* symbol table offset */
    uint32_t    nsyms;        /* number of symbol table entries */
    uint32_t    stroff;        /* string table offset */
    uint32_t    strsize;    /* string table size in bytes */
};複製代碼

主要是提供符號表的偏移量,以及元素個數,還有字符串表的偏移和其長度。符號表在 Mach-O目標文件中的地址能夠經過LC_SYMTAB加載命令指定的 symoff找到,對應的符號名稱在stroff,總共有nsyms條符號信息

LC_DYSYMTAB

這個數組結構有些複雜,有興趣的能夠閱讀loader.h文件,內部標示了動態符號表的偏移量和符號個數

struct dysymtab_command {
    uint32_t cmd;    /* LC_DYSYMTAB */
    uint32_t cmdsize;    /* sizeof(struct dysymtab_command) */
    uint32_t indirectsymoff; /* file offset to the indirect symbol table */
    uint32_t nindirectsyms;  /* number of indirect symbol table entries */
    .......複製代碼

_rebind_symbols_for_image

對於關鍵的代碼 _rebind_symbols_for_image 以下

static void rebind_symbols_for_image(struct rebindings_entry *rebindings,
                                     const struct mach_header *header,
                                     intptr_t slide) {
  Dl_info info;
  if (dladdr(header, &info) == 0) {
    return;
  }

  // segment_command_64
  segment_command_t *cur_seg_cmd;
  segment_command_t *linkedit_segment = NULL;
  // LC_SYMTAB
  struct symtab_command* symtab_cmd = NULL;
 // LC_DYSYMTAB
  struct dysymtab_command* dysymtab_cmd = NULL;

    // 下面是要尋找load_command,因此越過mach_header_t
  uintptr_t cur = (uintptr_t)header + sizeof(mach_header_t);
  for (uint i = 0; i < header->ncmds; i++, cur += cur_seg_cmd->cmdsize) {
    cur_seg_cmd = (segment_command_t *)cur;
    if (cur_seg_cmd->cmd == LC_SEGMENT_ARCH_DEPENDENT) {
      if (strcmp(cur_seg_cmd->segname, SEG_LINKEDIT) == 0) {
        //遍歷尋找__LINKEDIT
        linkedit_segment = cur_seg_cmd;
      }
    } else if (cur_seg_cmd->cmd == LC_SYMTAB) {
      //遍歷尋找lc_symtab
      symtab_cmd = (struct symtab_command*)cur_seg_cmd;
    } else if (cur_seg_cmd->cmd == LC_DYSYMTAB) {
      //遍歷尋找lc_dysymtab
      dysymtab_cmd = (struct dysymtab_command*)cur_seg_cmd;
    }
  }複製代碼

爲何要尋找這個幾個LoadCommand的信息呢?就如上面介紹的__LINKEDITLC_DYSYMTABLC_SYMTAB都提供了重要的信息。

__LINKEDIT段 含有爲動態連接庫使用的原始數據,好比符號,字符串,重定位表條目等等

閱讀下面的代碼以前,先來看一個計算公式

連接時程序的基址 = __LINKEDIT.VM_Address -__LINKEDIT.File_Offset + silde的改變值

這裏出現了一個 slide,那麼slide是啥呢?先看一下ASLR

ASLR:Address space layout randomization,將可執行程序隨機裝載到內存中,這裏的隨機只是偏移,而不是打亂,具體作法就是經過內核將 Mach-O的段「平移」某個隨機係數。slide 正是ASLR引入的偏移

也就是說程序的基址等於__LINKEDIT的地址減去偏移量,而後再加上ASLR形成的偏移

// 連接時程序的基址
  uintptr_t linkedit_base = (uintptr_t)slide + linkedit_segment->vmaddr - linkedit_segment->fileoff;

  // 符號表的地址 = 基址 + 符號表偏移量
  nlist_t *symtab = (nlist_t *)(linkedit_base + symtab_cmd->symoff);
  // 字符串表的地址 = 基址 + 字符串表偏移量
  char *strtab = (char *)(linkedit_base + symtab_cmd->stroff);

  // 動態符號表地址 = 基址 + 動態符號表偏移量
  uint32_t *indirect_symtab = (uint32_t *)(linkedit_base + dysymtab_cmd->indirectsymoff);複製代碼

符號表中的元素都是nlist_t結構體,nlist_t中有不少學問,這裏先看一下他的基礎結構

/*
 * This is the symbol table entry structure for 32-bit architectures.
 */
struct nlist {
    union {
        uint32_t n_strx;    /* index into the string table */
    } n_un;
    uint8_t n_type;        /* type flag, see below */
    uint8_t n_sect;        /* section number or NO_SECT */
    int16_t n_desc;        /* see <mach-o/stab.h> */
    uint32_t n_value;    /* value of this symbol (or stab offset) */
};複製代碼

而後再次遍歷loadcommands,尋找__DATA__DATA_CONSTsection,並對對__nl_symbol_ptr以及__la_symbol_ptr進行rebind

cur = (uintptr_t)header + sizeof(mach_header_t);
  for (uint i = 0; i < header->ncmds; i++, cur += cur_seg_cmd->cmdsize) {
    cur_seg_cmd = (segment_command_t *)cur;
    if (cur_seg_cmd->cmd == LC_SEGMENT_ARCH_DEPENDENT) {
      if (strcmp(cur_seg_cmd->segname, SEG_DATA) != 0 &&
          strcmp(cur_seg_cmd->segname, SEG_DATA_CONST) != 0) {
        continue;
      }

      //找到__DATA和__DATA_CONST的section,對__nl_symbol_ptr以及__la_symbol_ptr進行rebind
      for (uint j = 0; j < cur_seg_cmd->nsects; j++) {
        section_t *sect =
          (section_t *)(cur + sizeof(segment_command_t)) + j;
        if ((sect->flags & SECTION_TYPE) == S_LAZY_SYMBOL_POINTERS) {
          // sect爲Section,symtab爲符號表,strtab字符串表,indirect_symtab動態符號表(indirect symbol table)
          perform_rebinding_with_section(rebindings, sect, slide, symtab, strtab, indirect_symtab);
        }
        if ((sect->flags & SECTION_TYPE) == S_NON_LAZY_SYMBOL_POINTERS) {
          perform_rebinding_with_section(rebindings, sect, slide, symtab, strtab, indirect_symtab);
        }
      }
    }複製代碼

##perform_rebinding_with_section

nl_symbol_ptrla_symbol_ptrsection中的reserved1字段指明對應的indirect symbol table起始的index

For the two relevant sections, the section headers (struct sections from ) provide an offset (in the reserved1 field) into what is known as the indirect symbol table. The indirect symbol table, which is located in the LINKEDIT segment of the binary, is just an array of indexes into the symbol table (also in LINKEDIT) whose order is identical to that of the pointers in the non-lazy and lazy symbol sections

So, given struct section nl_symbol_ptr, the corresponding index in the symbol table of the first address in that section is indirect_symbol_table[nl_symbol_ptr->reserved1]. The symbol table itself is an array of struct nlists (see ), and each nlist contains an index into the string table in LINKEDIT which where the actual symbol names are stored. So, for each pointer nl_symbol_ptr and __la_symbol_ptr, we are able to find the corresponding symbol and then the corresponding string to compare against the requested symbol names, and if there is a match, we replace the pointer in the section with the replacement.

結合英文,看下面的代碼就很容易理解

// sect爲Section,symtab爲符號表,strtab字符串表,indirect_symtab動態符號表(indirect symbol table)
static void perform_rebinding_with_section(struct rebindings_entry *rebindings,
                                           section_t *section,
                                           intptr_t slide,
                                           nlist_t *symtab,
                                           char *strtab,
                                           uint32_t *indirect_symtab) {
  // `nl_symbol_ptr`和`la_symbol_ptr`section中的`reserved1`字段指明對應的`indirect symbol table`起始的index
    //動態符號表中第一個解析的符號的起始地址
  uint32_t *indirect_symbol_indices = indirect_symtab + section->reserved1;

  void **indirect_symbol_bindings = (void **)((uintptr_t)slide + section->addr);

  for (uint i = 0; i < section->size / sizeof(void *); i++) {
      // 符號表的index
    uint32_t symtab_index = indirect_symbol_indices[i];
    if (symtab_index == INDIRECT_SYMBOL_ABS || symtab_index == INDIRECT_SYMBOL_LOCAL ||
        symtab_index == (INDIRECT_SYMBOL_LOCAL   | INDIRECT_SYMBOL_ABS)) {
      continue;
    }
    //獲取每個須要動態解析的符號在符號表中的偏移量
    uint32_t strtab_offset = symtab[symtab_index].n_un.n_strx;

    //經過字符串表偏移量獲取符號對應的字符串(符號的名字)
    char *symbol_name = strtab + strtab_offset;複製代碼

上面的代碼其實就能夠用官方的一個圖片很直觀的表示

走到這裏是找到了字符串表對應的符號(字符串)

如何替換實現

遍歷 rebindings 數組,符號進行比較,相同的符號就進行實現替換,這裏的代碼比較清晰,直接貼出

struct rebindings_entry *cur = rebindings;
    while (cur) {
        for (uint j = 0; j < cur->rebindings_nel; j++) {
            if (strcmp(&symbol_name[1], cur->rebindings[j].name) == 0) {
                if (cur->rebindings[j].replaced != NULL &&
                    indirect_symbol_bindings[i] != cur->rebindings[j].replacement) {
                    *(cur->rebindings[j].replaced) = indirect_symbol_bindings[i];
                }
                indirect_symbol_bindings[i] = cur->rebindings[j].replacement;
                goto symbol_loop;
            }
        }
        cur = cur->next;
    }
symbol_loop:;
}複製代碼

參考連接

相關文章
相關標籤/搜索