深刻理解計算機系統 (CS:APP) - 高速緩存實驗 Cache Lab 解析

時間 2020-08-09

標籤深刻理解計算機系統 app 高速緩存實驗 cache lab 解析简体版

原文原文鏈接

原文地址：https://billc.io/2019/05/csapp-cachelab/git

這個實驗是這學期的第四個實驗。做爲緩存這一章的配套實驗，設計得很是精妙。難度上來說，相比以前的修改現成文件，直接寫一個程序也更高了一些。須要注意的是檢查程序在編譯時開啓了 -Werror，須要保證沒有警告才能成功編譯。github

從官方文檔得知須要完善 csim.c 和 trans.c 文件，第一個是模擬一個高速緩存的程序並從由 valgrind 程序生成的 trace 文件中統計 hit, miss 和 eviction 的數量。第二個文件須要優化矩陣轉置程序下降程序的不命中度。數組

PART A

這一部分的核心是使用了一個結構體來模擬一個緩存行：緩存

typedef struct {
    int valid;
    ulong tag;
    clock_t time;
} CacheLine;

再經過把緩存行在內存中動態分配成一個二維數組，實現模擬緩存的功能。而且使用了typedef CacheLine *CacheSet; 和 typedef CacheSet *CacheHead; 來讓程序更整齊。輸入來源於文件和命令行參數。能夠用 getopt() 函數來解析參數。app

各個函數的做用以下：函數

CacheHead CacheInit(int S, int E) 爲緩存動態分配內存；
int CacheJudge(CacheHead cache, ulong index, ulong tag) 判斷緩存狀態，是否有效，標記匹配；
void CacheEvict(CacheHead cache, ulong index, ulong tag) 執行 eviction 操做；
void CacheTouch(CacheHead cache, ulong index, ulong tag) 執行讀取操做，只更新時間戳；
void CacheInsert(CacheHead cache, ulong index, ulong tag) 執行緩存寫入操做；
void Adder(int type, int num) 計數器，增長 hit, miss 和 eviction 的數量，並根據配置選擇打印信息；
void printByte(bytept h, int len) 逐字節以 16 進制打印內存數據；
void Execute(CacheHead cache, char type, ulong address, int len) 主要的執行函數；
int main(int argc, char *args[]) main 函數，讀取參數，打開文件；

完整的程序代碼以下：post

// Written By @BillChen
// 2019.5.20
#include "cachelab.h"
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

#define MACHINE_BITS 64
#define NEED_EVICT -1
#define NO_MATCH -2
#define CACHED 1
#define ADD_HIT 1
#define ADD_MISS 2
#define ADD_EVICT 3

int totalMissCount = 0;
int totalHitCount = 0;
int totalEvictCount = 0;

typedef unsigned long ulong;
typedef unsigned char *bytept;
const char *optString = "s:E:b:t:hVv";

struct globalOptions {
    int setIndexBits;
    int associativity;
    int blockBits;
    int verboseFlag;
    int tagBits;
    int superVerboseFlag;
    char *traceDir;
} globalOptions;
struct result {
    int hit;
    int miss;
    int evict;
};
typedef struct {
    int valid;
    ulong tag;
    clock_t time;
} CacheLine;

typedef CacheLine *CacheSet;
typedef CacheSet *CacheHead;

void usage() {
    printf("Usage: ./csim [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
    printf("-h get help info\n");
    printf("-v Optional verbose flag that displays trace info\n");
    printf("-V Optional super verbose flag that displays very detailed trace info\n");
    printf("-s <s> Number of set index bits\n");
    printf("-E <E> Associativity (number of lines per set)\n");
    printf("-b <b> Number of block bits\n");
    printf("-t <tracefile>: Name of the valgrind trace to replay\n");
}

CacheHead CacheInit(int S, int E) {
    CacheHead cache;
    cache = calloc(1 << S, sizeof(CacheSet));
    if (cache == NULL) {
        printf("Fail to allocate memory for cache.\n");
        exit(EXIT_FAILURE);
    }
    int i = 0;
    for (i = 0; i < 1 << S; i++) {
        if ((cache[i] = calloc(E, sizeof(CacheLine))) == NULL) {
            printf("Fail to allocate memory for cache.\n");
            exit(EXIT_FAILURE);
        }
    }
    for (i = 0; i < 1 << S; i++) {
        int j;
        for (j = 0; j < E; j++) {
            cache[i][j].valid = 0;
        }
    }
    return cache;
}

int CacheJudge(CacheHead cache, ulong index, ulong tag) {
    int i;
    int fullFlag = 1;
    int matchFlag = 0;
    for (i = 0; i < globalOptions.associativity; i++) {
        if (cache[index][i].valid == 0) {
            fullFlag = 0;
        }
        if (cache[index][i].tag == tag && cache[index][i].valid == 1) {
            matchFlag = 1;
        }
    }
    if (matchFlag == 1)
        return CACHED;
    if (fullFlag == 1)
        return NEED_EVICT;
    else
        return NO_MATCH;
}

void CacheInsert(CacheHead cache, ulong index, ulong tag) {
    int freeLine = 0, i;
    for (i = 0; i < globalOptions.associativity; i++) {
        if (cache[index][i].valid == 0)
            break;
        freeLine++;
    }
    CacheLine *target = cache[index] + freeLine;
    target->tag = tag;
    target->valid = 1;
    target->time = clock();
}

void CacheEvict(CacheHead cache, ulong index, ulong tag) {
    int firstLine = 0, i = 0;
    clock_t firstCachedTime = cache[index][i].time;
    for (i = 0; i < globalOptions.associativity; i++) {
        if (cache[index][i].time < firstCachedTime) {
            firstCachedTime = cache[index][i].time;
            firstLine = i;
        }
    }
    CacheLine *target = cache[index] + firstLine;
    target->tag = 0;
    target->time = 0;
    target->valid = 0;
}

void CacheTouch(CacheHead cache, ulong index, ulong tag) {
    int touchLine = 0;
    while (cache[index][touchLine].tag != tag)
        touchLine++;
    cache[index][touchLine].time = clock();
}

void Adder(int type, int num) {
    int v = globalOptions.verboseFlag;
    switch (type) {
    case ADD_EVICT:
        totalEvictCount += num;
        if (v && num != 0)
            printf("eviction ");
        break;
    case ADD_HIT:
        totalHitCount += num;
        if (v && num != 0)
            printf("hit ");
        break;
    case ADD_MISS:
        totalMissCount += num;
        if (v && num != 0)
            printf("miss ");
    }
}

void printByte(bytept h, int len) {
    int i;
    for (i = 0; i < len; i++)
        printf("%.2x ", h[i]);
    printf("\n");
}

void Execute(CacheHead cache, char type, ulong address, int len) {
    ulong index = (address << globalOptions.tagBits) >> (MACHINE_BITS - globalOptions.setIndexBits);
    ulong tag = address >> (globalOptions.blockBits + globalOptions.setIndexBits);
    int status = CacheJudge(cache, index, tag);
    if (globalOptions.verboseFlag == 1) {
        if(globalOptions.superVerboseFlag == 1){
            printf("\n[address:] ");
            printByte((bytept)&address, sizeof(long));
            printf("[index:] ");
            printByte((bytept)&index, sizeof(long));
            printf("[tag:] ");
            printByte((bytept)&tag, sizeof(long));
            printf("(Decimal)[index: %ld, tag: %ld]\n------------------------------------------- ", index, tag);
        } 
        else{
            printf("(Decimal)[index: %ld, tag: %ld] ------ ", index, tag);
        }
    }
    switch (status) {
    case CACHED:
        CacheTouch(cache, index, tag);
        if (type == 'M') {
            Adder(ADD_HIT, 1);
            Adder(ADD_HIT, 1);
        } else {
            Adder(ADD_HIT, 1);
        }
        break;
    case NO_MATCH:
        CacheInsert(cache, index, tag);
        if (type == 'M') {
            Adder(ADD_MISS, 1);
            Adder(ADD_HIT, 1);
        } else {
            Adder(ADD_MISS, 1);
        }
        break;
    case NEED_EVICT:
        CacheEvict(cache, index, tag);
        CacheInsert(cache, index, tag);
        if (type == 'M') {
            Adder(ADD_MISS, 1);
            Adder(ADD_EVICT, 1);
            Adder(ADD_HIT, 1);

        } else {
            Adder(ADD_MISS, 1);
            Adder(ADD_EVICT, 1);
        }
        break;
    default:
        printf("Unknown error.\n");
        exit(EXIT_FAILURE);
    }
    if (globalOptions.verboseFlag == 1) {
        printf("\n");
    }
}

int main(int argc, char *args[]) {
    char ch;
    while ((ch = getopt(argc, args, optString)) != -1) {
        switch (ch) {
        case 's':
            if (atoi(optarg) < 0) {
                printf("Unvalid input for <s>. Try Again.\n");
                exit(EXIT_FAILURE);
            }
            globalOptions.setIndexBits = atoi(optarg);
            break;
        case 'E':
            if (atoi(optarg) < 0) {
                printf("Unvalid input for <E>. Try Again.\n");
                exit(EXIT_FAILURE);
            }
            globalOptions.associativity = atoi(optarg);
            break;
        case 'b':
            if (atoi(optarg) < 0) {
                printf("Unvalid input for <b>. Try Again.\n");
                exit(EXIT_FAILURE);
            }
            globalOptions.blockBits = atoi(optarg);
            break;
        case 't':
            globalOptions.traceDir = optarg;
            break;
        case 'v':
            globalOptions.verboseFlag = 1;
            break;
        case 'h':
            usage();
            exit(EXIT_FAILURE);
        case 'V':
            globalOptions.verboseFlag = 1;
            globalOptions.superVerboseFlag = 1;
            break;
        default:
            usage();
            exit(EXIT_FAILURE);
            break;
        }
    }
    globalOptions.tagBits = MACHINE_BITS - globalOptions.blockBits - globalOptions.setIndexBits;

    FILE *traceFile = fopen(globalOptions.traceDir, "r");
    if (traceFile == NULL) {
        printf("Fail to open file: %s\n", globalOptions.traceDir);
        exit(EXIT_FAILURE);
    }
    CacheHead cache = CacheInit(globalOptions.setIndexBits, globalOptions.associativity);
    char traceLine[32];
    while (fgets(traceLine, 32, traceFile) != NULL) {
        char mode;
        ulong address;
        int len;
        sscanf(traceLine, " %c %lx,%d", &mode, &address, &len);
        if (mode == 'I')
            continue;
        if (globalOptions.verboseFlag == 1) {
            printf("%c %lx,%d ", mode, address, len);
        }
        Execute(cache, mode, address, len);
    }
    printSummary(totalHitCount, totalMissCount, totalEvictCount);
    free(cache);
    return 0;
}

最終在 ./driver.py 的測試下，該程序和 csim-ref 的運行結果一致。測試

PART B

按照官方文檔的說明，須要在 trans.c 中寫入一個優化的矩陣轉置函數。儘量地下降不命中率。使用命令 ./test-trans -M <rol> -N <col> 能夠查看這一轉置函數的不命中數。生成的 trace.fi 文件還能夠利用 PART A 寫的緩存模擬器檢查命中狀況。優化

從官方文檔得知要在 PART B 中獲得分數須要完成三個測試並知足對應的不命中數條件。ui

Test I: 32 * 32

因爲程序使用的緩存 block size 爲 5，也就是有 2^5 的塊大小，爲32字節。sizeof(int) = 4，因此能夠存儲下 8 個整數。

先研究原始的一個簡單的矩陣轉置函數：

int i, j, tmp;
for (i = 0; i < N; i++) {
    for (j = 0; j < M; j++) {
        tmp = A[i][j];
        B[j][i] = tmp;
    }
}

這一函數的運行結果出現了 1000 多個 miss。提取一小部分原始的文件，利用 csim 查看詳細的 miss 和 eviction 信息，能夠發如今讀取的時候發生了嚴重的抖動，致使了大量 miss 的出現。

因此能夠利用矩陣分塊的思想。每一行數組均可以被存入 4 個緩存行中，一共有 32 個緩存行，因此每過 8 行就會出現一次和前面相同的組索引，發生 miss 和 eviction。因此考慮將 32 * 32 的矩陣分紅 16 個 8 * 8 的矩陣，每一次都將一行的 8 個 int 分別存儲進 t1 – t4。

即，將矩陣劃分紅以下結構：

1	2	3	4
5	6	7	8
9	10	11	12
13	14	15	16

其中每個小塊都是 8 * 8，每一行可以完整存儲到緩存行中的矩陣。這種狀況在 transpose_submit() 中的代碼以下：

if(N == 32 && M == 32){
    int i, j, k;
    int t1, t2, t3, t4, t5, t6, t7, t8;
    for (i = 0; i < 32; i += 8) {
        for (j = 0; j < 32; j += 8) {
            for (k = 0; k < 8; k++) {
                t1 = A[i + k][j];
                t2 = A[i + k][j + 1];
                t3 = A[i + k][j + 2];
                t4 = A[i + k][j + 3];
                t5 = A[i + k][j + 4];
                t6 = A[i + k][j + 5];
                t7 = A[i + k][j + 6];
                t8 = A[i + k][j + 7];
                B[j][i + k] = t1;
                B[j + 1][i + k] = t2;
                B[j + 2][i + k] = t3;
                B[j + 3][i + k] = t4;
                B[j + 4][i + k] = t5;
                B[j + 5][i + k] = t6;
                B[j + 6][i + k] = t7;
                B[j + 7][i + k] = t8;
            }
        }
    }
}

結果以下圖所示：

Test II: 64 * 64

和第一種狀況測試相似。可是因爲大小變成了 64 * 64，每過 4 行就會出現一次衝突的狀況。因此能夠先分紅 8 * 8 的塊，而後再把 8 * 8 的塊分紅 4 個 4 * 4 的塊。讀取一行，但存儲進的位置如圖所示。逆序存儲以後再逐行處理 C’ 和 B’ 處的數據。

因爲以前是逆序存儲的，因此在 C’ 會把 0 加載進緩存，而在 B’ 會把 24 加載進緩存，再利用 t1, t2, t3, t4 四個變量做臨時變量存儲，交換 0 行和 24 行的位置。

這一部分比較複雜，這裏參考了歐陽鬆的博客（https://www.ouyangsong.com/posts/55291/#fn4），大概的邏輯以下圖所示：

具體的代碼實現以下：

else if (N == 64 && M == 64) {
    int t0, t1, t2, t3, t4, t5, t6, t7;
    for (int i = 0; i < N; i += 8) {
        for (int j = 0; j < M; j += 8) {
            for (int k = i; k < i + 4; k++) {
                t0 = A[k][j];
                t1 = A[k][j + 1];
                t2 = A[k][j + 2];
                t3 = A[k][j + 3];
                t4 = A[k][j + 4];
                t5 = A[k][j + 5];
                t6 = A[k][j + 6];
                t7 = A[k][j + 7];
                B[j][k] = t0;
                B[j + 1][k] = t1;
                B[j + 2][k] = t2;
                B[j + 3][k] = t3;
                B[j + 0][k + 4] = t7;
                B[j + 1][k + 4] = t6;
                B[j + 2][k + 4] = t5;
                B[j + 3][k + 4] = t4;
            }
            for (int h = 0; h < 4; h++) {
                t0 = A[i + 4][j + 3 - h];
                t1 = A[i + 5][j + 3 - h];
                t2 = A[i + 6][j + 3 - h];
                t3 = A[i + 7][j + 3 - h];
                t4 = A[i + 4][j + 4 + h];
                t5 = A[i + 5][j + 4 + h];
                t6 = A[i + 6][j + 4 + h];
                t7 = A[i + 7][j + 4 + h];
                B[j + 4 + h][i + 0] = B[j + 3 - h][i + 4];
                B[j + 4 + h][i + 1] = B[j + 3 - h][i + 5];
                B[j + 4 + h][i + 2] = B[j + 3 - h][i + 6];
                B[j + 4 + h][i + 3] = B[j + 3 - h][i + 7];
                B[j + 3 - h][i + 4] = t0;
                B[j + 3 - h][i + 5] = t1;
                B[j + 3 - h][i + 6] = t2;
                B[j + 3 - h][i + 7] = t3;
                B[j + 4 + h][i + 4] = t4;
                B[j + 4 + h][i + 5] = t5;
                B[j + 4 + h][i + 6] = t6;
                B[j + 4 + h][i + 7] = t7;
            }
        }
    }
}

獲得以下結果：

Test III: 61 * 67

這一測試中因爲矩陣不規則，並且也不是 8 的倍數，因此在行與行之間沒有特別明顯的衝突不命中的關係。能夠嘗試用分塊矩陣的方式優化。通過嘗試 8 * 8 的分塊和 16 * 16 的分塊後，發現使用 16 * 16 的分塊方式能夠將 miss 數下降到 2000 如下。

這一部分的代碼以下：

else {
    int i, j, k, h;
    for (i = 0; i < N; i += 16) {
        for (j = 0; j < M; j += 16) {
            for (k = i; k < i + 16 && k < N; k++) {
                for (h = j; h < j + 16 && h < M; h++) {
                    B[h][k] = A[k][h];
                }
            }
        }
    }
}

能夠獲得 1992 的 miss 數。