part A

2021-09-23 22:18:42 星期四

#include <stdio.h>  //printf()
#include <time.h>  //clock_t
#include <stdlib.h>  //atoi()
#include <getopt.h>  //getopt()
#include "cachelab.h"

typedef struct 
{
    unsigned tag;    /* tag bits per line */
    _Bool valid;     /* 1 valid bit per line */
    clock_t stamp;   /* time stamp, to implement LRU algorithm */
}cache_line;

cache_line** init_cache(int s, int E) {
    /* cache size = S(=2^s sets per cache) * E(lines per set) * B(=2^b bytes per block)
     * we use cache_line_size here instead of B
     */
    int sets_cnt = 1<<s, cache_line_size = sizeof(cache_line);
    
    /* cache_line** cache is a two dimensional array, and cache[i] means
     * the ith set in the cache.
     */
    cache_line** cache = (cache_line**) malloc(cache_line_size * sets_cnt);
    
    for(int i = 0; i < sets_cnt; ++i) {
        int set_size = cache_line_size * E;
        //initial the ith set
        cache[i] = (cache_line*) malloc(set_size);
        //the for loop can be replaced by
        //memset(cache[i], 0, set_size)  //#include <string.h>
        for(int j = 0; j < E; ++j) {
            cache[i][j].tag = 0;
            cache[i][j].valid = 0;
            cache[i][j].stamp = 0;
        }
    }
    return cache;
}

void free_cache(cache_line** cache, int s) {
    int sets_cnt = 1<<s;
    for(int i = 0; i < sets_cnt; ++i) {
        free(cache[i]);
    }
    free(cache);
}

_Bool set_insert(cache_line* set, unsigned long tag, int E) {
    int old_stamp_index = 0;
    for(int i = 0; i < E; ++i) {
        if(set[i].stamp < set[old_stamp_index].stamp) old_stamp_index = i;
        if(!set[i].valid) {
            set[i].valid = 1;
            set[i].tag = tag;
            set[i].stamp = clock();
            return 0;
        }
    }
    //all lines valid, evict the Least-Recent-Used
    set[old_stamp_index].tag = tag;
    set[old_stamp_index].stamp = clock();
    return 1;
}

//hit=1, miss=0
_Bool set_inspect(cache_line* set, unsigned tag, int E) {
    for(int i = 0; i < E; ++i) {
        if(set[i].valid && set[i].tag == tag) {
            //hit, renew time stamp
            set[i].stamp = clock();
            return 1;
        }
    }
    //miss
    return 0;
}

void cache_simulate(cache_line** cache, char type, unsigned address,
                    int bytes, int s, int E, int b, int v, 
                    int* hits, int* misses, int* evictions) {
    //address = tag + set index + block offset
    //unsigned int has 32 bits
    unsigned tag = address>>s>>b, set_index = address<<(32-b-s)>>(32-s);
    cache_line* set = cache[set_index];
    _Bool hit_or_not = set_inspect(set, tag, E), eviction_or_not = 0;
    if(!hit_or_not) {
        eviction_or_not = set_insert(set, tag, E);
    }

    //print detailed info
    if(v) {
        printf("%c %x,%d", type, address, bytes);
        if(hit_or_not) {
            printf(" hit");
        } else {
            if(eviction_or_not) printf(" eviction");
            printf(" miss");
        }
        if(type == 'M') printf(" hit");
        printf("\n");
    }

    //summary
    if(type == 'M') ++*hits;
    if(hit_or_not) ++*hits;
    else ++*misses;
    if(eviction_or_not) ++*evictions;
}

void print_usage(){
    printf(
        "Usage: ./csim [-hv] -s <num> -E <num> -b <num> -t <file>\n"
        "Options\n"
        "  -h        Print this help message.\n"
        "  -v        Optional verbose flag.\n"
        "  -s <num>: Number of set index bits.\n"
        "  -E <num>: Number of lines per set.\n"
        "  -b <num>: Number of block offset bits.\n"
        "  -t <file>: Trace file.\n"
        "\n"
        "Exampes:\n"
        "  linux> ./csim -s 4 -E 1 -b 4 -t traces/yi.trace\n"
        "  linux> ./csim -v -s 8 -E 2 -b 4 -t traces/yi.trace\n"
    );
}

int main(int argc, char** argv) {
    //no arg
    if(argc == 1) {
        printf("./csim-ref: Missing required command line argument\n");
        print_usage();
        return 0;
    }

    /* access command line */
    int s, E, b, v = 0;
    char ch;
    char* file_path;
    while((ch = getopt(argc, argv, "s:E:b:t:vh")) != -1) {
        switch(ch) {
            case 's':
                s = atoi(optarg);
                break;
            case 'E':
                E = atoi(optarg);
                break;
            case 'b':
                b = atoi(optarg);
                break;
            case 't':
                file_path = optarg;
                break;
            case 'v':
                v = 1;
                break;
            case 'h':
                print_usage();
                return 0;
            default:
                break;
        }
    }

    //initialize cache
    cache_line** cache = init_cache(s, E);

    //open file
    FILE* fp = fopen(file_path, "r");//r: read only
    if(!fp) {
        printf("%s: No such file or directory\n", file_path);
        return 1;
    }

    //file format: S 7ff000398,8

    /* read file */
    char type;
    unsigned address;
    int bytes, hits = 0, misses = 0, evictions = 0;
    while(fscanf(fp, " %c %x,%d", &type, &address, &bytes) != EOF) {
        if(type == 'I') continue;
        //enter core function
        cache_simulate(cache, type, address, bytes, s, E, b, v,
                        &hits, &misses, &evictions);
    }
    fclose(fp);
    printSummary(hits, misses, evictions);

    free_cache(cache, s);
    return 0;
}

运行结果：

root@bb0b19885c0d:/cmu15-213/cachelab-handout# ./test-csim
                        Your simulator     Reference simulator
Points (s,E,b)    Hits  Misses  Evicts    Hits  Misses  Evicts
     3 (1,1,1)       9       8       6       9       8       6  traces/yi2.trace
     3 (4,2,4)       6       5       2       6       5       2  traces/yi.trace
     3 (2,1,4)       2       3       1       2       3       1  traces/dave.trace
     3 (2,1,3)     167      71      67     167      71      67  traces/trans.trace
     3 (2,2,3)     201      37      29     201      37      29  traces/trans.trace
     3 (2,4,3)     212      26      10     212      26      10  traces/trans.trace
     3 (5,1,5)     231       7       0     231       7       0  traces/trans.trace
     6 (5,1,5)  265189   21775   21743  265189   21775   21743  traces/long.trace
    27

TEST_CSIM_RESULTS=27

历时21.5h，做完Part A: Writing a Cache Simulator。我知道这速度真的超慢，更何况我基本都是抄的zero4drift。但是，不看答案我是真的没头绪，里面用到的巨多c库函数我都一无所知，Linux基础知识我也没有（比如说main()里面的argc和argv给了命令行参数处理的基本途径，你让我自己去想真的是无从下手）。我是先看书，想明白这个题目大概要我干什么之后，去别人代码里看架构，然后尽量自己写，写到不会的地方就参考。结果就是架构基本和别人一模一样，不过这也没办法，他真的写得太好了，好几个地方我自己想办法结果代码又臭又长，只好又采用回他的架构。最终我的代码和参考代码区别几乎就只在于我的变量名、变量引入的地方不一样、以及加了很多注释。
没办法，他的代码真的写得太好了。本来如果只要求test-sim得满分的话，输入输出那里可以省很多步（参考周小伦），但zero4drift方方面面都照顾到了，真的和csim-ref一模一样的功能，我真的好佩服好佩服。
这部分主要学习了c库函数等构件基本小工程的技巧。非常有收获。

2021-09-28 21:24:00 星期二
草，在看part B要求的时候才看到part A那里文档里说了推荐用getopt.h等头文件，再次吃了不看文档的亏。唉，都多少次了。

part B

2021-10-04 14:30:30 星期一

为什么A和B对角线上元素会冲突？

A[0][0]地址为0x0030b120,B[0][0]地址为0x0034b120。对于A[n][n]和B[n][n]而言，由于起始地址的后10位（二进制）一样，对应到cache里面就是set index和offset一样，只有tag不一。因此，对角线上的元素进行替换时，会在同一个set里面更新tag，即发生miss eviction。对非对角线上元素而言，在A中的offset与在B中的offset肯定是不一样的，就不会冲突。
对第一个问题，32x32矩阵，采取8x8分块可使misses大幅下降，但由于对角线冲突问题还拿不到满分。此时每一个对角线分块有37个miss，非对角线上分块有16个miss（两个8行），计算应该有37x4+16x12=340次miss，实际的343次miss多出的3次大概源自循环变量。

char trans_my[] = "my test funcion";
void trans_my_function(int M, int N, int A[N][M], int B[M][N])
{
    //set chunk
    for (int ii = 0; ii < N; ii += 8) {
        for (int jj = 0; jj < M; jj += 8) {
            //inside chunk
            for (int i = ii; i < ii + 8; ++i) {
                for (int j = jj; j < jj + 8; ++j) {
                    A[i][j] = B[j][i];
                }
            }
        }
    }
}

Function 4 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 4 (my test funcion): hits:1710, misses:343, evictions:311

以下是为什么对角线分块上有37个miss：

缓存中的内容 :
+-----------------------+-------------------+
| opt                   |  cache            |
+-----------------------+-------------------+
|before B[0][0]=tmp     | A[0]              |---+
+-----------------------+-------------------+   |
|after B[0][0]=tmp      | B[0]              |   |    
+-----------------------+-------------------+   |    A 的第一行复制到 B 的第一列 .
|after tmp=A[0][1]      | A[0]              |   |    最终缓存中剩下 A[0], B[1]...B[7].
+-----------------------+-------------------+   +--> A[0]被两次加载进入内存 , 
|after B[1][0]=tmp      | A[0] B[1]         |   |    总的 miss 次数是 10.               
+-----------------------+-------------------+   |    
|...                    |                   |   |    
+-----------------------+-------------------+   |
|after B[7][0]=tmp      | A[0] B[1..7]      |---+
+-----------------------+-------------------+
|after B[0][1]=tmp      | A[1] B[0] B[2..7] |---+
+-----------------------+-------------------+   |    A 的第二行复制到 B 的的二列 .
|after B[1][1]=tmp      | B[0..7]           |   |    其中发生的 miss 有 : 
+-----------------------+-------------------+   +--> A[0], B[0], A[1]与 B[1]的相互取代 . 
|after B[2][1]=tmp      | A[1] B[0] B[2..7] |   |    总的 miss 次数为 4.
+-----------------------+-------------------+   |
|...                    |                   |   |    
+-----------------------+-------------------+   |
|after B[7][1]=tmp      | A[1] B[0] B[2..]  |---+
+-----------------------+-------------------+        之后的三至七行与第二行类似 ,
|...                    |                   |------> miss 的次数都是 4.
+-----------------------+-------------------+
|after tmp=A[7][7]      | A[7] B[0..6]      |---+    最后一行 A[7]被 A[8]取代后 ,
+-----------------------+-------------------+   +--> 不需要重新加载 .
|after B[7][7]=tmp      | B[0..7]           |---+    总的 miss 数为 3. 
+-----------------------+-------------------+

所以对角块上的总的 miss 次数是 10+4*6+3=37.

解决32x32矩阵对角线冲突问题，可以使用变量，一次存好一行，再一次读取一列。这样可以在对角线上的分块中miss次数降至23。

char trans_32_32[] = "32-32 transpose";
void trans_32_32_function(int M, int N, int A[N][M], int B[M][N])
{
    int a[8];
    //set chunk
    for (int ii = 0; ii < N; ii += 8) {
        for (int jj = 0; jj < M; jj += 8) {
            //inside chunk
            for (int j = jj; j < jj + 8; ++j) {
                for (int i = ii; i < ii + 8; ++i) {
                    a[i-ii] = A[i][j]; 
                }
                for (int i = ii; i < ii + 8; ++i) {
                    B[j][i] = a[i-ii]; 
                }
            }
        }
    }
}

Function 1 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (32-32 transpose): hits:1766, misses:289, evictions:257

这样part A miss次数低于300，得到了满分。

为什么64x64矩阵进行8x8分块时比32x32矩阵多了那么多miss？

由b=5，2的5次方为32，意味着cache set的data block里可以装32个Byte，即8个int（1个int需要4个Byte）。对于32x32矩阵，如果进行8行8列的分块，对单独一行的8列的int数据全部可以装进一个set中。一行共32列，装4个set，共32行，综合起来就是4x8=32个set，cache刚好能装下（s=5，2的五次方为32）。
而对于64x64矩阵，共64列，一行装8个set。如果依然进行8x8分块，则前4行已经将cache填满，需要对后4行进行存取操作的时候，势必会将前面已经装好的某些数据弹出，造成非常多的miss。事实上，64x64的8x8分块和完全不进行分块的内存效率是一样的：

char trans_my[] = "my test funcion";
void trans_my_function(int M, int N, int A[N][M], int B[M][N])
{
    //set chunk
    for (int ii = 0; ii < N; ii += 4) {
        for (int jj = 0; jj < M; jj += 4) {
            //inside chunk
            for (int i = ii; i < ii + 4; ++i) {
                for (int j = jj; j < jj + 4; ++j) {
                    A[i][j] = B[j][i];
                }
            }
        }
    }
}

char trans_desc[] = "Simple row-wise scan transpose";
void trans(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, tmp;

    for (i = 0; i < N; i++) {
        for (j = 0; j < M; j++) {
            tmp = A[i][j];
            B[j][i] = tmp;
        }
    }    

}

Function 4 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 4 (my test funcion): hits:3474, misses:4723, evictions:4691

Function 5 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 5 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691

解决方法：将分块改为4x4。这样set是没装满的，非常浪费，但足以拿到不是0分的分数了，miss=1891（miss小于2000）

char trans_my[] = "my test funcion";
void trans_my_function(int M, int N, int A[N][M], int B[M][N])
{
    //set chunk
    for (int ii = 0; ii < N; ii += 4) {
        for (int jj = 0; jj < M; jj += 4) {
            //inside chunk
            for (int i = ii; i < ii + 4; ++i) {
                for (int j = jj; j < jj + 4; ++j) {
                    A[i][j] = B[j][i];
                }
            }
        }
    }
}

Function 4 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 4 (my test funcion): hits:6306, misses:1891, evictions:1859

如果用上32x32相同的策略来消除对角线冲突（即使用变量一次读好一行，再一次存好一列），可以进一步降低misses至1653

char trans_64_64[] = "64-64 transpose";
void trans_64_64_function(int M, int N, int A[N][M], int B[M][N])
{
    int a[8];
    //set chunk
    for (int ii = 0; ii < N; ii += 4) {
        for (int jj = 0; jj < M; jj += 4) {
            //inside chunk
            for (int j = jj; j < jj + 4; ++j) {
                for (int i = ii; i < ii + 4; ++i) {
                    a[i-ii] = A[i][j]; 
                }
                for (int i = ii; i < ii + 4; ++i) {
                    B[j][i] = a[i-ii]; 
                }
            }
        }
    }
}

Function 2 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 2 (64-64 transpose): hits:6546, misses:1653, evictions:1621

3

最后61x64矩阵，跟32x32的思路一样的，多试试分块策略就完事了。

char trans_61_67[] = "61-67 transpose";
void trans_61_67_function(int M, int N, int A[N][M], int B[M][N])
{
    for (int ii = 0; ii < N; ii += 24) {
        for (int jj = 0; jj < M; jj += 8) {
            for (int i = ii; i < ii + 24 && i < N; ++i) {
                for (int j = jj; j < jj + 8 && j < M; ++j) {
                    B[j][i] = A[i][j]; 
                }
            }
        }
    }
}

Function 3 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 3 (61-67 transpose): hits:6282, misses:1897, evictions:1865

成绩

python2.7 ./driver.py

Part A: Testing cache simulator
Running ./test-csim
                        Your simulator     Reference simulator
Points (s,E,b)    Hits  Misses  Evicts    Hits  Misses  Evicts
     3 (1,1,1)       9       8       6       9       8       6  traces/yi2.trace
     3 (4,2,4)       6       5       2       6       5       2  traces/yi.trace
     3 (2,1,4)       2       3       1       2       3       1  traces/dave.trace
     3 (2,1,3)     167      71      67     167      71      67  traces/trans.trace
     3 (2,2,3)     201      37      29     201      37      29  traces/trans.trace
     3 (2,4,3)     212      26      10     212      26      10  traces/trans.trace
     3 (5,1,5)     231       7       0     231       7       0  traces/trans.trace
     6 (5,1,5)  265189   21775   21743  265189   21775   21743  traces/long.trace
    27


Part B: Testing transpose function
Running ./test-trans -M 32 -N 32
Running ./test-trans -M 64 -N 64
Running ./test-trans -M 61 -N 67

Cache Lab summary:
                        Points   Max pts      Misses
Csim correctness          27.0        27
Trans perf 32x32           8.0         8         289
Trans perf 64x64           4.0         8        1653
Trans perf 61x67          10.0        10        1897
          Total points    49.0        53

最后64x64矩阵转置部分还没能拿到满分，这里有个非常好的文章（因为它带图，而且步骤分解清晰，讲得比很多人都清楚）https://zhuanlan.zhihu.com/p/387662272 ，但对于我来说还是太难懂了。思路是大概明白了，可实在不想自己把代码实现一遍了。

总结

cachelab算上看书和做lab，一共花了38个小时。
一路上的每一个问题，我自己独立做出来的意愿都是非常强的，但实在顶不住水平太次，徘徊半天才摸到门道，还是只能看别人的博客才能明白，明白以后还是写不出来代码，又变成抄抄乐。到最后，还留着看了博客也不能解决的地方……
part B非常困难，我在其中意识到我对地址的某些概念还不清晰，常常绕着绕着就绕不出来了。
由于中途对cache的构成有一丝不明晰：offset由多个bit组成，指向data block里面的确切地址的starting Byte，这一点没有搞明白，浪费了好几个小时。
巩固了Linux和docker的操作，初步认识valgrind内存管理工具。