part A
2021-09-23 22:18:42 星期四
#include <stdio.h> //printf()
#include <time.h> //clock_t
#include <stdlib.h> //atoi()
#include <getopt.h> //getopt()
#include "cachelab.h"
typedef struct
{
unsigned tag; /* tag bits per line */
_Bool valid; /* 1 valid bit per line */
clock_t stamp; /* time stamp, to implement LRU algorithm */
}cache_line;
cache_line** init_cache(int s, int E) {
/* cache size = S(=2^s sets per cache) * E(lines per set) * B(=2^b bytes per block)
* we use cache_line_size here instead of B
*/
int sets_cnt = 1<<s, cache_line_size = sizeof(cache_line);
/* cache_line** cache is a two dimensional array, and cache[i] means
* the ith set in the cache.
*/
cache_line** cache = (cache_line**) malloc(cache_line_size * sets_cnt);
for(int i = 0; i < sets_cnt; ++i) {
int set_size = cache_line_size * E;
//initial the ith set
cache[i] = (cache_line*) malloc(set_size);
//the for loop can be replaced by
//memset(cache[i], 0, set_size) //#include <string.h>
for(int j = 0; j < E; ++j) {
cache[i][j].tag = 0;
cache[i][j].valid = 0;
cache[i][j].stamp = 0;
}
}
return cache;
}
void free_cache(cache_line** cache, int s) {
int sets_cnt = 1<<s;
for(int i = 0; i < sets_cnt; ++i) {
free(cache[i]);
}
free(cache);
}
_Bool set_insert(cache_line* set, unsigned long tag, int E) {
int old_stamp_index = 0;
for(int i = 0; i < E; ++i) {
if(set[i].stamp < set[old_stamp_index].stamp) old_stamp_index = i;
if(!set[i].valid) {
set[i].valid = 1;
set[i].tag = tag;
set[i].stamp = clock();
return 0;
}
}
//all lines valid, evict the Least-Recent-Used
set[old_stamp_index].tag = tag;
set[old_stamp_index].stamp = clock();
return 1;
}
//hit=1, miss=0
_Bool set_inspect(cache_line* set, unsigned tag, int E) {
for(int i = 0; i < E; ++i) {
if(set[i].valid && set[i].tag == tag) {
//hit, renew time stamp
set[i].stamp = clock();
return 1;
}
}
//miss
return 0;
}
void cache_simulate(cache_line** cache, char type, unsigned address,
int bytes, int s, int E, int b, int v,
int* hits, int* misses, int* evictions) {
//address = tag + set index + block offset
//unsigned int has 32 bits
unsigned tag = address>>s>>b, set_index = address<<(32-b-s)>>(32-s);
cache_line* set = cache[set_index];
_Bool hit_or_not = set_inspect(set, tag, E), eviction_or_not = 0;
if(!hit_or_not) {
eviction_or_not = set_insert(set, tag, E);
}
//print detailed info
if(v) {
printf("%c %x,%d", type, address, bytes);
if(hit_or_not) {
printf(" hit");
} else {
if(eviction_or_not) printf(" eviction");
printf(" miss");
}
if(type == 'M') printf(" hit");
printf("\n");
}
//summary
if(type == 'M') ++*hits;
if(hit_or_not) ++*hits;
else ++*misses;
if(eviction_or_not) ++*evictions;
}
void print_usage(){
printf(
"Usage: ./csim [-hv] -s <num> -E <num> -b <num> -t <file>\n"
"Options\n"
" -h Print this help message.\n"
" -v Optional verbose flag.\n"
" -s <num>: Number of set index bits.\n"
" -E <num>: Number of lines per set.\n"
" -b <num>: Number of block offset bits.\n"
" -t <file>: Trace file.\n"
"\n"
"Exampes:\n"
" linux> ./csim -s 4 -E 1 -b 4 -t traces/yi.trace\n"
" linux> ./csim -v -s 8 -E 2 -b 4 -t traces/yi.trace\n"
);
}
int main(int argc, char** argv) {
//no arg
if(argc == 1) {
printf("./csim-ref: Missing required command line argument\n");
print_usage();
return 0;
}
/* access command line */
int s, E, b, v = 0;
char ch;
char* file_path;
while((ch = getopt(argc, argv, "s:E:b:t:vh")) != -1) {
switch(ch) {
case 's':
s = atoi(optarg);
break;
case 'E':
E = atoi(optarg);
break;
case 'b':
b = atoi(optarg);
break;
case 't':
file_path = optarg;
break;
case 'v':
v = 1;
break;
case 'h':
print_usage();
return 0;
default:
break;
}
}
//initialize cache
cache_line** cache = init_cache(s, E);
//open file
FILE* fp = fopen(file_path, "r");//r: read only
if(!fp) {
printf("%s: No such file or directory\n", file_path);
return 1;
}
//file format: S 7ff000398,8
/* read file */
char type;
unsigned address;
int bytes, hits = 0, misses = 0, evictions = 0;
while(fscanf(fp, " %c %x,%d", &type, &address, &bytes) != EOF) {
if(type == 'I') continue;
//enter core function
cache_simulate(cache, type, address, bytes, s, E, b, v,
&hits, &misses, &evictions);
}
fclose(fp);
printSummary(hits, misses, evictions);
free_cache(cache, s);
return 0;
}
运行结果:
root@bb0b19885c0d:/cmu15-213/cachelab-handout# ./test-csim
Your simulator Reference simulator
Points (s,E,b) Hits Misses Evicts Hits Misses Evicts
3 (1,1,1) 9 8 6 9 8 6 traces/yi2.trace
3 (4,2,4) 6 5 2 6 5 2 traces/yi.trace
3 (2,1,4) 2 3 1 2 3 1 traces/dave.trace
3 (2,1,3) 167 71 67 167 71 67 traces/trans.trace
3 (2,2,3) 201 37 29 201 37 29 traces/trans.trace
3 (2,4,3) 212 26 10 212 26 10 traces/trans.trace
3 (5,1,5) 231 7 0 231 7 0 traces/trans.trace
6 (5,1,5) 265189 21775 21743 265189 21775 21743 traces/long.trace
27
TEST_CSIM_RESULTS=27
历时21.5h,做完Part A: Writing a Cache Simulator。我知道这速度真的超慢,更何况我基本都是抄的zero4drift。但是,不看答案我是真的没头绪,里面用到的巨多c库函数我都一无所知,Linux基础知识我也没有(比如说main()里面的argc和argv给了命令行参数处理的基本途径,你让我自己去想真的是无从下手)。我是先看书,想明白这个题目大概要我干什么之后,去别人代码里看架构,然后尽量自己写,写到不会的地方就参考。结果就是架构基本和别人一模一样,不过这也没办法,他真的写得太好了,好几个地方我自己想办法结果代码又臭又长,只好又采用回他的架构。最终我的代码和参考代码区别几乎就只在于我的变量名、变量引入的地方不一样、以及加了很多注释。
没办法,他的代码真的写得太好了。本来如果只要求test-sim得满分的话,输入输出那里可以省很多步(参考周小伦),但zero4drift方方面面都照顾到了,真的和csim-ref一模一样的功能,我真的好佩服好佩服。
这部分主要学习了c库函数等构件基本小工程的技巧。非常有收获。
2021-09-28 21:24:00 星期二
草,在看part B要求的时候才看到part A那里文档里说了推荐用getopt.h等头文件,再次吃了不看文档的亏。唉,都多少次了。
part B
2021-10-04 14:30:30 星期一
为什么A和B对角线上元素会冲突?
A[0][0]
地址为0x0030b120
,B[0][0]
地址为0x0034b120
。对于A[n][n]
和B[n][n]
而言,由于起始地址的后10位(二进制)一样,对应到cache里面就是set index和offset一样,只有tag不一。因此,对角线上的元素进行替换时,会在同一个set里面更新tag,即发生miss eviction。对非对角线上元素而言,在A中的offset与在B中的offset肯定是不一样的,就不会冲突。
对第一个问题,32x32矩阵,采取8x8分块可使misses大幅下降,但由于对角线冲突问题还拿不到满分。此时每一个对角线分块有37个miss,非对角线上分块有16个miss(两个8行),计算应该有37x4+16x12=340次miss,实际的343次miss多出的3次大概源自循环变量。
char trans_my[] = "my test funcion";
void trans_my_function(int M, int N, int A[N][M], int B[M][N])
{
//set chunk
for (int ii = 0; ii < N; ii += 8) {
for (int jj = 0; jj < M; jj += 8) {
//inside chunk
for (int i = ii; i < ii + 8; ++i) {
for (int j = jj; j < jj + 8; ++j) {
A[i][j] = B[j][i];
}
}
}
}
}
Function 4 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 4 (my test funcion): hits:1710, misses:343, evictions:311
以下是为什么对角线分块上有37个miss:
缓存中的内容 :
+-----------------------+-------------------+
| opt | cache |
+-----------------------+-------------------+
|before B[0][0]=tmp | A[0] |---+
+-----------------------+-------------------+ |
|after B[0][0]=tmp | B[0] | |
+-----------------------+-------------------+ | A 的第一行复制到 B 的第一列 .
|after tmp=A[0][1] | A[0] | | 最终缓存中剩下 A[0], B[1]...B[7].
+-----------------------+-------------------+ +--> A[0]被两次加载进入内存 ,
|after B[1][0]=tmp | A[0] B[1] | | 总的 miss 次数是 10.
+-----------------------+-------------------+ |
|... | | |
+-----------------------+-------------------+ |
|after B[7][0]=tmp | A[0] B[1..7] |---+
+-----------------------+-------------------+
|after B[0][1]=tmp | A[1] B[0] B[2..7] |---+
+-----------------------+-------------------+ | A 的第二行复制到 B 的的二列 .
|after B[1][1]=tmp | B[0..7] | | 其中发生的 miss 有 :
+-----------------------+-------------------+ +--> A[0], B[0], A[1]与 B[1]的相互取代 .
|after B[2][1]=tmp | A[1] B[0] B[2..7] | | 总的 miss 次数为 4.
+-----------------------+-------------------+ |
|... | | |
+-----------------------+-------------------+ |
|after B[7][1]=tmp | A[1] B[0] B[2..] |---+
+-----------------------+-------------------+ 之后的三至七行与第二行类似 ,
|... | |------> miss 的次数都是 4.
+-----------------------+-------------------+
|after tmp=A[7][7] | A[7] B[0..6] |---+ 最后一行 A[7]被 A[8]取代后 ,
+-----------------------+-------------------+ +--> 不需要重新加载 .
|after B[7][7]=tmp | B[0..7] |---+ 总的 miss 数为 3.
+-----------------------+-------------------+
所以对角块上的总的 miss 次数是 10+4*6+3=37.
解决32x32矩阵对角线冲突问题,可以使用变量,一次存好一行,再一次读取一列。这样可以在对角线上的分块中miss次数降至23。
char trans_32_32[] = "32-32 transpose";
void trans_32_32_function(int M, int N, int A[N][M], int B[M][N])
{
int a[8];
//set chunk
for (int ii = 0; ii < N; ii += 8) {
for (int jj = 0; jj < M; jj += 8) {
//inside chunk
for (int j = jj; j < jj + 8; ++j) {
for (int i = ii; i < ii + 8; ++i) {
a[i-ii] = A[i][j];
}
for (int i = ii; i < ii + 8; ++i) {
B[j][i] = a[i-ii];
}
}
}
}
}
Function 1 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 1 (32-32 transpose): hits:1766, misses:289, evictions:257
这样part A miss次数低于300,得到了满分。
为什么64x64矩阵进行8x8分块时比32x32矩阵多了那么多miss?
由b=5,2的5次方为32,意味着cache set的data block里可以装32个Byte,即8个int(1个int需要4个Byte)。对于32x32矩阵,如果进行8行8列的分块,对单独一行的8列的int数据全部可以装进一个set中。一行共32列,装4个set,共32行,综合起来就是4x8=32个set,cache刚好能装下(s=5,2的五次方为32)。
而对于64x64矩阵,共64列,一行装8个set。如果依然进行8x8分块,则前4行已经将cache填满,需要对后4行进行存取操作的时候,势必会将前面已经装好的某些数据弹出,造成非常多的miss。事实上,64x64的8x8分块和完全不进行分块的内存效率是一样的:
char trans_my[] = "my test funcion";
void trans_my_function(int M, int N, int A[N][M], int B[M][N])
{
//set chunk
for (int ii = 0; ii < N; ii += 4) {
for (int jj = 0; jj < M; jj += 4) {
//inside chunk
for (int i = ii; i < ii + 4; ++i) {
for (int j = jj; j < jj + 4; ++j) {
A[i][j] = B[j][i];
}
}
}
}
}
char trans_desc[] = "Simple row-wise scan transpose";
void trans(int M, int N, int A[N][M], int B[M][N])
{
int i, j, tmp;
for (i = 0; i < N; i++) {
for (j = 0; j < M; j++) {
tmp = A[i][j];
B[j][i] = tmp;
}
}
}
Function 4 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 4 (my test funcion): hits:3474, misses:4723, evictions:4691
Function 5 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 5 (Simple row-wise scan transpose): hits:3474, misses:4723, evictions:4691
解决方法:将分块改为4x4。这样set是没装满的,非常浪费,但足以拿到不是0分的分数了,miss=1891(miss小于2000)
char trans_my[] = "my test funcion";
void trans_my_function(int M, int N, int A[N][M], int B[M][N])
{
//set chunk
for (int ii = 0; ii < N; ii += 4) {
for (int jj = 0; jj < M; jj += 4) {
//inside chunk
for (int i = ii; i < ii + 4; ++i) {
for (int j = jj; j < jj + 4; ++j) {
A[i][j] = B[j][i];
}
}
}
}
}
Function 4 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 4 (my test funcion): hits:6306, misses:1891, evictions:1859
如果用上32x32相同的策略来消除对角线冲突(即使用变量一次读好一行,再一次存好一列),可以进一步降低misses至1653
char trans_64_64[] = "64-64 transpose";
void trans_64_64_function(int M, int N, int A[N][M], int B[M][N])
{
int a[8];
//set chunk
for (int ii = 0; ii < N; ii += 4) {
for (int jj = 0; jj < M; jj += 4) {
//inside chunk
for (int j = jj; j < jj + 4; ++j) {
for (int i = ii; i < ii + 4; ++i) {
a[i-ii] = A[i][j];
}
for (int i = ii; i < ii + 4; ++i) {
B[j][i] = a[i-ii];
}
}
}
}
}
Function 2 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 2 (64-64 transpose): hits:6546, misses:1653, evictions:1621
3
最后61x64矩阵,跟32x32的思路一样的,多试试分块策略就完事了。
char trans_61_67[] = "61-67 transpose";
void trans_61_67_function(int M, int N, int A[N][M], int B[M][N])
{
for (int ii = 0; ii < N; ii += 24) {
for (int jj = 0; jj < M; jj += 8) {
for (int i = ii; i < ii + 24 && i < N; ++i) {
for (int j = jj; j < jj + 8 && j < M; ++j) {
B[j][i] = A[i][j];
}
}
}
}
}
Function 3 (6 total)
Step 1: Validating and generating memory traces
Step 2: Evaluating performance (s=5, E=1, b=5)
func 3 (61-67 transpose): hits:6282, misses:1897, evictions:1865
成绩
python2.7 ./driver.py
Part A: Testing cache simulator
Running ./test-csim
Your simulator Reference simulator
Points (s,E,b) Hits Misses Evicts Hits Misses Evicts
3 (1,1,1) 9 8 6 9 8 6 traces/yi2.trace
3 (4,2,4) 6 5 2 6 5 2 traces/yi.trace
3 (2,1,4) 2 3 1 2 3 1 traces/dave.trace
3 (2,1,3) 167 71 67 167 71 67 traces/trans.trace
3 (2,2,3) 201 37 29 201 37 29 traces/trans.trace
3 (2,4,3) 212 26 10 212 26 10 traces/trans.trace
3 (5,1,5) 231 7 0 231 7 0 traces/trans.trace
6 (5,1,5) 265189 21775 21743 265189 21775 21743 traces/long.trace
27
Part B: Testing transpose function
Running ./test-trans -M 32 -N 32
Running ./test-trans -M 64 -N 64
Running ./test-trans -M 61 -N 67
Cache Lab summary:
Points Max pts Misses
Csim correctness 27.0 27
Trans perf 32x32 8.0 8 289
Trans perf 64x64 4.0 8 1653
Trans perf 61x67 10.0 10 1897
Total points 49.0 53
最后64x64矩阵转置部分还没能拿到满分,这里有个非常好的文章(因为它带图,而且步骤分解清晰,讲得比很多人都清楚)https://zhuanlan.zhihu.com/p/387662272 ,但对于我来说还是太难懂了。思路是大概明白了,可实在不想自己把代码实现一遍了。
总结
- cachelab算上看书和做lab,一共花了38个小时。
- 一路上的每一个问题,我自己独立做出来的意愿都是非常强的,但实在顶不住水平太次,徘徊半天才摸到门道,还是只能看别人的博客才能明白,明白以后还是写不出来代码,又变成抄抄乐。到最后,还留着看了博客也不能解决的地方……
- part B非常困难,我在其中意识到我对地址的某些概念还不清晰,常常绕着绕着就绕不出来了。
- 由于中途对cache的构成有一丝不明晰:offset由多个bit组成,指向data block里面的确切地址的starting Byte,这一点没有搞明白,浪费了好几个小时。
- 巩固了Linux和docker的操作,初步认识valgrind内存管理工具。