不同级别Cache的处理延迟
Processing latency in different levels of Cache
关于缓存级别(L1、L2、L3)和 RAM 的访问时间,
我遇到了我还没有找到答案的奇怪行为,
如果你能帮助我,我将不胜感激:)
我开始按照以下方式填充内存块,
我有不同的块大小作为输入,例如 16 字节,32 字节,.... 256 KB,
对于每个特定的块,我读取内存,对其进行计数并将其写回。例如,对于 1 KB,我有 256 个不同的计数器数组(因为我的计数器是 int32 并且 32 位 = 4 字节),
我从零开始,将 256 个不同的计数器数组作为起点(我们称之为计数器数组)计数并写回,我做了 10,000 次计数(0~10000),并进行 10,000 次 100 次并记录这 100 个结果,取平均值并计算处理时间
(时间计算如下代码)
COUNTERS_MAX = 10000;
ITERATION_MAX = 100;
// The Function which each core should do, now is counter (cnt = cnt + 1)
static int
lcore_recv(struct lcore_params *p)
{
unsigned lcore_id = rte_lcore_id();
printf("Starting core %u\n", lcore_id);
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
struct tableEntry outputTable[ITERATION_MAX];
#endif
#endif
while(canContinue_)
{
//printf("Starting core %u\n", lcore_id);
//int index=((lcore_id-p->baseIndex)-1+CORE_MAX)%CORE_MAX;
void * vp;
struct data * d = p->valueMem;
FILE* fp = p->fp;
//fprintf(fp, "Iteration %d ----------------------\n", p->iteration);
//int index = p->index;
struct timespec t1, t2;
for(int q = 0; q < ITERATION_MAX; q++)
{
double processTime = 0;
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
//Checks last value of each counter
int expectedVal = (q + 1) * COUNTERS_MAX;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
{
if(d->value[i]!=expectedVal)
{
if(allOk)
{
allOk = false;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Failed : ");
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp,"%d ", i);
#endif
#endif
}
}
#ifdef EXCEL_OUTPUT
struct tableEntry* entry= &outputTable[p->index][p->iteration][q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#else
#ifdef DIRECT_FILE_WRITE
if(allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"\n");
}
fprintf(fp, "*** Time = %f ns \n", processTime);
#else
struct tableEntry* entry= &outputTable[q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
for(int q = 0; q < ITERATION_MAX; q++)
{
struct tableEntry* entry= &outputTable[q];
fprintf(fp," Expected : %d\n", entry->expectedVal);
if(entry->allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"Failed \n");
}
fprintf(fp, "*** Time = %f ns \n", entry->processTime);
}
#endif
#endif
pthread_mutex_lock(&mutexLock_);
processedCount++;
pthread_cond_signal(&readWaitHandle);
pthread_cond_wait(&newIterWaitHandle, &mutexLock_);
pthread_mutex_unlock(&mutexLock_);
}
return 0;
}
所以对于每个块我都做了相同的测试。例如,如果我有 20 个不同的测试点(块存储器,如 16 B、32 B,....),我将在 'ns' 中得到 100 行和 20 列时间的矩阵。
所以每一列显示不同的块大小,每一行显示不同的 100 测试。
最后我得到了每列的平均值并计算了每列的处理时间,奇怪的行为如下所示,
the block size based on Byte and the Y axis is the latency for each process in 'ns', here you could see 3 different cores which run at the same time with the same more or less behaviour
每当我从 16 B 这样的小块开始,大约 50 字节到 600 字节的间隔时,我总是看到这种疯狂的行为,我不知道为什么? (我的第一个问题)
因此,如果继续超过 2.93 MB(大约 8 MB(LLC 大小)/ 3(同时 运行 的不同内核),我们将进行如下跳转)
3 different core run simultaneously
我的第二个问题是,如果这个跳跃有意义,我的意思是 LLC 延迟和 RAM 延迟之间的差异大约 2.5 或 3 倍是可以的,或者应该更多)
PS.My 系统是 Core i7,3.4 Ghz,L1:32 KB,L2:256 KB 和 L3:8 MB,16 GB RAM
提前感谢您的帮助和考虑
您的测试方法不是测试以测量缓存延迟(并且您打开了 TubroBoost,因此没有恒定的 cpu 频率。)。
缓存的延迟是已知的,并且以 cpu 周期而不是 ns 为单位进行测量(缓存以 cpu 核心频率运行);内存延迟以周期 + ns 为单位,因为数据在从内存中读取后必须通过缓存层次结构(周期)(ns,内存有自己的时钟)。
例如 i7-4xxx (Haswell):
http://7-cpu.com/cpu/Haswell.html
Intel Haswell
Intel i7-4770 (Haswell), 3.4 GHz (Turbo Boost off), 22 nm. RAM: 32 GB (PC3-12800 cl11 cr2).
- L1 Data Cache Latency = 4 cycles for simple access via pointer
- L1 Data Cache Latency = 5 cycles for access with complex address calculation (size_t n, *p; n = p[n]).
- L2 Cache Latency = 12 cycles
- L3 Cache Latency = 36 cycles
- RAM Latency = 36 cycles + 57 ns
您现在拥有的是:向多个计数器添加一些 "count" 常量(是的,编译器很可能能够将内部循环优化为 d->value[i] += d->count
)。
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
你应该用什么来衡量缓存延迟:
Widely used classic test for cache latency is iterating over the linked list. ... This method is used by open-source lmbench - in the test lat_mem_rd ... There are sources of lat_mem_rd test from lmbench: https://github.com/foss-for-synopsys-dwc-arc-processors/lmbench/blob/master/src/lat_mem_rd.c
the main test is
#define ONE p = (char **)*p;
#define FIVE ONE ONE ONE ONE ONE
#define TEN FIVE FIVE
#define FIFTY TEN TEN TEN TEN TEN
#define HUNDRED FIFTY FIFTY
void
benchmark_loads(iter_t iterations, void *cookie)
{
struct mem_state* state = (struct mem_state*)cookie;
register char **p = (char**)state->p[0];
register size_t i;
register size_t count = state->len / (state->line * 100) + 1;
while (iterations-- > 0) {
for (i = 0; i < count; ++i) {
HUNDRED;
}
}
use_pointer((void *)p);
state->p[0] = (char*)p;
}
所以,在破译宏之后,我们做了很多线性操作,比如:
p = (char**) *p; // (in intel syntax) == mov eax, [eax]
p = (char**) *p;
p = (char**) *p;
.... // 100 times total
p = (char**) *p;
如手册页所述http://www.bitmover.com/lmbench/lat_mem_rd.8.html
The benchmark runs as two nested loops. The outer loop is the stride size. The inner loop is the array size. For each array size, the benchmark creates a ring of pointers that point forward one stride. Traversing the array is done by
p = (char **)*p;
Ubuntu 14.04.4 LTS,
我用海合会,
我的 makefile 如下
ifeq ($(RTE_SDK),)
$(error "Please define RTE_SDK environment variable")
endif
# Default target, can be overriden by command line or environment
RTE_TARGET ?= x86_64-native-linuxapp-gcc
include $(RTE_SDK)/mk/rte.vars.mk
# binary name
APP = Mahdi_test
INC += $(wildcard include/*.h)
# all source are stored in SRCS-y
SRCS-y := main.c
CFLAGS += $(WERROR_FLAGS) -I -S$(SRCDIR)/include -I/usr/local/include
# Most optimizations are only enabled if an -O level is set on the command line,
# otherwise they are disabled, even if individual optimization flags are specified.
# With -O, the compiler tries to reduce code size and execution time,
# without performing any optimizations that take a great deal of compilation time.
# -O3 Optimize yet more. -O3 turns on all optimizations specified by -O2
# EXTRA_CFLAGS += -O3 -S -Wno-error -std=c99
# After following line do make, go to ./build and run : objdump -d -M intel -S main.o >a.txt
EXTRA_CFLAGS += -O3 -g -Wno-error -std=c99
# rte.extapp.mk : External application
include $(RTE_SDK)/mk/rte.extapp.mk
CPU :
架构:x86_64
CPU 操作模式:32 位、64 位
字节顺序:小字节序
CPU(s): 8
在线 CPU(s) 名单:0-7
每核心线程数:2
每个插槽的核心数:4
插座:1
NUMA 节点:1
供应商 ID:GenuineIntel
CPU 家庭:6
型号:42
步进:7
CPU 兆赫:1600.000
BogoMIPS:6784.24
虚拟化:VT-x
一级缓存:32K
一级缓存:32K
二级缓存:256K
三级缓存:8192K
NUMA 节点 0 CPU(s): 0-7
所有代码都在单个文件中(我使用 dpdk 是为了利用这个库的好处),
#if __STDC_VERSION__ >= 199901L
#define _XOPEN_SOURCE 600
#else
#define _XOPEN_SOURCE 500
#endif /* __STDC_VERSION__ */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <rte_memory.h>
#include <rte_malloc.h>
#include <string.h>
#include <time.h>
#include <pthread.h>
#include <rte_ring.h>
#include <math.h>
#include <stdbool.h>
#include <sys/types.h>
#define EXCEL_OUTPUT
#ifndef EXCEL_OUTPUT
#define DIRECT_FILE_WRITE
#endif
#define CORE_MAX 3
#define BLOCK_MAX 20 // BKMG = 4, ~ 168.72 MB
#define COUNTERS_MAX 10000
#define ITERATION_MAX 100
#define Factor 1.5
#define BKMG 4
char* testNumber = "23";
/*
uint32_t sizes[BLOCK_MAX] = {
1*Factor*pow(2, 10)/4, 2*Factor*pow(2, 10)/4, 4*Factor*pow(2, 10)/4, 8*Factor*pow(2, 10)/4, 16*Factor*pow(2, 10)/4, 32*Factor*pow(2, 10)/4, 64*Factor*pow(2, 10)/4, 128*Factor*pow(2, 10)/4, 256*Factor*pow(2, 10)/4, 512*Factor*pow(2, 10)/4,
1*Factor*pow(2, 20)/4, 2*Factor*pow(2, 20)/4, 4*Factor*pow(2, 20)/4, 8*Factor*pow(2, 20)/4, 16*Factor*pow(2, 20)/4, 32*Factor*pow(2, 20)/4, 64*Factor*pow(2, 20)/4, 128*Factor*pow(2, 20)/4, 256*Factor*pow(2, 20)/4, 512*Factor*pow(2, 20)/4,
1*Factor*pow(2, 30)/4, 2*Factor*pow(2, 30)/4
};
*/
uint32_t sizes[BLOCK_MAX] = {
pow(Factor, 1)*pow(2, BKMG)/4, pow(Factor, 2)*pow(2, BKMG)/4, pow(Factor, 3)*pow(2, BKMG)/4, pow(Factor, 4)*pow(2, BKMG)/4, pow(Factor, 5)*pow(2, BKMG)/4, pow(Factor, 6)*pow(2, BKMG)/4, pow(Factor, 7)*pow(2, BKMG)/4, pow(Factor, 8)*pow(2, BKMG)/4, pow(Factor, 9)*pow(2, BKMG)/4, pow(Factor,10)*pow(2, BKMG)/4,
pow(Factor,11)*pow(2, BKMG)/4, pow(Factor,12)*pow(2, BKMG)/4, pow(Factor,13)*pow(2, BKMG)/4, pow(Factor,14)*pow(2, BKMG)/4, pow(Factor,15)*pow(2, BKMG)/4, pow(Factor,16)*pow(2, BKMG)/4, pow(Factor,17)*pow(2, BKMG)/4, pow(Factor,18)*pow(2, BKMG)/4, pow(Factor,19)*pow(2, BKMG)/4, pow(Factor,20)*pow(2, BKMG)/4,
pow(Factor,21)*pow(2, BKMG)/4, pow(Factor,22)*pow(2, BKMG)/4, pow(Factor,23)*pow(2, BKMG)/4, pow(Factor,24)*pow(2, BKMG)/4, pow(Factor,25)*pow(2, BKMG)/4, pow(Factor,26)*pow(2, BKMG)/4, pow(Factor,27)*pow(2, BKMG)/4, pow(Factor,28)*pow(2, BKMG)/4, pow(Factor,29)*pow(2, BKMG)/4, pow(Factor,30)*pow(2, BKMG)/4,
pow(Factor,31)*pow(2, BKMG)/4, pow(Factor,32)*pow(2, BKMG)/4, pow(Factor,33)*pow(2, BKMG)/4, pow(Factor,34)*pow(2, BKMG)/4, pow(Factor,35)*pow(2, BKMG)/4, pow(Factor,36)*pow(2, BKMG)/4, pow(Factor,37)*pow(2, BKMG)/4, pow(Factor,38)*pow(2, BKMG)/4, pow(Factor,39)*pow(2, BKMG)/4, pow(Factor,40)*pow(2, BKMG)/4,
pow(Factor,41)*pow(2, BKMG)/4, pow(Factor,42)*pow(2, BKMG)/4, pow(Factor,43)*pow(2, BKMG)/4, pow(Factor,44)*pow(2, BKMG)/4, pow(Factor,45)*pow(2, BKMG)/4, pow(Factor,46)*pow(2, BKMG)/4, pow(Factor,47)*pow(2, BKMG)/4, pow(Factor,48)*pow(2, BKMG)/4, pow(Factor,49)*pow(2, BKMG)/4, pow(Factor,50)*pow(2, BKMG)/4,
};
/*
char* names[BLOCK_MAX] = {
"1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
"1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M",
"1G", "2G"
};
*/
char* names[BLOCK_MAX] = {
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40",
"41", "42", "43", "44", "45", "46", "47", "48", "49", "50",
};
// This struct keeps the inoput parameter for each single core (for 3 cores we have 3 of this struct)
struct lcore_params
{
struct data* valueMem; // This pointer is the address of one sample of data struct which include the address of memorty related to core and the size of that
int iteration; // This keeos the number of main iteratiopn, which block of memory now is processing
FILE* fp; // This keeps the handler address of opened file for related core, which via that we could write in mentioned file
int index; // This keeps the number of core, here we don't use it anymore
};
// Keeps the information regarding the memory which allocates to cores
struct data
{
uint32_t* value; // This keeps the memory address. This memory is allocated independent for each specific core
uint32_t count; // The variable 'count' shows the number of 32-bits taken memory.
};
struct tableEntry
{
int expectedVal;
double processTime;
bool allOk;
};
// This thread variavbles is using for coordination btw cores in order to prevent them interfereing each other while checking readWaitHandle and newIterWaitHandle
pthread_mutex_t mutexLock_;
// All slave cores wait here till the signal issues(via pthread_cond_signal(&newIterWaitHandle)) from master core in order to start new memory block
// Conversely going through newIterWaitHandle goes up here which master core wait till all slave finish their tasks
pthread_cond_t readWaitHandle, newIterWaitHandle;
bool canContinue_ = true;
int processedCount = 0;
#ifdef EXCEL_OUTPUT
//holds all outputs. we save them at the end of work
struct tableEntry outputTable[CORE_MAX][BLOCK_MAX][ITERATION_MAX];
#endif
// The Function which each core should do, now is counter (cnt = cnt + 1)
static int
lcore_recv(struct lcore_params *p)
{
unsigned lcore_id = rte_lcore_id();
printf("Starting core %u\n", lcore_id);
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
struct tableEntry outputTable[ITERATION_MAX];
#endif
#endif
while(canContinue_)
{
//printf("Starting core %u\n", lcore_id);
//int index=((lcore_id-p->baseIndex)-1+CORE_MAX)%CORE_MAX;
void * vp;
struct data * d = p->valueMem;
FILE* fp = p->fp;
//fprintf(fp, "Iteration %d ----------------------\n", p->iteration);
//int index = p->index;
struct timespec t1, t2;
for(int q = 0; q < ITERATION_MAX; q++)
{
double processTime = 0;
// TEST TEST ON
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
// TEST TEST OFF
//Checks last value of each counter
int expectedVal = (q + 1) * COUNTERS_MAX;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
{
if(d->value[i]!=expectedVal)
{
if(allOk)
{
allOk = false;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Failed : ");
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp,"%d ", i);
#endif
#endif
}
}
#ifdef EXCEL_OUTPUT
struct tableEntry* entry= &outputTable[p->index][p->iteration][q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#else
#ifdef DIRECT_FILE_WRITE
if(allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"\n");
}
fprintf(fp, "*** Time = %f ns \n", processTime);
#else
struct tableEntry* entry= &outputTable[q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
for(int q = 0; q < ITERATION_MAX; q++)
{
struct tableEntry* entry= &outputTable[q];
fprintf(fp," Expected : %d\n", entry->expectedVal);
if(entry->allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"Failed \n");
}
fprintf(fp, "*** Time = %f ns \n", entry->processTime);
}
#endif
#endif
pthread_mutex_lock(&mutexLock_);
processedCount++;
pthread_cond_signal(&readWaitHandle);
pthread_cond_wait(&newIterWaitHandle, &mutexLock_);
pthread_mutex_unlock(&mutexLock_);
}
return 0;
}
// mem_alloc is used in order to release the allocated memory and resize the new memory with new size for it. This function is called for each separate core
static void
mem_alloc(struct data* valueMem, uint32_t newSize, uint32_t iteration)
{
valueMem->count = newSize;
if(valueMem->value)
{
rte_free(valueMem->value);
}
valueMem->value = (uint32_t *)rte_zmalloc(NULL, sizeof(uint32_t) * newSize, 0);
if(!valueMem->value)
{
printf("Memory Fail\n");
}
}
#ifdef EXCEL_OUTPUT
void saveToExcelFile()
{
char name[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name, "output");
strcat(name, testNumber);
strcat(name, ".xml");
FILE* fp = fopen(name, "w");
// some setting of excel and xml file
fprintf(fp,"<?xml version=\"1.0\"?>\n\
<?mso-application progid=\"Excel.Sheet\"?>\n\
<Workbook xmlns=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
xmlns:o=\"urn:schemas-microsoft-com:office:office\"\n\
xmlns:x=\"urn:schemas-microsoft-com:office:excel\"\n\
xmlns:ss=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
xmlns:html=\"http://www.w3.org/TR/REC-html40\">\n\
<DocumentProperties xmlns=\"urn:schemas-microsoft-com:office:office\">\n\
<Author>m</Author>\n\
<LastAuthor>m</LastAuthor>\n\
<Created>2016-06-11T13:00:49Z</Created>\n\
<LastSaved>2016-06-11T13:01:30Z</LastSaved>\n\
<Version>15.00</Version>\n\
</DocumentProperties>\n\
<OfficeDocumentSettings xmlns=\"urn:schemas-microsoft-com:office:office\">\n\
<AllowPNG/>\n\
</OfficeDocumentSettings>\n\
<ExcelWorkbook xmlns=\"urn:schemas-microsoft-com:office:excel\">\n\
<WindowHeight>7755</WindowHeight>\n\
<WindowWidth>20490</WindowWidth>\n\
<WindowTopX>0</WindowTopX>\n\
<WindowTopY>0</WindowTopY>\n\
<ActiveSheet>0</ActiveSheet>\n\
<ProtectStructure>False</ProtectStructure>\n\
<ProtectWindows>False</ProtectWindows>\n\
</ExcelWorkbook>\n\
<Styles>\n\
<Style ss:ID=\"Default\" ss:Name=\"Normal\">\n\
<Alignment ss:Vertical=\"Bottom\"/>\n\
<Borders/>\n\
<Font ss:FontName=\"Calibri\" x:Family=\"Swiss\" ss:Size=\"11\" ss:Color=\"#000000\"/>\n\
<Interior/>\n\
<NumberFormat/>\n\
<Protection/>\n\
</Style>\n\
<Style ss:ID=\"s62\">\n\
<Font ss:FontName=\"Calibri\" x:Family=\"Swiss\" ss:Size=\"11\" ss:Color=\"#FF0000\"\n\
ss:Bold=\"1\"/>\n\
</Style>\n\
</Styles>\n");
for(int i=0; i < CORE_MAX; i++)
{
// starts a worksheet
fprintf(fp,"<Worksheet ss:Name=\"Sheet%d\">\n\
<Table ss:ExpandedColumnCount=\"%d\" ss:ExpandedRowCount=\"%d\" x:FullColumns=\"1\"\n\
x:FullRows=\"1\" ss:DefaultRowHeight=\"15\">\n", i + 1, BLOCK_MAX + 1, ITERATION_MAX + 4);
fprintf(fp, "<Column ss:Width=\"95.25\"/>\n");
fprintf(fp,"<Row ss:StyleID=\"s62\">\n");
for(int q=0; q < BLOCK_MAX; q++)
{
char s[10];
float f = (float)(pow(Factor,q+1)*pow(2.0, BKMG));
sprintf(s,"%0.3f", f);
if(q == 0)
{
fprintf(fp,"<Cell ss:Index=\"2\"><Data ss:Type=\"Number\">%s</Data></Cell>\n", s);
}
else
{
fprintf(fp,"<Cell><Data ss:Type=\"Number\">%s</Data></Cell>\n", s);
}
}
fprintf(fp,"</Row>\n");
for(int j = 0; j < ITERATION_MAX; j++)
{
fprintf(fp,"<Row>\n");
for(int q = 0; q < BLOCK_MAX; q++)
{
if(q == 0)
{
fprintf(fp,"<Cell ss:Index=\"2\"><Data ss:Type=\"Number\">%f</Data></Cell>\n", outputTable[i][q][j].processTime);
}
else
{
fprintf(fp,"<Cell><Data ss:Type=\"Number\">%f</Data></Cell>\n", outputTable[i][q][j].processTime);
}
}
fprintf(fp,"</Row>\n");
}
fprintf(fp,"<Row>\n");
fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Mean</Data></Cell>\n");
for(int q = 0; q < BLOCK_MAX; q++)
{
fprintf(fp," <Cell ss:Formula=\"=AVERAGE(R[%d]C:R[-1]C)\"><Data ss:Type=\"Number\">0</Data></Cell>\n", -ITERATION_MAX);
}
fprintf(fp,"</Row>\n");
fprintf(fp,"<Row>\n");
fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Standard Deviation</Data></Cell>\n");
for(int q=0; q<BLOCK_MAX; q++)
{
fprintf(fp," <Cell ss:Formula=\"=STDEV(R[%d]C:R[-1]C)\"><Data ss:Type=\"Number\">0</Data></Cell>\n", -(ITERATION_MAX + 1));
}
fprintf(fp,"</Row>\n");
fprintf(fp,"<Row>\n");
fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Add Latency</Data></Cell>\n");
for(int q=0; q<BLOCK_MAX; q++)
{
fprintf(fp," <Cell ss:Formula=\"=R[-2]C/(2^4/4)/%d/%f^%d\"><Data ss:Type=\"Number\">0</Data></Cell>\n",COUNTERS_MAX, Factor, q + 1);
}
fprintf(fp,"</Row>\n");
//end of worksheet
fprintf(fp,"</Table>\n</Worksheet>\n");
}
//end of file
fprintf(fp,"</Workbook>");
fclose(fp);
}
#endif
int
main(int argc, char **argv)
{
mkdir("./Resaults", 0777);
int ret;
unsigned lcore_id;
pthread_attr_t attr;
pthread_mutex_init(&mutexLock_, NULL);
pthread_cond_init(&newIterWaitHandle, NULL);
pthread_cond_init(&readWaitHandle, NULL);
ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Cannot init EAL\n");
struct lcore_params params[CORE_MAX];
char numT[5];
sprintf(numT, "%d", CORE_MAX);
for(int i = 0; i < CORE_MAX; i++)
{
// Generates some structures to hold information of assinged job of each core
struct data* commonMem = (struct data*)rte_malloc(NULL, sizeof(struct data), 0);
#ifndef EXCEL_OUTPUT
char num[5];
sprintf(num, "%d", i);
char name3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name3, "./Resaults/");
strcat(name3, testNumber);
mkdir(name3, 0777);
strcat(name3, "/R");
strcat(name3, num);
strcat(name3, "_");
strcat(name3, numT);
strcat(name3, "Core");
mkdir(name3, 0777);
char name2[] = {'/','R', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name2, num);
strcat(name2, "_");
strcat(name2, names[0]);
strcat(name2, ".txt");
strcat(name3, name2);
params[i].fp = fopen(name3, "w");
#endif
mem_alloc(commonMem, sizes[0], 0);
params[i].valueMem = commonMem;
params[i].index = i;
params[i].iteration = 0;
commonMem->value[i] = NULL;
}
/*
printf("sleep ...\n");
for(int f=0;f<4; f++)
{
sleep(1);
}
*/
/*
double p=0;
for(double f=0;f<1e9; f+=0.3)
{
p+=0.1;
}*/
printf("Starting lcores ...\n");
printf("RTE_MAX_LCORE = %d\n", RTE_MAX_LCORE);
lcore_id = rte_get_next_lcore(-1, 1, 0);
processedCount = 0;
// Ask each core do the funtion lcore_recv
for(int i = 0; i < CORE_MAX; i++)
{
rte_eal_remote_launch((lcore_function_t*)lcore_recv, ¶ms[i], lcore_id);
lcore_id = rte_get_next_lcore(lcore_id, 0, 1);
}
// For each core do the function for "BLOCK_MAX" times
for(int j = 1; j <= BLOCK_MAX; j++)
{
printf("Iteration : %d\n", j);
pthread_mutex_lock(&mutexLock_);
while(processedCount < CORE_MAX)
{
pthread_cond_wait(&readWaitHandle, &mutexLock_);
}
for(int i = 0; i < CORE_MAX; i++)
{
#ifndef EXCEL_OUTPUT
fclose(params[i].fp);
if(j < BLOCK_MAX)
{
char num[5];
sprintf(num, "%d", i);
char name3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name3, "./Resaults/");
strcat(name3, testNumber);
mkdir(name3, 0777);
strcat(name3, "/R");
strcat(name3, num);
strcat(name3, "_");
strcat(name3, numT);
strcat(name3, "Core");
mem_alloc( params[i].valueMem, sizes[j], j);
char name2[] = {'/','R', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name2, num);
strcat(name2, "_");
strcat(name2, names[j]);
strcat(name2, ".txt");
strcat(name3, name2);
params[i].fp = fopen(name3,"w");
params[i].iteration = j;
}
#else
mem_alloc( params[i].valueMem, sizes[j], j);
params[i].iteration = j;
#endif
}
if(j < BLOCK_MAX)
{
printf("%d : New Data Added ----------\n", j);
}
else
{
canContinue_ = false;
}
//Signal cores in order to start new iteration
processedCount = 0;
for(int i = 0; i < CORE_MAX; i++)
{
pthread_cond_signal(&newIterWaitHandle);
}
pthread_mutex_unlock(&mutexLock_);
}
printf("Waiting for lcores to finish ...\n");
#ifdef EXCEL_OUTPUT
saveToExcelFile();
#endif
rte_eal_mp_wait_lcore();
return 0;
}
我 运行 使用此命令行 run.sh 源
!/bin/sh
./build/app/Mahdi_test -c 0x55 --master-lcore 0
汇编代码(内循环)完全遵循 btw TEST TEST ON 和 TEST TEST OFF
// TEST TEST ON
clock_gettime(1, &t1);
47: 48 89 e6 mov rsi,rsp
4a: bf 01 00 00 00 mov edi,0x1
4f: e8 00 00 00 00 call 54 <lcore_recv+0x54>
54: 8b 4b 08 mov ecx,DWORD PTR [rbx+0x8]
57: be 10 27 00 00 mov esi,0x2710
5c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
60: 85 c9 test ecx,ecx
62: 74 1d je 81 <lcore_recv+0x81>
64: 48 8b 03 mov rax,QWORD PTR [rbx]
67: 31 d2 xor edx,edx
69: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
{
d->value[i]++;
70: 83 00 01 add DWORD PTR [rax],0x1
double processTime = 0;
// TEST TEST ON
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
73: 83 c2 01 add edx,0x1
76: 48 83 c0 04 add rax,0x4
7a: 8b 4b 08 mov ecx,DWORD PTR [rbx+0x8]
7d: 39 ca cmp edx,ecx
7f: 72 ef jb 70 <lcore_recv+0x70>
for(int q = 0; q < ITERATION_MAX; q++)
{
double processTime = 0;
// TEST TEST ON
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
81: 83 ee 01 sub esi,0x1
84: 75 da jne 60 <lcore_recv+0x60>
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
86: 48 8d 74 24 10 lea rsi,[rsp+0x10]
8b: bf 01 00 00 00 mov edi,0x1
90: e8 00 00 00 00 call 95 <lcore_recv+0x95>
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
95: 8b 4b 08 mov ecx,DWORD PTR [rbx+0x8]
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
// TEST TEST OFF
//Checks last value of each counter
int expectedVal = (q + 1) * COUNTERS_MAX;
98: 41 8d 7c 24 01 lea edi,[r12+0x1]
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
9d: c4 e1 f3 2a 4c 24 10 vcvtsi2sd xmm1,xmm1,QWORD PTR [rsp+0x10]
a4: c4 e1 eb 2a 14 24 vcvtsi2sd xmm2,xmm2,QWORD PTR [rsp]
aa: c4 e1 fb 2a 44 24 18 vcvtsi2sd xmm0,xmm0,QWORD PTR [rsp+0x18]
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
b1: 85 c9 test ecx,ecx
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
b3: c5 f3 59 0d 00 00 00 vmulsd xmm1,xmm1,QWORD PTR [rip+0x0] # bb <lcore_recv+0xbb>
ba: 00
bb: c5 eb 59 15 00 00 00 vmulsd xmm2,xmm2,QWORD PTR [rip+0x0] # c3 <lcore_recv+0xc3>
c2: 00
c3: c5 f3 58 d8 vaddsd xmm3,xmm1,xmm0
c7: c4 e1 f3 2a 4c 24 08 vcvtsi2sd xmm1,xmm1,QWORD PTR [rsp+0x8]
ce: c5 eb 58 c1 vaddsd xmm0,xmm2,xmm1
d2: c5 e3 5c c0 vsubsd xmm0,xmm3,xmm0
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
d6: 74 6a je 142 <lcore_recv+0x142>
d8: 48 8b 33 mov rsi,QWORD PTR [rbx]
db: 31 c0 xor eax,eax
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
dd: ba 01 00 00 00 mov edx,0x1
e2: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
e8: 44 39 2c 86 cmp DWORD PTR [rsi+rax*4],r13d
ec: 41 0f 45 d6 cmovne edx,r14d
f0: 48 83 c0 01 add rax,0x1
for (int i = 0; i < d->count; i++)
f4: 39 c1 cmp ecx,eax
f6: 77 f0 ja e8 <lcore_recv+0xe8>
#endif
}
}
关于缓存级别(L1、L2、L3)和 RAM 的访问时间, 我遇到了我还没有找到答案的奇怪行为, 如果你能帮助我,我将不胜感激:)
我开始按照以下方式填充内存块, 我有不同的块大小作为输入,例如 16 字节,32 字节,.... 256 KB, 对于每个特定的块,我读取内存,对其进行计数并将其写回。例如,对于 1 KB,我有 256 个不同的计数器数组(因为我的计数器是 int32 并且 32 位 = 4 字节), 我从零开始,将 256 个不同的计数器数组作为起点(我们称之为计数器数组)计数并写回,我做了 10,000 次计数(0~10000),并进行 10,000 次 100 次并记录这 100 个结果,取平均值并计算处理时间 (时间计算如下代码)
COUNTERS_MAX = 10000;
ITERATION_MAX = 100;
// The Function which each core should do, now is counter (cnt = cnt + 1)
static int
lcore_recv(struct lcore_params *p)
{
unsigned lcore_id = rte_lcore_id();
printf("Starting core %u\n", lcore_id);
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
struct tableEntry outputTable[ITERATION_MAX];
#endif
#endif
while(canContinue_)
{
//printf("Starting core %u\n", lcore_id);
//int index=((lcore_id-p->baseIndex)-1+CORE_MAX)%CORE_MAX;
void * vp;
struct data * d = p->valueMem;
FILE* fp = p->fp;
//fprintf(fp, "Iteration %d ----------------------\n", p->iteration);
//int index = p->index;
struct timespec t1, t2;
for(int q = 0; q < ITERATION_MAX; q++)
{
double processTime = 0;
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
//Checks last value of each counter
int expectedVal = (q + 1) * COUNTERS_MAX;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
{
if(d->value[i]!=expectedVal)
{
if(allOk)
{
allOk = false;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Failed : ");
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp,"%d ", i);
#endif
#endif
}
}
#ifdef EXCEL_OUTPUT
struct tableEntry* entry= &outputTable[p->index][p->iteration][q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#else
#ifdef DIRECT_FILE_WRITE
if(allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"\n");
}
fprintf(fp, "*** Time = %f ns \n", processTime);
#else
struct tableEntry* entry= &outputTable[q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
for(int q = 0; q < ITERATION_MAX; q++)
{
struct tableEntry* entry= &outputTable[q];
fprintf(fp," Expected : %d\n", entry->expectedVal);
if(entry->allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"Failed \n");
}
fprintf(fp, "*** Time = %f ns \n", entry->processTime);
}
#endif
#endif
pthread_mutex_lock(&mutexLock_);
processedCount++;
pthread_cond_signal(&readWaitHandle);
pthread_cond_wait(&newIterWaitHandle, &mutexLock_);
pthread_mutex_unlock(&mutexLock_);
}
return 0;
}
所以对于每个块我都做了相同的测试。例如,如果我有 20 个不同的测试点(块存储器,如 16 B、32 B,....),我将在 'ns' 中得到 100 行和 20 列时间的矩阵。 所以每一列显示不同的块大小,每一行显示不同的 100 测试。 最后我得到了每列的平均值并计算了每列的处理时间,奇怪的行为如下所示, the block size based on Byte and the Y axis is the latency for each process in 'ns', here you could see 3 different cores which run at the same time with the same more or less behaviour 每当我从 16 B 这样的小块开始,大约 50 字节到 600 字节的间隔时,我总是看到这种疯狂的行为,我不知道为什么? (我的第一个问题) 因此,如果继续超过 2.93 MB(大约 8 MB(LLC 大小)/ 3(同时 运行 的不同内核),我们将进行如下跳转) 3 different core run simultaneously 我的第二个问题是,如果这个跳跃有意义,我的意思是 LLC 延迟和 RAM 延迟之间的差异大约 2.5 或 3 倍是可以的,或者应该更多)
PS.My 系统是 Core i7,3.4 Ghz,L1:32 KB,L2:256 KB 和 L3:8 MB,16 GB RAM
提前感谢您的帮助和考虑
您的测试方法不是测试以测量缓存延迟(并且您打开了 TubroBoost,因此没有恒定的 cpu 频率。)。
缓存的延迟是已知的,并且以 cpu 周期而不是 ns 为单位进行测量(缓存以 cpu 核心频率运行);内存延迟以周期 + ns 为单位,因为数据在从内存中读取后必须通过缓存层次结构(周期)(ns,内存有自己的时钟)。
例如 i7-4xxx (Haswell): http://7-cpu.com/cpu/Haswell.html
Intel Haswell
Intel i7-4770 (Haswell), 3.4 GHz (Turbo Boost off), 22 nm. RAM: 32 GB (PC3-12800 cl11 cr2).
- L1 Data Cache Latency = 4 cycles for simple access via pointer
- L1 Data Cache Latency = 5 cycles for access with complex address calculation (size_t n, *p; n = p[n]).
- L2 Cache Latency = 12 cycles
- L3 Cache Latency = 36 cycles
- RAM Latency = 36 cycles + 57 ns
您现在拥有的是:向多个计数器添加一些 "count" 常量(是的,编译器很可能能够将内部循环优化为 d->value[i] += d->count
)。
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
你应该用什么来衡量缓存延迟:
Widely used classic test for cache latency is iterating over the linked list. ... This method is used by open-source lmbench - in the test lat_mem_rd ... There are sources of lat_mem_rd test from lmbench: https://github.com/foss-for-synopsys-dwc-arc-processors/lmbench/blob/master/src/lat_mem_rd.c
the main test is
#define ONE p = (char **)*p;
#define FIVE ONE ONE ONE ONE ONE
#define TEN FIVE FIVE
#define FIFTY TEN TEN TEN TEN TEN
#define HUNDRED FIFTY FIFTY
void
benchmark_loads(iter_t iterations, void *cookie)
{
struct mem_state* state = (struct mem_state*)cookie;
register char **p = (char**)state->p[0];
register size_t i;
register size_t count = state->len / (state->line * 100) + 1;
while (iterations-- > 0) {
for (i = 0; i < count; ++i) {
HUNDRED;
}
}
use_pointer((void *)p);
state->p[0] = (char*)p;
}
所以,在破译宏之后,我们做了很多线性操作,比如:
p = (char**) *p; // (in intel syntax) == mov eax, [eax]
p = (char**) *p;
p = (char**) *p;
.... // 100 times total
p = (char**) *p;
如手册页所述http://www.bitmover.com/lmbench/lat_mem_rd.8.html
The benchmark runs as two nested loops. The outer loop is the stride size. The inner loop is the array size. For each array size, the benchmark creates a ring of pointers that point forward one stride. Traversing the array is done by
p = (char **)*p;
Ubuntu 14.04.4 LTS, 我用海合会, 我的 makefile 如下
ifeq ($(RTE_SDK),)
$(error "Please define RTE_SDK environment variable")
endif
# Default target, can be overriden by command line or environment
RTE_TARGET ?= x86_64-native-linuxapp-gcc
include $(RTE_SDK)/mk/rte.vars.mk
# binary name
APP = Mahdi_test
INC += $(wildcard include/*.h)
# all source are stored in SRCS-y
SRCS-y := main.c
CFLAGS += $(WERROR_FLAGS) -I -S$(SRCDIR)/include -I/usr/local/include
# Most optimizations are only enabled if an -O level is set on the command line,
# otherwise they are disabled, even if individual optimization flags are specified.
# With -O, the compiler tries to reduce code size and execution time,
# without performing any optimizations that take a great deal of compilation time.
# -O3 Optimize yet more. -O3 turns on all optimizations specified by -O2
# EXTRA_CFLAGS += -O3 -S -Wno-error -std=c99
# After following line do make, go to ./build and run : objdump -d -M intel -S main.o >a.txt
EXTRA_CFLAGS += -O3 -g -Wno-error -std=c99
# rte.extapp.mk : External application
include $(RTE_SDK)/mk/rte.extapp.mk
CPU : 架构:x86_64 CPU 操作模式:32 位、64 位 字节顺序:小字节序 CPU(s): 8 在线 CPU(s) 名单:0-7 每核心线程数:2 每个插槽的核心数:4 插座:1 NUMA 节点:1 供应商 ID:GenuineIntel CPU 家庭:6 型号:42 步进:7 CPU 兆赫:1600.000 BogoMIPS:6784.24 虚拟化:VT-x 一级缓存:32K 一级缓存:32K 二级缓存:256K 三级缓存:8192K NUMA 节点 0 CPU(s): 0-7
所有代码都在单个文件中(我使用 dpdk 是为了利用这个库的好处),
#if __STDC_VERSION__ >= 199901L
#define _XOPEN_SOURCE 600
#else
#define _XOPEN_SOURCE 500
#endif /* __STDC_VERSION__ */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <rte_memory.h>
#include <rte_malloc.h>
#include <string.h>
#include <time.h>
#include <pthread.h>
#include <rte_ring.h>
#include <math.h>
#include <stdbool.h>
#include <sys/types.h>
#define EXCEL_OUTPUT
#ifndef EXCEL_OUTPUT
#define DIRECT_FILE_WRITE
#endif
#define CORE_MAX 3
#define BLOCK_MAX 20 // BKMG = 4, ~ 168.72 MB
#define COUNTERS_MAX 10000
#define ITERATION_MAX 100
#define Factor 1.5
#define BKMG 4
char* testNumber = "23";
/*
uint32_t sizes[BLOCK_MAX] = {
1*Factor*pow(2, 10)/4, 2*Factor*pow(2, 10)/4, 4*Factor*pow(2, 10)/4, 8*Factor*pow(2, 10)/4, 16*Factor*pow(2, 10)/4, 32*Factor*pow(2, 10)/4, 64*Factor*pow(2, 10)/4, 128*Factor*pow(2, 10)/4, 256*Factor*pow(2, 10)/4, 512*Factor*pow(2, 10)/4,
1*Factor*pow(2, 20)/4, 2*Factor*pow(2, 20)/4, 4*Factor*pow(2, 20)/4, 8*Factor*pow(2, 20)/4, 16*Factor*pow(2, 20)/4, 32*Factor*pow(2, 20)/4, 64*Factor*pow(2, 20)/4, 128*Factor*pow(2, 20)/4, 256*Factor*pow(2, 20)/4, 512*Factor*pow(2, 20)/4,
1*Factor*pow(2, 30)/4, 2*Factor*pow(2, 30)/4
};
*/
uint32_t sizes[BLOCK_MAX] = {
pow(Factor, 1)*pow(2, BKMG)/4, pow(Factor, 2)*pow(2, BKMG)/4, pow(Factor, 3)*pow(2, BKMG)/4, pow(Factor, 4)*pow(2, BKMG)/4, pow(Factor, 5)*pow(2, BKMG)/4, pow(Factor, 6)*pow(2, BKMG)/4, pow(Factor, 7)*pow(2, BKMG)/4, pow(Factor, 8)*pow(2, BKMG)/4, pow(Factor, 9)*pow(2, BKMG)/4, pow(Factor,10)*pow(2, BKMG)/4,
pow(Factor,11)*pow(2, BKMG)/4, pow(Factor,12)*pow(2, BKMG)/4, pow(Factor,13)*pow(2, BKMG)/4, pow(Factor,14)*pow(2, BKMG)/4, pow(Factor,15)*pow(2, BKMG)/4, pow(Factor,16)*pow(2, BKMG)/4, pow(Factor,17)*pow(2, BKMG)/4, pow(Factor,18)*pow(2, BKMG)/4, pow(Factor,19)*pow(2, BKMG)/4, pow(Factor,20)*pow(2, BKMG)/4,
pow(Factor,21)*pow(2, BKMG)/4, pow(Factor,22)*pow(2, BKMG)/4, pow(Factor,23)*pow(2, BKMG)/4, pow(Factor,24)*pow(2, BKMG)/4, pow(Factor,25)*pow(2, BKMG)/4, pow(Factor,26)*pow(2, BKMG)/4, pow(Factor,27)*pow(2, BKMG)/4, pow(Factor,28)*pow(2, BKMG)/4, pow(Factor,29)*pow(2, BKMG)/4, pow(Factor,30)*pow(2, BKMG)/4,
pow(Factor,31)*pow(2, BKMG)/4, pow(Factor,32)*pow(2, BKMG)/4, pow(Factor,33)*pow(2, BKMG)/4, pow(Factor,34)*pow(2, BKMG)/4, pow(Factor,35)*pow(2, BKMG)/4, pow(Factor,36)*pow(2, BKMG)/4, pow(Factor,37)*pow(2, BKMG)/4, pow(Factor,38)*pow(2, BKMG)/4, pow(Factor,39)*pow(2, BKMG)/4, pow(Factor,40)*pow(2, BKMG)/4,
pow(Factor,41)*pow(2, BKMG)/4, pow(Factor,42)*pow(2, BKMG)/4, pow(Factor,43)*pow(2, BKMG)/4, pow(Factor,44)*pow(2, BKMG)/4, pow(Factor,45)*pow(2, BKMG)/4, pow(Factor,46)*pow(2, BKMG)/4, pow(Factor,47)*pow(2, BKMG)/4, pow(Factor,48)*pow(2, BKMG)/4, pow(Factor,49)*pow(2, BKMG)/4, pow(Factor,50)*pow(2, BKMG)/4,
};
/*
char* names[BLOCK_MAX] = {
"1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
"1M", "2M", "4M", "8M", "16M", "32M", "64M", "128M", "256M", "512M",
"1G", "2G"
};
*/
char* names[BLOCK_MAX] = {
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "24", "25", "26", "27", "28", "29", "30",
"31", "32", "33", "34", "35", "36", "37", "38", "39", "40",
"41", "42", "43", "44", "45", "46", "47", "48", "49", "50",
};
// This struct keeps the inoput parameter for each single core (for 3 cores we have 3 of this struct)
struct lcore_params
{
struct data* valueMem; // This pointer is the address of one sample of data struct which include the address of memorty related to core and the size of that
int iteration; // This keeos the number of main iteratiopn, which block of memory now is processing
FILE* fp; // This keeps the handler address of opened file for related core, which via that we could write in mentioned file
int index; // This keeps the number of core, here we don't use it anymore
};
// Keeps the information regarding the memory which allocates to cores
struct data
{
uint32_t* value; // This keeps the memory address. This memory is allocated independent for each specific core
uint32_t count; // The variable 'count' shows the number of 32-bits taken memory.
};
struct tableEntry
{
int expectedVal;
double processTime;
bool allOk;
};
// This thread variavbles is using for coordination btw cores in order to prevent them interfereing each other while checking readWaitHandle and newIterWaitHandle
pthread_mutex_t mutexLock_;
// All slave cores wait here till the signal issues(via pthread_cond_signal(&newIterWaitHandle)) from master core in order to start new memory block
// Conversely going through newIterWaitHandle goes up here which master core wait till all slave finish their tasks
pthread_cond_t readWaitHandle, newIterWaitHandle;
bool canContinue_ = true;
int processedCount = 0;
#ifdef EXCEL_OUTPUT
//holds all outputs. we save them at the end of work
struct tableEntry outputTable[CORE_MAX][BLOCK_MAX][ITERATION_MAX];
#endif
// The Function which each core should do, now is counter (cnt = cnt + 1)
static int
lcore_recv(struct lcore_params *p)
{
unsigned lcore_id = rte_lcore_id();
printf("Starting core %u\n", lcore_id);
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
struct tableEntry outputTable[ITERATION_MAX];
#endif
#endif
while(canContinue_)
{
//printf("Starting core %u\n", lcore_id);
//int index=((lcore_id-p->baseIndex)-1+CORE_MAX)%CORE_MAX;
void * vp;
struct data * d = p->valueMem;
FILE* fp = p->fp;
//fprintf(fp, "Iteration %d ----------------------\n", p->iteration);
//int index = p->index;
struct timespec t1, t2;
for(int q = 0; q < ITERATION_MAX; q++)
{
double processTime = 0;
// TEST TEST ON
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
// TEST TEST OFF
//Checks last value of each counter
int expectedVal = (q + 1) * COUNTERS_MAX;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
{
if(d->value[i]!=expectedVal)
{
if(allOk)
{
allOk = false;
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Failed : ");
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp,"%d ", i);
#endif
#endif
}
}
#ifdef EXCEL_OUTPUT
struct tableEntry* entry= &outputTable[p->index][p->iteration][q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#else
#ifdef DIRECT_FILE_WRITE
if(allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"\n");
}
fprintf(fp, "*** Time = %f ns \n", processTime);
#else
struct tableEntry* entry= &outputTable[q];
entry->allOk=allOk;
entry->expectedVal=expectedVal;
entry->processTime=processTime;
#endif
#endif
}
#ifndef EXCEL_OUTPUT
#ifndef DIRECT_FILE_WRITE
for(int q = 0; q < ITERATION_MAX; q++)
{
struct tableEntry* entry= &outputTable[q];
fprintf(fp," Expected : %d\n", entry->expectedVal);
if(entry->allOk)
{
fprintf(fp,"All counters are ok \n");
}
else
{
fprintf(fp,"Failed \n");
}
fprintf(fp, "*** Time = %f ns \n", entry->processTime);
}
#endif
#endif
pthread_mutex_lock(&mutexLock_);
processedCount++;
pthread_cond_signal(&readWaitHandle);
pthread_cond_wait(&newIterWaitHandle, &mutexLock_);
pthread_mutex_unlock(&mutexLock_);
}
return 0;
}
// mem_alloc is used in order to release the allocated memory and resize the new memory with new size for it. This function is called for each separate core
static void
mem_alloc(struct data* valueMem, uint32_t newSize, uint32_t iteration)
{
valueMem->count = newSize;
if(valueMem->value)
{
rte_free(valueMem->value);
}
valueMem->value = (uint32_t *)rte_zmalloc(NULL, sizeof(uint32_t) * newSize, 0);
if(!valueMem->value)
{
printf("Memory Fail\n");
}
}
#ifdef EXCEL_OUTPUT
void saveToExcelFile()
{
char name[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name, "output");
strcat(name, testNumber);
strcat(name, ".xml");
FILE* fp = fopen(name, "w");
// some setting of excel and xml file
fprintf(fp,"<?xml version=\"1.0\"?>\n\
<?mso-application progid=\"Excel.Sheet\"?>\n\
<Workbook xmlns=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
xmlns:o=\"urn:schemas-microsoft-com:office:office\"\n\
xmlns:x=\"urn:schemas-microsoft-com:office:excel\"\n\
xmlns:ss=\"urn:schemas-microsoft-com:office:spreadsheet\"\n\
xmlns:html=\"http://www.w3.org/TR/REC-html40\">\n\
<DocumentProperties xmlns=\"urn:schemas-microsoft-com:office:office\">\n\
<Author>m</Author>\n\
<LastAuthor>m</LastAuthor>\n\
<Created>2016-06-11T13:00:49Z</Created>\n\
<LastSaved>2016-06-11T13:01:30Z</LastSaved>\n\
<Version>15.00</Version>\n\
</DocumentProperties>\n\
<OfficeDocumentSettings xmlns=\"urn:schemas-microsoft-com:office:office\">\n\
<AllowPNG/>\n\
</OfficeDocumentSettings>\n\
<ExcelWorkbook xmlns=\"urn:schemas-microsoft-com:office:excel\">\n\
<WindowHeight>7755</WindowHeight>\n\
<WindowWidth>20490</WindowWidth>\n\
<WindowTopX>0</WindowTopX>\n\
<WindowTopY>0</WindowTopY>\n\
<ActiveSheet>0</ActiveSheet>\n\
<ProtectStructure>False</ProtectStructure>\n\
<ProtectWindows>False</ProtectWindows>\n\
</ExcelWorkbook>\n\
<Styles>\n\
<Style ss:ID=\"Default\" ss:Name=\"Normal\">\n\
<Alignment ss:Vertical=\"Bottom\"/>\n\
<Borders/>\n\
<Font ss:FontName=\"Calibri\" x:Family=\"Swiss\" ss:Size=\"11\" ss:Color=\"#000000\"/>\n\
<Interior/>\n\
<NumberFormat/>\n\
<Protection/>\n\
</Style>\n\
<Style ss:ID=\"s62\">\n\
<Font ss:FontName=\"Calibri\" x:Family=\"Swiss\" ss:Size=\"11\" ss:Color=\"#FF0000\"\n\
ss:Bold=\"1\"/>\n\
</Style>\n\
</Styles>\n");
for(int i=0; i < CORE_MAX; i++)
{
// starts a worksheet
fprintf(fp,"<Worksheet ss:Name=\"Sheet%d\">\n\
<Table ss:ExpandedColumnCount=\"%d\" ss:ExpandedRowCount=\"%d\" x:FullColumns=\"1\"\n\
x:FullRows=\"1\" ss:DefaultRowHeight=\"15\">\n", i + 1, BLOCK_MAX + 1, ITERATION_MAX + 4);
fprintf(fp, "<Column ss:Width=\"95.25\"/>\n");
fprintf(fp,"<Row ss:StyleID=\"s62\">\n");
for(int q=0; q < BLOCK_MAX; q++)
{
char s[10];
float f = (float)(pow(Factor,q+1)*pow(2.0, BKMG));
sprintf(s,"%0.3f", f);
if(q == 0)
{
fprintf(fp,"<Cell ss:Index=\"2\"><Data ss:Type=\"Number\">%s</Data></Cell>\n", s);
}
else
{
fprintf(fp,"<Cell><Data ss:Type=\"Number\">%s</Data></Cell>\n", s);
}
}
fprintf(fp,"</Row>\n");
for(int j = 0; j < ITERATION_MAX; j++)
{
fprintf(fp,"<Row>\n");
for(int q = 0; q < BLOCK_MAX; q++)
{
if(q == 0)
{
fprintf(fp,"<Cell ss:Index=\"2\"><Data ss:Type=\"Number\">%f</Data></Cell>\n", outputTable[i][q][j].processTime);
}
else
{
fprintf(fp,"<Cell><Data ss:Type=\"Number\">%f</Data></Cell>\n", outputTable[i][q][j].processTime);
}
}
fprintf(fp,"</Row>\n");
}
fprintf(fp,"<Row>\n");
fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Mean</Data></Cell>\n");
for(int q = 0; q < BLOCK_MAX; q++)
{
fprintf(fp," <Cell ss:Formula=\"=AVERAGE(R[%d]C:R[-1]C)\"><Data ss:Type=\"Number\">0</Data></Cell>\n", -ITERATION_MAX);
}
fprintf(fp,"</Row>\n");
fprintf(fp,"<Row>\n");
fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Standard Deviation</Data></Cell>\n");
for(int q=0; q<BLOCK_MAX; q++)
{
fprintf(fp," <Cell ss:Formula=\"=STDEV(R[%d]C:R[-1]C)\"><Data ss:Type=\"Number\">0</Data></Cell>\n", -(ITERATION_MAX + 1));
}
fprintf(fp,"</Row>\n");
fprintf(fp,"<Row>\n");
fprintf(fp,"<Cell ss:StyleID=\"s62\"><Data ss:Type=\"String\">Add Latency</Data></Cell>\n");
for(int q=0; q<BLOCK_MAX; q++)
{
fprintf(fp," <Cell ss:Formula=\"=R[-2]C/(2^4/4)/%d/%f^%d\"><Data ss:Type=\"Number\">0</Data></Cell>\n",COUNTERS_MAX, Factor, q + 1);
}
fprintf(fp,"</Row>\n");
//end of worksheet
fprintf(fp,"</Table>\n</Worksheet>\n");
}
//end of file
fprintf(fp,"</Workbook>");
fclose(fp);
}
#endif
int
main(int argc, char **argv)
{
mkdir("./Resaults", 0777);
int ret;
unsigned lcore_id;
pthread_attr_t attr;
pthread_mutex_init(&mutexLock_, NULL);
pthread_cond_init(&newIterWaitHandle, NULL);
pthread_cond_init(&readWaitHandle, NULL);
ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Cannot init EAL\n");
struct lcore_params params[CORE_MAX];
char numT[5];
sprintf(numT, "%d", CORE_MAX);
for(int i = 0; i < CORE_MAX; i++)
{
// Generates some structures to hold information of assinged job of each core
struct data* commonMem = (struct data*)rte_malloc(NULL, sizeof(struct data), 0);
#ifndef EXCEL_OUTPUT
char num[5];
sprintf(num, "%d", i);
char name3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name3, "./Resaults/");
strcat(name3, testNumber);
mkdir(name3, 0777);
strcat(name3, "/R");
strcat(name3, num);
strcat(name3, "_");
strcat(name3, numT);
strcat(name3, "Core");
mkdir(name3, 0777);
char name2[] = {'/','R', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name2, num);
strcat(name2, "_");
strcat(name2, names[0]);
strcat(name2, ".txt");
strcat(name3, name2);
params[i].fp = fopen(name3, "w");
#endif
mem_alloc(commonMem, sizes[0], 0);
params[i].valueMem = commonMem;
params[i].index = i;
params[i].iteration = 0;
commonMem->value[i] = NULL;
}
/*
printf("sleep ...\n");
for(int f=0;f<4; f++)
{
sleep(1);
}
*/
/*
double p=0;
for(double f=0;f<1e9; f+=0.3)
{
p+=0.1;
}*/
printf("Starting lcores ...\n");
printf("RTE_MAX_LCORE = %d\n", RTE_MAX_LCORE);
lcore_id = rte_get_next_lcore(-1, 1, 0);
processedCount = 0;
// Ask each core do the funtion lcore_recv
for(int i = 0; i < CORE_MAX; i++)
{
rte_eal_remote_launch((lcore_function_t*)lcore_recv, ¶ms[i], lcore_id);
lcore_id = rte_get_next_lcore(lcore_id, 0, 1);
}
// For each core do the function for "BLOCK_MAX" times
for(int j = 1; j <= BLOCK_MAX; j++)
{
printf("Iteration : %d\n", j);
pthread_mutex_lock(&mutexLock_);
while(processedCount < CORE_MAX)
{
pthread_cond_wait(&readWaitHandle, &mutexLock_);
}
for(int i = 0; i < CORE_MAX; i++)
{
#ifndef EXCEL_OUTPUT
fclose(params[i].fp);
if(j < BLOCK_MAX)
{
char num[5];
sprintf(num, "%d", i);
char name3[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name3, "./Resaults/");
strcat(name3, testNumber);
mkdir(name3, 0777);
strcat(name3, "/R");
strcat(name3, num);
strcat(name3, "_");
strcat(name3, numT);
strcat(name3, "Core");
mem_alloc( params[i].valueMem, sizes[j], j);
char name2[] = {'/','R', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
strcat(name2, num);
strcat(name2, "_");
strcat(name2, names[j]);
strcat(name2, ".txt");
strcat(name3, name2);
params[i].fp = fopen(name3,"w");
params[i].iteration = j;
}
#else
mem_alloc( params[i].valueMem, sizes[j], j);
params[i].iteration = j;
#endif
}
if(j < BLOCK_MAX)
{
printf("%d : New Data Added ----------\n", j);
}
else
{
canContinue_ = false;
}
//Signal cores in order to start new iteration
processedCount = 0;
for(int i = 0; i < CORE_MAX; i++)
{
pthread_cond_signal(&newIterWaitHandle);
}
pthread_mutex_unlock(&mutexLock_);
}
printf("Waiting for lcores to finish ...\n");
#ifdef EXCEL_OUTPUT
saveToExcelFile();
#endif
rte_eal_mp_wait_lcore();
return 0;
}
我 运行 使用此命令行 run.sh 源
!/bin/sh
./build/app/Mahdi_test -c 0x55 --master-lcore 0
汇编代码(内循环)完全遵循 btw TEST TEST ON 和 TEST TEST OFF
// TEST TEST ON
clock_gettime(1, &t1);
47: 48 89 e6 mov rsi,rsp
4a: bf 01 00 00 00 mov edi,0x1
4f: e8 00 00 00 00 call 54 <lcore_recv+0x54>
54: 8b 4b 08 mov ecx,DWORD PTR [rbx+0x8]
57: be 10 27 00 00 mov esi,0x2710
5c: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
60: 85 c9 test ecx,ecx
62: 74 1d je 81 <lcore_recv+0x81>
64: 48 8b 03 mov rax,QWORD PTR [rbx]
67: 31 d2 xor edx,edx
69: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
{
d->value[i]++;
70: 83 00 01 add DWORD PTR [rax],0x1
double processTime = 0;
// TEST TEST ON
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
{
for (int i = 0; i < d->count; i++)
73: 83 c2 01 add edx,0x1
76: 48 83 c0 04 add rax,0x4
7a: 8b 4b 08 mov ecx,DWORD PTR [rbx+0x8]
7d: 39 ca cmp edx,ecx
7f: 72 ef jb 70 <lcore_recv+0x70>
for(int q = 0; q < ITERATION_MAX; q++)
{
double processTime = 0;
// TEST TEST ON
clock_gettime(1, &t1);
for(uint32_t p = 0; p <= COUNTERS_MAX - 1; p++)
81: 83 ee 01 sub esi,0x1
84: 75 da jne 60 <lcore_recv+0x60>
for (int i = 0; i < d->count; i++)
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
86: 48 8d 74 24 10 lea rsi,[rsp+0x10]
8b: bf 01 00 00 00 mov edi,0x1
90: e8 00 00 00 00 call 95 <lcore_recv+0x95>
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
95: 8b 4b 08 mov ecx,DWORD PTR [rbx+0x8]
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
// TEST TEST OFF
//Checks last value of each counter
int expectedVal = (q + 1) * COUNTERS_MAX;
98: 41 8d 7c 24 01 lea edi,[r12+0x1]
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
9d: c4 e1 f3 2a 4c 24 10 vcvtsi2sd xmm1,xmm1,QWORD PTR [rsp+0x10]
a4: c4 e1 eb 2a 14 24 vcvtsi2sd xmm2,xmm2,QWORD PTR [rsp]
aa: c4 e1 fb 2a 44 24 18 vcvtsi2sd xmm0,xmm0,QWORD PTR [rsp+0x18]
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
b1: 85 c9 test ecx,ecx
{
d->value[i]++;
}
}
clock_gettime(1, &t2);
processTime = (t2.tv_sec*1e9 + t2.tv_nsec) - (t1.tv_sec*1e9 + t1.tv_nsec);/* nanoseconds */
b3: c5 f3 59 0d 00 00 00 vmulsd xmm1,xmm1,QWORD PTR [rip+0x0] # bb <lcore_recv+0xbb>
ba: 00
bb: c5 eb 59 15 00 00 00 vmulsd xmm2,xmm2,QWORD PTR [rip+0x0] # c3 <lcore_recv+0xc3>
c2: 00
c3: c5 f3 58 d8 vaddsd xmm3,xmm1,xmm0
c7: c4 e1 f3 2a 4c 24 08 vcvtsi2sd xmm1,xmm1,QWORD PTR [rsp+0x8]
ce: c5 eb 58 c1 vaddsd xmm0,xmm2,xmm1
d2: c5 e3 5c c0 vsubsd xmm0,xmm3,xmm0
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
for (int i = 0; i < d->count; i++)
d6: 74 6a je 142 <lcore_recv+0x142>
d8: 48 8b 33 mov rsi,QWORD PTR [rbx]
db: 31 c0 xor eax,eax
#ifndef EXCEL_OUTPUT
#ifdef DIRECT_FILE_WRITE
fprintf(fp," Expected : %d\n", expectedVal);
#endif
#endif
bool allOk = true;
dd: ba 01 00 00 00 mov edx,0x1
e2: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
e8: 44 39 2c 86 cmp DWORD PTR [rsi+rax*4],r13d
ec: 41 0f 45 d6 cmovne edx,r14d
f0: 48 83 c0 01 add rax,0x1
for (int i = 0; i < d->count; i++)
f4: 39 c1 cmp ecx,eax
f6: 77 f0 ja e8 <lcore_recv+0xe8>
#endif
}
}