Linux下memcpy性能测试

789人浏览   2024-04-13 16:34:26

最近在使用x86平台(硬件环境:Intel Xeon E5V4,DDR4 32GB )时,发现性能存在一个瓶颈,和用户预期效果存在一点差距。于是排查用户代码,发现其中存在一个对大段内存拷贝的操作,也就是使用memcpy一次拷贝4MB以上数据,心想如果能减少拷贝时间应该就能解决问题。于是自己写了一个测试程序专门用来测试memcpy的时间。代码如下:

/*
 ============================================================================
 Name : test_mem.c
 Author : 111
 Version :
 Copyright : Your copyright notice
 Description : Hello World in C, Ansi-style
 ============================================================================
 */
 
 
#include <assert.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
 
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#define __USE_GNU
#include <sched.h>
#include <pthread.h>
 
#define COUNTNUMBER 1000
#define SENDCORE 5
 
static void timespec_sub(struct timespec *t1, const struct timespec *t2)
{
 assert(t1->tv_nsec >= 0);
 assert(t1->tv_nsec < 1000000000);
 assert(t2->tv_nsec >= 0);
 assert(t2->tv_nsec < 1000000000);
 t1->tv_sec -= t2->tv_sec;
 t1->tv_nsec -= t2->tv_nsec;
 if (t1->tv_nsec >= 1000000000)
 {
 t1->tv_sec++;
 t1->tv_nsec -= 1000000000;
 }
 else if (t1->tv_nsec < 0)
 {
 t1->tv_sec--;
 t1->tv_nsec += 1000000000;
 }
}
 
int main1(void) {
 
	int rc;
	int i;
	int j;
	char *src = NULL;
	char *dst = NULL;
	struct timespec ts_start, ts_end;
	int size=65536;
	unsigned long long timec=0;
 
	if(SENDCORE)
	{
		cpu_set_t cpu_set;
		CPU_ZERO(&cpu_set);
		CPU_SET(SENDCORE,&cpu_set);
		if(-1==sched_setaffinity(0,sizeof(cpu_set_t),&cpu_set))
		{
			perror("send process band core error.");
		}
	}
 
	posix_memalign((void **)&src, 4096/*alignment*/, 0x60000000);
 
	posix_memalign((void **)&dst, 4096/*alignment*/, 0x60000000);
	int pagesize = 0;
	int pageindex = 0;
	char *lsrc,*ldst;
	for(j=0;j<8;j++)
	{
		pagesize = 0x60000000/size;
		pageindex = 0;
		lsrc = src;
		ldst = dst;
 
		rc = clock_gettime(CLOCK_MONOTONIC, &ts_start);
 
		for(i=0;i<COUNTNUMBER;i++)
		{
			memcpy(ldst,lsrc,size);
			if(i>= pagesize)
			{
				pageindex = 0;
			}
			ldst = dst+pageindex*size;
			lsrc = src+pageindex*size;
		}
 
		rc = clock_gettime(CLOCK_MONOTONIC, &ts_end);
 
 
		timespec_sub(&ts_end, &ts_start);
		 /* display passed time, a bit less accurate but side-effects are accounted for */
		timec=ts_end.tv_sec*1000000+ts_end.tv_nsec/1000;
		//printf("CLOCK_MONOTONIC reports %ld.%09ld seconds (total) for copy %d 1000 times\n", ts_end.tv_sec, ts_end.tv_nsec,size);
		printf("CLOCK_MONOTONIC reports %.2fus for copy %dtimes %dB\n", (1.0*timec)/COUNTNUMBER, COUNTNUMBER,size);
		size=size*2;
	}
 
	return EXIT_SUCCESS;
}

上面代码进行了绑核,4MB的数据拷贝运行时间要450us以上。

查阅资料发现memcpy除了开多线程拷贝外,并没有什么好的办法,请教了搞Linux操作系统的专业人士,他们修改了我的测试代码,让memcpy一次拷贝4KB(正好为LinuxPage页大小),共拷贝1024次。如下:

/*
 ============================================================================
 Name : test_mem.c
 Author : 111
 Version :
 Copyright : Your copyright notice
 Description : Hello World in C, Ansi-style
 ============================================================================
 */
 
#include <assert.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
 
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#define __USE_GNU
#include <sched.h>
#include <pthread.h>
 
#define COUNTNUMBER 1000
#define SENDCORE 2
 
static void timespec_sub(struct timespec *t1, const struct timespec *t2) {
	assert(t1->tv_nsec >= 0);
	assert(t1->tv_nsec < 1000000000);
	assert(t2->tv_nsec >= 0);
	assert(t2->tv_nsec < 1000000000);
	t1->tv_sec -= t2->tv_sec;
	t1->tv_nsec -= t2->tv_nsec;
	if (t1->tv_nsec >= 1000000000) {
		t1->tv_sec++;
		t1->tv_nsec -= 1000000000;
	} else if (t1->tv_nsec < 0) {
		t1->tv_sec--;
		t1->tv_nsec += 1000000000;
	}
}
 
int main(void) {
 
	int rc;
	int i;
	int j;
	int k;
	char *src = NULL;
	char *dst = NULL;
	char *psrc = NULL;
	char *pdst = NULL;
	struct timespec ts_start, ts_end;
	int size = 65536;
	int temp=0;
	int count = 0;
	unsigned long long timec = 0;
 
	if (SENDCORE) {
		cpu_set_t cpu_set;
		CPU_ZERO(&cpu_set);
		CPU_SET(SENDCORE, &cpu_set);
		if (-1 == sched_setaffinity(0, sizeof(cpu_set_t), &cpu_set)) {
			perror("send process band core error.");
		}
	}
 
	if (posix_memalign((void **) &src, 1024/*alignment*/, 0x60000000) != 0)
	//if(posix_memalign((void **)&src, 4096/*alignment*/, 0x60000000)!= 0)
	{
		printf("posix_memalign fail.\n");
		return -1;
	}
 
	if (posix_memalign((void **) &dst, 1024/*alignment*/, 0x60000000) != 0) {
		printf("posix_memalign fail.\n");
		free(src);
		return -1;
	}
 
	for (k = 0; k < 8; k++)
	{
		rc = clock_gettime(CLOCK_MONOTONIC, &ts_start);
 
		count=0;
		for (i = 0; i < COUNTNUMBER; i++)
		{
			temp=size/4096;
			psrc = src;
			pdst = dst;
#if 0
			memcpy(pdst, psrc, size); 
#else
			for (j = 0; j < temp; j++)
			{
				memcpy(pdst, psrc, 4096);
				pdst = pdst + 4096;
				psrc = psrc + 4096;
				count++;
			}
#endif
		}
 
		rc = clock_gettime(CLOCK_MONOTONIC, &ts_end);
 
		printf("copy data by 4kb total count is %d\n", count/COUNTNUMBER);
		timespec_sub(&ts_end, &ts_start);
		/* display passed time, a bit less accurate but side-effects are accounted for */
		timec = ts_end.tv_sec * 1000000 + ts_end.tv_nsec / 1000;
		//printf("CLOCK_MONOTONIC reports %ld.%09ld seconds (total) for copy %d 1000 times\n", ts_end.tv_sec, ts_end.tv_nsec,size);
		printf("CLOCK_MONOTONIC reports %.2fus for copy %dtimes %dKB\n",
				(1.0 * timec) / COUNTNUMBER, COUNTNUMBER, size / 1024);
		size=size*2;
	}
	return EXIT_SUCCESS;
}

这样测试时间就就不到300us,具体对比如下:上面为修改过后的结果

为什么切分为按Page页大小拷贝相同的数据效率能提升这么多,很奇怪。翻墙看了一下,原来国外也有人遇到过这个问题:

https://stackoverflow.com/questions/21038965/why-does-the-speed-of-memcpy-drop-dramatically-every-4kb

解释贴在下面了

Memory is usually organized in 4k pages (although there's also support for larger sizes). The virtual address space your program sees may be contiguous, but it's not necessarily the case in physical memory. The OS, which maintains a mapping of virtual to physical addresses (in the page map) would usually try to keep the physical pages together as well but that's not always possible and they may be fractured (especially on long usage where they may be swapped occasionally).

When your memory stream crosses a 4k page boundary, the CPU needs to stop and go fetch a new translation - if it already saw the page, it may be cached in the TLB, and the access is optimized to be the fastest, but if this is the first access (or if you have too many pages for the TLBs to hold on to), the CPU will have to stall the memory access and start a page walk over the page map entries - that's relatively long as each level is in fact a memory read by itself (on virtual machines it's even longer as each level may need a full pagewalk on the host).

具体意思就是按4K页大小拷贝,不存在跨页和新的地址转换操作,节省了时间开销。

看来以后使用memcpy可以通过这种方式提升效率,又学到了一点新东西。

---------------------


相关推荐