Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752581Ab0HQTuL (ORCPT ); Tue, 17 Aug 2010 15:50:11 -0400 Received: from mga03.intel.com ([143.182.124.21]:11828 "EHLO mga03.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752334Ab0HQTuE (ORCPT ); Tue, 17 Aug 2010 15:50:04 -0400 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="4.56,223,1280732400"; d="scan'208";a="313633220" Date: Tue, 17 Aug 2010 15:50:01 -0400 From: Matthew Wilcox To: linux-mm@kvack.org Cc: linux-kernel@vger.kernel.org Subject: Re: [TESTCASE] Clean pages clogging the VM Message-ID: <20100817195001.GA18817@linux.intel.com> References: <20100809133000.GB6981@wil.cx> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20100809133000.GB6981@wil.cx> User-Agent: Mutt/1.5.20 (2009-06-14) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9715 Lines: 315 No comment on this? Was it just that I posted it during the VM summit? On Mon, Aug 09, 2010 at 09:30:00AM -0400, Matthew Wilcox wrote: > > This testcase shows some odd behaviour from the Linux VM. > > It creates a 1TB sparse file, mmaps it, and randomly reads locations > in it. Due to the file being entirely sparse, the VM allocates new pages > and zeroes them. Initially, it runs very fast, taking on the order of > 2.7 to 4us per page fault. Eventually, the VM runs out of free pages, > and starts doing huge amounts of work trying to figure out which of > these clean pages to throw away. In my testing with a 6GB machine > and 2.9GHz CPU, one in every 15,000 page faults takes over a second, > and one in every 40,000 page faults take over seven seconds! > > This test-case demonstrates a problem that occurs with a read-mostly > mmap of a file on very fast media. I wouldn't like to see a solution > that special-cases zeroed pages. I think userspace has done its part > to tell the kernel what's it's doing by calling madvise(MADV_RANDOM). > This ought to be enough to hint to the kernel that it should be eagerly > throwing away pages in this VMA. > > > /* > * Copyright (c) 2010, Intel Corporation > * All rights reserved. > * > * Redistribution and use in source and binary forms, with or without > * modification, are permitted provided that the following conditions are met: > * > * * Redistributions of source code must retain the above copyright notice, > * this list of conditions and the following disclaimer. > * * Redistributions in binary form must reproduce the above copyright notice, > * this list of conditions and the following disclaimer in the documentation > * and/or other materials provided with the distribution. > * * Neither the name of Intel Corporation nor the names of its contributors > * may be used to endorse or promote products derived from this software > * without specific prior written permission. > * > * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" > * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE > * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR > * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF > * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS > * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN > * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) > * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE > * POSSIBILITY OF SUCH DAMAGE. > */ > > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > #include > > #define rdtscll(val) do { \ > unsigned int __a,__d; \ > asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \ > (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \ > } while(0) > > > #define MAX_FILE_SIZE ((off_t)1024 * 1024 * 1024 * 1024) > #define MAX_FILE_IOS 16384 > #define MAX_LATENCY 10000000 // usecs > > #define NUM_IOS 1024 > #define IO_SIZE 4096 > #define BUFFER_SIZE (1024 * 1024) > > pthread_t tid; > double cpu_clock; > long long unsigned cpu_start, cpu_stop; > > void *mmap_test(void *arg); > void die (); > > static const char usage_cmds[] = > "usage: %s [options]\n" > "cmd line options:\n" > " -f file_name Read from File named 'file_name'\n" > " -a file_size File of 'file_size' Bytes/thread\n" > " -b buffer_size Write/Read into/from buffer of 'buffer_size' Bytes/thread\n" > " -n num_file_ios Process 'num_file_ios' IOs\n" > " -s io_size IO Size = 'io_size' Bytes\n" > " -l max_latency Show latency stats based on usecs of max_latency\n" > ; > > void usage(const char *program) > { > fprintf(stderr, usage_cmds, program); > } > > off_t file_size = MAX_FILE_SIZE; // -a > long long unsigned int buffer_size = BUFFER_SIZE; // -b > char *filename = "sparse-file"; // -f > int num_file_ios = NUM_IOS; // -n > int max_latency = MAX_LATENCY; // -l > int io_size = IO_SIZE; // -s > long long unsigned int latency_limit; > > int main(int argc, char **argv) > { > pthread_attr_t attr; > cpu_set_t mask; > FILE *proc; > char buf[256]; > double mhz = 0.0; > > while (1) { > int option = getopt(argc, argv, "a:b:f:h:l:n:p:s:"); > if (option == -1) { > break; > } > switch (option) { > case 'a': > file_size = strtoul(optarg, NULL, 0); > printf("a: file_size:%ld Bytes :%ld MB\n", file_size, file_size/(1024*1024)); > break; > case 'b': > buffer_size = strtoul(optarg, NULL, 0); > printf("b: buffer_size:%lld Bytes\n", buffer_size); > break; > case 'f': > filename = optarg; > printf("f: filename:%s\n", filename); > break; > case 'h': > printf("h: options\n"); > goto help; > case 'l': > max_latency = strtoul(optarg, NULL, 0); > printf("l: latency stats based on max latency:%d\n", max_latency); > break; > case 'n': > num_file_ios = strtoul(optarg, NULL, 0); > printf("n: num_file_ios:%d\n", num_file_ios); > if (num_file_ios > MAX_FILE_IOS) { > printf("-n %d Entered > MAX_FILE_IOS:%d\n", num_file_ios, MAX_FILE_IOS); > exit(1); > } > break; > case 's': > io_size = strtoul(optarg, NULL, 0); > printf("s: io_size:%d Bytes\n", io_size); > break; > default: > help: > usage(argv[0]); > printf("default:\n"); > exit(1); > } > } > > proc = fopen("/proc/cpuinfo", "r"); > if (!proc) > return 0.0; > > while (fgets(buf, sizeof buf, proc)) { > double cpu; > > if (sscanf(buf, "cpu MHz : %lf", &cpu) != 1) > continue; > if (mhz == 0.0) { > mhz = cpu; > continue; > } > if (mhz != cpu) { > fprintf(stderr, > "Conflicting CPU frequency values: %lf != %lf\n", > mhz, cpu); > return 0.0; > } > } > fclose(proc); > printf("CPU Clock Freq from /proc/cpuinfo:%.4f\n", mhz); > // > // Measure CPU Core Frequnecy over 5 second period > // > printf("Measuring CPU Frequency......:"); > rdtscll(cpu_start); > usleep(5000000); > rdtscll(cpu_stop); > cpu_clock = (double)((double)(cpu_stop-cpu_start))/(double)5.0; > printf("%.3f\n", cpu_clock); > latency_limit = (long long unsigned int) (cpu_clock*max_latency/1000000); > printf("latency_limit:%llu cycles or %d usecs\n", latency_limit, max_latency); > > pthread_attr_init (&attr); > pthread_attr_setscope (&attr, PTHREAD_SCOPE_SYSTEM); > pthread_attr_setstacksize (&attr, (size_t) (1024*1024)); > > if (pthread_create(&tid, &attr, mmap_test, (void *)(long) 0) != 0) { > die("Thread create failed!"); > } > > CPU_ZERO(&mask); > CPU_SET(0, &mask); > if (pthread_setaffinity_np(tid, sizeof(mask), &mask) ) { > printf("WARNING: could not set CPU Affinity, exit...\n"); > exit(1); > } > > pthread_join(tid, NULL); > sleep(1); > > return 0; > } > > > void die(char *string) > { > fprintf(stderr, "\nmmap_test: %s\n", string); > exit(1); > } > > void *mmapfile(char *fname, off_t size, int *filed) > { > int fd; > void *file_addr; > struct stat statbuf; > > fd = open(fname, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); > *filed = fd; > if (fd < 0) { > fprintf(stderr, "unable to open %s to get an FD:%s\n", fname, strerror(errno)); > exit(1); > } > > fstat(fd, &statbuf); > if (statbuf.st_size < size) > ftruncate(fd, size); > > file_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); > if (file_addr == MAP_FAILED) { > fprintf(stderr, "datafile mmap failed: %s\n", strerror(errno)); > exit(1); > } > > madvise(file_addr, size, MADV_RANDOM); > return file_addr; > } > > void create_offsets(off_t *offset_buf, int threadnum) > { > int i, curr_time; > > curr_time = time(NULL); > srandom(curr_time / (threadnum + 1)); > > for (i = 0; i < num_file_ios; i++) { > double random1 = ((double)(rand()%(RAND_MAX)) / RAND_MAX); > offset_buf[i] = file_size * random1; > offset_buf[i] = offset_buf[i] / io_size * io_size; > } > } > > void *mmap_test(void *arg) > { > int threadnum = (long) arg; > int fd; > char *file_ptr, *file_addr; > char *buf_ptr, *buf_addr = NULL; > int i, j, ios; > off_t offset_buf[MAX_FILE_IOS]; > unsigned long long latency_start, latency_stop; > > posix_memalign((void *)&buf_addr, 4096, buffer_size); > > file_addr = mmapfile(filename, file_size, &fd); > > ios = buffer_size/io_size; > > create_offsets(offset_buf, threadnum); > > for (j = 0; j < num_file_ios; j++) { > buf_ptr = buf_addr; > file_ptr = file_addr + offset_buf[j]; > > for (i = 0; i < ios; i++) { > rdtscll(latency_start); > *buf_ptr = *(char *)file_ptr; > rdtscll(latency_stop); > printf("%lld\n", latency_stop - latency_start); > buf_ptr += io_size; > file_ptr += io_size; > } > } > > close(fd); > munmap(file_addr, file_size); > free(buf_addr); > > pthread_exit(NULL); > return 0; > } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/