/* * libpmem: IO engine that uses PMDK libpmem to read and write data * * Copyright (C) 2017 Nippon Telegraph and Telephone Corporation. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License, * version 2 as published by the Free Software Foundation.. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ /* * libpmem engine * * IO engine that uses libpmem to read and write data * * To use: * ioengine=libpmem * * Other relevant settings: * iodepth=1 * direct=1 * directory=/mnt/pmem0/ * bs=4k * * direct=1 means that pmem_drain() is executed for each write operation. * In contrast, direct=0 means that pmem_drain() is not executed. * * The pmem device must have a DAX-capable filesystem and be mounted * with DAX enabled. directory must point to a mount point of DAX FS. * * Example: * mkfs.xfs /dev/pmem0 * mkdir /mnt/pmem0 * mount -o dax /dev/pmem0 /mnt/pmem0 * * * See examples/libpmem.fio for more. * * * libpmem.so * By default, the libpmem engine will let the system find the libpmem.so * that it uses. You can use an alternative libpmem by setting the * FIO_PMEM_LIB environment variable to the full path to the desired * libpmem.so. */ #include #include #include #include #include #include #include #include #include #include #include "../fio.h" #include "../verify.h" /* * Limits us to 1GiB of mapped files in total to model after * libpmem engine behavior */ #define MMAP_TOTAL_SZ (1 * 1024 * 1024 * 1024UL) struct fio_libpmem_data { void *libpmem_ptr; size_t libpmem_sz; off_t libpmem_off; }; #define MEGABYTE ((uintptr_t)1 << 20) #define GIGABYTE ((uintptr_t)1 << 30) #define PROCMAXLEN 2048 /* maximum expected line length in /proc files */ #define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) static bool Mmap_no_random; static void *Mmap_hint; static unsigned long long Mmap_align; /* * util_map_hint_align -- choose the desired mapping alignment * * Use 2MB/1GB page alignment only if the mapping length is at least * twice as big as the page size. */ static inline size_t util_map_hint_align(size_t len, size_t req_align) { size_t align = Mmap_align; dprint(FD_IO, "DEBUG util_map_hint_align\n" ); if (req_align) align = req_align; else if (len >= 2 * GIGABYTE) align = GIGABYTE; else if (len >= 4 * MEGABYTE) align = 2 * MEGABYTE; dprint(FD_IO, "align=%d\n", (int)align); return align; } #ifdef __FreeBSD__ static const char *sscanf_os = "%p %p"; #define MAP_NORESERVE 0 #define OS_MAPFILE "/proc/curproc/map" #else static const char *sscanf_os = "%p-%p"; #define OS_MAPFILE "/proc/self/maps" #endif /* * util_map_hint_unused -- use /proc to determine a hint address for mmap() * * This is a helper function for util_map_hint(). * It opens up /proc/self/maps and looks for the first unused address * in the process address space that is: * - greater or equal 'minaddr' argument, * - large enough to hold range of given length, * - aligned to the specified unit. * * Asking for aligned address like this will allow the DAX code to use large * mappings. It is not an error if mmap() ignores the hint and chooses * different address. */ static char *util_map_hint_unused(void *minaddr, size_t len, size_t align) { char *lo = NULL; /* beginning of current range in maps file */ char *hi = NULL; /* end of current range in maps file */ char *raddr = minaddr; /* ignore regions below 'minaddr' */ #ifdef WIN32 MEMORY_BASIC_INFORMATION mi; #else FILE *fp; char line[PROCMAXLEN]; /* for fgets() */ #endif dprint(FD_IO, "DEBUG util_map_hint_unused\n"); assert(align > 0); if (raddr == NULL) raddr += page_size; raddr = (char *)roundup((uintptr_t)raddr, align); #ifdef WIN32 while ((uintptr_t)raddr < UINTPTR_MAX - len) { size_t ret = VirtualQuery(raddr, &mi, sizeof(mi)); if (ret == 0) { ERR("VirtualQuery %p", raddr); return MAP_FAILED; } dprint(FD_IO, "addr %p len %zu state %d", mi.BaseAddress, mi.RegionSize, mi.State); if ((mi.State != MEM_FREE) || (mi.RegionSize < len)) { raddr = (char *)mi.BaseAddress + mi.RegionSize; raddr = (char *)roundup((uintptr_t)raddr, align); dprint(FD_IO, "nearest aligned addr %p", raddr); } else { dprint(FD_IO, "unused region of size %zu found at %p", mi.RegionSize, mi.BaseAddress); return mi.BaseAddress; } } dprint(FD_IO, "end of address space reached"); return MAP_FAILED; #else fp = fopen(OS_MAPFILE, "r"); if (!fp) { log_err("!%s\n", OS_MAPFILE); return MAP_FAILED; } while (fgets(line, PROCMAXLEN, fp) != NULL) { /* check for range line */ if (sscanf(line, sscanf_os, &lo, &hi) == 2) { dprint(FD_IO, "%p-%p\n", lo, hi); if (lo > raddr) { if ((uintptr_t)(lo - raddr) >= len) { dprint(FD_IO, "unused region of size " "%zu found at %p\n", lo - raddr, raddr); break; } else { dprint(FD_IO, "region is too small: " "%zu < %zu\n", lo - raddr, len); } } if (hi > raddr) { raddr = (char *)roundup((uintptr_t)hi, align); dprint(FD_IO, "nearest aligned addr %p\n", raddr); } if (raddr == 0) { dprint(FD_IO, "end of address space reached\n"); break; } } } /* * Check for a case when this is the last unused range in the address * space, but is not large enough. (very unlikely) */ if ((raddr != NULL) && (UINTPTR_MAX - (uintptr_t)raddr < len)) { dprint(FD_IO, "end of address space reached"); raddr = MAP_FAILED; } fclose(fp); dprint(FD_IO, "returning %p", raddr); return raddr; #endif } /* * util_map_hint -- determine hint address for mmap() * * If PMEM_MMAP_HINT environment variable is not set, we let the system to pick * the randomized mapping address. Otherwise, a user-defined hint address * is used. * * Windows Environment: * XXX - Windows doesn't support large DAX pages yet, so there is * no point in aligning for the same. * * Except for Windows Environment: * ALSR in 64-bit Linux kernel uses 28-bit of randomness for mmap * (bit positions 12-39), which means the base mapping address is randomized * within [0..1024GB] range, with 4KB granularity. Assuming additional * 1GB alignment, it results in 1024 possible locations. * * Configuring the hint address via PMEM_MMAP_HINT environment variable * disables address randomization. In such case, the function will search for * the first unused, properly aligned region of given size, above the * specified address. */ static char *util_map_hint(size_t len, size_t req_align) { char *addr; size_t align = 0; char *e = NULL; dprint(FD_IO, "DEBUG util_map_hint\n"); dprint(FD_IO, "len %zu req_align %zu\n", len, req_align); /* choose the desired alignment based on the requested length */ align = util_map_hint_align(len, req_align); e = getenv("PMEM_MMAP_HINT"); if (e) { char *endp; unsigned long long val = 0; errno = 0; val = strtoull(e, &endp, 16); if (errno || endp == e) { dprint(FD_IO, "Invalid PMEM_MMAP_HINT\n"); } else { Mmap_hint = (void *)val; Mmap_no_random = true; dprint(FD_IO, "PMEM_MMAP_HINT set to %p\n", Mmap_hint); } } if (Mmap_no_random) { dprint(FD_IO, "user-defined hint %p\n", (void *)Mmap_hint); addr = util_map_hint_unused((void *)Mmap_hint, len, align); } else { /* * Create dummy mapping to find an unused region of given size. * * Request for increased size for later address alignment. * * Windows Environment: * Use MAP_NORESERVE flag to only reserve the range of pages * rather than commit. We don't want the pages to be actually * backed by the operating system paging file, as the swap * file is usually too small to handle terabyte pools. * * Except for Windows Environment: * Use MAP_PRIVATE with read-only access to simulate * zero cost for overcommit accounting. Note: MAP_NORESERVE * flag is ignored if overcommit is disabled (mode 2). */ #ifndef WIN32 addr = mmap(NULL, len + align, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); #else addr = mmap(NULL, len + align, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS|MAP_NORESERVE, -1, 0); #endif if (addr != MAP_FAILED) { dprint(FD_IO, "system choice %p\n", addr); munmap(addr, len + align); addr = (char *)roundup((uintptr_t)addr, align); } } dprint(FD_IO, "hint %p\n", addr); return addr; } /* * This is the mmap execution function */ static int fio_libpmem_file(struct thread_data *td, struct fio_file *f, size_t length, off_t off) { struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); int flags = 0; void *addr = NULL; dprint(FD_IO, "DEBUG fio_libpmem_file\n"); if (td_rw(td)) flags = PROT_READ | PROT_WRITE; else if (td_write(td)) { flags = PROT_WRITE; if (td->o.verify != VERIFY_NONE) flags |= PROT_READ; } else flags = PROT_READ; dprint(FD_IO, "f->file_name = %s td->o.verify = %d \n", f->file_name, td->o.verify); dprint(FD_IO, "length = %ld flags = %d f->fd = %d off = %ld \n", length, flags, f->fd,off); addr = util_map_hint(length, 0); fdd->libpmem_ptr = mmap(addr, length, flags, MAP_SHARED, f->fd, off); if (fdd->libpmem_ptr == MAP_FAILED) { fdd->libpmem_ptr = NULL; td_verror(td, errno, "mmap"); } if (td->error && fdd->libpmem_ptr) munmap(fdd->libpmem_ptr, length); return td->error; } /* * XXX Just mmap an appropriate portion, we cannot mmap the full extent */ static int fio_libpmem_prep_limited(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); dprint(FD_IO, "DEBUG fio_libpmem_prep_limited\n" ); if (io_u->buflen > f->real_file_size) { log_err("libpmem: bs too big for libpmem engine\n"); return EIO; } fdd->libpmem_sz = min(MMAP_TOTAL_SZ, f->real_file_size); if (fdd->libpmem_sz > f->io_size) fdd->libpmem_sz = f->io_size; fdd->libpmem_off = io_u->offset; return fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off); } /* * Attempt to mmap the entire file */ static int fio_libpmem_prep_full(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); int ret; dprint(FD_IO, "DEBUG fio_libpmem_prep_full\n" ); if (fio_file_partial_mmap(f)) return EINVAL; dprint(FD_IO," f->io_size %ld : io_u->offset %lld \n", f->io_size, io_u->offset); if (io_u->offset != (size_t) io_u->offset || f->io_size != (size_t) f->io_size) { fio_file_set_partial_mmap(f); return EINVAL; } fdd->libpmem_sz = f->io_size; fdd->libpmem_off = 0; ret = fio_libpmem_file(td, f, fdd->libpmem_sz, fdd->libpmem_off); if (ret) fio_file_set_partial_mmap(f); return ret; } static int fio_libpmem_prep(struct thread_data *td, struct io_u *io_u) { struct fio_file *f = io_u->file; struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); int ret; dprint(FD_IO, "DEBUG fio_libpmem_prep\n" ); /* * It fits within existing mapping, use it */ dprint(FD_IO," io_u->offset %llu : fdd->libpmem_off %llu : " "io_u->buflen %llu : fdd->libpmem_sz %llu\n", io_u->offset, (unsigned long long) fdd->libpmem_off, io_u->buflen, (unsigned long long) fdd->libpmem_sz); if (io_u->offset >= fdd->libpmem_off && (io_u->offset + io_u->buflen <= fdd->libpmem_off + fdd->libpmem_sz)) goto done; /* * unmap any existing mapping */ if (fdd->libpmem_ptr) { dprint(FD_IO,"munmap \n"); if (munmap(fdd->libpmem_ptr, fdd->libpmem_sz) < 0) return errno; fdd->libpmem_ptr = NULL; } if (fio_libpmem_prep_full(td, io_u)) { td_clear_error(td); ret = fio_libpmem_prep_limited(td, io_u); if (ret) return ret; } done: io_u->mmap_data = fdd->libpmem_ptr + io_u->offset - fdd->libpmem_off - f->file_offset; return 0; } static enum fio_q_status fio_libpmem_queue(struct thread_data *td, struct io_u *io_u) { fio_ro_check(td, io_u); io_u->error = 0; dprint(FD_IO, "DEBUG fio_libpmem_queue\n"); switch (io_u->ddir) { case DDIR_READ: memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen); break; case DDIR_WRITE: dprint(FD_IO, "DEBUG mmap_data=%p, xfer_buf=%p\n", io_u->mmap_data, io_u->xfer_buf ); dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect); if (td->o.odirect) { pmem_memcpy_persist(io_u->mmap_data, io_u->xfer_buf, io_u->xfer_buflen); } else { pmem_memcpy_nodrain(io_u->mmap_data, io_u->xfer_buf, io_u->xfer_buflen); } break; case DDIR_SYNC: case DDIR_DATASYNC: case DDIR_SYNC_FILE_RANGE: break; default: io_u->error = EINVAL; break; } return FIO_Q_COMPLETED; } static int fio_libpmem_init(struct thread_data *td) { struct thread_options *o = &td->o; dprint(FD_IO,"o->rw_min_bs %llu \n o->fsync_blocks %d \n o->fdatasync_blocks %d \n", o->rw_min_bs,o->fsync_blocks,o->fdatasync_blocks); dprint(FD_IO, "DEBUG fio_libpmem_init\n"); if ((o->rw_min_bs & page_mask) && (o->fsync_blocks || o->fdatasync_blocks)) { log_err("libpmem: mmap options dictate a minimum block size of " "%llu bytes\n", (unsigned long long) page_size); return 1; } return 0; } static int fio_libpmem_open_file(struct thread_data *td, struct fio_file *f) { struct fio_libpmem_data *fdd; int ret; dprint(FD_IO,"DEBUG fio_libpmem_open_file\n"); dprint(FD_IO,"f->io_size=%ld \n",f->io_size); dprint(FD_IO,"td->o.size=%lld \n",td->o.size); dprint(FD_IO,"td->o.iodepth=%d\n",td->o.iodepth); dprint(FD_IO,"td->o.iodepth_batch=%d \n",td->o.iodepth_batch); ret = generic_open_file(td, f); if (ret) return ret; fdd = calloc(1, sizeof(*fdd)); if (!fdd) { int fio_unused __ret; __ret = generic_close_file(td, f); return 1; } FILE_SET_ENG_DATA(f, fdd); return 0; } static int fio_libpmem_close_file(struct thread_data *td, struct fio_file *f) { struct fio_libpmem_data *fdd = FILE_ENG_DATA(f); dprint(FD_IO,"DEBUG fio_libpmem_close_file\n"); dprint(FD_IO,"td->o.odirect %d \n",td->o.odirect); if (!td->o.odirect) { dprint(FD_IO,"pmem_drain\n"); pmem_drain(); } FILE_SET_ENG_DATA(f, NULL); free(fdd); fio_file_clear_partial_mmap(f); return generic_close_file(td, f); } static struct ioengine_ops ioengine = { .name = "libpmem", .version = FIO_IOOPS_VERSION, .init = fio_libpmem_init, .prep = fio_libpmem_prep, .queue = fio_libpmem_queue, .open_file = fio_libpmem_open_file, .close_file = fio_libpmem_close_file, .get_file_size = generic_get_file_size, .flags = FIO_SYNCIO |FIO_NOEXTEND, }; static void fio_init fio_libpmem_register(void) { #ifndef WIN32 Mmap_align = page_size; #else if (Mmap_align == 0) { SYSTEM_INFO si; GetSystemInfo(&si); Mmap_align = si.dwAllocationGranularity; } #endif register_ioengine(&ioengine); } static void fio_exit fio_libpmem_unregister(void) { unregister_ioengine(&ioengine); }