From: Steven Swanson <swanson@eng.ucsd.edu>
Subject: [RFC 09/16] NOVA: DAX code
To: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org,
        linux-nvdimm@lists.01.org
Cc: Steven Swanson <steven.swanson@gmail.com>, dan.j.williams@intel.com
Date: Thu, 03 Aug 2017 00:49:12 -0700
Message-ID: <150174655259.104003.2975202455695549439.stgit@hn>
In-Reply-To: <150174646416.104003.14042713459553361884.stgit@hn>
References: <150174646416.104003.14042713459553361884.stgit@hn>
User-Agent: StGit/0.17.1-27-g0d46-dirty
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"
Content-Transfer-Encoding: 7bit
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 36339
Lines: 1362

NOVA leverages the kernel's DAX mechanisms for mmap and file data access.  Nova
maintains a red-black tree in DRAM (nova_inode_info_header.vma_tree) to track
which portions of a file have been mapped.

Signed-off-by: Steven Swanson <swanson@cs.ucsd.edu>
---
 fs/nova/dax.c | 1346 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1346 insertions(+)
 create mode 100644 fs/nova/dax.c

diff --git a/fs/nova/dax.c b/fs/nova/dax.c
new file mode 100644
index 000000000000..871b10f1889c
--- /dev/null
+++ b/fs/nova/dax.c
@@ -0,0 +1,1346 @@
+/*
+ * BRIEF DESCRIPTION
+ *
+ * DAX file operations.
+ *
+ * Copyright 2015-2016 Regents of the University of California,
+ * UCSD Non-Volatile Systems Lab, Andiry Xu <jix024@cs.ucsd.edu>
+ * Copyright 2012-2013 Intel Corporation
+ * Copyright 2009-2011 Marco Stornelli <marco.stornelli@gmail.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/cpufeature.h>
+#include <asm/pgtable.h>
+#include <linux/version.h>
+#include "nova.h"
+#include "inode.h"
+
+
+
+static inline int nova_copy_partial_block(struct super_block *sb,
+	struct nova_inode_info_header *sih,
+	struct nova_file_write_entry *entry, unsigned long index,
+	size_t offset, size_t length, void *kmem)
+{
+	void *ptr;
+	int rc = 0;
+	unsigned long nvmm;
+
+	nvmm = get_nvmm(sb, sih, entry, index);
+	ptr = nova_get_block(sb, (nvmm << PAGE_SHIFT));
+
+	if (ptr != NULL) {
+		if (support_clwb)
+			rc = memcpy_mcsafe(kmem + offset, ptr + offset,
+						length);
+		else
+			memcpy_to_pmem_nocache(kmem + offset, ptr + offset,
+						length);
+	}
+
+	/* TODO: If rc < 0, go to MCE data recovery. */
+	return rc;
+}
+
+static inline int nova_handle_partial_block(struct super_block *sb,
+	struct nova_inode_info_header *sih,
+	struct nova_file_write_entry *entry, unsigned long index,
+	size_t offset, size_t length, void *kmem)
+{
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct nova_file_write_entry *entryc, entry_copy;
+
+	nova_memunlock_block(sb, kmem);
+	if (entry == NULL) {
+		/* Fill zero */
+		if (support_clwb)
+			memset(kmem + offset, 0, length);
+		else
+			memcpy_to_pmem_nocache(kmem + offset,
+					sbi->zeroed_page, length);
+	} else {
+		/* Copy from original block */
+		if (metadata_csum == 0)
+			entryc = entry;
+		else {
+			entryc = &entry_copy;
+			if (!nova_verify_entry_csum(sb, entry, entryc))
+				return -EIO;
+		}
+
+		nova_copy_partial_block(sb, sih, entryc, index,
+					offset, length, kmem);
+
+	}
+	nova_memlock_block(sb, kmem);
+	if (support_clwb)
+		nova_flush_buffer(kmem + offset, length, 0);
+	return 0;
+}
+
+/*
+ * Fill the new start/end block from original blocks.
+ * Do nothing if fully covered; copy if original blocks present;
+ * Fill zero otherwise.
+ */
+int nova_handle_head_tail_blocks(struct super_block *sb,
+	struct inode *inode, loff_t pos, size_t count, void *kmem)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	size_t offset, eblk_offset;
+	unsigned long start_blk, end_blk, num_blocks;
+	struct nova_file_write_entry *entry;
+	timing_t partial_time;
+	int ret = 0;
+
+	NOVA_START_TIMING(partial_block_t, partial_time);
+	offset = pos & (sb->s_blocksize - 1);
+	num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
+	/* offset in the actual block size block */
+	offset = pos & (nova_inode_blk_size(sih) - 1);
+	start_blk = pos >> sb->s_blocksize_bits;
+	end_blk = start_blk + num_blocks - 1;
+
+	nova_dbg_verbose("%s: %lu blocks\n", __func__, num_blocks);
+	/* We avoid zeroing the alloc'd range, which is going to be overwritten
+	 * by this system call anyway
+	 */
+	nova_dbg_verbose("%s: start offset %lu start blk %lu %p\n", __func__,
+				offset, start_blk, kmem);
+	if (offset != 0) {
+		entry = nova_get_write_entry(sb, sih, start_blk);
+		ret = nova_handle_partial_block(sb, sih, entry,
+						start_blk, 0, offset, kmem);
+		if (ret < 0)
+			return ret;
+	}
+
+	kmem = (void *)((char *)kmem +
+			((num_blocks - 1) << sb->s_blocksize_bits));
+	eblk_offset = (pos + count) & (nova_inode_blk_size(sih) - 1);
+	nova_dbg_verbose("%s: end offset %lu, end blk %lu %p\n", __func__,
+				eblk_offset, end_blk, kmem);
+	if (eblk_offset != 0) {
+		entry = nova_get_write_entry(sb, sih, end_blk);
+
+		ret = nova_handle_partial_block(sb, sih, entry, end_blk,
+						eblk_offset,
+						sb->s_blocksize - eblk_offset,
+						kmem);
+		if (ret < 0)
+			return ret;
+	}
+	NOVA_END_TIMING(partial_block_t, partial_time);
+
+	return ret;
+}
+
+int nova_reassign_file_tree(struct super_block *sb,
+	struct nova_inode_info_header *sih, u64 begin_tail)
+{
+	void *addr;
+	struct nova_file_write_entry *entry;
+	struct nova_file_write_entry *entryc, entry_copy;
+	u64 curr_p = begin_tail;
+	size_t entry_size = sizeof(struct nova_file_write_entry);
+
+	entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+	while (curr_p && curr_p != sih->log_tail) {
+		if (is_last_entry(curr_p, entry_size))
+			curr_p = next_log_page(sb, curr_p);
+
+		if (curr_p == 0) {
+			nova_err(sb, "%s: File inode %lu log is NULL!\n",
+				__func__, sih->ino);
+			return -EINVAL;
+		}
+
+		addr = (void *) nova_get_block(sb, curr_p);
+		entry = (struct nova_file_write_entry *) addr;
+
+		if (metadata_csum == 0)
+			entryc = entry;
+		else if (!nova_verify_entry_csum(sb, entry, entryc))
+			return -EIO;
+
+		if (nova_get_entry_type(entryc) != FILE_WRITE) {
+			nova_dbg("%s: entry type is not write? %d\n",
+				__func__, nova_get_entry_type(entry));
+			curr_p += entry_size;
+			continue;
+		}
+
+		nova_assign_write_entry(sb, sih, entry, entryc, true);
+		curr_p += entry_size;
+	}
+
+	return 0;
+}
+
+int nova_cleanup_incomplete_write(struct super_block *sb,
+	struct nova_inode_info_header *sih, unsigned long blocknr,
+	int allocated, u64 begin_tail, u64 end_tail)
+{
+	void *addr;
+	struct nova_file_write_entry *entry;
+	struct nova_file_write_entry *entryc, entry_copy;
+	u64 curr_p = begin_tail;
+	size_t entry_size = sizeof(struct nova_file_write_entry);
+
+	if (blocknr > 0 && allocated > 0)
+		nova_free_data_blocks(sb, sih, blocknr, allocated);
+
+	if (begin_tail == 0 || end_tail == 0)
+		return 0;
+
+	entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+	while (curr_p != end_tail) {
+		if (is_last_entry(curr_p, entry_size))
+			curr_p = next_log_page(sb, curr_p);
+
+		if (curr_p == 0) {
+			nova_err(sb, "%s: File inode %lu log is NULL!\n",
+				__func__, sih->ino);
+			return -EINVAL;
+		}
+
+		addr = (void *) nova_get_block(sb, curr_p);
+		entry = (struct nova_file_write_entry *) addr;
+
+		if (metadata_csum == 0)
+			entryc = entry;
+		else {
+			/* skip entry check here as the entry checksum may not
+			 * be updated when this is called
+			 */
+			if (memcpy_mcsafe(entryc, entry,
+					sizeof(struct nova_file_write_entry)))
+				return -EIO;
+		}
+
+		if (nova_get_entry_type(entryc) != FILE_WRITE) {
+			nova_dbg("%s: entry type is not write? %d\n",
+				__func__, nova_get_entry_type(entry));
+			curr_p += entry_size;
+			continue;
+		}
+
+		blocknr = entryc->block >> PAGE_SHIFT;
+		nova_free_data_blocks(sb, sih, blocknr, entryc->num_pages);
+		curr_p += entry_size;
+	}
+
+	return 0;
+}
+
+void nova_init_file_write_entry(struct super_block *sb,
+	struct nova_inode_info_header *sih, struct nova_file_write_entry *entry,
+	u64 epoch_id, u64 pgoff, int num_pages, u64 blocknr, u32 time,
+	u64 file_size)
+{
+	memset(entry, 0, sizeof(struct nova_file_write_entry));
+	entry->entry_type = FILE_WRITE;
+	entry->reassigned = 0;
+	entry->updating = 0;
+	entry->epoch_id = epoch_id;
+	entry->trans_id = sih->trans_id;
+	entry->pgoff = cpu_to_le64(pgoff);
+	entry->num_pages = cpu_to_le32(num_pages);
+	entry->invalid_pages = 0;
+	entry->block = cpu_to_le64(nova_get_block_off(sb, blocknr,
+							sih->i_blk_type));
+	entry->mtime = cpu_to_le32(time);
+
+	entry->size = file_size;
+}
+
+int nova_protect_file_data(struct super_block *sb, struct inode *inode,
+	loff_t pos, size_t count, const char __user *buf, unsigned long blocknr,
+	bool inplace)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	size_t offset, eblk_offset, bytes, left;
+	unsigned long start_blk, end_blk, num_blocks, nvmm, nvmmoff;
+	unsigned long blocksize = sb->s_blocksize;
+	unsigned int blocksize_bits = sb->s_blocksize_bits;
+	u8 *blockbuf, *blockptr;
+	struct nova_file_write_entry *entry;
+	struct nova_file_write_entry *entryc, entry_copy;
+	bool mapped, nvmm_ok;
+	int ret = 0;
+	timing_t protect_file_data_time, memcpy_time;
+
+	NOVA_START_TIMING(protect_file_data_t, protect_file_data_time);
+
+	offset = pos & (blocksize - 1);
+	num_blocks = ((offset + count - 1) >> blocksize_bits) + 1;
+	start_blk = pos >> blocksize_bits;
+	end_blk = start_blk + num_blocks - 1;
+
+	NOVA_START_TIMING(protect_memcpy_t, memcpy_time);
+	blockbuf = kmalloc(blocksize, GFP_KERNEL);
+	if (blockbuf == NULL) {
+		nova_err(sb, "%s: block buffer allocation error\n", __func__);
+		return -ENOMEM;
+	}
+
+	bytes = blocksize - offset;
+	if (bytes > count)
+		bytes = count;
+
+	left = copy_from_user(blockbuf + offset, buf, bytes);
+	NOVA_END_TIMING(protect_memcpy_t, memcpy_time);
+	if (unlikely(left != 0)) {
+		nova_err(sb, "%s: not all data is copied from user! expect to copy %zu bytes, actually copied %zu bytes\n",
+			 __func__, bytes, bytes - left);
+		ret = -EFAULT;
+		goto out;
+	}
+
+	entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+	if (offset != 0) {
+		NOVA_STATS_ADD(protect_head, 1);
+		entry = nova_get_write_entry(sb, sih, start_blk);
+		if (entry != NULL) {
+			if (metadata_csum == 0)
+				entryc = entry;
+			else if (!nova_verify_entry_csum(sb, entry, entryc))
+				return -EIO;
+
+			/* make sure data in the partial block head is good */
+			nvmm = get_nvmm(sb, sih, entryc, start_blk);
+			nvmmoff = nova_get_block_off(sb, nvmm, sih->i_blk_type);
+			blockptr = (u8 *) nova_get_block(sb, nvmmoff);
+
+			mapped = nova_find_pgoff_in_vma(inode, start_blk);
+			if (data_csum > 0 && !mapped && !inplace) {
+				nvmm_ok = nova_verify_data_csum(sb, sih, nvmm,
+								0, offset);
+				if (!nvmm_ok) {
+					ret = -EIO;
+					goto out;
+				}
+			}
+
+			ret = memcpy_mcsafe(blockbuf, blockptr, offset);
+			if (ret < 0)
+				goto out;
+		} else {
+			memset(blockbuf, 0, offset);
+		}
+
+		/* copying existing checksums from nvmm can be even slower than
+		 * re-computing checksums of a whole block.
+		if (data_csum > 0)
+			nova_copy_partial_block_csum(sb, sih, entry, start_blk,
+							offset, blocknr, false);
+		*/
+	}
+
+	if (num_blocks == 1)
+		goto eblk;
+
+	do {
+		if (inplace)
+			nova_update_block_csum_parity(sb, sih, blockbuf,
+							blocknr, offset, bytes);
+		else
+			nova_update_block_csum_parity(sb, sih, blockbuf,
+							blocknr, 0, blocksize);
+
+		blocknr++;
+		pos += bytes;
+		buf += bytes;
+		count -= bytes;
+		offset = pos & (blocksize - 1);
+
+		bytes = count < blocksize ? count : blocksize;
+		left = copy_from_user(blockbuf, buf, bytes);
+		if (unlikely(left != 0)) {
+			nova_err(sb, "%s: not all data is copied from user!  expect to copy %zu bytes, actually copied %zu bytes\n",
+				 __func__, bytes, bytes - left);
+			ret = -EFAULT;
+			goto out;
+		}
+	} while (count > blocksize);
+
+eblk:
+	eblk_offset = (pos + count) & (blocksize - 1);
+
+	if (eblk_offset != 0) {
+		NOVA_STATS_ADD(protect_tail, 1);
+		entry = nova_get_write_entry(sb, sih, end_blk);
+		if (entry != NULL) {
+			if (metadata_csum == 0)
+				entryc = entry;
+			else if (!nova_verify_entry_csum(sb, entry, entryc))
+				return -EIO;
+
+			/* make sure data in the partial block tail is good */
+			nvmm = get_nvmm(sb, sih, entryc, end_blk);
+			nvmmoff = nova_get_block_off(sb, nvmm, sih->i_blk_type);
+			blockptr = (u8 *) nova_get_block(sb, nvmmoff);
+
+			mapped = nova_find_pgoff_in_vma(inode, end_blk);
+			if (data_csum > 0 && !mapped && !inplace) {
+				nvmm_ok = nova_verify_data_csum(sb, sih, nvmm,
+					eblk_offset, blocksize - eblk_offset);
+				if (!nvmm_ok) {
+					ret = -EIO;
+					goto out;
+				}
+			}
+
+			ret = memcpy_mcsafe(blockbuf + eblk_offset,
+						blockptr + eblk_offset,
+						blocksize - eblk_offset);
+			if (ret < 0)
+				goto out;
+		} else {
+			memset(blockbuf + eblk_offset, 0,
+				blocksize - eblk_offset);
+		}
+
+		/* copying existing checksums from nvmm can be even slower than
+		 * re-computing checksums of a whole block.
+		if (data_csum > 0)
+			nova_copy_partial_block_csum(sb, sih, entry, end_blk,
+						eblk_offset, blocknr, true);
+		*/
+	}
+
+	if (inplace)
+		nova_update_block_csum_parity(sb, sih, blockbuf, blocknr,
+							offset, bytes);
+	else
+		nova_update_block_csum_parity(sb, sih, blockbuf, blocknr,
+							0, blocksize);
+
+out:
+	if (blockbuf != NULL)
+		kfree(blockbuf);
+
+	NOVA_END_TIMING(protect_file_data_t, protect_file_data_time);
+
+	return ret;
+}
+
+static bool nova_get_verify_entry(struct super_block *sb,
+	struct nova_file_write_entry *entry,
+	struct nova_file_write_entry *entryc,
+	int locked)
+{
+	int ret = 0;
+
+	if (metadata_csum == 0)
+		return true;
+
+	if (locked == 0) {
+		/* Someone else may be updating the entry. Skip check */
+		ret = memcpy_mcsafe(entryc, entry,
+				sizeof(struct nova_file_write_entry));
+		if (ret < 0)
+			return false;
+
+		return true;
+	}
+
+	return nova_verify_entry_csum(sb, entry, entryc);
+}
+
+/*
+ * Check if there is an existing entry for target page offset.
+ * Used for inplace write, direct IO, DAX-mmap and fallocate.
+ */
+unsigned long nova_check_existing_entry(struct super_block *sb,
+	struct inode *inode, unsigned long num_blocks, unsigned long start_blk,
+	struct nova_file_write_entry **ret_entry,
+	struct nova_file_write_entry *ret_entryc, int check_next, u64 epoch_id,
+	int *inplace, int locked)
+{
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct nova_file_write_entry *entry;
+	struct nova_file_write_entry *entryc;
+	unsigned long next_pgoff;
+	unsigned long ent_blks = 0;
+	timing_t check_time;
+
+	NOVA_START_TIMING(check_entry_t, check_time);
+
+	*ret_entry = NULL;
+	*inplace = 0;
+	entry = nova_get_write_entry(sb, sih, start_blk);
+
+	entryc = (metadata_csum == 0) ? entry : ret_entryc;
+
+	if (entry) {
+		if (metadata_csum == 0)
+			entryc = entry;
+		else if (!nova_get_verify_entry(sb, entry, entryc, locked))
+			goto out;
+
+		*ret_entry = entry;
+
+		/* We can do inplace write. Find contiguous blocks */
+		if (entryc->reassigned == 0)
+			ent_blks = entryc->num_pages -
+					(start_blk - entryc->pgoff);
+		else
+			ent_blks = 1;
+
+		if (ent_blks > num_blocks)
+			ent_blks = num_blocks;
+
+		if (entryc->epoch_id == epoch_id)
+			*inplace = 1;
+
+	} else if (check_next) {
+		/* Possible Hole */
+		entry = nova_find_next_entry(sb, sih, start_blk);
+		if (entry) {
+			if (metadata_csum == 0)
+				entryc = entry;
+			else if (!nova_get_verify_entry(sb, entry, entryc,
+							locked))
+				goto out;
+
+			next_pgoff = entryc->pgoff;
+			if (next_pgoff <= start_blk) {
+				nova_err(sb, "iblock %lu, entry pgoff %lu, num pages %lu\n",
+				       start_blk, next_pgoff, entry->num_pages);
+				nova_print_inode_log(sb, inode);
+				BUG();
+				ent_blks = num_blocks;
+				goto out;
+			}
+			ent_blks = next_pgoff - start_blk;
+			if (ent_blks > num_blocks)
+				ent_blks = num_blocks;
+		} else {
+			/* File grow */
+			ent_blks = num_blocks;
+		}
+	}
+
+	if (entry && ent_blks == 0) {
+		nova_dbg("%s: %d\n", __func__, check_next);
+		dump_stack();
+	}
+
+out:
+	NOVA_END_TIMING(check_entry_t, check_time);
+	return ent_blks;
+}
+
+ssize_t nova_inplace_file_write(struct file *filp,
+	const char __user *buf,	size_t len, loff_t *ppos)
+{
+	struct address_space *mapping = filp->f_mapping;
+	struct inode	*inode = mapping->host;
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct super_block *sb = inode->i_sb;
+	struct nova_inode *pi, inode_copy;
+	struct nova_file_write_entry *entry;
+	struct nova_file_write_entry *entryc, entry_copy;
+	struct nova_file_write_entry entry_data;
+	struct nova_inode_update update;
+	ssize_t	    written = 0;
+	loff_t pos;
+	size_t count, offset, copied;
+	unsigned long start_blk, num_blocks, ent_blks = 0;
+	unsigned long total_blocks;
+	unsigned long blocknr = 0;
+	unsigned int data_bits;
+	int allocated = 0;
+	int inplace = 0;
+	bool hole_fill = false;
+	bool update_log = false;
+	void *kmem;
+	u64 blk_off;
+	size_t bytes;
+	long status = 0;
+	timing_t inplace_write_time, memcpy_time;
+	unsigned long step = 0;
+	u64 begin_tail = 0;
+	u64 epoch_id;
+	u64 file_size;
+	u32 time;
+	ssize_t ret;
+
+
+	if (len == 0)
+		return 0;
+
+
+	NOVA_START_TIMING(inplace_write_t, inplace_write_time);
+
+	sb_start_write(inode->i_sb);
+	inode_lock(inode);
+
+	if (!access_ok(VERIFY_READ, buf, len)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	pos = *ppos;
+
+	if (filp->f_flags & O_APPEND)
+		pos = i_size_read(inode);
+
+	count = len;
+
+	pi = nova_get_block(sb, sih->pi_addr);
+
+	/* nova_inode tail pointer will be updated and we make sure all other
+	 * inode fields are good before checksumming the whole structure
+	 */
+	if (nova_check_inode_integrity(sb, sih->ino, sih->pi_addr,
+			sih->alter_pi_addr, &inode_copy, 0) < 0) {
+		ret = -EIO;
+		goto out;
+	}
+
+	offset = pos & (sb->s_blocksize - 1);
+	num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
+	total_blocks = num_blocks;
+
+	/* offset in the actual block size block */
+
+	ret = file_remove_privs(filp);
+	if (ret)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = current_time(inode);
+	time = current_time(inode).tv_sec;
+
+	epoch_id = nova_get_epoch_id(sb);
+
+	nova_dbgv("%s: epoch_id %llu, inode %lu, offset %lld, count %lu\n",
+			__func__, epoch_id, inode->i_ino, pos, count);
+	update.tail = sih->log_tail;
+	update.alter_tail = sih->alter_log_tail;
+	while (num_blocks > 0) {
+		hole_fill = false;
+		offset = pos & (nova_inode_blk_size(sih) - 1);
+		start_blk = pos >> sb->s_blocksize_bits;
+
+		ent_blks = nova_check_existing_entry(sb, inode, num_blocks,
+						start_blk, &entry, &entry_copy,
+						1, epoch_id, &inplace, 1);
+
+		entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+		if (entry && inplace) {
+			/* We can do inplace write. Find contiguous blocks */
+			blocknr = get_nvmm(sb, sih, entryc, start_blk);
+			blk_off = blocknr << PAGE_SHIFT;
+			allocated = ent_blks;
+			if (data_csum || data_parity)
+				nova_set_write_entry_updating(sb, entry, 1);
+		} else {
+			/* Allocate blocks to fill hole */
+			allocated = nova_new_data_blocks(sb, sih, &blocknr,
+					 start_blk, ent_blks, ALLOC_NO_INIT,
+					 ANY_CPU, ALLOC_FROM_HEAD);
+
+			nova_dbg_verbose("%s: alloc %d blocks @ %lu\n",
+						__func__, allocated, blocknr);
+
+			if (allocated <= 0) {
+				nova_dbg("%s alloc blocks failed!, %d\n",
+							__func__, allocated);
+				ret = allocated;
+				goto out;
+			}
+
+			hole_fill = true;
+			blk_off = nova_get_block_off(sb, blocknr,
+							sih->i_blk_type);
+		}
+
+		step++;
+		bytes = sb->s_blocksize * allocated - offset;
+		if (bytes > count)
+			bytes = count;
+
+		kmem = nova_get_block(inode->i_sb, blk_off);
+
+		if (hole_fill &&
+		    (offset || ((offset + bytes) & (PAGE_SIZE - 1)) != 0)) {
+			ret =  nova_handle_head_tail_blocks(sb, inode,
+							    pos, bytes, kmem);
+			if (ret)
+				goto out;
+
+		}
+
+		/* Now copy from user buf */
+//		nova_dbg("Write: %p\n", kmem);
+		NOVA_START_TIMING(memcpy_w_nvmm_t, memcpy_time);
+		nova_memunlock_range(sb, kmem + offset, bytes);
+		copied = bytes - memcpy_to_pmem_nocache(kmem + offset,
+						buf, bytes);
+		nova_memlock_range(sb, kmem + offset, bytes);
+		NOVA_END_TIMING(memcpy_w_nvmm_t, memcpy_time);
+
+		if (data_csum > 0 || data_parity > 0) {
+			ret = nova_protect_file_data(sb, inode, pos, bytes,
+						buf, blocknr, !hole_fill);
+			if (ret)
+				goto out;
+		}
+
+		if (pos + copied > inode->i_size)
+			file_size = cpu_to_le64(pos + copied);
+		else
+			file_size = cpu_to_le64(inode->i_size);
+
+		/* Handle hole fill write */
+		if (hole_fill) {
+			nova_init_file_write_entry(sb, sih, &entry_data,
+						epoch_id, start_blk, allocated,
+						blocknr, time, file_size);
+
+			ret = nova_append_file_write_entry(sb, pi, inode,
+						&entry_data, &update);
+			if (ret) {
+				nova_dbg("%s: append inode entry failed\n",
+								__func__);
+				ret = -ENOSPC;
+				goto out;
+			}
+		} else {
+			/* Update existing entry */
+			struct nova_log_entry_info entry_info;
+
+			entry_info.type = FILE_WRITE;
+			entry_info.epoch_id = epoch_id;
+			entry_info.trans_id = sih->trans_id;
+			entry_info.time = time;
+			entry_info.file_size = file_size;
+			entry_info.inplace = 1;
+
+			nova_inplace_update_write_entry(sb, inode, entry,
+							&entry_info);
+		}
+
+		nova_dbgv("Write: %p, %lu\n", kmem, copied);
+		if (copied > 0) {
+			status = copied;
+			written += copied;
+			pos += copied;
+			buf += copied;
+			count -= copied;
+			num_blocks -= allocated;
+		}
+		if (unlikely(copied != bytes)) {
+			nova_dbg("%s ERROR!: %p, bytes %lu, copied %lu\n",
+				__func__, kmem, bytes, copied);
+			if (status >= 0)
+				status = -EFAULT;
+		}
+		if (status < 0)
+			break;
+
+		if (hole_fill) {
+			update_log = true;
+			if (begin_tail == 0)
+				begin_tail = update.curr_entry;
+		}
+	}
+
+	data_bits = blk_type_to_shift[sih->i_blk_type];
+	sih->i_blocks += (total_blocks << (data_bits - sb->s_blocksize_bits));
+
+	inode->i_blocks = sih->i_blocks;
+
+	if (update_log) {
+		nova_memunlock_inode(sb, pi);
+		nova_update_inode(sb, inode, pi, &update, 1);
+		nova_memlock_inode(sb, pi);
+		NOVA_STATS_ADD(inplace_new_blocks, 1);
+
+		/* Update file tree */
+		ret = nova_reassign_file_tree(sb, sih, begin_tail);
+		if (ret)
+			goto out;
+	}
+
+	ret = written;
+	NOVA_STATS_ADD(inplace_write_breaks, step);
+	nova_dbgv("blocks: %lu, %lu\n", inode->i_blocks, sih->i_blocks);
+
+	*ppos = pos;
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		sih->i_size = pos;
+	}
+
+	sih->trans_id++;
+out:
+	if (ret < 0)
+		nova_cleanup_incomplete_write(sb, sih, blocknr, allocated,
+						begin_tail, update.tail);
+
+	inode_unlock(inode);
+	sb_end_write(inode->i_sb);
+	NOVA_END_TIMING(inplace_write_t, inplace_write_time);
+	NOVA_STATS_ADD(inplace_write_bytes, written);
+	return ret;
+}
+
+/* Check if existing entry overlap with vma regions */
+int nova_check_overlap_vmas(struct super_block *sb,
+	struct nova_inode_info_header *sih,
+	unsigned long pgoff, unsigned long num_pages)
+{
+	unsigned long start_pgoff = 0;
+	unsigned long num = 0;
+	unsigned long i;
+	struct vma_item *item;
+	struct rb_node *temp;
+	int ret = 0;
+
+	if (sih->num_vmas == 0)
+		return 0;
+
+	temp = rb_first(&sih->vma_tree);
+	while (temp) {
+		item = container_of(temp, struct vma_item, node);
+		temp = rb_next(temp);
+		ret = nova_get_vma_overlap_range(sb, sih, item->vma, pgoff,
+					num_pages, &start_pgoff, &num);
+		if (ret) {
+			for (i = 0; i < num; i++) {
+				if (nova_get_write_entry(sb, sih,
+							start_pgoff + i))
+					return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ * return > 0, # of blocks mapped or allocated.
+ * return = 0, if plain lookup failed.
+ * return < 0, error case.
+ */
+int nova_dax_get_blocks(struct inode *inode, sector_t iblock,
+	unsigned long max_blocks, u32 *bno, bool *new, bool *boundary,
+	int create, bool taking_lock)
+{
+	struct super_block *sb = inode->i_sb;
+	struct nova_inode *pi;
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct nova_file_write_entry *entry = NULL;
+	struct nova_file_write_entry *entryc, entry_copy;
+	struct nova_file_write_entry entry_data;
+	struct nova_inode_update update;
+	u32 time;
+	unsigned int data_bits;
+	unsigned long nvmm = 0;
+	unsigned long blocknr = 0;
+	u64 epoch_id;
+	int num_blocks = 0;
+	int inplace = 0;
+	int allocated = 0;
+	int locked = 0;
+	int check_next = 1;
+	int ret = 0;
+	timing_t get_block_time;
+
+
+	if (max_blocks == 0)
+		return 0;
+
+	NOVA_START_TIMING(dax_get_block_t, get_block_time);
+
+	nova_dbgv("%s: pgoff %lu, num %lu, create %d\n",
+				__func__, iblock, max_blocks, create);
+
+	epoch_id = nova_get_epoch_id(sb);
+
+	if (taking_lock)
+		check_next = 0;
+
+again:
+	num_blocks = nova_check_existing_entry(sb, inode, max_blocks,
+					iblock, &entry, &entry_copy, check_next,
+					epoch_id, &inplace, locked);
+
+	entryc = (metadata_csum == 0) ? entry : &entry_copy;
+
+	if (entry) {
+		if (create == 0 || inplace) {
+			nvmm = get_nvmm(sb, sih, entryc, iblock);
+			nova_dbgv("%s: found pgoff %lu, block %lu\n",
+					__func__, iblock, nvmm);
+			goto out;
+		}
+	}
+
+	if (create == 0) {
+		num_blocks = 0;
+		goto out1;
+	}
+
+	if (taking_lock && locked == 0) {
+		inode_lock(inode);
+		locked = 1;
+		/* Check again incase someone has done it for us */
+		check_next = 1;
+		goto again;
+	}
+
+	pi = nova_get_inode(sb, inode);
+	inode->i_ctime = inode->i_mtime = current_time(inode);
+	time = current_time(inode).tv_sec;
+	update.tail = sih->log_tail;
+	update.alter_tail = sih->alter_log_tail;
+
+	/* Return initialized blocks to the user */
+	allocated = nova_new_data_blocks(sb, sih, &blocknr, iblock,
+				 num_blocks, ALLOC_INIT_ZERO, ANY_CPU,
+				 ALLOC_FROM_HEAD);
+	if (allocated <= 0) {
+		nova_dbgv("%s alloc blocks failed %d\n", __func__,
+							allocated);
+		ret = allocated;
+		goto out;
+	}
+
+	num_blocks = allocated;
+	/* Do not extend file size */
+	nova_init_file_write_entry(sb, sih, &entry_data,
+					epoch_id, iblock, num_blocks,
+					blocknr, time, inode->i_size);
+
+	ret = nova_append_file_write_entry(sb, pi, inode,
+				&entry_data, &update);
+	if (ret) {
+		nova_dbgv("%s: append inode entry failed\n", __func__);
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	nvmm = blocknr;
+	data_bits = blk_type_to_shift[sih->i_blk_type];
+	sih->i_blocks += (num_blocks << (data_bits - sb->s_blocksize_bits));
+
+	nova_memunlock_inode(sb, pi);
+	nova_update_inode(sb, inode, pi, &update, 1);
+	nova_memlock_inode(sb, pi);
+
+	ret = nova_reassign_file_tree(sb, sih, update.curr_entry);
+	if (ret) {
+		nova_dbgv("%s: nova_reassign_file_tree failed: %d\n",
+			  __func__,  ret);
+		goto out;
+	}
+	inode->i_blocks = sih->i_blocks;
+	sih->trans_id++;
+	NOVA_STATS_ADD(dax_new_blocks, 1);
+
+//	set_buffer_new(bh);
+out:
+	if (ret < 0) {
+		nova_cleanup_incomplete_write(sb, sih, blocknr, allocated,
+						0, update.tail);
+		num_blocks = ret;
+		goto out1;
+	}
+
+	*bno = nvmm;
+//	if (num_blocks > 1)
+//		bh->b_size = sb->s_blocksize * num_blocks;
+
+out1:
+	if (taking_lock && locked)
+		inode_unlock(inode);
+
+	NOVA_END_TIMING(dax_get_block_t, get_block_time);
+	return num_blocks;
+}
+
+int nova_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+	unsigned int flags, struct iomap *iomap, bool taking_lock)
+{
+	struct nova_sb_info *sbi = NOVA_SB(inode->i_sb);
+	unsigned int blkbits = inode->i_blkbits;
+	unsigned long first_block = offset >> blkbits;
+	unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
+	bool new = false, boundary = false;
+	u32 bno;
+	int ret;
+
+	ret = nova_dax_get_blocks(inode, first_block, max_blocks, &bno, &new,
+				  &boundary, flags & IOMAP_WRITE, taking_lock);
+	if (ret < 0) {
+		nova_dbgv("%s: nova_dax_get_blocks failed %d", __func__, ret);
+		return ret;
+	}
+
+	iomap->flags = 0;
+	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->dax_dev = sbi->s_dax_dev;
+	iomap->offset = (u64)first_block << blkbits;
+
+	if (ret == 0) {
+		iomap->type = IOMAP_HOLE;
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->length = 1 << blkbits;
+	} else {
+		iomap->type = IOMAP_MAPPED;
+		iomap->blkno = (sector_t)bno << (blkbits - 9);
+		iomap->length = (u64)ret << blkbits;
+		iomap->flags |= IOMAP_F_MERGED;
+	}
+
+	if (new)
+		iomap->flags |= IOMAP_F_NEW;
+	return 0;
+}
+
+int nova_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+	ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+	if (iomap->type == IOMAP_MAPPED &&
+			written < length &&
+			(flags & IOMAP_WRITE))
+		truncate_pagecache(inode, inode->i_size);
+	return 0;
+}
+
+
+static int nova_iomap_begin_lock(struct inode *inode, loff_t offset,
+	loff_t length, unsigned int flags, struct iomap *iomap)
+{
+	return nova_iomap_begin(inode, offset, length, flags, iomap, true);
+}
+
+static struct iomap_ops nova_iomap_ops_lock = {
+	.iomap_begin	= nova_iomap_begin_lock,
+	.iomap_end	= nova_iomap_end,
+};
+
+
+static int nova_dax_huge_fault(struct vm_fault *vmf,
+			      enum page_entry_size pe_size)
+{
+	int ret = 0;
+	timing_t fault_time;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+
+	NOVA_START_TIMING(pmd_fault_t, fault_time);
+
+	nova_dbgv("%s: inode %lu, pgoff %lu\n",
+		  __func__, inode->i_ino, vmf->pgoff);
+
+	ret = dax_iomap_fault(vmf, pe_size, &nova_iomap_ops_lock);
+
+	NOVA_END_TIMING(pmd_fault_t, fault_time);
+	return ret;
+}
+
+static int nova_dax_fault(struct vm_fault *vmf)
+{
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+
+	nova_dbgv("%s: inode %lu, pgoff %lu\n",
+		  __func__, inode->i_ino, vmf->pgoff);
+
+	return nova_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int nova_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	loff_t size;
+	int ret = 0;
+	timing_t fault_time;
+
+	NOVA_START_TIMING(pfn_mkwrite_t, fault_time);
+
+	inode_lock(inode);
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vmf);
+	inode_unlock(inode);
+
+	NOVA_END_TIMING(pfn_mkwrite_t, fault_time);
+	return ret;
+}
+
+static inline int nova_rbtree_compare_vma(struct vma_item *curr,
+	struct vm_area_struct *vma)
+{
+	if (vma < curr->vma)
+		return -1;
+	if (vma > curr->vma)
+		return 1;
+
+	return 0;
+}
+
+static int nova_append_write_mmap_to_log(struct super_block *sb,
+	struct inode *inode, struct vma_item *item)
+{
+	struct vm_area_struct *vma = item->vma;
+	struct nova_inode *pi;
+	struct nova_mmap_entry data;
+	struct nova_inode_update update;
+	unsigned long num_pages;
+	u64 epoch_id;
+	int ret;
+
+	/* Only for csum and parity update */
+	if (data_csum == 0 && data_parity == 0)
+		return 0;
+
+	pi = nova_get_inode(sb, inode);
+	epoch_id = nova_get_epoch_id(sb);
+	update.tail = update.alter_tail = 0;
+
+	memset(&data, 0, sizeof(struct nova_mmap_entry));
+	data.entry_type = MMAP_WRITE;
+	data.epoch_id = epoch_id;
+	data.pgoff = cpu_to_le64(vma->vm_pgoff);
+	num_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	data.num_pages = cpu_to_le64(num_pages);
+	data.invalid = 0;
+
+	nova_dbgv("%s : Appending mmap log entry for inode %lu, pgoff %llu, %llu pages\n",
+			__func__, inode->i_ino,
+			data.pgoff, data.num_pages);
+
+	ret = nova_append_mmap_entry(sb, pi, inode, &data, &update, item);
+	if (ret) {
+		nova_dbg("%s: append write mmap entry failure\n", __func__);
+		goto out;
+	}
+
+	nova_memunlock_inode(sb, pi);
+	nova_update_inode(sb, inode, pi, &update, 1);
+	nova_memlock_inode(sb, pi);
+out:
+	return ret;
+}
+
+int nova_insert_write_vma(struct vm_area_struct *vma)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct super_block *sb = inode->i_sb;
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	unsigned long flags = VM_SHARED | VM_WRITE;
+	struct vma_item *item, *curr;
+	struct rb_node **temp, *parent;
+	int compVal;
+	int insert = 0;
+	int ret;
+	timing_t insert_vma_time;
+
+
+	if ((vma->vm_flags & flags) != flags)
+		return 0;
+
+	NOVA_START_TIMING(insert_vma_t, insert_vma_time);
+
+	item = nova_alloc_vma_item(sb);
+	if (!item) {
+		NOVA_END_TIMING(insert_vma_t, insert_vma_time);
+		return -ENOMEM;
+	}
+
+	item->vma = vma;
+
+	nova_dbgv("Inode %lu insert vma %p, start 0x%lx, end 0x%lx, pgoff %lu\n",
+			inode->i_ino, vma, vma->vm_start, vma->vm_end,
+			vma->vm_pgoff);
+
+	inode_lock(inode);
+
+	/* Append to log */
+	ret = nova_append_write_mmap_to_log(sb, inode, item);
+	if (ret)
+		goto out;
+
+	temp = &(sih->vma_tree.rb_node);
+	parent = NULL;
+
+	while (*temp) {
+		curr = container_of(*temp, struct vma_item, node);
+		compVal = nova_rbtree_compare_vma(curr, vma);
+		parent = *temp;
+
+		if (compVal == -1) {
+			temp = &((*temp)->rb_left);
+		} else if (compVal == 1) {
+			temp = &((*temp)->rb_right);
+		} else {
+			nova_dbg("%s: vma %p already exists\n",
+				__func__, vma);
+			kfree(item);
+			goto out;
+		}
+	}
+
+	rb_link_node(&item->node, parent, temp);
+	rb_insert_color(&item->node, &sih->vma_tree);
+
+	sih->num_vmas++;
+	if (sih->num_vmas == 1)
+		insert = 1;
+
+	sih->trans_id++;
+out:
+	inode_unlock(inode);
+
+	if (insert) {
+		mutex_lock(&sbi->vma_mutex);
+		list_add_tail(&sih->list, &sbi->mmap_sih_list);
+		mutex_unlock(&sbi->vma_mutex);
+	}
+
+	NOVA_END_TIMING(insert_vma_t, insert_vma_time);
+	return ret;
+}
+
+static int nova_remove_write_vma(struct vm_area_struct *vma)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct nova_inode_info *si = NOVA_I(inode);
+	struct nova_inode_info_header *sih = &si->header;
+	struct super_block *sb = inode->i_sb;
+	struct nova_sb_info *sbi = NOVA_SB(sb);
+	struct vma_item *curr = NULL;
+	struct rb_node *temp;
+	int compVal;
+	int found = 0;
+	int remove = 0;
+	timing_t remove_vma_time;
+
+
+	NOVA_START_TIMING(remove_vma_t, remove_vma_time);
+	inode_lock(inode);
+
+	temp = sih->vma_tree.rb_node;
+	while (temp) {
+		curr = container_of(temp, struct vma_item, node);
+		compVal = nova_rbtree_compare_vma(curr, vma);
+
+		if (compVal == -1) {
+			temp = temp->rb_left;
+		} else if (compVal == 1) {
+			temp = temp->rb_right;
+		} else {
+			nova_reset_vma_csum_parity(sb, curr);
+			rb_erase(&curr->node, &sih->vma_tree);
+			found = 1;
+			break;
+		}
+	}
+
+	if (found) {
+		sih->num_vmas--;
+		if (sih->num_vmas == 0)
+			remove = 1;
+	}
+
+	inode_unlock(inode);
+
+	if (found) {
+		nova_dbgv("Inode %lu remove vma %p, start 0x%lx, end 0x%lx, pgoff %lu\n",
+			  inode->i_ino,	curr->vma, curr->vma->vm_start,
+			  curr->vma->vm_end, curr->vma->vm_pgoff);
+		nova_free_vma_item(sb, curr);
+	}
+
+	if (remove) {
+		mutex_lock(&sbi->vma_mutex);
+		list_del(&sih->list);
+		mutex_unlock(&sbi->vma_mutex);
+	}
+
+	NOVA_END_TIMING(remove_vma_t, remove_vma_time);
+	return 0;
+}
+
+static int nova_restore_page_write(struct vm_area_struct *vma,
+	unsigned long address)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+
+	down_write(&mm->mmap_sem);
+
+	nova_dbgv("Restore vma %p write, start 0x%lx, end 0x%lx, address 0x%lx\n",
+		  vma, vma->vm_start, vma->vm_end, address);
+
+	/* Restore single page write */
+	nova_mmap_to_new_blocks(vma, address);
+
+	up_write(&mm->mmap_sem);
+
+	return 0;
+}
+
+static void nova_vma_open(struct vm_area_struct *vma)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct inode *inode = mapping->host;
+
+	nova_dbg_mmap4k("[%s:%d] inode %lu, MMAP 4KPAGE vm_start(0x%lx), vm_end(0x%lx), vm pgoff %lu, %lu blocks, vm_flags(0x%lx), vm_page_prot(0x%lx)\n",
+			__func__, __LINE__,
+			inode->i_ino, vma->vm_start, vma->vm_end,
+			vma->vm_pgoff,
+			(vma->vm_end - vma->vm_start) >> PAGE_SHIFT,
+			vma->vm_flags,
+			pgprot_val(vma->vm_page_prot));
+
+	nova_insert_write_vma(vma);
+}
+
+static void nova_vma_close(struct vm_area_struct *vma)
+{
+	nova_dbgv("[%s:%d] MMAP 4KPAGE vm_start(0x%lx), vm_end(0x%lx), vm_flags(0x%lx), vm_page_prot(0x%lx)\n",
+		  __func__, __LINE__, vma->vm_start, vma->vm_end,
+		  vma->vm_flags, pgprot_val(vma->vm_page_prot));
+
+	vma->original_write = 0;
+	nova_remove_write_vma(vma);
+}
+
+const struct vm_operations_struct nova_dax_vm_ops = {
+	.fault	= nova_dax_fault,
+	.huge_fault = nova_dax_huge_fault,
+	.page_mkwrite = nova_dax_fault,
+	.pfn_mkwrite = nova_dax_pfn_mkwrite,
+	.open = nova_vma_open,
+	.close = nova_vma_close,
+	.dax_cow = nova_restore_page_write,
+};
+