xfs: optimise away log forces on timestamp updates for fdatasync

[deliverable/linux.git] / fs / xfs / xfs_file.c
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index db4acc1c3e73479cdf32322944a7fa1bcf34b96f..c94699cbc667fd64954ca3ea2d2fe100e872a1c9 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -242,19 +242,30 @@ xfs_file_fsync(
         }
  
         /*
-        * All metadata updates are logged, which means that we just have
-        * to flush the log up to the latest LSN that touched the inode.
+        * All metadata updates are logged, which means that we just have to
+        * flush the log up to the latest LSN that touched the inode. If we have
+        * concurrent fsync/fdatasync() calls, we need them to all block on the
+        * log force before we clear the ili_fsync_fields field. This ensures
+        * that we don't get a racing sync operation that does not wait for the
+        * metadata to hit the journal before returning. If we race with
+        * clearing the ili_fsync_fields, then all that will happen is the log
+        * force will do nothing as the lsn will already be on disk. We can't
+        * race with setting ili_fsync_fields because that is done under
+        * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+        * until after the ili_fsync_fields is cleared.
          */
         xfs_ilock(ip, XFS_ILOCK_SHARED);
         if (xfs_ipincount(ip)) {
                 if (!datasync ||
-                   (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+                   (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
                         lsn = ip->i_itemp->ili_last_lsn;
         }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
-       if (lsn)
+       if (lsn) {
                 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+               ip->i_itemp->ili_fsync_fields = 0;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
         /*
          * If we only have a single device, and the log force about was
@@ -317,24 +328,33 @@ xfs_file_read_iter(
                 return -EIO;
  
         /*
-        * Locking is a bit tricky here. If we take an exclusive lock
-        * for direct IO, we effectively serialise all new concurrent
-        * read IO to this file and block it behind IO that is currently in
-        * progress because IO in progress holds the IO lock shared. We only
-        * need to hold the lock exclusive to blow away the page cache, so
-        * only take lock exclusively if the page cache needs invalidation.
-        * This allows the normal direct IO case of no page cache pages to
-        * proceeed concurrently without serialisation.
+        * Locking is a bit tricky here. If we take an exclusive lock for direct
+        * IO, we effectively serialise all new concurrent read IO to this file
+        * and block it behind IO that is currently in progress because IO in
+        * progress holds the IO lock shared. We only need to hold the lock
+        * exclusive to blow away the page cache, so only take lock exclusively
+        * if the page cache needs invalidation. This allows the normal direct
+        * IO case of no page cache pages to proceeed concurrently without
+        * serialisation.
          */
         xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
         if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
                 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
                 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
  
+               /*
+                * The generic dio code only flushes the range of the particular
+                * I/O. Because we take an exclusive lock here, this whole
+                * sequence is considerably more expensive for us. This has a
+                * noticeable performance impact for any file with cached pages,
+                * even when outside of the range of the particular I/O.
+                *
+                * Hence, amortize the cost of the lock against a full file
+                * flush and reduce the chances of repeated iolock cycles going
+                * forward.
+                */
                 if (inode->i_mapping->nrpages) {
-                       ret = filemap_write_and_wait_range(
-                                                       VFS_I(ip)->i_mapping,
-                                                       pos, pos + size - 1);
+                       ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                         if (ret) {
                                 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                 return ret;
@@ -345,9 +365,7 @@ xfs_file_read_iter(
                          * we fail to invalidate a page, but this should never
                          * happen on XFS. Warn if it does fail.
                          */
-                       ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       pos >> PAGE_CACHE_SHIFT,
-                                       (pos + size - 1) >> PAGE_CACHE_SHIFT);
+                       ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
                         WARN_ON_ONCE(ret);
                         ret = 0;
                 }
@@ -733,19 +751,19 @@ xfs_file_dio_aio_write(
         pos = iocb->ki_pos;
         end = pos + count - 1;
  
+       /*
+        * See xfs_file_read_iter() for why we do a full-file flush here.
+        */
         if (mapping->nrpages) {
-               ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                  pos, end);
+               ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
                 if (ret)
                         goto out;
                 /*
-                * Invalidate whole pages. This can return an error if
-                * we fail to invalidate a page, but this should never
-                * happen on XFS. Warn if it does fail.
+                * Invalidate whole pages. This can return an error if we fail
+                * to invalidate a page, but this should never happen on XFS.
+                * Warn if it does fail.
                  */
-               ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
-                                       pos >> PAGE_CACHE_SHIFT,
-                                       end >> PAGE_CACHE_SHIFT);
+               ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
                 WARN_ON_ONCE(ret);
                 ret = 0;
         }
@@ -1539,8 +1557,36 @@ xfs_filemap_fault(
         return ret;
  }
  
+STATIC int
+xfs_filemap_pmd_fault(
+       struct vm_area_struct   *vma,
+       unsigned long           addr,
+       pmd_t                   *pmd,
+       unsigned int            flags)
+{
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret;
+
+       if (!IS_DAX(inode))
+               return VM_FAULT_FALLBACK;
+
+       trace_xfs_filemap_pmd_fault(ip);
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
+                                   xfs_end_io_dax_write);
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+
+       return ret;
+}
+
  static const struct vm_operations_struct xfs_file_vm_ops = {
         .fault          = xfs_filemap_fault,
+       .pmd_fault      = xfs_filemap_pmd_fault,
         .map_pages      = filemap_map_pages,
         .page_mkwrite   = xfs_filemap_page_mkwrite,
  };
@@ -1553,7 +1599,7 @@ xfs_file_mmap(
         file_accessed(filp);
         vma->vm_ops = &xfs_file_vm_ops;
         if (IS_DAX(file_inode(filp)))
-               vma->vm_flags |= VM_MIXEDMAP;
+               vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
         return 0;
  }