6 files changed, 407 insertions, 4 deletions
diff --git a/Documentation/filesystems/conf.py b/Documentation/filesystems/conf.py
new file mode 100644
index 000000000000..ea44172af5c4
--- /dev/null
+++ b/Documentation/filesystems/conf.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8; mode: python -*-
+
+project = "Linux Filesystems API"
+
+tags.add("subproject")
+
+latex_documents = [
+    ('index', 'filesystems.tex', project,
+     'The kernel development community', 'manual'),
+]
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 4f6531a4701b..273ccb26885e 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -155,11 +155,15 @@ noinline_data          Disable the inline data feature, inline data feature is
                        enabled by default.
 data_flush             Enable data flushing before checkpoint in order to
                        persist data of regular and symlink.
+fault_injection=%d     Enable fault injection in all supported types with
+                       specified injection rate.
 mode=%s                Control block allocation mode which supports "adaptive"
                        and "lfs". In "lfs" mode, there should be no random
                        writes towards main area.
 io_bits=%u             Set the bit size of write IO requests. It should be set
                        with "mode=lfs".
+usrquota               Enable plain user disk quota accounting.
+grpquota               Enable plain group disk quota accounting.
 
 ================================================================================
 DEBUGFS ENTRIES
diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst
new file mode 100644
index 000000000000..256e10eedba4
--- /dev/null
+++ b/Documentation/filesystems/index.rst
@@ -0,0 +1,317 @@
+=====================
+Linux Filesystems API
+=====================
+
+The Linux VFS
+=============
+
+The Filesystem types
+--------------------
+
+.. kernel-doc:: include/linux/fs.h
+   :internal:
+
+The Directory Cache
+-------------------
+
+.. kernel-doc:: fs/dcache.c
+   :export:
+
+.. kernel-doc:: include/linux/dcache.h
+   :internal:
+
+Inode Handling
+--------------
+
+.. kernel-doc:: fs/inode.c
+   :export:
+
+.. kernel-doc:: fs/bad_inode.c
+   :export:
+
+Registration and Superblocks
+----------------------------
+
+.. kernel-doc:: fs/super.c
+   :export:
+
+File Locks
+----------
+
+.. kernel-doc:: fs/locks.c
+   :export:
+
+.. kernel-doc:: fs/locks.c
+   :internal:
+
+Other Functions
+---------------
+
+.. kernel-doc:: fs/mpage.c
+   :export:
+
+.. kernel-doc:: fs/namei.c
+   :export:
+
+.. kernel-doc:: fs/buffer.c
+   :export:
+
+.. kernel-doc:: block/bio.c
+   :export:
+
+.. kernel-doc:: fs/seq_file.c
+   :export:
+
+.. kernel-doc:: fs/filesystems.c
+   :export:
+
+.. kernel-doc:: fs/fs-writeback.c
+   :export:
+
+.. kernel-doc:: fs/block_dev.c
+   :export:
+
+The proc filesystem
+===================
+
+sysctl interface
+----------------
+
+.. kernel-doc:: kernel/sysctl.c
+   :export:
+
+proc filesystem interface
+-------------------------
+
+.. kernel-doc:: fs/proc/base.c
+   :internal:
+
+Events based on file descriptors
+================================
+
+.. kernel-doc:: fs/eventfd.c
+   :export:
+
+The Filesystem for Exporting Kernel Objects
+===========================================
+
+.. kernel-doc:: fs/sysfs/file.c
+   :export:
+
+.. kernel-doc:: fs/sysfs/symlink.c
+   :export:
+
+The debugfs filesystem
+======================
+
+debugfs interface
+-----------------
+
+.. kernel-doc:: fs/debugfs/inode.c
+   :export:
+
+.. kernel-doc:: fs/debugfs/file.c
+   :export:
+
+The Linux Journalling API
+=========================
+
+Overview
+--------
+
+Details
+~~~~~~~
+
+The journalling layer is easy to use. You need to first of all create a
+journal_t data structure. There are two calls to do this dependent on
+how you decide to allocate the physical media on which the journal
+resides. The :c:func:`jbd2_journal_init_inode` call is for journals stored in
+filesystem inodes, or the :c:func:`jbd2_journal_init_dev` call can be used
+for journal stored on a raw device (in a continuous range of blocks). A
+journal_t is a typedef for a struct pointer, so when you are finally
+finished make sure you call :c:func:`jbd2_journal_destroy` on it to free up
+any used kernel memory.
+
+Once you have got your journal_t object you need to 'mount' or load the
+journal file. The journalling layer expects the space for the journal
+was already allocated and initialized properly by the userspace tools.
+When loading the journal you must call :c:func:`jbd2_journal_load` to process
+journal contents. If the client file system detects the journal contents
+does not need to be processed (or even need not have valid contents), it
+may call :c:func:`jbd2_journal_wipe` to clear the journal contents before
+calling :c:func:`jbd2_journal_load`.
+
+Note that jbd2_journal_wipe(..,0) calls
+:c:func:`jbd2_journal_skip_recovery` for you if it detects any outstanding
+transactions in the journal and similarly :c:func:`jbd2_journal_load` will
+call :c:func:`jbd2_journal_recover` if necessary. I would advise reading
+:c:func:`ext4_load_journal` in fs/ext4/super.c for examples on this stage.
+
+Now you can go ahead and start modifying the underlying filesystem.
+Almost.
+
+You still need to actually journal your filesystem changes, this is done
+by wrapping them into transactions. Additionally you also need to wrap
+the modification of each of the buffers with calls to the journal layer,
+so it knows what the modifications you are actually making are. To do
+this use :c:func:`jbd2_journal_start` which returns a transaction handle.
+
+:c:func:`jbd2_journal_start` and its counterpart :c:func:`jbd2_journal_stop`,
+which indicates the end of a transaction are nestable calls, so you can
+reenter a transaction if necessary, but remember you must call
+:c:func:`jbd2_journal_stop` the same number of times as
+:c:func:`jbd2_journal_start` before the transaction is completed (or more
+accurately leaves the update phase). Ext4/VFS makes use of this feature to
+simplify handling of inode dirtying, quota support, etc.
+
+Inside each transaction you need to wrap the modifications to the
+individual buffers (blocks). Before you start to modify a buffer you
+need to call :c:func:`jbd2_journal_get_create_access()` /
+:c:func:`jbd2_journal_get_write_access()` /
+:c:func:`jbd2_journal_get_undo_access()` as appropriate, this allows the
+journalling layer to copy the unmodified
+data if it needs to. After all the buffer may be part of a previously
+uncommitted transaction. At this point you are at last ready to modify a
+buffer, and once you are have done so you need to call
+:c:func:`jbd2_journal_dirty_metadata`. Or if you've asked for access to a
+buffer you now know is now longer required to be pushed back on the
+device you can call :c:func:`jbd2_journal_forget` in much the same way as you
+might have used :c:func:`bforget` in the past.
+
+A :c:func:`jbd2_journal_flush` may be called at any time to commit and
+checkpoint all your transactions.
+
+Then at umount time , in your :c:func:`put_super` you can then call
+:c:func:`jbd2_journal_destroy` to clean up your in-core journal object.
+
+Unfortunately there a couple of ways the journal layer can cause a
+deadlock. The first thing to note is that each task can only have a
+single outstanding transaction at any one time, remember nothing commits
+until the outermost :c:func:`jbd2_journal_stop`. This means you must complete
+the transaction at the end of each file/inode/address etc. operation you
+perform, so that the journalling system isn't re-entered on another
+journal. Since transactions can't be nested/batched across differing
+journals, and another filesystem other than yours (say ext4) may be
+modified in a later syscall.
+
+The second case to bear in mind is that :c:func:`jbd2_journal_start` can block
+if there isn't enough space in the journal for your transaction (based
+on the passed nblocks param) - when it blocks it merely(!) needs to wait
+for transactions to complete and be committed from other tasks, so
+essentially we are waiting for :c:func:`jbd2_journal_stop`. So to avoid
+deadlocks you must treat :c:func:`jbd2_journal_start` /
+:c:func:`jbd2_journal_stop` as if they were semaphores and include them in
+your semaphore ordering rules to prevent
+deadlocks. Note that :c:func:`jbd2_journal_extend` has similar blocking
+behaviour to :c:func:`jbd2_journal_start` so you can deadlock here just as
+easily as on :c:func:`jbd2_journal_start`.
+
+Try to reserve the right number of blocks the first time. ;-). This will
+be the maximum number of blocks you are going to touch in this
+transaction. I advise having a look at at least ext4_jbd.h to see the
+basis on which ext4 uses to make these decisions.
+
+Another wriggle to watch out for is your on-disk block allocation
+strategy. Why? Because, if you do a delete, you need to ensure you
+haven't reused any of the freed blocks until the transaction freeing
+these blocks commits. If you reused these blocks and crash happens,
+there is no way to restore the contents of the reallocated blocks at the
+end of the last fully committed transaction. One simple way of doing
+this is to mark blocks as free in internal in-memory block allocation
+structures only after the transaction freeing them commits. Ext4 uses
+journal commit callback for this purpose.
+
+With journal commit callbacks you can ask the journalling layer to call
+a callback function when the transaction is finally committed to disk,
+so that you can do some of your own management. You ask the journalling
+layer for calling the callback by simply setting
+``journal->j_commit_callback`` function pointer and that function is
+called after each transaction commit. You can also use
+``transaction->t_private_list`` for attaching entries to a transaction
+that need processing when the transaction commits.
+
+JBD2 also provides a way to block all transaction updates via
+:c:func:`jbd2_journal_lock_updates()` /
+:c:func:`jbd2_journal_unlock_updates()`. Ext4 uses this when it wants a
+window with a clean and stable fs for a moment. E.g.
+
+::
+
+
+        jbd2_journal_lock_updates() //stop new stuff happening..
+        jbd2_journal_flush()        // checkpoint everything.
+        ..do stuff on stable fs
+        jbd2_journal_unlock_updates() // carry on with filesystem use.
+
+The opportunities for abuse and DOS attacks with this should be obvious,
+if you allow unprivileged userspace to trigger codepaths containing
+these calls.
+
+Summary
+~~~~~~~
+
+Using the journal is a matter of wrapping the different context changes,
+being each mount, each modification (transaction) and each changed
+buffer to tell the journalling layer about them.
+
+Data Types
+----------
+
+The journalling layer uses typedefs to 'hide' the concrete definitions
+of the structures used. As a client of the JBD2 layer you can just rely
+on the using the pointer as a magic cookie of some sort. Obviously the
+hiding is not enforced as this is 'C'.
+
+Structures
+~~~~~~~~~~
+
+.. kernel-doc:: include/linux/jbd2.h
+   :internal:
+
+Functions
+---------
+
+The functions here are split into two groups those that affect a journal
+as a whole, and those which are used to manage transactions
+
+Journal Level
+~~~~~~~~~~~~~
+
+.. kernel-doc:: fs/jbd2/journal.c
+   :export:
+
+.. kernel-doc:: fs/jbd2/recovery.c
+   :internal:
+
+Transasction Level
+~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: fs/jbd2/transaction.c
+
+See also
+--------
+
+`Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen
+Tweedie <http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz>`__
+
+`Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen
+Tweedie <http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html>`__
+
+splice API
+==========
+
+splice is a method for moving blocks of data around inside the kernel,
+without continually transferring them between the kernel and user space.
+
+.. kernel-doc:: fs/splice.c
+
+pipes API
+=========
+
+Pipe interfaces are all for in-kernel (builtin image) use. They are not
+exported for use by modules.
+
+.. kernel-doc:: include/linux/pipe_fs_i.h
+   :internal:
+
+.. kernel-doc:: fs/pipe.c
diff --git a/Documentation/filesystems/nfs/idmapper.txt b/Documentation/filesystems/nfs/idmapper.txt
index fe03d10bb79a..b86831acd583 100644
--- a/Documentation/filesystems/nfs/idmapper.txt
+++ b/Documentation/filesystems/nfs/idmapper.txt
@@ -55,7 +55,7 @@ request-key will find the first matching line and corresponding program.  In
 this case, /some/other/program will handle all uid lookups and
 /usr/sbin/nfs.idmap will handle gid, user, and group lookups.
 
-See <file:Documentation/security/keys-request-key.txt> for more information
+See <file:Documentation/security/keys/request-key.rst> for more information
 about the request-key function.
 
 
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index c9e884b52698..36f528a7fdd6 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -201,6 +201,40 @@ rightmost one and going left.  In the above example lower1 will be the
 top, lower2 the middle and lower3 the bottom layer.
 
 
+Sharing and copying layers
+--------------------------
+
+Lower layers may be shared among several overlay mounts and that is indeed
+a very common practice.  An overlay mount may use the same lower layer
+path as another overlay mount and it may use a lower layer path that is
+beneath or above the path of another overlay lower layer path.
+
+Using an upper layer path and/or a workdir path that are already used by
+another overlay mount is not allowed and will fail with EBUSY.  Using
+partially overlapping paths is not allowed but will not fail with EBUSY.
+
+Mounting an overlay using an upper layer path, where the upper layer path
+was previously used by another mounted overlay in combination with a
+different lower layer path, is allowed, unless the "inodes index" feature
+is enabled.
+
+With the "inodes index" feature, on the first time mount, an NFS file
+handle of the lower layer root directory, along with the UUID of the lower
+filesystem, are encoded and stored in the "trusted.overlay.origin" extended
+attribute on the upper layer root directory.  On subsequent mount attempts,
+the lower root directory file handle and lower filesystem UUID are compared
+to the stored origin in upper root directory.  On failure to verify the
+lower root origin, mount will fail with ESTALE.  An overlayfs mount with
+"inodes index" enabled will fail with EOPNOTSUPP if the lower filesystem
+does not support NFS export, lower filesystem does not have a valid UUID or
+if the upper filesystem does not support extended attributes.
+
+It is quite a common practice to copy overlay layers to a different
+directory tree on the same or different underlying filesystem, and even
+to a different machine.  With the "inodes index" feature, trying to mount
+the copied layers will fail the verification of the lower root file handle.
+
+
 Non-standard behavior
 ---------------------
 
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f42b90687d40..48c9faa73a76 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -576,7 +576,43 @@ should clear PG_Dirty and set PG_Writeback.  It can be actually
 written at any point after PG_Dirty is clear.  Once it is known to be
 safe, PG_Writeback is cleared.
 
-Writeback makes use of a writeback_control structure...
+Writeback makes use of a writeback_control structure to direct the
+operations.  This gives the the writepage and writepages operations some
+information about the nature of and reason for the writeback request,
+and the constraints under which it is being done.  It is also used to
+return information back to the caller about the result of a writepage or
+writepages request.
+
+Handling errors during writeback
+--------------------------------
+Most applications that do buffered I/O will periodically call a file
+synchronization call (fsync, fdatasync, msync or sync_file_range) to
+ensure that data written has made it to the backing store.  When there
+is an error during writeback, they expect that error to be reported when
+a file sync request is made.  After an error has been reported on one
+request, subsequent requests on the same file descriptor should return
+0, unless further writeback errors have occurred since the previous file
+syncronization.
+
+Ideally, the kernel would report errors only on file descriptions on
+which writes were done that subsequently failed to be written back.  The
+generic pagecache infrastructure does not track the file descriptions
+that have dirtied each individual page however, so determining which
+file descriptors should get back an error is not possible.
+
+Instead, the generic writeback error tracking infrastructure in the
+kernel settles for reporting errors to fsync on all file descriptions
+that were open at the time that the error occurred.  In a situation with
+multiple writers, all of them will get back an error on a subsequent fsync,
+even if all of the writes done through that particular file descriptor
+succeeded (or even if there were no writes on that file descriptor at all).
+
+Filesystems that wish to use this infrastructure should call
+mapping_set_error to record the error in the address_space when it
+occurs.  Then, after writing back data from the pagecache in their
+file->fsync operation, they should call file_check_and_advance_wb_err to
+ensure that the struct file's error cursor has advanced to the correct
+point in the stream of errors emitted by the backing device(s).
 
 struct address_space_operations
 -------------------------------
@@ -804,7 +840,8 @@ struct address_space_operations {
 The File Object
 ===============
 
-A file object represents a file opened by a process.
+A file object represents a file opened by a process. This is also known
+as an "open file description" in POSIX parlance.
 
 
 struct file_operations
@@ -887,7 +924,8 @@ otherwise noted.
 
   release: called when the last reference to an open file is closed
 
-  fsync: called by the fsync(2) system call
+  fsync: called by the fsync(2) system call. Also see the section above
+	 entitled "Handling errors during writeback".
 
   fasync: called by the fcntl(2) system call when asynchronous
 	(non-blocking) mode is enabled for a file