summaryrefslogtreecommitdiff
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking52
-rw-r--r--Documentation/filesystems/fscrypt.rst10
-rw-r--r--Documentation/filesystems/fuse-io.txt38
-rw-r--r--Documentation/filesystems/nfs/nfsroot.txt70
-rw-r--r--Documentation/filesystems/proc.txt8
-rw-r--r--Documentation/filesystems/tmpfs.txt5
-rw-r--r--Documentation/filesystems/vfs.txt15
7 files changed, 161 insertions, 37 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 75d2d57e2c44..2c391338c675 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -69,31 +69,31 @@ prototypes:
locking rules:
all may block
- i_mutex(inode)
-lookup: yes
-create: yes
-link: yes (both)
-mknod: yes
-symlink: yes
-mkdir: yes
-unlink: yes (both)
-rmdir: yes (both) (see below)
-rename: yes (all) (see below)
+ i_rwsem(inode)
+lookup: shared
+create: exclusive
+link: exclusive (both)
+mknod: exclusive
+symlink: exclusive
+mkdir: exclusive
+unlink: exclusive (both)
+rmdir: exclusive (both)(see below)
+rename: exclusive (all) (see below)
readlink: no
get_link: no
-setattr: yes
+setattr: exclusive
permission: no (may not block if called in rcu-walk mode)
get_acl: no
getattr: no
listxattr: no
fiemap: no
update_time: no
-atomic_open: yes
+atomic_open: exclusive
tmpfile: no
- Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
-victim.
+ Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem
+ exclusive on victim.
cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
See Documentation/filesystems/directory-locking for more detailed discussion
@@ -111,10 +111,10 @@ prototypes:
locking rules:
all may block
- i_mutex(inode)
+ i_rwsem(inode)
list: no
get: no
-set: yes
+set: exclusive
--------------------------- super_operations ---------------------------
prototypes:
@@ -217,14 +217,14 @@ prototypes:
locking rules:
All except set_page_dirty and freepage may block
- PageLocked(page) i_mutex
+ PageLocked(page) i_rwsem
writepage: yes, unlocks (see below)
readpage: yes, unlocks
writepages:
set_page_dirty no
readpages:
-write_begin: locks the page yes
-write_end: yes, unlocks yes
+write_begin: locks the page exclusive
+write_end: yes, unlocks exclusive
bmap:
invalidatepage: yes
releasepage: yes
@@ -439,7 +439,10 @@ prototypes:
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
- unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ int (*iterate_shared) (struct file *, struct dir_context *);
+ __poll_t (*poll) (struct file *, struct poll_table_struct *);
+ struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+ __poll_t (*poll_mask) (struct file *, __poll_t);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
@@ -470,7 +473,7 @@ prototypes:
};
locking rules:
- All may block.
+ All except for ->poll_mask may block.
->llseek() locking has moved from llseek to the individual llseek
implementations. If your fs is not using generic_file_llseek, you
@@ -480,6 +483,10 @@ mutex or just to use i_size_read() instead.
Note: this does not protect the file->f_pos against concurrent modifications
since this is something the userspace has to take care about.
+->iterate() is called with i_rwsem exclusive.
+
+->iterate_shared() is called with i_rwsem at least shared.
+
->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
Most instances call fasync_helper(), which does that maintenance, so it's
not normally something one needs to worry about. Return values > 0 will be
@@ -498,6 +505,9 @@ in sys_read() and friends.
the lease within the individual filesystem to record the result of the
operation
+->poll_mask can be called with or without the waitqueue lock for the waitqueue
+returned from ->get_poll_head.
+
--------------------------- dquot_operations -------------------------------
prototypes:
int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index cfbc18f0d9c9..48b424de85bb 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -191,11 +191,21 @@ Currently, the following pairs of encryption modes are supported:
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
+- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames
It is strongly recommended to use AES-256-XTS for contents encryption.
AES-128-CBC was added only for low-powered embedded devices with
crypto accelerators such as CAAM or CESA that do not support XTS.
+Similarly, Speck128/256 support was only added for older or low-end
+CPUs which cannot do AES fast enough -- especially ARM CPUs which have
+NEON instructions but not the Cryptography Extensions -- and for which
+it would not otherwise be feasible to use encryption at all. It is
+not recommended to use Speck on CPUs that have AES instructions.
+Speck support is only available if it has been enabled in the crypto
+API via CONFIG_CRYPTO_SPECK. Also, on ARM platforms, to get
+acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled.
+
New encryption modes can be added relatively easily, without changes
to individual filesystems. However, authenticated encryption (AE)
modes are not currently supported because of the difficulty of dealing
diff --git a/Documentation/filesystems/fuse-io.txt b/Documentation/filesystems/fuse-io.txt
new file mode 100644
index 000000000000..07b8f73f100f
--- /dev/null
+++ b/Documentation/filesystems/fuse-io.txt
@@ -0,0 +1,38 @@
+Fuse supports the following I/O modes:
+
+- direct-io
+- cached
+ + write-through
+ + writeback-cache
+
+The direct-io mode can be selected with the FOPEN_DIRECT_IO flag in the
+FUSE_OPEN reply.
+
+In direct-io mode the page cache is completely bypassed for reads and writes.
+No read-ahead takes place. Shared mmap is disabled.
+
+In cached mode reads may be satisfied from the page cache, and data may be
+read-ahead by the kernel to fill the cache. The cache is always kept consistent
+after any writes to the file. All mmap modes are supported.
+
+The cached mode has two sub modes controlling how writes are handled. The
+write-through mode is the default and is supported on all kernels. The
+writeback-cache mode may be selected by the FUSE_WRITEBACK_CACHE flag in the
+FUSE_INIT reply.
+
+In write-through mode each write is immediately sent to userspace as one or more
+WRITE requests, as well as updating any cached pages (and caching previously
+uncached, but fully written pages). No READ requests are ever sent for writes,
+so when an uncached page is partially written, the page is discarded.
+
+In writeback-cache mode (enabled by the FUSE_WRITEBACK_CACHE flag) writes go to
+the cache only, which means that the write(2) syscall can often complete very
+fast. Dirty pages are written back implicitly (background writeback or page
+reclaim on memory pressure) or explicitly (invoked by close(2), fsync(2) and
+when the last ref to the file is being released on munmap(2)). This mode
+assumes that all changes to the filesystem go through the FUSE kernel module
+(size and atime/ctime/mtime attributes are kept up-to-date by the kernel), so
+it's generally not suitable for network filesystems. If a partial page is
+written, then the page needs to be first read from userspace. This means, that
+even for files opened for O_WRONLY it is possible that READ requests will be
+generated by the kernel.
diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 5efae00f6c7f..d2963123eb1c 100644
--- a/Documentation/filesystems/nfs/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
@@ -5,6 +5,7 @@ Written 1996 by Gero Kuhlmann <gero@gkminix.han.de>
Updated 1997 by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
Updated 2006 by Nico Schottelius <nico-kernel-nfsroot@schottelius.org>
Updated 2006 by Horms <horms@verge.net.au>
+Updated 2018 by Chris Novakovic <chris@chrisn.me.uk>
@@ -79,7 +80,7 @@ nfsroot=[<server-ip>:]<root-dir>[,<nfs-options>]
ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
- <dns0-ip>:<dns1-ip>
+ <dns0-ip>:<dns1-ip>:<ntp0-ip>
This parameter tells the kernel how to configure IP addresses of devices
and also how to set up the IP routing table. It was originally called
@@ -110,6 +111,9 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
will not be triggered if it is missing and NFS root is not
in operation.
+ Value is exported to /proc/net/pnp with the prefix "bootserver "
+ (see below).
+
Default: Determined using autoconfiguration.
The address of the autoconfiguration server is used.
@@ -123,10 +127,13 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
Default: Determined using autoconfiguration.
- <hostname> Name of the client. May be supplied by autoconfiguration,
- but its absence will not trigger autoconfiguration.
- If specified and DHCP is used, the user provided hostname will
- be carried in the DHCP request to hopefully update DNS record.
+ <hostname> Name of the client. If a '.' character is present, anything
+ before the first '.' is used as the client's hostname, and anything
+ after it is used as its NIS domain name. May be supplied by
+ autoconfiguration, but its absence will not trigger autoconfiguration.
+ If specified and DHCP is used, the user-provided hostname (and NIS
+ domain name, if present) will be carried in the DHCP request; this
+ may cause a DNS record to be created or updated for the client.
Default: Client IP address is used in ASCII notation.
@@ -162,12 +169,55 @@ ip=<client-ip>:<server-ip>:<gw-ip>:<netmask>:<hostname>:<device>:<autoconf>:
Default: any
- <dns0-ip> IP address of first nameserver.
- Value gets exported by /proc/net/pnp which is often linked
- on embedded systems by /etc/resolv.conf.
+ <dns0-ip> IP address of primary nameserver.
+ Value is exported to /proc/net/pnp with the prefix "nameserver "
+ (see below).
+
+ Default: None if not using autoconfiguration; determined
+ automatically if using autoconfiguration.
+
+ <dns1-ip> IP address of secondary nameserver.
+ See <dns0-ip>.
+
+ <ntp0-ip> IP address of a Network Time Protocol (NTP) server.
+ Value is exported to /proc/net/ipconfig/ntp_servers, but is
+ otherwise unused (see below).
+
+ Default: None if not using autoconfiguration; determined
+ automatically if using autoconfiguration.
+
+ After configuration (whether manual or automatic) is complete, two files
+ are created in the following format; lines are omitted if their respective
+ value is empty following configuration:
+
+ - /proc/net/pnp:
+
+ #PROTO: <DHCP|BOOTP|RARP|MANUAL> (depending on configuration method)
+ domain <dns-domain> (if autoconfigured, the DNS domain)
+ nameserver <dns0-ip> (primary name server IP)
+ nameserver <dns1-ip> (secondary name server IP)
+ nameserver <dns2-ip> (tertiary name server IP)
+ bootserver <server-ip> (NFS server IP)
+
+ - /proc/net/ipconfig/ntp_servers:
+
+ <ntp0-ip> (NTP server IP)
+ <ntp1-ip> (NTP server IP)
+ <ntp2-ip> (NTP server IP)
+
+ <dns-domain> and <dns2-ip> (in /proc/net/pnp) and <ntp1-ip> and <ntp2-ip>
+ (in /proc/net/ipconfig/ntp_servers) are requested during autoconfiguration;
+ they cannot be specified as part of the "ip=" kernel command line parameter.
+
+ Because the "domain" and "nameserver" options are recognised by DNS
+ resolvers, /etc/resolv.conf is often linked to /proc/net/pnp on systems
+ that use an NFS root filesystem.
- <dns1-ip> IP address of second nameserver.
- Same as above.
+ Note that the kernel will not synchronise the system time with any NTP
+ servers it discovers; this is the responsibility of a user space process
+ (e.g. an initrd/initramfs script that passes the IP addresses listed in
+ /proc/net/ipconfig/ntp_servers to an NTP client before mounting the real
+ root filesystem if it is on NFS).
nfsrootdebug
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2a84bb334894..520f6a84cf50 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -515,7 +515,8 @@ guarantees:
The /proc/PID/clear_refs is used to reset the PG_Referenced and ACCESSED/YOUNG
bits on both physical and virtual pages associated with a process, and the
-soft-dirty bit on pte (see Documentation/vm/soft-dirty.txt for details).
+soft-dirty bit on pte (see Documentation/admin-guide/mm/soft-dirty.rst
+for details).
To clear the bits for all the pages associated with the process
> echo 1 > /proc/PID/clear_refs
@@ -536,7 +537,8 @@ Any other value written to /proc/PID/clear_refs will have no effect.
The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags
using /proc/kpageflags and number of times a page is mapped using
-/proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt.
+/proc/kpagecount. For detailed explanation, see
+Documentation/admin-guide/mm/pagemap.rst.
The /proc/pid/numa_maps is an extension based on maps, showing the memory
locality and binding policy, as well as the memory usage (in pages) of
@@ -564,7 +566,7 @@ address policy mapping details
Where:
"address" is the starting address for the mapping;
-"policy" reports the NUMA memory policy set for the mapping (see vm/numa_memory_policy.txt);
+"policy" reports the NUMA memory policy set for the mapping (see Documentation/admin-guide/mm/numa_memory_policy.rst);
"mapping details" summarizes mapping data such as mapping type, page usage counters,
node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page
size, in KB, that is backing the mapping up.
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index a85355cf85f4..d06e9a59a9f4 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -105,8 +105,9 @@ policy for the file will revert to "default" policy.
NUMA memory allocation policies have optional flags that can be used in
conjunction with their modes. These optional flags can be specified
when tmpfs is mounted by appending them to the mode before the NodeList.
-See Documentation/vm/numa_memory_policy.txt for a list of all available
-memory allocation policy mode flags and their effect on memory policy.
+See Documentation/admin-guide/mm/numa_memory_policy.rst for a list of
+all available memory allocation policy mode flags and their effect on
+memory policy.
=static is equivalent to MPOL_F_STATIC_NODES
=relative is equivalent to MPOL_F_RELATIVE_NODES
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5fd325df59e2..829a7b7857a4 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -856,7 +856,9 @@ struct file_operations {
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
- unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ __poll_t (*poll) (struct file *, struct poll_table_struct *);
+ struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+ __poll_t (*poll_mask) (struct file *, __poll_t);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
@@ -901,6 +903,17 @@ otherwise noted.
activity on this file and (optionally) go to sleep until there
is activity. Called by the select(2) and poll(2) system calls
+ get_poll_head: Returns the struct wait_queue_head that callers can
+ wait on. Callers need to check the returned events using ->poll_mask
+ once woken. Can return NULL to indicate polling is not supported,
+ or any error code using the ERR_PTR convention to indicate that a
+ grave error occured and ->poll_mask shall not be called.
+
+ poll_mask: return the mask of EPOLL* values describing the file descriptor
+ state. Called either before going to sleep on the waitqueue returned by
+ get_poll_head, or after it has been woken. If ->get_poll_head and
+ ->poll_mask are implemented ->poll does not need to be implement.
+
unlocked_ioctl: called by the ioctl(2) system call.
compat_ioctl: called by the ioctl(2) system call when 32 bit system calls