summaryrefslogtreecommitdiff
path: root/fs/ext4/extents_status.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4/extents_status.c')
-rw-r--r--fs/ext4/extents_status.c537
1 files changed, 312 insertions, 225 deletions
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9b5b8951afb4..e04fbf10fe4f 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -120,9 +120,40 @@
* memory. Hence, we will reclaim written/unwritten/hole extents from
* the tree under a heavy memory pressure.
*
+ * ==========================================================================
+ * 3. Assurance of Ext4 extent status tree consistency
+ *
+ * When mapping blocks, Ext4 queries the extent status tree first and should
+ * always trusts that the extent status tree is consistent and up to date.
+ * Therefore, it is important to adheres to the following rules when createing,
+ * modifying and removing extents.
+ *
+ * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
+ * the extent information should always be processed through the extent
+ * status tree instead of being organized manually through the on-disk
+ * extent tree.
+ *
+ * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
+ * exclusively and update the extent status tree atomically. If the extents
+ * to be modified are large enough to exceed the range that a single
+ * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
+ * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
+ * does):
+ *
+ * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against page faults, as well as reads and writes that may
+ * concurrently modify the extent status tree.
+ * b) Evict all page cache in the affected range and recommend rebuilding
+ * or dropping the extent status tree after modifying the on-disk
+ * extent tree. This ensures exclusion against concurrent writebacks
+ * that do not hold those locks but only holds a folio lock.
+ *
+ * 3. Based on the rules above, when querying block mappings, Ext4 should at
+ * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
+ * specified querying range.
*
* ==========================================================================
- * 3. Performance analysis
+ * 4. Performance analysis
*
* -- overhead
* 1. There is a cache extent for write access, so if writes are
@@ -134,7 +165,7 @@
*
*
* ==========================================================================
- * 4. TODO list
+ * 5. TODO list
*
* -- Refactor delayed space reservation
*
@@ -152,8 +183,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc);
int __init ext4_init_es(void)
{
@@ -203,6 +235,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
return es->es_lblk + es->es_len - 1;
}
+static inline void ext4_es_inc_seq(struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1);
+}
+
/*
* search through the tree for an delayed extent with a given offset. If
* it can't be found, try to find next extent.
@@ -309,6 +348,8 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
+ es->es_lblk = es->es_len = es->es_pblk = 0;
+
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -448,6 +489,19 @@ static void ext4_es_list_del(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
}
+static inline struct pending_reservation *__alloc_pending(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static inline void __free_pending(struct pending_reservation *pr)
+{
+ kmem_cache_free(ext4_pending_cachep, pr);
+}
+
/*
* Returns true if we cannot fail to allocate memory for this extent_status
* entry and cannot reclaim it until its status changes.
@@ -542,8 +596,8 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_is_hole(es1))
return 1;
- /* we need to check delayed extent is without unwritten status */
- if (ext4_es_is_delayed(es1) && !ext4_es_is_unwritten(es1))
+ /* we need to check delayed extent */
+ if (ext4_es_is_delayed(es1))
return 1;
return 0;
@@ -832,74 +886,113 @@ out:
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status)
+ unsigned int status, bool delalloc_reserve_used)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
- int err1 = 0;
- int err2 = 0;
+ int err1 = 0, err2 = 0, err3 = 0;
+ int resv_used = 0, pending = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
+ struct pending_reservation *pr = NULL;
+ bool revise_pending = false;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
- lblk, len, pblk, status, inode->i_ino);
+ es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);
if (!len)
return;
BUG_ON(end < lblk);
-
- if ((status & EXTENT_STATUS_DELAYED) &&
- (status & EXTENT_STATUS_WRITTEN)) {
- ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as "
- " delayed and written which can potentially "
- " cause data loss.", lblk, len);
- WARN_ON(1);
- }
+ WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);
newes.es_lblk = lblk;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, pblk, status);
- trace_ext4_es_insert_extent(inode, &newes);
ext4_es_insert_extent_check(inode, &newes);
+ revise_pending = sbi->s_cluster_ratio > 1 &&
+ test_opt(inode->i_sb, DELALLOC) &&
+ (status & (EXTENT_STATUS_WRITTEN |
+ EXTENT_STATUS_UNWRITTEN));
retry:
if (err1 && !es1)
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
+ if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
+ pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
if (err1 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
err2 = __es_insert_extent(inode, &newes, es2);
if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
err2 = 0;
if (err2 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
- if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
- (status & EXTENT_STATUS_WRITTEN ||
- status & EXTENT_STATUS_UNWRITTEN))
- __revise_pending(inode, lblk, len);
-
- /* es is pre-allocated but not used, free it. */
- if (es1 && !es1->es_len)
- __es_free_extent(es1);
- if (es2 && !es2->es_len)
- __es_free_extent(es2);
+ if (revise_pending) {
+ err3 = __revise_pending(inode, lblk, len, &pr);
+ if (err3 < 0)
+ goto error;
+ if (pr) {
+ __free_pending(pr);
+ pr = NULL;
+ }
+ pending = err3;
+ }
+ /*
+ * TODO: For cache on-disk extents, there is no need to increment
+ * the sequence counter, this requires future optimization.
+ */
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2)
+ /*
+ * Reduce the reserved cluster count to reflect successful deferred
+ * allocation of delayed allocated clusters or direct allocation of
+ * clusters discovered to be delayed allocated. Once allocated, a
+ * cluster is not included in the reserved count.
+ *
+ * When direct allocating (from fallocate, filemap, DIO, or clusters
+ * allocated when delalloc has been disabled by ext4_nonda_switch())
+ * an extent either 1) contains delayed blocks but start with
+ * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
+ * allocated blocks which belong to delayed allocated clusters when
+ * bigalloc feature is enabled, quota has already been claimed by
+ * ext4_mb_new_blocks(), so release the quota reservations made for
+ * any previously delayed allocated clusters instead of claim them
+ * again.
+ */
+ resv_used += pending;
+ if (resv_used)
+ ext4_da_update_reserve_space(inode, resv_used,
+ delalloc_reserve_used);
+
+ if (err1 || err2 || err3 < 0)
goto retry;
+ trace_ext4_es_insert_extent(inode, &newes);
ext4_es_print_tree(inode);
return;
}
@@ -946,8 +1039,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
* Return: 1 on found, 0 on not
*/
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t *next_lblk,
- struct extent_status *es)
+ ext4_lblk_t *next_lblk, struct extent_status *es,
+ u64 *pseq)
{
struct ext4_es_tree *tree;
struct ext4_es_stats *stats;
@@ -1006,6 +1099,8 @@ out:
} else
*next_lblk = 0;
}
+ if (pseq)
+ *pseq = EXT4_I(inode)->i_es_seq;
} else {
percpu_counter_inc(&stats->es_stats_cache_misses);
}
@@ -1017,7 +1112,7 @@ out:
}
struct rsvd_count {
- int ndelonly;
+ int ndelayed;
bool first_do_lblk_found;
ext4_lblk_t first_do_lblk;
ext4_lblk_t last_do_lblk;
@@ -1043,10 +1138,10 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct rb_node *node;
- rc->ndelonly = 0;
+ rc->ndelayed = 0;
/*
- * for bigalloc, note the first delonly block in the range has not
+ * for bigalloc, note the first delayed block in the range has not
* been found, record the extent containing the block to the left of
* the region to be removed, if any, and note that there's no partial
* cluster to track
@@ -1066,9 +1161,8 @@ static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
}
/*
- * count_rsvd - count the clusters containing delayed and not unwritten
- * (delonly) blocks in a range within an extent and add to
- * the running tally in rsvd_count
+ * count_rsvd - count the clusters containing delayed blocks in a range
+ * within an extent and add to the running tally in rsvd_count
*
* @inode - file containing extent
* @lblk - first block in range
@@ -1085,13 +1179,13 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t i, end, nclu;
- if (!ext4_es_is_delonly(es))
+ if (!ext4_es_is_delayed(es))
return;
WARN_ON(len <= 0);
if (sbi->s_cluster_ratio == 1) {
- rc->ndelonly += (int) len;
+ rc->ndelayed += (int) len;
return;
}
@@ -1101,7 +1195,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
end = lblk + (ext4_lblk_t) len - 1;
end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;
- /* record the first block of the first delonly extent seen */
+ /* record the first block of the first delayed extent seen */
if (!rc->first_do_lblk_found) {
rc->first_do_lblk = i;
rc->first_do_lblk_found = true;
@@ -1115,7 +1209,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
* doesn't start with it, count it and stop tracking
*/
if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
}
@@ -1125,7 +1219,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
*/
if (EXT4_LBLK_COFF(sbi, i) != 0) {
if (end >= EXT4_LBLK_CFILL(sbi, i)) {
- rc->ndelonly++;
+ rc->ndelayed++;
rc->partial = false;
i = EXT4_LBLK_CFILL(sbi, i) + 1;
}
@@ -1133,11 +1227,11 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
/*
* if the current cluster starts on a cluster boundary, count the
- * number of whole delonly clusters in the extent
+ * number of whole delayed clusters in the extent
*/
if ((i + sbi->s_cluster_ratio - 1) <= end) {
nclu = (end - i + 1) >> sbi->s_cluster_bits;
- rc->ndelonly += nclu;
+ rc->ndelayed += nclu;
i += nclu << sbi->s_cluster_bits;
}
@@ -1197,10 +1291,9 @@ static struct pending_reservation *__pr_tree_search(struct rb_root *root,
* @rc - pointer to reserved count data
*
* The number of reservations to be released is equal to the number of
- * clusters containing delayed and not unwritten (delonly) blocks within
- * the range, minus the number of clusters still containing delonly blocks
- * at the ends of the range, and minus the number of pending reservations
- * within the range.
+ * clusters containing delayed blocks within the range, minus the number of
+ * clusters still containing delayed blocks at the ends of the range, and
+ * minus the number of pending reservations within the range.
*/
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct extent_status *right_es,
@@ -1211,33 +1304,33 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
struct rb_node *node;
ext4_lblk_t first_lclu, last_lclu;
- bool left_delonly, right_delonly, count_pending;
+ bool left_delayed, right_delayed, count_pending;
struct extent_status *es;
if (sbi->s_cluster_ratio > 1) {
/* count any remaining partial cluster */
if (rc->partial)
- rc->ndelonly++;
+ rc->ndelayed++;
- if (rc->ndelonly == 0)
+ if (rc->ndelayed == 0)
return 0;
first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);
/*
- * decrease the delonly count by the number of clusters at the
- * ends of the range that still contain delonly blocks -
+ * decrease the delayed count by the number of clusters at the
+ * ends of the range that still contain delayed blocks -
* these clusters still need to be reserved
*/
- left_delonly = right_delonly = false;
+ left_delayed = right_delayed = false;
es = rc->left_es;
while (es && ext4_es_end(es) >=
EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- left_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ left_delayed = true;
break;
}
node = rb_prev(&es->rb_node);
@@ -1245,7 +1338,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
break;
es = rb_entry(node, struct extent_status, rb_node);
}
- if (right_es && (!left_delonly || first_lclu != last_lclu)) {
+ if (right_es && (!left_delayed || first_lclu != last_lclu)) {
if (end < ext4_es_end(right_es)) {
es = right_es;
} else {
@@ -1255,9 +1348,9 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
while (es && es->es_lblk <=
EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
- if (ext4_es_is_delonly(es)) {
- rc->ndelonly--;
- right_delonly = true;
+ if (ext4_es_is_delayed(es)) {
+ rc->ndelayed--;
+ right_delayed = true;
break;
}
node = rb_next(&es->rb_node);
@@ -1271,21 +1364,21 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* Determine the block range that should be searched for
* pending reservations, if any. Clusters on the ends of the
- * original removed range containing delonly blocks are
+ * original removed range containing delayed blocks are
* excluded. They've already been accounted for and it's not
* possible to determine if an associated pending reservation
* should be released with the information available in the
* extents status tree.
*/
if (first_lclu == last_lclu) {
- if (left_delonly | right_delonly)
+ if (left_delayed | right_delayed)
count_pending = false;
else
count_pending = true;
} else {
- if (left_delonly)
+ if (left_delayed)
first_lclu++;
- if (right_delonly)
+ if (right_delayed)
last_lclu--;
if (first_lclu <= last_lclu)
count_pending = true;
@@ -1296,16 +1389,16 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
/*
* a pending reservation found between first_lclu and last_lclu
* represents an allocated cluster that contained at least one
- * delonly block, so the delonly total must be reduced by one
+ * delayed block, so the delayed total must be reduced by one
* for each pending reservation found and released
*/
if (count_pending) {
pr = __pr_tree_search(&tree->root, first_lclu);
while (pr && pr->lclu <= last_lclu) {
- rc->ndelonly--;
+ rc->ndelayed--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
if (!node)
break;
pr = rb_entry(node, struct pending_reservation,
@@ -1313,7 +1406,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
}
}
}
- return rc->ndelonly;
+ return rc->ndelayed;
}
@@ -1399,8 +1492,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
}
}
if (count_reserved)
- count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
- &orig_es, &rc);
+ count_rsvd(inode, orig_es.es_lblk + len1,
+ orig_es.es_len - len1 - len2, &orig_es, &rc);
goto out_get_reserved;
}
@@ -1471,7 +1564,6 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
@@ -1491,15 +1583,23 @@ retry:
*/
write_lock(&EXT4_I(inode)->i_es_lock);
err = __es_remove_extent(inode, lblk, end, &reserved, es);
- if (es && !es->es_len)
- __es_free_extent(es);
+ if (err)
+ goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es) {
+ if (!es->es_len)
+ __es_free_extent(es);
+ es = NULL;
+ }
+ ext4_es_inc_seq(inode);
+error:
write_unlock(&EXT4_I(inode)->i_es_lock);
if (err)
goto retry;
+ trace_ext4_es_remove_extent(inode, lblk, len);
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -1596,7 +1696,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
unsigned long nr;
struct ext4_sb_info *sbi;
- sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+ sbi = shrink->private_data;
nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
@@ -1605,8 +1705,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
static unsigned long ext4_es_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
- struct ext4_sb_info *sbi = container_of(shrink,
- struct ext4_sb_info, s_es_shrinker);
+ struct ext4_sb_info *sbi = shrink->private_data;
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
@@ -1690,13 +1789,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err3;
- sbi->s_es_shrinker.scan_objects = ext4_es_scan;
- sbi->s_es_shrinker.count_objects = ext4_es_count;
- sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
- err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s",
- sbi->s_sb->s_id);
- if (err)
+ sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
+ if (!sbi->s_es_shrinker) {
+ err = -ENOMEM;
goto err4;
+ }
+
+ sbi->s_es_shrinker->scan_objects = ext4_es_scan;
+ sbi->s_es_shrinker->count_objects = ext4_es_count;
+ sbi->s_es_shrinker->private_data = sbi;
+
+ shrinker_register(sbi->s_es_shrinker);
return 0;
err4:
@@ -1716,7 +1819,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
- unregister_shrinker(&sbi->s_es_shrinker);
+ shrinker_free(sbi->s_es_shrinker);
}
/*
@@ -1897,11 +2000,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
*
* @inode - file containing the cluster
* @lblk - logical block in the cluster to be added
+ * @prealloc - preallocated pending entry
*
- * Returns 0 on successful insertion and -ENOMEM on failure. If the
+ * Returns 1 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
-static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
@@ -1927,15 +2032,21 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
}
}
- pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
- if (pr == NULL) {
- ret = -ENOMEM;
- goto out;
+ if (likely(*prealloc == NULL)) {
+ pr = __alloc_pending(false);
+ if (!pr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else {
+ pr = *prealloc;
+ *prealloc = NULL;
}
pr->lclu = lclu;
rb_link_node(&pr->rb_node, parent, p);
rb_insert_color(&pr->rb_node, &tree->root);
+ ret = 1;
out:
return ret;
@@ -1960,7 +2071,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
if (pr != NULL) {
tree = &EXT4_I(inode)->i_pending_tree;
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
}
}
@@ -2006,34 +2117,47 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
- * ext4_es_insert_delayed_block - adds a delayed block to the extents status
- * tree, adding a pending reservation where
- * needed
+ * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
+ * status tree, adding a pending reservation
+ * where needed
*
* @inode - file containing the newly added block
- * @lblk - logical block to be added
- * @allocated - indicates whether a physical cluster has been allocated for
- * the logical cluster that contains the block
+ * @lblk - start logical block to be added
+ * @len - length of blocks to be added
+ * @lclu_allocated/end_allocated - indicates whether a physical cluster has
+ * been allocated for the logical cluster
+ * that contains the start/end block. Note that
+ * end_allocated should always be set to false
+ * if the start and the end block are in the
+ * same cluster
*/
-void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, bool lclu_allocated,
+ bool end_allocated)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status newes;
- int err1 = 0;
- int err2 = 0;
+ ext4_lblk_t end = lblk + len - 1;
+ int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
+ struct pending_reservation *pr1 = NULL;
+ struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
- lblk, inode->i_ino);
+ es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
+ lblk, len, inode->i_ino);
+ if (!len)
+ return;
+
+ WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
+ end_allocated);
newes.es_lblk = lblk;
- newes.es_len = 1;
+ newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
- trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
ext4_es_insert_extent_check(inode, &newes);
@@ -2042,123 +2166,66 @@ retry:
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
+ if (err1 || err2 || err3 < 0) {
+ if (lclu_allocated && !pr1)
+ pr1 = __alloc_pending(true);
+ if (end_allocated && !pr2)
+ pr2 = __alloc_pending(true);
+ }
write_lock(&EXT4_I(inode)->i_es_lock);
- err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
err2 = __es_insert_extent(inode, &newes, es2);
if (err2 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
- if (allocated)
- __insert_pending(inode, lblk);
-
- /* es is pre-allocated but not used, free it. */
- if (es1 && !es1->es_len)
- __es_free_extent(es1);
- if (es2 && !es2->es_len)
- __es_free_extent(es2);
+ if (lclu_allocated) {
+ err3 = __insert_pending(inode, lblk, &pr1);
+ if (err3 < 0)
+ goto error;
+ if (pr1) {
+ __free_pending(pr1);
+ pr1 = NULL;
+ }
+ }
+ if (end_allocated) {
+ err3 = __insert_pending(inode, end, &pr2);
+ if (err3 < 0)
+ goto error;
+ if (pr2) {
+ __free_pending(pr2);
+ pr2 = NULL;
+ }
+ }
+ ext4_es_inc_seq(inode);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
- if (err1 || err2)
+ if (err1 || err2 || err3 < 0)
goto retry;
+ trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
+ end_allocated);
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
return;
}
/*
- * __es_delayed_clu - count number of clusters containing blocks that
- * are delayed only
- *
- * @inode - file containing block range
- * @start - logical block defining start of range
- * @end - logical block defining end of range
- *
- * Returns the number of clusters containing only delayed (not delayed
- * and unwritten) blocks in the range specified by @start and @end. Any
- * cluster or part of a cluster within the range and containing a delayed
- * and not unwritten block within the range is counted as a whole cluster.
- */
-static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
- ext4_lblk_t end)
-{
- struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
- struct extent_status *es;
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- struct rb_node *node;
- ext4_lblk_t first_lclu, last_lclu;
- unsigned long long last_counted_lclu;
- unsigned int n = 0;
-
- /* guaranteed to be unequal to any ext4_lblk_t value */
- last_counted_lclu = ~0ULL;
-
- es = __es_tree_search(&tree->root, start);
-
- while (es && (es->es_lblk <= end)) {
- if (ext4_es_is_delonly(es)) {
- if (es->es_lblk <= start)
- first_lclu = EXT4_B2C(sbi, start);
- else
- first_lclu = EXT4_B2C(sbi, es->es_lblk);
-
- if (ext4_es_end(es) >= end)
- last_lclu = EXT4_B2C(sbi, end);
- else
- last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
-
- if (first_lclu == last_counted_lclu)
- n += last_lclu - first_lclu;
- else
- n += last_lclu - first_lclu + 1;
- last_counted_lclu = last_lclu;
- }
- node = rb_next(&es->rb_node);
- if (!node)
- break;
- es = rb_entry(node, struct extent_status, rb_node);
- }
-
- return n;
-}
-
-/*
- * ext4_es_delayed_clu - count number of clusters containing blocks that
- * are both delayed and unwritten
- *
- * @inode - file containing block range
- * @lblk - logical block defining start of range
- * @len - number of blocks in range
- *
- * Locking for external use of __es_delayed_clu().
- */
-unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
- ext4_lblk_t end;
- unsigned int n;
-
- if (len == 0)
- return 0;
-
- end = lblk + len - 1;
- WARN_ON(end < lblk);
-
- read_lock(&ei->i_es_lock);
-
- n = __es_delayed_clu(inode, lblk, end);
-
- read_unlock(&ei->i_es_lock);
-
- return n;
-}
-
-/*
* __revise_pending - makes, cancels, or leaves unchanged pending cluster
* reservations for a specified block range depending
* upon the presence or absence of delayed blocks
@@ -2168,21 +2235,27 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
* @inode - file containing the range
* @lblk - logical block defining the start of range
* @len - length of range in blocks
+ * @prealloc - preallocated pending entry
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
- * status. Must be called while holding i_es_lock.
+ * status. Must be called while holding i_es_lock. Returns number of new
+ * inserts pending cluster on insert pendings, returns 0 on remove pendings,
+ * return -ENOMEM on failure.
*/
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
+ int pendings = 0;
+ int ret = 0;
if (len == 0)
- return;
+ return 0;
/*
* Two cases - block range within single cluster and block range
@@ -2200,39 +2273,53 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
if (f_del) {
- __insert_pending(inode, first);
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode,
- &ext4_es_is_delonly,
+ &ext4_es_is_delayed,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
+ } else
__remove_pending(inode, last);
}
} else {
first = EXT4_LBLK_CMASK(sbi, lblk);
if (first != lblk)
- f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ f_del = __es_scan_range(inode, &ext4_es_is_delayed,
first, lblk - 1);
- if (f_del)
- __insert_pending(inode, first);
- else
+ if (f_del) {
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
+ } else
__remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
- l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+ l_del = __es_scan_range(inode, &ext4_es_is_delayed,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ pendings += ret;
+ } else
__remove_pending(inode, last);
}
+out:
+ return (ret < 0) ? ret : pendings;
}