diff options
Diffstat (limited to 'fs/ocfs2/cluster')
| -rw-r--r-- | fs/ocfs2/cluster/Makefile | 3 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 1088 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/heartbeat.h | 28 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/masklog.c | 73 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/masklog.h | 79 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/netdebug.c | 187 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/nodemanager.c | 430 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/nodemanager.h | 21 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/ocfs2_heartbeat.h | 20 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/ocfs2_nodemanager.h | 21 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/quorum.c | 73 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/quorum.h | 21 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/sys.c | 23 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/sys.h | 21 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/tcp.c | 431 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/tcp.h | 29 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/tcp_internal.h | 36 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/ver.c | 42 | ||||
| -rw-r--r-- | fs/ocfs2/cluster/ver.h | 31 |
19 files changed, 1050 insertions, 1607 deletions
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index bc8c5e7d8608..0e5b73215819 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile @@ -1,4 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ - quorum.o tcp.o netdebug.o ver.o + quorum.o tcp.o netdebug.o diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 5c1c864e81cc..724350925aff 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1,24 +1,9 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ +#include "linux/kstrtox.h" #include <linux/kernel.h> #include <linux/sched.h> #include <linux/jiffies.h> @@ -35,7 +20,8 @@ #include <linux/time.h> #include <linux/debugfs.h> #include <linux/slab.h> - +#include <linux/bitmap.h> +#include <linux/ktime.h> #include "heartbeat.h" #include "tcp.h" #include "nodemanager.h" @@ -105,10 +91,6 @@ static struct o2hb_debug_buf *o2hb_db_failedregions; #define O2HB_DEBUG_REGION_PINNED "pinned" static struct dentry *o2hb_debug_dir; -static struct dentry *o2hb_debug_livenodes; -static struct dentry *o2hb_debug_liveregions; -static struct dentry *o2hb_debug_quorumregions; -static struct dentry *o2hb_debug_failedregions; static LIST_HEAD(o2hb_all_regions); @@ -118,21 +100,19 @@ static struct o2hb_callback { static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type); -#define O2HB_DEFAULT_BLOCK_BITS 9 - enum o2hb_heartbeat_modes { O2HB_HEARTBEAT_LOCAL = 0, O2HB_HEARTBEAT_GLOBAL, O2HB_HEARTBEAT_NUM_MODES, }; -char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { - "local", /* O2HB_HEARTBEAT_LOCAL */ - "global", /* O2HB_HEARTBEAT_GLOBAL */ +static const char *o2hb_heartbeat_mode_desc[O2HB_HEARTBEAT_NUM_MODES] = { + "local", /* O2HB_HEARTBEAT_LOCAL */ + "global", /* O2HB_HEARTBEAT_GLOBAL */ }; unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD; -unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; +static unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; /* * o2hb_dependent_users tracks the number of registered callbacks that depend @@ -140,7 +120,7 @@ unsigned int o2hb_heartbeat_mode = O2HB_HEARTBEAT_LOCAL; * However only o2dlm depends on the heartbeat. It does not want the heartbeat * to stop while a dlm domain is still active. */ -unsigned int o2hb_dependent_users; +static unsigned int o2hb_dependent_users; /* * In global heartbeat mode, all regions are pinned if there are one or more @@ -218,7 +198,8 @@ struct o2hb_region { unsigned hr_unclean_stop:1, hr_aborted_start:1, hr_item_pinned:1, - hr_item_dropped:1; + hr_item_dropped:1, + hr_node_deleted:1; /* protected by the hr_callback_sem */ struct task_struct *hr_task; @@ -233,7 +214,7 @@ struct o2hb_region { unsigned int hr_num_pages; struct page **hr_slot_data; - struct block_device *hr_bdev; + struct file *hr_bdev_file; struct o2hb_disk_slot *hr_slots; /* live node map of this region */ @@ -241,10 +222,6 @@ struct o2hb_region { unsigned int hr_region_num; struct dentry *hr_debug_dir; - struct dentry *hr_debug_livenodes; - struct dentry *hr_debug_regnum; - struct dentry *hr_debug_elapsed_time; - struct dentry *hr_debug_pinned; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; struct o2hb_debug_buf *hr_db_elapsed_time; @@ -259,8 +236,6 @@ struct o2hb_region { * (hr_steady_iterations == 0) within hr_unsteady_iterations */ atomic_t hr_unsteady_iterations; - char hr_dev_name[BDEVNAME_SIZE]; - unsigned int hr_timeout_ms; /* randomized as the region goes up and down so that a node @@ -270,48 +245,65 @@ struct o2hb_region { struct delayed_work hr_write_timeout_work; unsigned long hr_last_timeout_start; + /* negotiate timer, used to negotiate extending hb timeout. */ + struct delayed_work hr_nego_timeout_work; + unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* Used during o2hb_check_slot to hold a copy of the block * being checked because we temporarily have to zero out the * crc field. */ struct o2hb_disk_heartbeat_block *hr_tmp_block; + + /* Message key for negotiate timeout message. */ + unsigned int hr_key; + struct list_head hr_handler_list; + + /* last hb status, 0 for success, other value for error. */ + int hr_last_hb_status; }; +static inline struct block_device *reg_bdev(struct o2hb_region *reg) +{ + return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL; +} + struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; int wc_error; }; -static int o2hb_pop_count(void *map, int count) -{ - int i = -1, pop = 0; +#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) - while ((i = find_next_bit(map, count, i + 1)) < count) - pop++; - return pop; -} +enum { + O2HB_NEGO_TIMEOUT_MSG = 1, + O2HB_NEGO_APPROVE_MSG = 2, +}; + +struct o2hb_nego_msg { + u8 node_num; +}; static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; - unsigned long flags; struct o2hb_region *reg = container_of(work, struct o2hb_region, hr_write_timeout_work.work); - mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " - "milliseconds\n", reg->hr_dev_name, + mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " + "milliseconds\n", reg_bdev(reg), jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { - spin_lock_irqsave(&o2hb_live_lock, flags); + spin_lock(&o2hb_live_lock); if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); - failed = o2hb_pop_count(&o2hb_failed_region_bitmap, + failed = bitmap_weight(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); - quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, + quorum = bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); - spin_unlock_irqrestore(&o2hb_live_lock, flags); + spin_unlock(&o2hb_live_lock); mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", quorum, failed); @@ -327,7 +319,7 @@ static void o2hb_write_timeout(struct work_struct *work) o2quo_disk_timeout(); } -static void o2hb_arm_write_timeout(struct o2hb_region *reg) +static void o2hb_arm_timeout(struct o2hb_region *reg) { /* Arm writeout only after thread reaches steady state */ if (atomic_read(®->hr_steady_iterations) != 0) @@ -342,14 +334,134 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) spin_unlock(&o2hb_live_lock); } cancel_delayed_work(®->hr_write_timeout_work); - reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); + + cancel_delayed_work(®->hr_nego_timeout_work); + /* negotiate timeout must be less than write timeout. */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); + bitmap_zero(reg->hr_nego_node_bitmap, O2NM_MAX_NODES); } -static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +static void o2hb_disarm_timeout(struct o2hb_region *reg) { cancel_delayed_work_sync(®->hr_write_timeout_work); + cancel_delayed_work_sync(®->hr_nego_timeout_work); +} + +static int o2hb_send_nego_msg(int key, int type, u8 target) +{ + struct o2hb_nego_msg msg; + int status, ret; + + msg.node_num = o2nm_this_node(); +again: + ret = o2net_send_message(type, key, &msg, sizeof(msg), + target, &status); + + if (ret == -EAGAIN || ret == -ENOMEM) { + msleep(100); + goto again; + } + + return ret; +} + +static void o2hb_nego_timeout(struct work_struct *work) +{ + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int master_node, i, ret; + struct o2hb_region *reg; + + reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); + /* don't negotiate timeout if last hb failed since it is very + * possible io failed. Should let write timeout fence self. + */ + if (reg->hr_last_hb_status) + return; + + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); + /* lowest node as master node to make negotiate decision. */ + master_node = find_first_bit(live_node_bitmap, O2NM_MAX_NODES); + + if (master_node == o2nm_this_node()) { + if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, + config_item_name(®->hr_item), reg_bdev(reg)); + set_bit(master_node, reg->hr_nego_node_bitmap); + } + if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, + O2NM_MAX_NODES)) { + /* check negotiate bitmap every second to do timeout + * approve decision. + */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(1000)); + + return; + } + + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n", + config_item_name(®->hr_item), + reg_bdev(reg)); + /* approve negotiate timeout request. */ + o2hb_arm_timeout(reg); + + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + if (i == master_node) + continue; + + mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i); + ret = o2hb_send_nego_msg(reg->hr_key, + O2HB_NEGO_APPROVE_MSG, i); + if (ret) + mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n", + i, ret); + } + } else { + /* negotiate timeout with master node. */ + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), + reg_bdev(reg), master_node); + ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, + master_node); + if (ret) + mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n", + master_node, ret); + } +} + +static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + struct o2hb_nego_msg *nego_msg; + + nego_msg = (struct o2hb_nego_msg *)msg->buf; + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n", + nego_msg->node_num, config_item_name(®->hr_item), + reg_bdev(reg)); + if (nego_msg->node_num < O2NM_MAX_NODES) + set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); + else + mlog(ML_ERROR, "got nego timeout message from bad node.\n"); + + return 0; +} + +static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n", + config_item_name(®->hr_item), reg_bdev(reg)); + o2hb_arm_timeout(reg); + return 0; } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -373,21 +485,19 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, } } -static void o2hb_wait_on_io(struct o2hb_region *reg, - struct o2hb_bio_wait_ctxt *wc) +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) { o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); } -static void o2hb_bio_end_io(struct bio *bio, - int error) +static void o2hb_bio_end_io(struct bio *bio) { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) { - mlog(ML_ERROR, "IO Error %d\n", error); - wc->wc_error = error; + if (bio->bi_status) { + mlog(ML_ERROR, "IO Error %d\n", bio->bi_status); + wc->wc_error = blk_status_to_errno(bio->bi_status); } o2hb_bio_wait_dec(wc, 1); @@ -399,7 +509,7 @@ static void o2hb_bio_end_io(struct bio *bio, static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, struct o2hb_bio_wait_ctxt *wc, unsigned int *current_slot, - unsigned int max_slots) + unsigned int max_slots, blk_opf_t opf) { int len, current_page; unsigned int vec_len, vec_start; @@ -413,7 +523,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(GFP_ATOMIC, 16); + bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -421,18 +531,17 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, } /* Must put everything in 512 byte sectors for the bio... */ - bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9); - bio->bi_bdev = reg->hr_bdev; + bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; - vec_start = (cs << bits) % PAGE_CACHE_SIZE; + vec_start = (cs << bits) % PAGE_SIZE; while(cs < max_slots) { current_page = cs / spp; page = reg->hr_slot_data[current_page]; - vec_len = min(PAGE_CACHE_SIZE - vec_start, - (max_slots-cs) * (PAGE_CACHE_SIZE/spp) ); + vec_len = min(PAGE_SIZE - vec_start, + (max_slots-cs) * (PAGE_SIZE/spp) ); mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n", current_page, vec_len, vec_start); @@ -440,7 +549,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, len = bio_add_page(bio, page, vec_len, vec_start); if (len != vec_len) break; - cs += vec_len / (PAGE_CACHE_SIZE/spp); + cs += vec_len / (PAGE_SIZE/spp); vec_start = 0; } @@ -450,9 +559,10 @@ bail: } static int o2hb_read_slots(struct o2hb_region *reg, + unsigned int begin_slot, unsigned int max_slots) { - unsigned int current_slot=0; + unsigned int current_slot = begin_slot; int status; struct o2hb_bio_wait_ctxt wc; struct bio *bio; @@ -460,7 +570,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, o2hb_bio_wait_init(&wc); while(current_slot < max_slots) { - bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots); + bio = o2hb_setup_one_bio(reg, &wc, ¤t_slot, max_slots, + REQ_OP_READ); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -468,13 +579,13 @@ static int o2hb_read_slots(struct o2hb_region *reg, } atomic_inc(&wc.wc_num_reqs); - submit_bio(READ, bio); + submit_bio(bio); } status = 0; bail_and_wait: - o2hb_wait_on_io(reg, &wc); + o2hb_wait_on_io(&wc); if (wc.wc_error && !status) status = wc.wc_error; @@ -492,7 +603,8 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); - bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1); + bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, + REQ_OP_WRITE | REQ_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -500,7 +612,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, } atomic_inc(&write_wc->wc_num_reqs); - submit_bio(WRITE_SYNC, bio); + submit_bio(bio); status = 0; bail: @@ -582,8 +694,8 @@ static int o2hb_check_own_slot(struct o2hb_region *reg) else errstr = ERRSTR3; - mlog(ML_ERROR, "%s (%s): expected(%u:0x%llx, 0x%llx), " - "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_dev_name, + mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg), slot->ds_node_num, (unsigned long long)slot->ds_last_generation, (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), @@ -606,7 +718,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg, hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; memset(hb_block, 0, reg->hr_block_bytes); /* TODO: time stuff */ - cputime = CURRENT_TIME.tv_sec; + cputime = ktime_get_real_seconds(); if (!cputime) cputime = 1; @@ -628,11 +740,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, struct o2nm_node *node, int idx) { - struct list_head *iter; struct o2hb_callback_func *f; - list_for_each(iter, &hbcall->list) { - f = list_entry(iter, struct o2hb_callback_func, hc_item); + list_for_each_entry(f, &hbcall->list, hc_item) { mlog(ML_HEARTBEAT, "calling funcs %p\n", f); (f->hc_func)(node, idx, f->hc_data); } @@ -641,16 +751,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, /* Will run the list in order until we process the passed event */ static void o2hb_run_event_list(struct o2hb_node_event *queued_event) { - int empty; struct o2hb_callback *hbcall; struct o2hb_node_event *event; - spin_lock(&o2hb_live_lock); - empty = list_empty(&queued_event->hn_item); - spin_unlock(&o2hb_live_lock); - if (empty) - return; - /* Holding callback sem assures we don't alter the callback * lists when doing this, and serializes ourselves with other * processes wanting callbacks. */ @@ -709,6 +812,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) struct o2hb_node_event event = { .hn_item = LIST_HEAD_INIT(event.hn_item), }; struct o2nm_node *node; + int queued = 0; node = o2nm_get_node_by_num(slot->ds_node_num); if (!node) @@ -726,11 +830,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, slot->ds_node_num); + queued = 1; } } spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); o2nm_node_put(node); } @@ -758,12 +864,12 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * live nodes heartbeat on it. In other words, the region has been * added to all nodes. */ - if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, - sizeof(o2hb_live_node_bitmap))) + if (!bitmap_equal(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, + O2NM_MAX_NODES)) goto unlock; - printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -771,7 +877,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) * If global heartbeat active, unpin all regions if the * region count > CUT_OFF */ - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) o2hb_region_unpin(NULL); unlock: @@ -790,6 +896,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; int tmp; + int queued = 0; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); @@ -820,8 +927,8 @@ static int o2hb_check_slot(struct o2hb_region *reg, /* The node is live but pushed out a bad crc. We * consider it a transient miss but don't populate any * other values as they may be junk. */ - mlog(ML_ERROR, "Node %d has written a bad crc to %s\n", - slot->ds_node_num, reg->hr_dev_name); + mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n", + slot->ds_node_num, reg_bdev(reg)); o2hb_dump_slot(hb_block); slot->ds_equal_samples++; @@ -883,6 +990,7 @@ fire_callbacks: slot->ds_node_num); changed = 1; + queued = 1; } list_add_tail(&slot->ds_live_item, @@ -899,12 +1007,12 @@ fire_callbacks: slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); if (slot_dead_ms && slot_dead_ms != dead_ms) { /* TODO: Perhaps we can fail the region here. */ - mlog(ML_ERROR, "Node %d on device %s has a dead count " + mlog(ML_ERROR, "Node %d on device %pg has a dead count " "of %u ms, but our count is %u ms.\n" "Please double check your configuration values " "for 'O2CB_HEARTBEAT_THRESHOLD'\n", - slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, - dead_ms); + slot->ds_node_num, reg_bdev(reg), + slot_dead_ms, dead_ms); } goto out; } @@ -913,7 +1021,7 @@ fire_callbacks: if (list_empty(&slot->ds_live_item)) goto out; - /* live nodes only go dead after enough consequtive missed + /* live nodes only go dead after enough consecutive missed * samples.. reset the missed counter whenever we see * activity */ if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) { @@ -934,6 +1042,7 @@ fire_callbacks: node, slot->ds_node_num); changed = 1; + queued = 1; } /* We don't clear this because the node is still @@ -949,35 +1058,27 @@ fire_callbacks: out: spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); if (node) o2nm_node_put(node); return changed; } -/* This could be faster if we just implmented a find_last_bit, but I - * don't think the circumstances warrant it. */ -static int o2hb_highest_node(unsigned long *nodes, - int numbits) +static int o2hb_highest_node(unsigned long *nodes, int numbits) { - int highest, node; - - highest = numbits; - node = -1; - while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) { - if (node >= numbits) - break; - - highest = node; - } + return find_last_bit(nodes, numbits); +} - return highest; +static int o2hb_lowest_node(unsigned long *nodes, int numbits) +{ + return find_first_bit(nodes, numbits); } static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { - int i, ret, highest_node; + int i, ret, highest_node, lowest_node; int membership_change = 0, own_slot_ok = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -994,7 +1095,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * If a node is not configured but is in the livemap, we still need * to read the slot so as to be able to remove it from the livemap. */ - o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + o2hb_fill_node_map(live_node_bitmap, O2NM_MAX_NODES); i = -1; while ((i = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { @@ -1002,7 +1103,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); - if (highest_node >= O2NM_MAX_NODES) { + lowest_node = o2hb_lowest_node(configured_nodes, O2NM_MAX_NODES); + if (highest_node >= O2NM_MAX_NODES || lowest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "o2hb: No configured nodes found!\n"); ret = -EINVAL; goto bail; @@ -1012,7 +1114,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * yet. Of course, if the node definitions have holes in them * then we're reading an empty slot anyway... Consider this * best-effort. */ - ret = o2hb_read_slots(reg, highest_node + 1); + ret = o2hb_read_slots(reg, lowest_node, highest_node + 1); if (ret < 0) { mlog_errno(ret); goto bail; @@ -1043,13 +1145,13 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * before we can go to steady state. This ensures that * people we find in our steady state have seen us. */ - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to * disk */ - mlog(ML_ERROR, "Write error %d on device \"%s\"\n", - write_wc.wc_error, reg->hr_dev_name); + mlog(ML_ERROR, "Write error %d on device \"%pg\"\n", + write_wc.wc_error, reg_bdev(reg)); ret = write_wc.wc_error; goto bail; } @@ -1057,7 +1159,8 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Skip disarming the timeout if own slot has stale/bad data */ if (own_slot_ok) { o2hb_set_quorum_device(reg); - o2hb_arm_write_timeout(reg); + o2hb_arm_timeout(reg); + reg->hr_last_timeout_start = jiffies; } bail: @@ -1072,9 +1175,9 @@ bail: if (atomic_read(®->hr_steady_iterations) != 0) { if (atomic_dec_and_test(®->hr_unsteady_iterations)) { printk(KERN_NOTICE "o2hb: Unable to stabilize " - "heartbeart on region %s (%s)\n", + "heartbeat on region %s (%pg)\n", config_item_name(®->hr_item), - reg->hr_dev_name); + reg_bdev(reg)); atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); @@ -1085,37 +1188,6 @@ bail: return ret; } -/* Subtract b from a, storing the result in a. a *must* have a larger - * value than b. */ -static void o2hb_tv_subtract(struct timeval *a, - struct timeval *b) -{ - /* just return 0 when a is after b */ - if (a->tv_sec < b->tv_sec || - (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) { - a->tv_sec = 0; - a->tv_usec = 0; - return; - } - - a->tv_sec -= b->tv_sec; - a->tv_usec -= b->tv_usec; - while ( a->tv_usec < 0 ) { - a->tv_sec--; - a->tv_usec += 1000000; - } -} - -static unsigned int o2hb_elapsed_msecs(struct timeval *start, - struct timeval *end) -{ - struct timeval res = *end; - - o2hb_tv_subtract(&res, start); - - return res.tv_sec * 1000 + res.tv_usec / 1000; -} - /* * we ride the region ref that the region dir holds. before the region * dir is removed and drops it ref it will wait to tear down this @@ -1126,15 +1198,21 @@ static int o2hb_thread(void *data) int i, ret; struct o2hb_region *reg = data; struct o2hb_bio_wait_ctxt write_wc; - struct timeval before_hb, after_hb; + ktime_t before_hb, after_hb; unsigned int elapsed_msec; mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n"); - set_user_nice(current, -20); + set_user_nice(current, MIN_NICE); /* Pin node */ - o2nm_depend_this_node(); + ret = o2nm_depend_this_node(); + if (ret) { + mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret); + reg->hr_node_deleted = 1; + wake_up(&o2hb_steady_queue); + return 0; + } while (!kthread_should_stop() && !reg->hr_unclean_stop && !reg->hr_aborted_start) { @@ -1143,18 +1221,19 @@ static int o2hb_thread(void *data) * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ - do_gettimeofday(&before_hb); + before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); + reg->hr_last_hb_status = ret; + + after_hb = ktime_get_real(); - do_gettimeofday(&after_hb); - elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); + elapsed_msec = (unsigned int) + ktime_ms_delta(after_hb, before_hb); mlog(ML_HEARTBEAT, - "start = %lu.%lu, end = %lu.%lu, msec = %u\n", - before_hb.tv_sec, (unsigned long) before_hb.tv_usec, - after_hb.tv_sec, (unsigned long) after_hb.tv_usec, - elapsed_msec); + "start = %lld, end = %lld, msec = %u, ret = %d\n", + before_hb, after_hb, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { @@ -1164,7 +1243,7 @@ static int o2hb_thread(void *data) } } - o2hb_disarm_write_timeout(reg); + o2hb_disarm_timeout(reg); /* unclean stop is only used in very bad situation */ for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) @@ -1179,7 +1258,7 @@ static int o2hb_thread(void *data) o2hb_prepare_block(reg, 0); ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); else mlog_errno(ret); } @@ -1229,7 +1308,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) case O2HB_DB_TYPE_REGION_NUMBER: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%d\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%d\n", reg->hr_region_num); goto done; @@ -1239,12 +1318,12 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) /* If 0, it has never been set before */ if (lts) lts = jiffies_to_msecs(jiffies - lts); - out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); + out += scnprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts); goto done; case O2HB_DB_TYPE_REGION_PINNED: reg = (struct o2hb_region *)db->db_data; - out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", + out += scnprintf(buf + out, PAGE_SIZE - out, "%u\n", !!reg->hr_item_pinned); goto done; @@ -1253,8 +1332,8 @@ static int o2hb_debug_open(struct inode *inode, struct file *file) } while ((i = find_next_bit(map, db->db_len, i + 1)) < db->db_len) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); done: i_size_write(inode, out); @@ -1303,107 +1382,60 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { + debugfs_remove_recursive(o2hb_debug_dir); kfree(o2hb_db_livenodes); kfree(o2hb_db_liveregions); kfree(o2hb_db_quorumregions); kfree(o2hb_db_failedregions); - debugfs_remove(o2hb_debug_failedregions); - debugfs_remove(o2hb_debug_quorumregions); - debugfs_remove(o2hb_debug_liveregions); - debugfs_remove(o2hb_debug_livenodes); - debugfs_remove(o2hb_debug_dir); } -static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, - struct o2hb_debug_buf **db, int db_len, - int type, int size, int len, void *data) +static void o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, int type, + int size, int len, void *data) { *db = kmalloc(db_len, GFP_KERNEL); if (!*db) - return NULL; + return; (*db)->db_type = type; (*db)->db_size = size; (*db)->db_len = len; (*db)->db_data = data; - return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, - &o2hb_debug_fops); + debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); } -static int o2hb_debug_init(void) +static void o2hb_debug_init(void) { - int ret = -ENOMEM; - o2hb_debug_dir = debugfs_create_dir(O2HB_DEBUG_DIR, NULL); - if (!o2hb_debug_dir) { - mlog_errno(ret); - goto bail; - } - - o2hb_debug_livenodes = o2hb_debug_create(O2HB_DEBUG_LIVENODES, - o2hb_debug_dir, - &o2hb_db_livenodes, - sizeof(*o2hb_db_livenodes), - O2HB_DB_TYPE_LIVENODES, - sizeof(o2hb_live_node_bitmap), - O2NM_MAX_NODES, - o2hb_live_node_bitmap); - if (!o2hb_debug_livenodes) { - mlog_errno(ret); - goto bail; - } - o2hb_debug_liveregions = o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, - o2hb_debug_dir, - &o2hb_db_liveregions, - sizeof(*o2hb_db_liveregions), - O2HB_DB_TYPE_LIVEREGIONS, - sizeof(o2hb_live_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_live_region_bitmap); - if (!o2hb_debug_liveregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, o2hb_debug_dir, + &o2hb_db_livenodes, sizeof(*o2hb_db_livenodes), + O2HB_DB_TYPE_LIVENODES, sizeof(o2hb_live_node_bitmap), + O2NM_MAX_NODES, o2hb_live_node_bitmap); - o2hb_debug_quorumregions = - o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, - o2hb_debug_dir, - &o2hb_db_quorumregions, - sizeof(*o2hb_db_quorumregions), - O2HB_DB_TYPE_QUORUMREGIONS, - sizeof(o2hb_quorum_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_quorum_region_bitmap); - if (!o2hb_debug_quorumregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVEREGIONS, o2hb_debug_dir, + &o2hb_db_liveregions, sizeof(*o2hb_db_liveregions), + O2HB_DB_TYPE_LIVEREGIONS, + sizeof(o2hb_live_region_bitmap), O2NM_MAX_REGIONS, + o2hb_live_region_bitmap); - o2hb_debug_failedregions = - o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, - o2hb_debug_dir, - &o2hb_db_failedregions, - sizeof(*o2hb_db_failedregions), - O2HB_DB_TYPE_FAILEDREGIONS, - sizeof(o2hb_failed_region_bitmap), - O2NM_MAX_REGIONS, - o2hb_failed_region_bitmap); - if (!o2hb_debug_failedregions) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_QUORUMREGIONS, o2hb_debug_dir, + &o2hb_db_quorumregions, + sizeof(*o2hb_db_quorumregions), + O2HB_DB_TYPE_QUORUMREGIONS, + sizeof(o2hb_quorum_region_bitmap), O2NM_MAX_REGIONS, + o2hb_quorum_region_bitmap); - ret = 0; -bail: - if (ret) - o2hb_exit(); - - return ret; + o2hb_debug_create(O2HB_DEBUG_FAILEDREGIONS, o2hb_debug_dir, + &o2hb_db_failedregions, + sizeof(*o2hb_db_failedregions), + O2HB_DB_TYPE_FAILEDREGIONS, + sizeof(o2hb_failed_region_bitmap), O2NM_MAX_REGIONS, + o2hb_failed_region_bitmap); } -int o2hb_init(void) +void o2hb_init(void) { int i; @@ -1413,38 +1445,34 @@ int o2hb_init(void) for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++) INIT_LIST_HEAD(&o2hb_live_slots[i]); - INIT_LIST_HEAD(&o2hb_node_events); - - memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap)); - memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); - memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); - memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); - memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); + bitmap_zero(o2hb_live_node_bitmap, O2NM_MAX_NODES); + bitmap_zero(o2hb_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_live_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS); + bitmap_zero(o2hb_failed_region_bitmap, O2NM_MAX_REGIONS); o2hb_dependent_users = 0; - return o2hb_debug_init(); + o2hb_debug_init(); } /* if we're already in a callback then we're already serialized by the sem */ static void o2hb_fill_node_map_from_callback(unsigned long *map, - unsigned bytes) + unsigned int bits) { - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memcpy(map, &o2hb_live_node_bitmap, bytes); + bitmap_copy(map, o2hb_live_node_bitmap, bits); } /* * get a map of all nodes that are heartbeating in any regions */ -void o2hb_fill_node_map(unsigned long *map, unsigned bytes) +void o2hb_fill_node_map(unsigned long *map, unsigned int bits) { /* callers want to serialize this map and callbacks so that they * can trust that they don't miss nodes coming to the party */ down_read(&o2hb_callback_sem); spin_lock(&o2hb_live_lock); - o2hb_fill_node_map_from_callback(map, bytes); + o2hb_fill_node_map_from_callback(map, bits); spin_unlock(&o2hb_live_lock); up_read(&o2hb_callback_sem); } @@ -1469,7 +1497,7 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); - mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); + mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg)); kfree(reg->hr_tmp_block); @@ -1482,38 +1510,37 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slot_data); } - if (reg->hr_bdev) - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); + if (reg->hr_bdev_file) + fput(reg->hr_bdev_file); kfree(reg->hr_slots); - kfree(reg->hr_db_regnum); + debugfs_remove_recursive(reg->hr_debug_dir); kfree(reg->hr_db_livenodes); - debugfs_remove(reg->hr_debug_livenodes); - debugfs_remove(reg->hr_debug_regnum); - debugfs_remove(reg->hr_debug_elapsed_time); - debugfs_remove(reg->hr_debug_pinned); - debugfs_remove(reg->hr_debug_dir); + kfree(reg->hr_db_regnum); + kfree(reg->hr_db_elapsed_time); + kfree(reg->hr_db_pinned); spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); + o2net_unregister_handler_list(®->hr_handler_list); kfree(reg); } static int o2hb_read_block_input(struct o2hb_region *reg, const char *page, - size_t count, unsigned long *ret_bytes, unsigned int *ret_bits) { unsigned long bytes; char *p = (char *)page; + int ret; - bytes = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &bytes); + if (ret) + return ret; /* Heartbeat and fs min / max block sizes are the same. */ if (bytes > 4096 || bytes < 512) @@ -1529,25 +1556,26 @@ static int o2hb_read_block_input(struct o2hb_region *reg, return 0; } -static ssize_t o2hb_region_block_bytes_read(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", reg->hr_block_bytes); + return sprintf(page, "%u\n", to_o2hb_region(item)->hr_block_bytes); } -static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, +static ssize_t o2hb_region_block_bytes_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); int status; unsigned long block_bytes; unsigned int block_bits; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; - status = o2hb_read_block_input(reg, page, count, - &block_bytes, &block_bits); + status = o2hb_read_block_input(reg, page, &block_bytes, + &block_bits); if (status) return status; @@ -1557,24 +1585,26 @@ static ssize_t o2hb_region_block_bytes_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_start_block_read(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_show(struct config_item *item, char *page) { - return sprintf(page, "%llu\n", reg->hr_start_block); + return sprintf(page, "%llu\n", to_o2hb_region(item)->hr_start_block); } -static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, +static ssize_t o2hb_region_start_block_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long long tmp; char *p = (char *)page; + ssize_t ret; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; - tmp = simple_strtoull(p, &p, 0); - if (!p || (*p && (*p != '\n'))) + ret = kstrtoull(p, 0, &tmp); + if (ret) return -EINVAL; reg->hr_start_block = tmp; @@ -1582,25 +1612,26 @@ static ssize_t o2hb_region_start_block_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_blocks_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_blocks_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", reg->hr_blocks); + return sprintf(page, "%d\n", to_o2hb_region(item)->hr_blocks); } -static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, +static ssize_t o2hb_region_blocks_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); unsigned long tmp; char *p = (char *)page; + int ret; - if (reg->hr_bdev) + if (reg->hr_bdev_file) return -EINVAL; - tmp = simple_strtoul(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 0, &tmp); + if (ret) + return ret; if (tmp > O2NM_MAX_NODES || tmp == 0) return -ERANGE; @@ -1610,20 +1641,19 @@ static ssize_t o2hb_region_blocks_write(struct o2hb_region *reg, return count; } -static ssize_t o2hb_region_dev_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (reg->hr_bdev) - ret = sprintf(page, "%s\n", reg->hr_dev_name); + if (to_o2hb_region(item)->hr_bdev_file) + ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item))); return ret; } static void o2hb_init_region_params(struct o2hb_region *reg) { - reg->hr_slots_per_page = PAGE_CACHE_SIZE >> reg->hr_block_bits; + reg->hr_slots_per_page = PAGE_SIZE >> reg->hr_block_bits; reg->hr_timeout_ms = O2HB_REGION_TIMEOUT_MS; mlog(ML_HEARTBEAT, "hr_start_block = %llu, hr_blocks = %u\n", @@ -1644,17 +1674,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; reg->hr_tmp_block = kmalloc(reg->hr_block_bytes, GFP_KERNEL); - if (reg->hr_tmp_block == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_tmp_block == NULL) return -ENOMEM; - } reg->hr_slots = kcalloc(reg->hr_blocks, sizeof(struct o2hb_disk_slot), GFP_KERNEL); - if (reg->hr_slots == NULL) { - mlog_errno(-ENOMEM); + if (reg->hr_slots == NULL) return -ENOMEM; - } for(i = 0; i < reg->hr_blocks; i++) { slot = ®->hr_slots[i]; @@ -1670,17 +1696,13 @@ static int o2hb_map_slot_data(struct o2hb_region *reg) reg->hr_slot_data = kcalloc(reg->hr_num_pages, sizeof(struct page *), GFP_KERNEL); - if (!reg->hr_slot_data) { - mlog_errno(-ENOMEM); + if (!reg->hr_slot_data) return -ENOMEM; - } for(i = 0; i < reg->hr_num_pages; i++) { page = alloc_page(GFP_KERNEL); - if (!page) { - mlog_errno(-ENOMEM); + if (!page) return -ENOMEM; - } reg->hr_slot_data[i] = page; @@ -1711,11 +1733,9 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg) struct o2hb_disk_slot *slot; struct o2hb_disk_heartbeat_block *hb_block; - ret = o2hb_read_slots(reg, reg->hr_blocks); - if (ret) { - mlog_errno(ret); + ret = o2hb_read_slots(reg, 0, reg->hr_blocks); + if (ret) goto out; - } /* We only want to get an idea of the values initially in each * slot, so we do no verification - o2hb_check_slot will @@ -1735,61 +1755,57 @@ out: return ret; } -/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ -static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, +/* + * this is acting as commit; we set up all of hr_bdev_file and hr_task or + * nothing + */ +static ssize_t o2hb_region_dev_store(struct config_item *item, const char *page, size_t count) { + struct o2hb_region *reg = to_o2hb_region(item); struct task_struct *hb_task; long fd; int sectsize; char *p = (char *)page; - struct fd f; - struct inode *inode; ssize_t ret = -EINVAL; int live_threshold; - if (reg->hr_bdev) - goto out; + if (reg->hr_bdev_file) + return -EINVAL; /* We can't heartbeat without having had our node number * configured yet. */ if (o2nm_this_node() == O2NM_MAX_NODES) - goto out; + return -EINVAL; - fd = simple_strtol(p, &p, 0); - if (!p || (*p && (*p != '\n'))) - goto out; + ret = kstrtol(p, 0, &fd); + if (ret < 0) + return -EINVAL; if (fd < 0 || fd >= INT_MAX) - goto out; + return -EINVAL; - f = fdget(fd); - if (f.file == NULL) - goto out; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EINVAL; if (reg->hr_blocks == 0 || reg->hr_start_block == 0 || reg->hr_block_bytes == 0) - goto out2; - - inode = igrab(f.file->f_mapping->host); - if (inode == NULL) - goto out2; + return -EINVAL; - if (!S_ISBLK(inode->i_mode)) - goto out3; + if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode)) + return -EINVAL; - reg->hr_bdev = I_BDEV(f.file->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); - if (ret) { - reg->hr_bdev = NULL; - goto out3; + reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev, + BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(reg->hr_bdev_file)) { + ret = PTR_ERR(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; + return ret; } - inode = NULL; - - bdevname(reg->hr_bdev, reg->hr_dev_name); - sectsize = bdev_logical_block_size(reg->hr_bdev); + sectsize = bdev_logical_block_size(reg_bdev(reg)); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, "blocksize %u incorrect for device, expected %d", @@ -1819,6 +1835,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, } INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); /* * A node is considered live after it has beat LIVE_THRESHOLD @@ -1831,14 +1848,14 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, live_threshold = O2HB_LIVE_THRESHOLD; if (o2hb_global_heartbeat_active()) { spin_lock(&o2hb_live_lock); - if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) + if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1) live_threshold <<= 1; spin_unlock(&o2hb_live_lock); } ++live_threshold; atomic_set(®->hr_steady_iterations, live_threshold); - /* unsteady_iterations is double the steady_iterations */ - atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + /* unsteady_iterations is triple the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold * 3)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); @@ -1853,7 +1870,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, spin_unlock(&o2hb_live_lock); ret = wait_event_interruptible(o2hb_steady_queue, - atomic_read(®->hr_steady_iterations) == 0); + atomic_read(®->hr_steady_iterations) == 0 || + reg->hr_node_deleted); if (ret) { atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; @@ -1864,6 +1882,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out3; } + if (reg->hr_node_deleted) { + ret = -EINVAL; + goto out3; + } + /* Ok, we were woken. Make sure it wasn't by drop_item() */ spin_lock(&o2hb_live_lock); hb_task = reg->hr_task; @@ -1877,26 +1900,20 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, ret = -EIO; if (hb_task && o2hb_global_heartbeat_active()) - printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n", - config_item_name(®->hr_item), reg->hr_dev_name); + printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n", + config_item_name(®->hr_item), reg_bdev(reg)); out3: - iput(inode); -out2: - fdput(f); -out: if (ret < 0) { - if (reg->hr_bdev) { - blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - reg->hr_bdev = NULL; - } + fput(reg->hr_bdev_file); + reg->hr_bdev_file = NULL; } return ret; } -static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, - char *page) +static ssize_t o2hb_region_pid_show(struct config_item *item, char *page) { + struct o2hb_region *reg = to_o2hb_region(item); pid_t pid = 0; spin_lock(&o2hb_live_lock); @@ -1910,95 +1927,26 @@ static ssize_t o2hb_region_pid_read(struct o2hb_region *reg, return sprintf(page, "%u\n", pid); } -struct o2hb_region_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_region *, char *); - ssize_t (*store)(struct o2hb_region *, const char *, size_t); -}; - -static struct o2hb_region_attribute o2hb_region_attr_block_bytes = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "block_bytes", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_block_bytes_read, - .store = o2hb_region_block_bytes_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_start_block = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "start_block", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_start_block_read, - .store = o2hb_region_start_block_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_blocks = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "blocks", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_blocks_read, - .store = o2hb_region_blocks_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_dev = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dev", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_region_dev_read, - .store = o2hb_region_dev_write, -}; - -static struct o2hb_region_attribute o2hb_region_attr_pid = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "pid", - .ca_mode = S_IRUGO | S_IRUSR }, - .show = o2hb_region_pid_read, -}; +CONFIGFS_ATTR(o2hb_region_, block_bytes); +CONFIGFS_ATTR(o2hb_region_, start_block); +CONFIGFS_ATTR(o2hb_region_, blocks); +CONFIGFS_ATTR(o2hb_region_, dev); +CONFIGFS_ATTR_RO(o2hb_region_, pid); static struct configfs_attribute *o2hb_region_attrs[] = { - &o2hb_region_attr_block_bytes.attr, - &o2hb_region_attr_start_block.attr, - &o2hb_region_attr_blocks.attr, - &o2hb_region_attr_dev.attr, - &o2hb_region_attr_pid.attr, + &o2hb_region_attr_block_bytes, + &o2hb_region_attr_start_block, + &o2hb_region_attr_blocks, + &o2hb_region_attr_dev, + &o2hb_region_attr_pid, NULL, }; -static ssize_t o2hb_region_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = 0; - - if (o2hb_region_attr->show) - ret = o2hb_region_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_region_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_region *reg = to_o2hb_region(item); - struct o2hb_region_attribute *o2hb_region_attr = - container_of(attr, struct o2hb_region_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_region_attr->store) - ret = o2hb_region_attr->store(reg, page, count); - return ret; -} - static struct configfs_item_operations o2hb_region_item_ops = { .release = o2hb_region_release, - .show_attribute = o2hb_region_show, - .store_attribute = o2hb_region_store, }; -static struct config_item_type o2hb_region_type = { +static const struct config_item_type o2hb_region_type = { .ct_item_ops = &o2hb_region_item_ops, .ct_attrs = o2hb_region_attrs, .ct_owner = THIS_MODULE, @@ -2018,69 +1966,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } -static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +static void o2hb_debug_region_init(struct o2hb_region *reg, + struct dentry *parent) { - int ret = -ENOMEM; + struct dentry *dir; - reg->hr_debug_dir = - debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { - mlog_errno(ret); - goto bail; - } + dir = debugfs_create_dir(config_item_name(®->hr_item), parent); + reg->hr_debug_dir = dir; - reg->hr_debug_livenodes = - o2hb_debug_create(O2HB_DEBUG_LIVENODES, - reg->hr_debug_dir, - &(reg->hr_db_livenodes), - sizeof(*(reg->hr_db_livenodes)), - O2HB_DB_TYPE_REGION_LIVENODES, - sizeof(reg->hr_live_node_bitmap), - O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, + reg); - reg->hr_debug_regnum = - o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, - reg->hr_debug_dir, - &(reg->hr_db_regnum), - sizeof(*(reg->hr_db_regnum)), - O2HB_DB_TYPE_REGION_NUMBER, - 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - reg->hr_debug_elapsed_time = - o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, - reg->hr_debug_dir, - &(reg->hr_db_elapsed_time), - sizeof(*(reg->hr_db_elapsed_time)), - O2HB_DB_TYPE_REGION_ELAPSED_TIME, - 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - reg->hr_debug_pinned = - o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, - reg->hr_debug_dir, - &(reg->hr_db_pinned), - sizeof(*(reg->hr_db_pinned)), - O2HB_DB_TYPE_REGION_PINNED, - 0, 0, reg); - if (!reg->hr_debug_pinned) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - ret = 0; -bail: - return ret; } static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, @@ -2115,13 +2027,39 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); - ret = o2hb_debug_region_init(reg, o2hb_debug_dir); - if (ret) { - config_item_put(®->hr_item); - goto free; - } + /* this is the same way to generate msg key as dlm, for local heartbeat, + * name is also the same, so make initial crc value different to avoid + * message key conflict. + */ + reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS, + name, strlen(name)); + INIT_LIST_HEAD(®->hr_handler_list); + ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_timeout_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto remove_item; + + ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_approve_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto unregister_handler; + + o2hb_debug_region_init(reg, o2hb_debug_dir); return ®->hr_item; + +unregister_handler: + o2net_unregister_handler_list(®->hr_handler_list); +remove_item: + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + if (o2hb_global_heartbeat_active()) + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + spin_unlock(&o2hb_live_lock); free: kfree(reg); return ERR_PTR(ret); @@ -2152,10 +2090,10 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, quorum_region = 1; clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); spin_unlock(&o2hb_live_lock); - printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n", + printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n", ((atomic_read(®->hr_steady_iterations) == 0) ? "stopped" : "start aborted"), config_item_name(item), - reg->hr_dev_name); + reg_bdev(reg)); } /* @@ -2182,7 +2120,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, if (!o2hb_dependent_users) goto unlock; - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) o2hb_region_pin(NULL); @@ -2190,56 +2128,22 @@ unlock: spin_unlock(&o2hb_live_lock); } -struct o2hb_heartbeat_group_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2hb_heartbeat_group *, char *); - ssize_t (*store)(struct o2hb_heartbeat_group *, const char *, size_t); -}; - -static ssize_t o2hb_heartbeat_group_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = 0; - - if (o2hb_heartbeat_group_attr->show) - ret = o2hb_heartbeat_group_attr->show(reg, page); - return ret; -} - -static ssize_t o2hb_heartbeat_group_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2hb_heartbeat_group *reg = to_o2hb_heartbeat_group(to_config_group(item)); - struct o2hb_heartbeat_group_attribute *o2hb_heartbeat_group_attr = - container_of(attr, struct o2hb_heartbeat_group_attribute, attr); - ssize_t ret = -EINVAL; - - if (o2hb_heartbeat_group_attr->store) - ret = o2hb_heartbeat_group_attr->store(reg, page, count); - return ret; -} - -static ssize_t o2hb_heartbeat_group_threshold_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, + char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group *group, - const char *page, - size_t count) +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, + const char *page, size_t count) { unsigned long tmp; char *p = (char *)page; + int ret; - tmp = simple_strtoul(p, &p, 10); - if (!p || (*p && (*p != '\n'))) - return -EINVAL; + ret = kstrtoul(p, 10, &tmp); + if (ret) + return ret; /* this will validate ranges for us. */ o2hb_dead_threshold_set((unsigned int) tmp); @@ -2247,17 +2151,15 @@ static ssize_t o2hb_heartbeat_group_threshold_store(struct o2hb_heartbeat_group return count; } -static -ssize_t o2hb_heartbeat_group_mode_show(struct o2hb_heartbeat_group *group, - char *page) +static ssize_t o2hb_heartbeat_group_mode_show(struct config_item *item, + char *page) { return sprintf(page, "%s\n", o2hb_heartbeat_mode_desc[o2hb_heartbeat_mode]); } -static -ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, - const char *page, size_t count) +static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, + const char *page, size_t count) { unsigned int i; int ret; @@ -2268,7 +2170,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, return -EINVAL; for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { - if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) + if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len)) continue; ret = o2hb_global_heartbeat_mode_set(i); @@ -2282,41 +2184,22 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, } -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_threshold = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "dead_threshold", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_threshold_show, - .store = o2hb_heartbeat_group_threshold_store, -}; - -static struct o2hb_heartbeat_group_attribute o2hb_heartbeat_group_attr_mode = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "mode", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2hb_heartbeat_group_mode_show, - .store = o2hb_heartbeat_group_mode_store, -}; +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold.attr, - &o2hb_heartbeat_group_attr_mode.attr, + &o2hb_heartbeat_group_attr_dead_threshold, + &o2hb_heartbeat_group_attr_mode, NULL, }; -static struct configfs_item_operations o2hb_heartbeat_group_item_ops = { - .show_attribute = o2hb_heartbeat_group_show, - .store_attribute = o2hb_heartbeat_group_store, -}; - static struct configfs_group_operations o2hb_heartbeat_group_group_ops = { .make_item = o2hb_heartbeat_group_make_item, .drop_item = o2hb_heartbeat_group_drop_item, }; -static struct config_item_type o2hb_heartbeat_group_type = { +static const struct config_item_type o2hb_heartbeat_group_type = { .ct_group_ops = &o2hb_heartbeat_group_group_ops, - .ct_item_ops = &o2hb_heartbeat_group_item_ops, .ct_attrs = o2hb_heartbeat_group_attrs, .ct_owner = THIS_MODULE, }; @@ -2482,7 +2365,7 @@ static int o2hb_region_inc_user(const char *region_uuid) if (o2hb_dependent_users > 1) goto unlock; - if (o2hb_pop_count(&o2hb_quorum_region_bitmap, + if (bitmap_weight(o2hb_quorum_region_bitmap, O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF) ret = o2hb_region_pin(NULL); @@ -2491,7 +2374,7 @@ unlock: return ret; } -void o2hb_region_dec_user(const char *region_uuid) +static void o2hb_region_dec_user(const char *region_uuid) { spin_lock(&o2hb_live_lock); @@ -2516,8 +2399,7 @@ unlock: int o2hb_register_callback(const char *region_uuid, struct o2hb_callback_func *hc) { - struct o2hb_callback_func *tmp; - struct list_head *iter; + struct o2hb_callback_func *f; struct o2hb_callback *hbcall; int ret; @@ -2540,10 +2422,9 @@ int o2hb_register_callback(const char *region_uuid, down_write(&o2hb_callback_sem); - list_for_each(iter, &hbcall->list) { - tmp = list_entry(iter, struct o2hb_callback_func, hc_item); - if (hc->hc_priority < tmp->hc_priority) { - list_add_tail(&hc->hc_item, iter); + list_for_each_entry(f, &hbcall->list, hc_item) { + if (hc->hc_priority < f->hc_priority) { + list_add_tail(&hc->hc_item, &f->hc_item); break; } } @@ -2582,11 +2463,13 @@ void o2hb_unregister_callback(const char *region_uuid, } EXPORT_SYMBOL_GPL(o2hb_unregister_callback); -int o2hb_check_node_heartbeating(u8 node_num) +int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map(testing_map, sizeof(testing_map)); + spin_lock(&o2hb_live_lock); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); + spin_unlock(&o2hb_live_lock); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", @@ -2596,13 +2479,13 @@ int o2hb_check_node_heartbeating(u8 node_num) return 1; } -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem); int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + o2hb_fill_node_map_from_callback(testing_map, O2NM_MAX_NODES); if (!test_bit(node_num, testing_map)) { mlog(ML_HEARTBEAT, "node (%u) does not have heartbeating enabled.\n", @@ -2614,23 +2497,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); -/* Makes sure our local node is configured with a node number, and is - * heartbeating. */ -int o2hb_check_local_node_heartbeating(void) -{ - u8 node_num; - - /* if this node was set then we have networking */ - node_num = o2nm_this_node(); - if (node_num == O2NM_MAX_NODES) { - mlog(ML_HEARTBEAT, "this node has not been configured.\n"); - return 0; - } - - return o2hb_check_node_heartbeating(node_num); -} -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); - /* * this is just a hack until we get the plumbing which flips file systems * read only and drops the hb ref instead of killing the node dead. diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 00ad8e8fea51..8ef8c1b9eeb7 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * heartbeat.h * * Function prototypes * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_HEARTBEAT_H @@ -76,12 +59,11 @@ int o2hb_register_callback(const char *region_uuid, void o2hb_unregister_callback(const char *region_uuid, struct o2hb_callback_func *hc); void o2hb_fill_node_map(unsigned long *map, - unsigned bytes); + unsigned int bits); void o2hb_exit(void); -int o2hb_init(void); -int o2hb_check_node_heartbeating(u8 node_num); +void o2hb_init(void); +int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); -int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); int o2hb_get_all_regions(char *region_uuids, u8 numregions); int o2hb_global_heartbeat_active(void); diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 07ac24fd9252..563881ddbf00 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/module.h> @@ -24,7 +8,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/string.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "masklog.h" @@ -49,13 +33,13 @@ static ssize_t mlog_mask_show(u64 mask, char *buf) static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) { - if (!strnicmp(buf, "allow", 5)) { + if (!strncasecmp(buf, "allow", 5)) { __mlog_set_u64(mask, mlog_and_bits); __mlog_clear_u64(mask, mlog_not_bits); - } else if (!strnicmp(buf, "deny", 4)) { + } else if (!strncasecmp(buf, "deny", 4)) { __mlog_set_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); - } else if (!strnicmp(buf, "off", 3)) { + } else if (!strncasecmp(buf, "off", 3)) { __mlog_clear_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); } else @@ -64,6 +48,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) return count; } +void __mlog_printk(const u64 *mask, const char *func, int line, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *level; + const char *prefix = ""; + + if (!__mlog_test_u64(*mask, mlog_and_bits) || + __mlog_test_u64(*mask, mlog_not_bits)) + return; + + if (*mask & ML_ERROR) { + level = KERN_ERR; + prefix = "ERROR: "; + } else if (*mask & ML_NOTICE) { + level = KERN_NOTICE; + } else { + level = KERN_INFO; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s(%s,%u,%u):%s:%d %s%pV", + level, current->comm, task_pid_nr(current), + raw_smp_processor_id(), func, line, prefix, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(__mlog_printk); + struct mlog_attribute { struct attribute attr; u64 mask; @@ -102,7 +120,8 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(KTHREAD), }; -static struct attribute *mlog_attr_ptrs[MLOG_MAX_BITS] = {NULL, }; +static struct attribute *mlog_default_attrs[MLOG_MAX_BITS] = {NULL, }; +ATTRIBUTE_GROUPS(mlog_default); static ssize_t mlog_show(struct kobject *obj, struct attribute *attr, char *buf) @@ -126,8 +145,8 @@ static const struct sysfs_ops mlog_attr_ops = { }; static struct kobj_type mlog_ktype = { - .default_attrs = mlog_attr_ptrs, - .sysfs_ops = &mlog_attr_ops, + .default_groups = mlog_default_groups, + .sysfs_ops = &mlog_attr_ops, }; static struct kset mlog_kset = { @@ -139,10 +158,10 @@ int mlog_sys_init(struct kset *o2cb_kset) int i = 0; while (mlog_attrs[i].attr.mode) { - mlog_attr_ptrs[i] = &mlog_attrs[i].attr; + mlog_default_attrs[i] = &mlog_attrs[i].attr; i++; } - mlog_attr_ptrs[i] = NULL; + mlog_default_attrs[i] = NULL; kobject_set_name(&mlog_kset.kobj, "logmask"); mlog_kset.kobj.kset = o2cb_kset; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index baa2b9ef7eef..630bd5a3dd0d 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef O2CLUSTER_MASKLOG_H @@ -45,7 +29,7 @@ * just calling printk() so that this can eventually make its way through * relayfs along with the debugging messages. Everything else gets KERN_DEBUG. * The inline tests and macro dance give GCC the opportunity to quite cleverly - * only emit the appropriage printk() when the caller passes in a constant + * only emit the appropriate printk() when the caller passes in a constant * mask, as is almost always the case. * * All this bitmask nonsense is managed from the files under @@ -162,46 +146,39 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #endif +__printf(4, 5) +void __mlog_printk(const u64 *m, const char *func, int line, + const char *fmt, ...); + /* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. + * Testing before the __mlog_printk call lets the compiler eliminate the + * call completely when (m & ML_ALLOWED_BITS) is 0. */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) +#define mlog(mask, fmt, ...) \ +do { \ + u64 _m = MLOG_MASK_PREFIX | (mask); \ + if (_m & ML_ALLOWED_BITS) \ + __mlog_printk(&_m, __func__, __LINE__, fmt, \ + ##__VA_ARGS__); \ +} while (0) -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. - */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, mlog_and_bits) && \ - !__mlog_test_u64(__m, mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ +#define mlog_ratelimited(mask, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + mlog(mask, fmt, ##__VA_ARGS__); \ } while (0) -#define mlog_errno(st) do { \ +#define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ - _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ + _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ + _st != -EDQUOT) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ -} while (0) + _st; \ +}) #define mlog_bug_on_msg(cond, fmt, args...) do { \ if (cond) { \ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 73ba81928bce..bc27301eab6d 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * netdebug.c * * debug functionality for o2net * * Copyright (C) 2005, 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifdef CONFIG_DEBUG_FS @@ -53,10 +36,6 @@ #define SHOW_SOCK_STATS 1 static struct dentry *o2net_dentry; -static struct dentry *sc_dentry; -static struct dentry *nst_dentry; -static struct dentry *stats_dentry; -static struct dentry *nodes_dentry; static DEFINE_SPINLOCK(o2net_debug_lock); @@ -65,17 +44,17 @@ static LIST_HEAD(send_tracking); void o2net_debug_add_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&nst->st_net_debug_item, &send_tracking); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_nst(struct o2net_send_tracking *nst) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); if (!list_empty(&nst->st_net_debug_item)) list_del_init(&nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } static struct o2net_send_tracking @@ -105,9 +84,9 @@ static void *nst_seq_start(struct seq_file *seq, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; } @@ -116,13 +95,13 @@ static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct o2net_send_tracking *nst, *dummy_nst = seq->private; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); list_del_init(&dummy_nst->st_net_debug_item); if (nst) list_add(&dummy_nst->st_net_debug_item, &nst->st_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return nst; /* unused, just needs to be null when done */ } @@ -133,7 +112,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) ktime_t now; s64 sock, send, status; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); nst = next_nst(dummy_nst); if (!nst) goto out; @@ -166,7 +145,7 @@ static int nst_seq_show(struct seq_file *seq, void *v) (long long)status); out: - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } @@ -185,29 +164,13 @@ static const struct seq_operations nst_seq_ops = { static int nst_fop_open(struct inode *inode, struct file *file) { struct o2net_send_tracking *dummy_nst; - struct seq_file *seq; - int ret; - - dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); - if (dummy_nst == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_nst->st_task = NULL; - - ret = seq_open(file, &nst_seq_ops); - if (ret) - goto out; - seq = file->private_data; - seq->private = dummy_nst; + dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst)); + if (!dummy_nst) + return -ENOMEM; o2net_debug_add_nst(dummy_nst); - dummy_nst = NULL; - -out: - kfree(dummy_nst); - return ret; + return 0; } static int nst_fop_release(struct inode *inode, struct file *file) @@ -228,16 +191,16 @@ static const struct file_operations nst_seq_fops = { void o2net_debug_add_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_add(&sc->sc_net_debug_item, &sock_containers); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } void o2net_debug_del_sc(struct o2net_sock_container *sc) { - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); list_del_init(&sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); } struct o2net_sock_debug { @@ -273,9 +236,9 @@ static void *sc_seq_start(struct seq_file *seq, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; } @@ -285,12 +248,12 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); list_del_init(&dummy_sc->sc_net_debug_item); if (sc) list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return sc; /* unused, just needs to be null when done */ } @@ -365,7 +328,7 @@ static void sc_show_sock_container(struct seq_file *seq, " func key: 0x%08x\n" " func type: %u\n", sc, - atomic_read(&sc->sc_kref.refcount), + kref_read(&sc->sc_kref), &saddr, inet ? ntohs(sport) : 0, &daddr, inet ? ntohs(dport) : 0, sc->sc_node->nd_name, @@ -386,7 +349,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) struct o2net_sock_debug *sd = seq->private; struct o2net_sock_container *sc, *dummy_sc = sd->dbg_sock; - spin_lock(&o2net_debug_lock); + spin_lock_bh(&o2net_debug_lock); sc = next_sc(dummy_sc); if (sc) { @@ -396,7 +359,7 @@ static int sc_seq_show(struct seq_file *seq, void *v) sc_show_sock_stats(seq, sc); } - spin_unlock(&o2net_debug_lock); + spin_unlock_bh(&o2net_debug_lock); return 0; } @@ -412,33 +375,27 @@ static const struct seq_operations sc_seq_ops = { .show = sc_seq_show, }; -static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) +static int sc_common_open(struct file *file, int ctxt) { + struct o2net_sock_debug *sd; struct o2net_sock_container *dummy_sc; - struct seq_file *seq; - int ret; - dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); - if (dummy_sc == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_sc->sc_page = NULL; + dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL); + if (!dummy_sc) + return -ENOMEM; - ret = seq_open(file, &sc_seq_ops); - if (ret) - goto out; + sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd)); + if (!sd) { + kfree(dummy_sc); + return -ENOMEM; + } - seq = file->private_data; - seq->private = sd; + sd->dbg_ctxt = ctxt; sd->dbg_sock = dummy_sc; - o2net_debug_add_sc(dummy_sc); - dummy_sc = NULL; + o2net_debug_add_sc(dummy_sc); -out: - kfree(dummy_sc); - return ret; + return 0; } static int sc_fop_release(struct inode *inode, struct file *file) @@ -448,21 +405,13 @@ static int sc_fop_release(struct inode *inode, struct file *file) struct o2net_sock_container *dummy_sc = sd->dbg_sock; o2net_debug_del_sc(dummy_sc); + kfree(dummy_sc); return seq_release_private(inode, file); } static int stats_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_STATS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_STATS); } static const struct file_operations stats_seq_fops = { @@ -474,16 +423,7 @@ static const struct file_operations stats_seq_fops = { static int sc_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_CONTAINERS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_CONTAINERS); } static const struct file_operations sc_seq_fops = { @@ -498,11 +438,11 @@ static int o2net_fill_bitmap(char *buf, int len) unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; int i = -1, out = 0; - o2net_fill_node_map(map, sizeof(map)); + o2net_fill_node_map(map, O2NM_MAX_NODES); while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) - out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i); - out += snprintf(buf + out, PAGE_SIZE - out, "\n"); + out += scnprintf(buf + out, PAGE_SIZE - out, "%d ", i); + out += scnprintf(buf + out, PAGE_SIZE - out, "\n"); return out; } @@ -544,36 +484,23 @@ static const struct file_operations nodes_fops = { void o2net_debugfs_exit(void) { - debugfs_remove(nodes_dentry); - debugfs_remove(stats_dentry); - debugfs_remove(sc_dentry); - debugfs_remove(nst_dentry); - debugfs_remove(o2net_dentry); + debugfs_remove_recursive(o2net_dentry); } -int o2net_debugfs_init(void) +void o2net_debugfs_init(void) { umode_t mode = S_IFREG|S_IRUSR; o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); - if (o2net_dentry) - nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode, - o2net_dentry, NULL, &nst_seq_fops); - if (nst_dentry) - sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode, - o2net_dentry, NULL, &sc_seq_fops); - if (sc_dentry) - stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode, - o2net_dentry, NULL, &stats_seq_fops); - if (stats_dentry) - nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode, - o2net_dentry, NULL, &nodes_fops); - if (nodes_dentry) - return 0; - - o2net_debugfs_exit(); - mlog_errno(-ENOMEM); - return -ENOMEM; + + debugfs_create_file(NST_DEBUG_NAME, mode, o2net_dentry, NULL, + &nst_seq_fops); + debugfs_create_file(SC_DEBUG_NAME, mode, o2net_dentry, NULL, + &sc_seq_fops); + debugfs_create_file(STATS_DEBUG_NAME, mode, o2net_dentry, NULL, + &stats_seq_fops); + debugfs_create_file(NODES_DEBUG_NAME, mode, o2net_dentry, NULL, + &nodes_fops); } #endif /* CONFIG_DEBUG_FS */ diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index bb240647ca5f..2f61d39e4e50 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-or-later +/* * Copyright (C) 2004, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #include <linux/slab.h> @@ -29,18 +13,20 @@ #include "heartbeat.h" #include "masklog.h" #include "sys.h" -#include "ver.h" /* for now we operate under the assertion that there can be only one * cluster active at a time. Changing this will require trickling * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; -char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { - "reset", /* O2NM_FENCE_RESET */ - "panic", /* O2NM_FENCE_PANIC */ +static const char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ }; +static inline void o2nm_lock_subsystem(void); +static inline void o2nm_unlock_subsystem(void); + struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -68,7 +54,7 @@ int o2nm_configured_node_map(unsigned long *map, unsigned bytes) return -EINVAL; read_lock(&cluster->cl_nodes_lock); - memcpy(map, cluster->cl_nodes_bitmap, sizeof(cluster->cl_nodes_bitmap)); + bitmap_copy(map, cluster->cl_nodes_bitmap, O2NM_MAX_NODES); read_unlock(&cluster->cl_nodes_lock); return 0; @@ -173,31 +159,35 @@ static void o2nm_node_release(struct config_item *item) kfree(node); } -static ssize_t o2nm_node_num_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_num_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_num); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_num); } static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) { /* through the first node_set .parent * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ - return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + if (node->nd_item.ci_parent) + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + else + return NULL; } enum { O2NM_NODE_ATTR_NUM = 0, O2NM_NODE_ATTR_PORT, O2NM_NODE_ATTR_ADDRESS, - O2NM_NODE_ATTR_LOCAL, }; -static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; + int ret = 0; tmp = simple_strtoul(p, &p, 0); if (!p || (*p && (*p != '\n'))) @@ -214,28 +204,41 @@ static ssize_t o2nm_node_num_write(struct o2nm_node *node, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) - p = NULL; + ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_NUM, + &node->nd_set_attributes)) + ret = -EBUSY; else { cluster->cl_nodes[tmp] = node; node->nd_num = tmp; set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); - if (p == NULL) - return -EEXIST; + o2nm_unlock_subsystem(); + + if (ret) + return ret; return count; } -static ssize_t o2nm_node_ipv4_port_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_port_show(struct config_item *item, char *page) { - return sprintf(page, "%u\n", ntohs(node->nd_ipv4_port)); + return sprintf(page, "%u\n", ntohs(to_o2nm_node(item)->nd_ipv4_port)); } -static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_port_store(struct config_item *item, const char *page, size_t count) { + struct o2nm_node *node = to_o2nm_node(item); unsigned long tmp; char *p = (char *)page; @@ -248,21 +251,24 @@ static ssize_t o2nm_node_ipv4_port_write(struct o2nm_node *node, if (tmp >= (u16)-1) return -ERANGE; + if (test_and_set_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) + return -EBUSY; node->nd_ipv4_port = htons(tmp); return count; } -static ssize_t o2nm_node_ipv4_address_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_ipv4_address_show(struct config_item *item, char *page) { - return sprintf(page, "%pI4\n", &node->nd_ipv4_address); + return sprintf(page, "%pI4\n", &to_o2nm_node(item)->nd_ipv4_address); } -static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, +static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, const char *page, size_t count) { - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster; int ret, i; struct rb_node **p, *parent; unsigned int octets[4]; @@ -279,15 +285,27 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); } + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + ret = 0; write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) ret = -EEXIST; + else if (test_and_set_bit(O2NM_NODE_ATTR_ADDRESS, + &node->nd_set_attributes)) + ret = -EBUSY; else { rb_link_node(&node->nd_ip_node, parent, p); rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -296,15 +314,16 @@ static ssize_t o2nm_node_ipv4_address_write(struct o2nm_node *node, return count; } -static ssize_t o2nm_node_local_read(struct o2nm_node *node, char *page) +static ssize_t o2nm_node_local_show(struct config_item *item, char *page) { - return sprintf(page, "%d\n", node->nd_local); + return sprintf(page, "%d\n", to_o2nm_node(item)->nd_local); } -static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, +static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_node *node = to_o2nm_node(item); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; ssize_t ret; @@ -322,17 +341,26 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + ret = -EINVAL; + goto out; + } + /* the only failure case is trying to set a new local node * when a different one is already set */ if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; + cluster->cl_local_node != node->nd_num) { + ret = -EBUSY; + goto out; + } /* bring up the rx thread if we're setting the new local node. */ if (tmp && !cluster->cl_has_local) { ret = o2net_start_listening(node); if (ret) - return ret; + goto out; } if (!tmp && cluster->cl_has_local && @@ -347,114 +375,31 @@ static ssize_t o2nm_node_local_write(struct o2nm_node *node, const char *page, cluster->cl_local_node = node->nd_num; } - return count; -} - -struct o2nm_node_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_node *, char *); - ssize_t (*store)(struct o2nm_node *, const char *, size_t); -}; + ret = count; -static struct o2nm_node_attribute o2nm_node_attr_num = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "num", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_num_read, - .store = o2nm_node_num_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_port = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_port", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_port_read, - .store = o2nm_node_ipv4_port_write, -}; - -static struct o2nm_node_attribute o2nm_node_attr_ipv4_address = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "ipv4_address", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_ipv4_address_read, - .store = o2nm_node_ipv4_address_write, -}; +out: + o2nm_unlock_subsystem(); + return ret; +} -static struct o2nm_node_attribute o2nm_node_attr_local = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "local", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_node_local_read, - .store = o2nm_node_local_write, -}; +CONFIGFS_ATTR(o2nm_node_, num); +CONFIGFS_ATTR(o2nm_node_, ipv4_port); +CONFIGFS_ATTR(o2nm_node_, ipv4_address); +CONFIGFS_ATTR(o2nm_node_, local); static struct configfs_attribute *o2nm_node_attrs[] = { - [O2NM_NODE_ATTR_NUM] = &o2nm_node_attr_num.attr, - [O2NM_NODE_ATTR_PORT] = &o2nm_node_attr_ipv4_port.attr, - [O2NM_NODE_ATTR_ADDRESS] = &o2nm_node_attr_ipv4_address.attr, - [O2NM_NODE_ATTR_LOCAL] = &o2nm_node_attr_local.attr, + &o2nm_node_attr_num, + &o2nm_node_attr_ipv4_port, + &o2nm_node_attr_ipv4_address, + &o2nm_node_attr_local, NULL, }; -static int o2nm_attr_index(struct configfs_attribute *attr) -{ - int i; - for (i = 0; i < ARRAY_SIZE(o2nm_node_attrs); i++) { - if (attr == o2nm_node_attrs[i]) - return i; - } - BUG(); - return 0; -} - -static ssize_t o2nm_node_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret = 0; - - if (o2nm_node_attr->show) - ret = o2nm_node_attr->show(node, page); - return ret; -} - -static ssize_t o2nm_node_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_node_attribute *o2nm_node_attr = - container_of(attr, struct o2nm_node_attribute, attr); - ssize_t ret; - int attr_index = o2nm_attr_index(attr); - - if (o2nm_node_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - if (test_bit(attr_index, &node->nd_set_attributes)) - return -EBUSY; - - ret = o2nm_node_attr->store(node, page, count); - if (ret < count) - goto out; - - set_bit(attr_index, &node->nd_set_attributes); -out: - return ret; -} - static struct configfs_item_operations o2nm_node_item_ops = { .release = o2nm_node_release, - .show_attribute = o2nm_node_show, - .store_attribute = o2nm_node_store, }; -static struct config_item_type o2nm_node_type = { +static const struct config_item_type o2nm_node_type = { .ct_item_ops = &o2nm_node_item_ops, .ct_attrs = o2nm_node_attrs, .ct_owner = THIS_MODULE, @@ -476,12 +421,6 @@ static struct o2nm_node_group *to_o2nm_node_group(struct config_group *group) } #endif -struct o2nm_cluster_attribute { - struct configfs_attribute attr; - ssize_t (*show)(struct o2nm_cluster *, char *); - ssize_t (*store)(struct o2nm_cluster *, const char *, size_t); -}; - static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, unsigned int *val) { @@ -502,15 +441,16 @@ static ssize_t o2nm_cluster_attr_write(const char *page, ssize_t count, return count; } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_idle_timeout_ms_show(struct config_item *item, + char *page) { - return sprintf(page, "%u\n", cluster->cl_idle_timeout_ms); + return sprintf(page, "%u\n", to_o2nm_cluster(item)->cl_idle_timeout_ms); } -static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_idle_timeout_ms_store(struct config_item *item, + const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -537,15 +477,17 @@ static ssize_t o2nm_cluster_attr_idle_timeout_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_keepalive_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_keepalive_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_keepalive_delay_ms); } -static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_keepalive_delay_ms_store( + struct config_item *item, const char *page, size_t count) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret; unsigned int val; @@ -572,22 +514,24 @@ static ssize_t o2nm_cluster_attr_keepalive_delay_ms_write( return ret; } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_reconnect_delay_ms_show( + struct config_item *item, char *page) { - return sprintf(page, "%u\n", cluster->cl_reconnect_delay_ms); + return sprintf(page, "%u\n", + to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_reconnect_delay_ms_store( + struct config_item *item, const char *page, size_t count) { return o2nm_cluster_attr_write(page, count, - &cluster->cl_reconnect_delay_ms); + &to_o2nm_cluster(item)->cl_reconnect_delay_ms); } -static ssize_t o2nm_cluster_attr_fence_method_read( - struct o2nm_cluster *cluster, char *page) +static ssize_t o2nm_cluster_fence_method_show( + struct config_item *item, char *page) { + struct o2nm_cluster *cluster = to_o2nm_cluster(item); ssize_t ret = 0; if (cluster) @@ -596,8 +540,8 @@ static ssize_t o2nm_cluster_attr_fence_method_read( return ret; } -static ssize_t o2nm_cluster_attr_fence_method_write( - struct o2nm_cluster *cluster, const char *page, size_t count) +static ssize_t o2nm_cluster_fence_method_store( + struct config_item *item, const char *page, size_t count) { unsigned int i; @@ -609,10 +553,10 @@ static ssize_t o2nm_cluster_attr_fence_method_write( continue; if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) continue; - if (cluster->cl_fence_method != i) { + if (to_o2nm_cluster(item)->cl_fence_method != i) { printk(KERN_INFO "ocfs2: Changing fence method to %s\n", o2nm_fence_method_desc[i]); - cluster->cl_fence_method = i; + to_o2nm_cluster(item)->cl_fence_method = i; } return count; } @@ -621,79 +565,18 @@ bail: return -EINVAL; } -static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "idle_timeout_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_idle_timeout_ms_read, - .store = o2nm_cluster_attr_idle_timeout_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_keepalive_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "keepalive_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_keepalive_delay_ms_read, - .store = o2nm_cluster_attr_keepalive_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "reconnect_delay_ms", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_reconnect_delay_ms_read, - .store = o2nm_cluster_attr_reconnect_delay_ms_write, -}; - -static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { - .attr = { .ca_owner = THIS_MODULE, - .ca_name = "fence_method", - .ca_mode = S_IRUGO | S_IWUSR }, - .show = o2nm_cluster_attr_fence_method_read, - .store = o2nm_cluster_attr_fence_method_write, -}; +CONFIGFS_ATTR(o2nm_cluster_, idle_timeout_ms); +CONFIGFS_ATTR(o2nm_cluster_, keepalive_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, reconnect_delay_ms); +CONFIGFS_ATTR(o2nm_cluster_, fence_method); static struct configfs_attribute *o2nm_cluster_attrs[] = { - &o2nm_cluster_attr_idle_timeout_ms.attr, - &o2nm_cluster_attr_keepalive_delay_ms.attr, - &o2nm_cluster_attr_reconnect_delay_ms.attr, - &o2nm_cluster_attr_fence_method.attr, + &o2nm_cluster_attr_idle_timeout_ms, + &o2nm_cluster_attr_keepalive_delay_ms, + &o2nm_cluster_attr_reconnect_delay_ms, + &o2nm_cluster_attr_fence_method, NULL, }; -static ssize_t o2nm_cluster_show(struct config_item *item, - struct configfs_attribute *attr, - char *page) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret = 0; - - if (o2nm_cluster_attr->show) - ret = o2nm_cluster_attr->show(cluster, page); - return ret; -} - -static ssize_t o2nm_cluster_store(struct config_item *item, - struct configfs_attribute *attr, - const char *page, size_t count) -{ - struct o2nm_cluster *cluster = to_o2nm_cluster(item); - struct o2nm_cluster_attribute *o2nm_cluster_attr = - container_of(attr, struct o2nm_cluster_attribute, attr); - ssize_t ret; - - if (o2nm_cluster_attr->store == NULL) { - ret = -EINVAL; - goto out; - } - - ret = o2nm_cluster_attr->store(cluster, page, count); - if (ret < count) - goto out; -out: - return ret; -} static struct config_item *o2nm_node_group_make_item(struct config_group *group, const char *name) @@ -722,13 +605,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, struct o2nm_node *node = to_o2nm_node(item); struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); - o2net_disconnect_node(node); + if (cluster->cl_nodes[node->nd_num] == node) { + o2net_disconnect_node(node); - if (cluster->cl_has_local && - (cluster->cl_local_node == node->nd_num)) { - cluster->cl_has_local = 0; - cluster->cl_local_node = O2NM_INVALID_NODE_NUM; - o2net_stop_listening(node); + if (cluster->cl_has_local && + (cluster->cl_local_node == node->nd_num)) { + cluster->cl_has_local = 0; + cluster->cl_local_node = O2NM_INVALID_NODE_NUM; + o2net_stop_listening(node); + } } /* XXX call into net to stop this node from trading messages */ @@ -757,7 +642,7 @@ static struct configfs_group_operations o2nm_node_group_group_ops = { .drop_item = o2nm_node_group_drop_item, }; -static struct config_item_type o2nm_node_group_type = { +static const struct config_item_type o2nm_node_group_type = { .ct_group_ops = &o2nm_node_group_group_ops, .ct_owner = THIS_MODULE, }; @@ -768,17 +653,14 @@ static void o2nm_cluster_release(struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - kfree(cluster->cl_group.default_groups); kfree(cluster); } static struct configfs_item_operations o2nm_cluster_item_ops = { .release = o2nm_cluster_release, - .show_attribute = o2nm_cluster_show, - .store_attribute = o2nm_cluster_store, }; -static struct config_item_type o2nm_cluster_type = { +static const struct config_item_type o2nm_cluster_type = { .ct_item_ops = &o2nm_cluster_item_ops, .ct_attrs = o2nm_cluster_attrs, .ct_owner = THIS_MODULE, @@ -806,29 +688,26 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g struct o2nm_cluster *cluster = NULL; struct o2nm_node_group *ns = NULL; struct config_group *o2hb_group = NULL, *ret = NULL; - void *defs = NULL; - /* this runs under the parent dir's i_mutex; there can be only + /* this runs under the parent dir's i_rwsem; there can be only * one caller in here at a time */ if (o2nm_single_cluster) return ERR_PTR(-ENOSPC); cluster = kzalloc(sizeof(struct o2nm_cluster), GFP_KERNEL); ns = kzalloc(sizeof(struct o2nm_node_group), GFP_KERNEL); - defs = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); o2hb_group = o2hb_alloc_hb_set(); - if (cluster == NULL || ns == NULL || o2hb_group == NULL || defs == NULL) + if (cluster == NULL || ns == NULL || o2hb_group == NULL) goto out; config_group_init_type_name(&cluster->cl_group, name, &o2nm_cluster_type); + configfs_add_default_group(&ns->ns_group, &cluster->cl_group); + config_group_init_type_name(&ns->ns_group, "node", &o2nm_node_group_type); + configfs_add_default_group(o2hb_group, &cluster->cl_group); - cluster->cl_group.default_groups = defs; - cluster->cl_group.default_groups[0] = &ns->ns_group; - cluster->cl_group.default_groups[1] = o2hb_group; - cluster->cl_group.default_groups[2] = NULL; rwlock_init(&cluster->cl_nodes_lock); cluster->cl_node_ip_tree = RB_ROOT; cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; @@ -844,7 +723,6 @@ out: kfree(cluster); kfree(ns); o2hb_free_hb_set(o2hb_group); - kfree(defs); ret = ERR_PTR(-ENOMEM); } @@ -854,18 +732,11 @@ out: static void o2nm_cluster_group_drop_item(struct config_group *group, struct config_item *item) { struct o2nm_cluster *cluster = to_o2nm_cluster(item); - int i; - struct config_item *killme; BUG_ON(o2nm_single_cluster != cluster); o2nm_single_cluster = NULL; - for (i = 0; cluster->cl_group.default_groups[i]; i++) { - killme = &cluster->cl_group.default_groups[i]->cg_item; - cluster->cl_group.default_groups[i] = NULL; - config_item_put(killme); - } - + configfs_remove_default_groups(&cluster->cl_group); config_item_put(item); } @@ -874,7 +745,7 @@ static struct configfs_group_operations o2nm_cluster_group_group_ops = { .drop_item = o2nm_cluster_group_drop_item, }; -static struct config_item_type o2nm_cluster_group_type = { +static const struct config_item_type o2nm_cluster_group_type = { .ct_group_ops = &o2nm_cluster_group_group_ops, .ct_owner = THIS_MODULE, }; @@ -890,6 +761,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = { }, }; +static inline void o2nm_lock_subsystem(void) +{ + mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + +static inline void o2nm_unlock_subsystem(void) +{ + mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + int o2nm_depend_item(struct config_item *item) { return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); @@ -897,7 +778,7 @@ int o2nm_depend_item(struct config_item *item) void o2nm_undepend_item(struct config_item *item) { - configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); + configfs_undepend_item(item); } int o2nm_depend_this_node(void) @@ -943,13 +824,9 @@ static void __exit exit_o2nm(void) static int __init init_o2nm(void) { - int ret = -1; + int ret; - cluster_print_version(); - - ret = o2hb_init(); - if (ret) - goto out; + o2hb_init(); ret = o2net_init(); if (ret) @@ -984,6 +861,7 @@ out: MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("OCFS2 cluster management"); module_init(init_o2nm) module_exit(exit_o2nm) diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index 09ea2d388bbb..3490e77a952d 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * nodemanager.h * * Function prototypes * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_NODEMANAGER_H diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h index 3f4151da9709..6088c9f974dd 100644 --- a/fs/ocfs2/cluster/ocfs2_heartbeat.h +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -1,26 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * ocfs2_heartbeat.h * * On-disk structures for ocfs2_heartbeat * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef _OCFS2_HEARTBEAT_H diff --git a/fs/ocfs2/cluster/ocfs2_nodemanager.h b/fs/ocfs2/cluster/ocfs2_nodemanager.h index 49b594325bec..c9a0b77443e7 100644 --- a/fs/ocfs2/cluster/ocfs2_nodemanager.h +++ b/fs/ocfs2/cluster/ocfs2_nodemanager.h @@ -1,28 +1,11 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * ocfs2_nodemanager.h * * Header describing the interface between userspace and the kernel * for the ocfs2_nodemanager module. * * Copyright (C) 2002, 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef _OCFS2_NODEMANAGER_H diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ec141e758d7..bfb8b456876c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -1,23 +1,7 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +// SPDX-License-Identifier: GPL-2.0-or-later +/* * * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ /* This quorum hack is only here until we transition to some more rational @@ -39,7 +23,7 @@ * race between when we see a node start heartbeating and when we connect * to it. * - * So nodes that are in this transtion put a hold on the quorum decision + * So nodes that are in this transition put a hold on the quorum decision * with a counter. As they fall out of this transition they drop the count * and if they're the last, they fire off the decision. */ @@ -76,20 +60,21 @@ static void o2quo_fence_self(void) switch (o2nm_single_cluster->cl_fence_method) { case O2NM_FENCE_PANIC: panic("*** ocfs2 is very sorry to be fencing this system by " - "panicing ***\n"); + "panicking ***\n"); break; default: WARN_ON(o2nm_single_cluster->cl_fence_method >= O2NM_FENCE_METHODS); + fallthrough; case O2NM_FENCE_RESET: printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " "system by restarting ***\n"); emergency_restart(); break; - }; + } } -/* Indicate that a timeout occurred on a hearbeat region write. The +/* Indicate that a timeout occurred on a heartbeat region write. The * other nodes in the cluster may consider us dead at that time so we * want to "fence" ourselves so that we don't scribble on the disk * after they think they've recovered us. This can't solve all @@ -108,7 +93,7 @@ static void o2quo_make_decision(struct work_struct *work) int lowest_hb, lowest_reachable = 0, fence = 0; struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); if (lowest_hb != O2NM_MAX_NODES) @@ -160,9 +145,18 @@ static void o2quo_make_decision(struct work_struct *work) } out: - spin_unlock(&qs->qs_lock); - if (fence) + if (fence) { + spin_unlock_bh(&qs->qs_lock); o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock_bh(&qs->qs_lock); + + } + } static void o2quo_set_hold(struct o2quo_state *qs, u8 node) @@ -195,14 +189,14 @@ static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) } /* as a node comes up we delay the quorum decision until we know the fate of - * the connection. the hold will be droped in conn_up or hb_down. it might be + * the connection. the hold will be dropped in conn_up or hb_down. it might be * perpetuated by con_err until hb_down. if we already have a conn, we might * be dropping a hold that conn_up got. */ void o2quo_hb_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating++; mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, @@ -217,7 +211,7 @@ void o2quo_hb_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* hb going down releases any holds we might have had due to this node from @@ -226,7 +220,7 @@ void o2quo_hb_down(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_heartbeating--; mlog_bug_on_msg(qs->qs_heartbeating < 0, @@ -239,7 +233,7 @@ void o2quo_hb_down(u8 node) o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* this tells us that we've decided that the node is still heartbeating @@ -251,18 +245,18 @@ void o2quo_hb_still_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); mlog(0, "node %u\n", node); qs->qs_pending = 1; o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* This is analogous to hb_up. as a node's connection comes up we delay the - * quorum decision until we see it heartbeating. the hold will be droped in + * quorum decision until we see it heartbeating. the hold will be dropped in * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if * it's already heartbeating we might be dropping a hold that conn_up got. * */ @@ -270,7 +264,7 @@ void o2quo_conn_up(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); qs->qs_connected++; mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, @@ -285,7 +279,7 @@ void o2quo_conn_up(u8 node) else o2quo_clear_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } /* we've decided that we won't ever be connecting to the node again. if it's @@ -296,7 +290,7 @@ void o2quo_conn_err(u8 node) { struct o2quo_state *qs = &o2quo_state; - spin_lock(&qs->qs_lock); + spin_lock_bh(&qs->qs_lock); if (test_bit(node, qs->qs_conn_bm)) { qs->qs_connected--; @@ -305,14 +299,15 @@ void o2quo_conn_err(u8 node) node, qs->qs_connected); clear_bit(node, qs->qs_conn_bm); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); } mlog(0, "node %u, %d total\n", node, qs->qs_connected); - if (test_bit(node, qs->qs_hb_bm)) - o2quo_set_hold(qs, node); - spin_unlock(&qs->qs_lock); + spin_unlock_bh(&qs->qs_lock); } void o2quo_init(void) diff --git a/fs/ocfs2/cluster/quorum.h b/fs/ocfs2/cluster/quorum.h index 6649cc6f67c9..d64bf4482a4a 100644 --- a/fs/ocfs2/cluster/quorum.h +++ b/fs/ocfs2/cluster/quorum.h @@ -1,23 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_QUORUM_H diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index a4b07730b2e1..022f716c74ff 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +// SPDX-License-Identifier: GPL-2.0-only +/* * sys.c * * OCFS2 cluster sysfs interface * * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, - * version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #include <linux/kernel.h> @@ -41,7 +24,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION); } static struct kobj_attribute attr_version = - __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL); + __ATTR(interface_revision, S_IRUGO, version_show, NULL); static struct attribute *o2cb_attrs[] = { &attr_version.attr, diff --git a/fs/ocfs2/cluster/sys.h b/fs/ocfs2/cluster/sys.h index d66b8ab0045e..70aaba65317e 100644 --- a/fs/ocfs2/cluster/sys.h +++ b/fs/ocfs2/cluster/sys.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-only */ +/* * sys.h * * Function prototypes for o2cb sysfs interface * * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation, - * version 2 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_SYS_H diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d644dc611425..79b281e32f4c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1,33 +1,17 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * - * vim: noexpandtab sw=8 ts=8 sts=0: +// SPDX-License-Identifier: GPL-2.0-or-later +/* * * Copyright (C) 2004 Oracle. All rights reserved. * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * * ---- * - * Callers for this were originally written against a very simple synchronus + * Callers for this were originally written against a very simple synchronous * API. This implementation reflects those simple callers. Some day I'm sure * we'll need to move to a more robust posting/callback mechanism. * * Transmit calls pass in kernel virtual addresses and block copying this into * the socket's tx buffers via a usual blocking sendmsg. They'll block waiting - * for a failed socket to timeout. TX callers can also pass in a poniter to an + * for a failed socket to timeout. TX callers can also pass in a pointer to an * 'int' which gets filled with an errno off the wire in response to the * message they send. * @@ -54,6 +38,7 @@ */ #include <linux/kernel.h> +#include <linux/sched/mm.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/idr.h> @@ -61,8 +46,9 @@ #include <linux/net.h> #include <linux/export.h> #include <net/tcp.h> +#include <trace/events/sock.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "heartbeat.h" #include "tcp.h" @@ -97,7 +83,7 @@ typeof(sc) __sc = (sc); \ mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p " \ "pg_off %zu] " fmt, __sc, \ - atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock, \ + kref_read(&__sc->sc_kref), __sc->sc_sock, \ __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off , \ ##args); \ } while (0) @@ -108,14 +94,14 @@ static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; /* XXX someday we'll need better accounting */ -static struct socket *o2net_listen_sock = NULL; +static struct socket *o2net_listen_sock; /* * listen work is only queued by the listening socket callbacks on the * o2net_wq. teardown detaches the callbacks before destroying the workqueue. * quorum work is queued as sock containers are shutdown.. stop_listening * tears down all the node's sock containers, preventing future shutdowns - * and queued quroum work, before canceling delayed quorum work and + * and queued quorum work, before canceling delayed quorum work and * destroying the work queue. */ static struct workqueue_struct *o2net_wq; @@ -137,9 +123,9 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] = static void o2net_sc_connect_completed(struct work_struct *work); static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); -static void o2net_listen_data_ready(struct sock *sk, int bytes); +static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); -static void o2net_idle_timer(unsigned long data); +static void o2net_idle_timer(struct timer_list *t); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); @@ -262,17 +248,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc) #endif /* CONFIG_OCFS2_FS_STATS */ -static inline int o2net_reconnect_delay(void) +static inline unsigned int o2net_reconnect_delay(void) { return o2nm_single_cluster->cl_reconnect_delay_ms; } -static inline int o2net_keepalive_delay(void) +static inline unsigned int o2net_keepalive_delay(void) { return o2nm_single_cluster->cl_keepalive_delay_ms; } -static inline int o2net_idle_timeout(void) +static inline unsigned int o2net_idle_timeout(void) { return o2nm_single_cluster->cl_idle_timeout_ms; } @@ -449,9 +435,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = o2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; + timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0); sclog(sc, "alloced\n"); @@ -536,15 +520,16 @@ static void o2net_set_nn_state(struct o2net_node *nn, if (nn->nn_persistent_error || nn->nn_sc_valid) wake_up(&nn->nn_sc_wq); - if (!was_err && nn->nn_persistent_error) { + if (was_valid && !was_err && nn->nn_persistent_error) { o2quo_conn_err(o2net_num_from_nn(nn)); queue_delayed_work(o2net_wq, &nn->nn_still_up, msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); } if (was_valid && !valid) { - printk(KERN_NOTICE "o2net: No longer connected to " - SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); + if (old_sc) + printk(KERN_NOTICE "o2net: No longer connected to " + SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -596,13 +581,16 @@ static void o2net_set_nn_state(struct o2net_node *nn, } /* see o2net_register_callbacks() */ -static void o2net_data_ready(struct sock *sk, int bytes) +static void o2net_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); + struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); - if (sk->sk_user_data) { - struct o2net_sock_container *sc = sk->sk_user_data; + trace_sk_data_ready(sk); + + read_lock_bh(&sk->sk_callback_lock); + sc = sk->sk_user_data; + if (sc) { sclog(sc, "data_ready hit\n"); o2net_set_data_ready_time(sc); o2net_sc_queue_work(sc, &sc->sc_rx_work); @@ -610,9 +598,9 @@ static void o2net_data_ready(struct sock *sk, int bytes) } else { ready = sk->sk_data_ready; } - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); - ready(sk, bytes); + ready(sk); } /* see o2net_register_callbacks() */ @@ -621,7 +609,7 @@ static void o2net_state_change(struct sock *sk) void (*state_change)(struct sock *sk); struct o2net_sock_container *sc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); sc = sk->sk_user_data; if (sc == NULL) { state_change = sk->sk_state_change; @@ -648,7 +636,7 @@ static void o2net_state_change(struct sock *sk) break; } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); state_change(sk); } @@ -736,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work) if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { /* we shouldn't flush as we're in the thread, the * races with pending sc work structs are harmless */ - del_timer_sync(&sc->sc_idle_timeout); + timer_delete_sync(&sc->sc_idle_timeout); o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); sc_put(sc); kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); @@ -765,32 +753,32 @@ static struct o2net_msg_handler * o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, struct rb_node **ret_parent) { - struct rb_node **p = &o2net_handler_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p = &o2net_handler_tree.rb_node; + struct rb_node *parent = NULL; struct o2net_msg_handler *nmh, *ret = NULL; int cmp; - while (*p) { - parent = *p; - nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); + while (*p) { + parent = *p; + nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); cmp = o2net_handler_cmp(nmh, msg_type, key); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { ret = nmh; - break; + break; } - } + } - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; - return ret; + return ret; } static void o2net_handler_kref_release(struct kref *kref) @@ -871,8 +859,6 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, "for type %u key %08x\n", msg_type, key); } write_unlock(&o2net_handler_lock); - if (ret) - goto out; out: if (ret) @@ -915,74 +901,51 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, len); + return sock_recvmsg(sock, &msg, MSG_DONTWAIT); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg = {.msg_flags = 0,}; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } static void o2net_sendpage(struct o2net_sock_container *sc, - void *kmalloced_virt, - size_t size) + void *virt, size_t size) { struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + struct msghdr msg = {}; + struct bio_vec bv; ssize_t ret; + bvec_set_virt(&bv, virt, size); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, size); + while (1) { + msg.msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES; mutex_lock(&sc->sc_send_lock); - ret = sc->sc_sock->ops->sendpage(sc->sc_sock, - virt_to_page(kmalloced_virt), - (long)kmalloced_virt & ~PAGE_MASK, - size, MSG_DONTWAIT); + ret = sock_sendmsg(sc->sc_sock, &msg); mutex_unlock(&sc->sc_send_lock); + if (ret == size) break; if (ret == (ssize_t)-EAGAIN) { @@ -1033,16 +996,15 @@ static int o2net_tx_can_proceed(struct o2net_node *nn, } /* Get a map of all nodes to which this node is currently connected to */ -void o2net_fill_node_map(unsigned long *map, unsigned bytes) +void o2net_fill_node_map(unsigned long *map, unsigned int bits) { struct o2net_sock_container *sc; int node, ret; - BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long))); - - memset(map, 0, bytes); + bitmap_zero(map, bits); for (node = 0; node < O2NM_MAX_NODES; ++node) { - o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret); + if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret)) + continue; if (!ret) { set_bit(node, map); sc_put(sc); @@ -1102,7 +1064,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, o2net_set_nst_sock_container(&nst, sc); veclen = caller_veclen + 1; - vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); + vec = kmalloc_array(veclen, sizeof(struct kvec), GFP_ATOMIC); if (vec == NULL) { mlog(0, "failed to %zu element kvec!\n", veclen); ret = -ENOMEM; @@ -1238,7 +1200,6 @@ static int o2net_process_message(struct o2net_sock_container *sc, msglog(hdr, "bad magic\n"); ret = -EINVAL; goto out; - break; } /* find a handler for it */ @@ -1458,7 +1419,7 @@ out: return ret; } -/* this work func is triggerd by data ready. it reads until it can read no +/* this work func is triggered by data ready. it reads until it can read no * more. it interprets 0, eof, as fatal. if data_ready hits while we're doing * our work the work struct will be marked and we'll be called again. */ static void o2net_rx_until_empty(struct work_struct *work) @@ -1481,31 +1442,6 @@ static void o2net_rx_until_empty(struct work_struct *work) sc_put(sc); } -static int o2net_set_nodelay(struct socket *sock) -{ - int ret, val = 1; - mm_segment_t oldfs; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Dear unsuspecting programmer, - * - * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level - * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will - * silently turn into SO_DEBUG. - * - * Yours, - * Keeper of hilariously fragile interfaces. - */ - ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); - - set_fs(oldfs); - return ret; -} - static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1547,12 +1483,13 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ -static void o2net_idle_timer(unsigned long data) +static void o2net_idle_timer(struct timer_list *t) { - struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct o2net_sock_container *sc = timer_container_of(sc, t, + sc_idle_timeout); struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); #ifdef CONFIG_DEBUG_FS unsigned long msecs = ktime_to_ms(ktime_get()) - @@ -1562,16 +1499,20 @@ static void o2net_idle_timer(unsigned long data) #endif printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), - msecs / 1000, msecs % 1000); + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in o2net_start_connect. + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. */ atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) @@ -1586,6 +1527,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + /* Only push out an existing timer */ if (timer_pending(&sc->sc_idle_timeout)) o2net_sc_reset_idle_timer(sc); @@ -1606,23 +1556,25 @@ static void o2net_start_connect(struct work_struct *work) struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0, stop; unsigned int timeout; + unsigned int nofs_flag; + /* + * sock_create allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. + */ + nofs_flag = memalloc_nofs_save(); /* if we're greater we initiate tx, otherwise we accept */ if (o2nm_this_node() <= o2net_num_from_nn(nn)) goto out; /* watch for racing with tearing a node down */ node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); - if (node == NULL) { - ret = 0; + if (node == NULL) goto out; - } mynode = o2nm_get_node_by_num(o2nm_this_node()); - if (mynode == NULL) { - ret = 0; + if (mynode == NULL) goto out; - } spin_lock(&nn->nn_lock); /* @@ -1657,12 +1609,13 @@ static void o2net_start_connect(struct work_struct *work) sc->sc_sock = sock; /* freed by sc_kref_release */ sock->sk->sk_allocation = GFP_ATOMIC; + sock->sk->sk_use_task_frag = false; myaddr.sin_family = AF_INET; myaddr.sin_addr.s_addr = mynode->nd_ipv4_address; myaddr.sin_port = htons(0); /* any port */ - ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&myaddr, sizeof(myaddr)); if (ret) { mlog(ML_ERROR, "bind failed with %d at address %pI4\n", @@ -1670,11 +1623,8 @@ static void o2net_start_connect(struct work_struct *work) goto out; } - ret = o2net_set_nodelay(sc->sc_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(sc->sc_sock->sk); + tcp_sock_set_user_timeout(sock->sk, O2NET_TCP_USER_TIMEOUT); o2net_register_callbacks(sc->sc_sock->sk, sc); @@ -1688,20 +1638,19 @@ static void o2net_start_connect(struct work_struct *work) remoteaddr.sin_port = node->nd_ipv4_port; ret = sc->sc_sock->ops->connect(sc->sc_sock, - (struct sockaddr *)&remoteaddr, + (struct sockaddr_unsized *)&remoteaddr, sizeof(remoteaddr), O_NONBLOCK); if (ret == -EINPROGRESS) ret = 0; out: - if (ret) { + if (ret && sc) { printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); /* 0 err so that another will be queued and attempted * from set_nn_state */ - if (sc) - o2net_ensure_shutdown(nn, sc, 0); + o2net_ensure_shutdown(nn, sc, 0); } if (sc) sc_put(sc); @@ -1710,6 +1659,7 @@ out: if (mynode) o2nm_node_put(mynode); + memalloc_nofs_restore(nofs_flag); return; } @@ -1721,12 +1671,13 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { printk(KERN_NOTICE "o2net: No connection established with " - "node %u after %u.%u seconds, giving up.\n", + "node %u after %u.%u seconds, check network and" + " cluster configuration.\n", o2net_num_from_nn(nn), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); - o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); + o2net_set_nn_state(nn, NULL, 0, 0); } spin_unlock(&nn->nn_lock); } @@ -1787,7 +1738,7 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, (msecs_to_jiffies(o2net_reconnect_delay()) + 1); if (node_num != o2nm_this_node()) { - /* believe it or not, accept and node hearbeating testing + /* believe it or not, accept and node heartbeating testing * can succeed for this node before we got here.. so * only use set_nn_state to clear the persistent error * if that hasn't already happened */ @@ -1826,17 +1777,28 @@ int o2net_register_hb_callbacks(void) /* ------------------------------------------------------------ */ -static int o2net_accept_one(struct socket *sock) +static int o2net_accept_one(struct socket *sock, int *more) { - int ret, slen; + int ret; struct sockaddr_in sin; struct socket *new_sock = NULL; struct o2nm_node *node = NULL; struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; + struct proto_accept_arg arg = { + .flags = O_NONBLOCK, + }; struct o2net_node *nn; + unsigned int nofs_flag; + + /* + * sock_create_lite allocates the sock with GFP_KERNEL. We must + * prevent the filesystem from being reentered by memory reclaim. + */ + nofs_flag = memalloc_nofs_save(); BUG_ON(sock == NULL); + *more = 0; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) @@ -1844,21 +1806,17 @@ static int o2net_accept_one(struct socket *sock) new_sock->type = sock->type; new_sock->ops = sock->ops; - ret = sock->ops->accept(sock, new_sock, O_NONBLOCK); + ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; + *more = 1; new_sock->sk->sk_allocation = GFP_ATOMIC; - ret = o2net_set_nodelay(new_sock); - if (ret) { - mlog(ML_ERROR, "setting TCP_NODELAY failed with %d\n", ret); - goto out; - } + tcp_sock_set_nodelay(new_sock->sk); + tcp_sock_set_user_timeout(new_sock->sk, O2NET_TCP_USER_TIMEOUT); - slen = sizeof(sin); - ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, - &slen, 1); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) goto out; @@ -1873,12 +1831,16 @@ static int o2net_accept_one(struct socket *sock) if (o2nm_this_node() >= node->nd_num) { local_node = o2nm_get_node_by_num(o2nm_this_node()); - printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " - "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " - "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), node->nd_name, - node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + if (local_node) + printk(KERN_NOTICE "o2net: Unexpected connect attempt " + "seen at node '%s' (%u, %pI4:%d) from " + "node '%s' (%u, %pI4:%d)\n", + local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), + node->nd_name, + node->nd_num, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); ret = -EINVAL; goto out; } @@ -1939,39 +1901,78 @@ out: o2nm_node_put(local_node); if (sc) sc_put(sc); + + memalloc_nofs_restore(nofs_flag); return ret; } +/* + * This function is invoked in response to one or more + * pending accepts at softIRQ level. We must drain the + * entire que before returning. + */ + static void o2net_accept_many(struct work_struct *work) { struct socket *sock = o2net_listen_sock; - while (o2net_accept_one(sock) == 0) + int more; + + /* + * It is critical to note that due to interrupt moderation + * at the network driver level, we can't assume to get a + * softIRQ for every single conn since tcp SYN packets + * can arrive back-to-back, and therefore many pending + * accepts may result in just 1 softIRQ. If we terminate + * the o2net_accept_one() loop upon seeing an err, what happens + * to the rest of the conns in the queue? If no new SYN + * arrives for hours, no softIRQ will be delivered, + * and the connections will just sit in the queue. + */ + + for (;;) { + o2net_accept_one(sock, &more); + if (!more) + break; cond_resched(); + } } -static void o2net_listen_data_ready(struct sock *sk, int bytes) +static void o2net_listen_data_ready(struct sock *sk) { - void (*ready)(struct sock *sk, int bytes); + void (*ready)(struct sock *sk); + + trace_sk_data_ready(sk); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (ready == NULL) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } - /* ->sk_data_ready is also called for a newly established child socket - * before it has been accepted and the acceptor has set up their - * data_ready.. we only want to queue listen work for our listening - * socket */ + /* This callback may called twice when a new connection + * is being established as a child socket inherits everything + * from a parent LISTEN socket, including the data_ready cb of + * the parent. This leads to a hazard. In o2net_accept_one() + * we are still initializing the child socket but have not + * changed the inherited data_ready callback yet when + * data starts arriving. + * We avoid this hazard by checking the state. + * For the listening socket, the state will be TCP_LISTEN; for the new + * socket, will be TCP_ESTABLISHED. Also, in this case, + * sk->sk_user_data is not a valid function pointer. + */ + if (sk->sk_state == TCP_LISTEN) { - mlog(ML_TCP, "bytes: %d\n", bytes); queue_work(o2net_wq, &o2net_listen_work); + } else { + ready = NULL; } out: - read_unlock(&sk->sk_callback_lock); - ready(sk, bytes); + read_unlock_bh(&sk->sk_callback_lock); + if (ready != NULL) + ready(sk); } static int o2net_open_listening_sock(__be32 addr, __be16 port) @@ -2001,7 +2002,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port) INIT_WORK(&o2net_listen_work, o2net_accept_many); sock->sk->sk_reuse = SK_CAN_REUSE; - ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + ret = sock->ops->bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (ret < 0) { printk(KERN_ERR "o2net: Error %d while binding socket at " "%pI4:%u\n", ret, &addr, ntohs(port)); @@ -2037,7 +2038,7 @@ int o2net_start_listening(struct o2nm_node *node) BUG_ON(o2net_listen_sock != NULL); mlog(ML_KTHREAD, "starting o2net thread...\n"); - o2net_wq = create_singlethread_workqueue("o2net"); + o2net_wq = alloc_ordered_workqueue("o2net", WQ_MEM_RECLAIM); if (o2net_wq == NULL) { mlog(ML_ERROR, "unable to launch o2net thread\n"); return -ENOMEM; /* ? */ @@ -2093,22 +2094,23 @@ void o2net_stop_listening(struct o2nm_node *node) int o2net_init(void) { + struct folio *folio; + void *p; unsigned long i; o2quo_init(); + o2net_debugfs_init(); - if (o2net_debugfs_init()) - return -ENOMEM; + folio = folio_alloc(GFP_KERNEL | __GFP_ZERO, 0); + if (!folio) + goto out; - o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); - o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); - return -ENOMEM; - } + p = folio_address(folio); + o2net_hand = p; + p += sizeof(struct o2net_handshake); + o2net_keep_req = p; + p += sizeof(struct o2net_msg); + o2net_keep_resp = p; o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); o2net_hand->connector_id = cpu_to_be64(1); @@ -2133,13 +2135,16 @@ int o2net_init(void) } return 0; + +out: + o2net_debugfs_exit(); + o2quo_exit(); + return -ENOMEM; } void o2net_exit(void) { o2quo_exit(); - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); o2net_debugfs_exit(); + folio_put(virt_to_folio(o2net_hand)); } diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 5bada2a69b50..a75b551d31c7 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -1,27 +1,10 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * tcp.h * * Function prototypes * * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * */ #ifndef O2CLUSTER_TCP_H @@ -47,7 +30,7 @@ struct o2net_msg __be32 status; __be32 key; __be32 msg_num; - __u8 buf[0]; + __u8 buf[]; }; typedef int (o2net_msg_handler_func)(struct o2net_msg *msg, u32 len, void *data, @@ -63,6 +46,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) @@ -123,16 +107,15 @@ struct o2net_send_tracking; struct o2net_sock_container; #ifdef CONFIG_DEBUG_FS -int o2net_debugfs_init(void); +void o2net_debugfs_init(void); void o2net_debugfs_exit(void); void o2net_debug_add_nst(struct o2net_send_tracking *nst); void o2net_debug_del_nst(struct o2net_send_tracking *nst); void o2net_debug_add_sc(struct o2net_sock_container *sc); void o2net_debug_del_sc(struct o2net_sock_container *sc); #else -static inline int o2net_debugfs_init(void) +static inline void o2net_debugfs_init(void) { - return 0; } static inline void o2net_debugfs_exit(void) { diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 4cbcb65784a3..601c99bd2611 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -1,22 +1,6 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. */ #ifndef O2CLUSTER_TCP_INTERNAL_H @@ -107,12 +91,12 @@ struct o2net_node { struct list_head nn_status_list; /* connects are attempted from when heartbeat comes up until either hb - * goes down, the node is unconfigured, no connect attempts succeed - * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work - * is queued from set_nn_state both from hb up and from itself if a - * connect attempt fails and so can be self-arming. shutdown is - * careful to first mark the nn such that no connects will be attempted - * before canceling delayed connect work and flushing the queue. */ + * goes down, the node is unconfigured, or a connect succeeds. + * connect_work is queued from set_nn_state both from hb up and from + * itself if a connect attempt fails and so can be self-arming. + * shutdown is careful to first mark the nn such that no connects will + * be attempted before canceling delayed connect work and flushing the + * queue. */ struct delayed_work nn_connect_work; unsigned long nn_last_connect_attempt; @@ -165,7 +149,7 @@ struct o2net_sock_container { /* original handlers for the sockets */ void (*sc_state_change)(struct sock *sk); - void (*sc_data_ready)(struct sock *sk, int bytes); + void (*sc_data_ready)(struct sock *sk); u32 sc_msg_key; u16 sc_msg_type; @@ -196,7 +180,7 @@ struct o2net_msg_handler { u32 nh_msg_type; u32 nh_key; o2net_msg_handler_func *nh_func; - o2net_msg_handler_func *nh_func_data; + void *nh_func_data; o2net_post_msg_handler_func *nh_post_func; struct kref nh_kref; diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c deleted file mode 100644 index a56eee6abad3..000000000000 --- a/fs/ocfs2/cluster/ver.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.c - * - * version string - * - * Copyright (C) 2002, 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include "ver.h" - -#define CLUSTER_BUILD_VERSION "1.5.0" - -#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION - -void cluster_print_version(void) -{ - printk(KERN_INFO "%s\n", VERSION_STR); -} - -MODULE_DESCRIPTION(VERSION_STR); - -MODULE_VERSION(CLUSTER_BUILD_VERSION); diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h deleted file mode 100644 index 32554c3382c2..000000000000 --- a/fs/ocfs2/cluster/ver.h +++ /dev/null @@ -1,31 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ver.h - * - * Function prototypes - * - * Copyright (C) 2005 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - */ - -#ifndef O2CLUSTER_VER_H -#define O2CLUSTER_VER_H - -void cluster_print_version(void); - -#endif /* O2CLUSTER_VER_H */ |
