diff options
Diffstat (limited to 'net')
71 files changed, 2166 insertions, 982 deletions
diff --git a/net/802/mrp.c b/net/802/mrp.c index e085bcc754f6..1eb05d80b07b 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -871,10 +871,10 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) */ del_timer_sync(&app->join_timer); - spin_lock(&app->lock); + spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_pdu_queue(app); - spin_unlock(&app->lock); + spin_unlock_bh(&app->lock); mrp_queue_xmit(app); diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 8e15d966d9b0..239992021b1d 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -837,6 +837,19 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst); if (dat_entry) { + /* If the ARP request is destined for a local client the local + * client will answer itself. DAT would only generate a + * duplicate packet. + * + * Moreover, if the soft-interface is enslaved into a bridge, an + * additional DAT answer may trigger kernel warnings about + * a packet coming from the wrong port. + */ + if (batadv_is_my_client(bat_priv, dat_entry->mac_addr)) { + ret = true; + goto out; + } + skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src, bat_priv->soft_iface, ip_dst, hw_src, dat_entry->mac_addr, hw_src); diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3e30a0f1b908..51aafd669cbb 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -163,16 +163,25 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_vis_quit(bat_priv); batadv_gw_node_purge(bat_priv); - batadv_originator_free(bat_priv); batadv_nc_free(bat_priv); + batadv_dat_free(bat_priv); + batadv_bla_free(bat_priv); + /* Free the TT and the originator tables only after having terminated + * all the other depending components which may use these structures for + * their purposes. + */ batadv_tt_free(bat_priv); - batadv_bla_free(bat_priv); - - batadv_dat_free(bat_priv); + /* Since the originator table clean up routine is accessing the TT + * tables as well, it has to be invoked after the TT tables have been + * freed and marked as empty. This ensures that no cleanup RCU callbacks + * accessing the TT data are scheduled for later execution. + */ + batadv_originator_free(bat_priv); free_percpu(bat_priv->bat_counters); + bat_priv->bat_counters = NULL; atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); } @@ -475,7 +484,7 @@ static int batadv_param_set_ra(const char *val, const struct kernel_param *kp) char *algo_name = (char *)val; size_t name_len = strlen(algo_name); - if (algo_name[name_len - 1] == '\n') + if (name_len > 0 && algo_name[name_len - 1] == '\n') algo_name[name_len - 1] = '\0'; bat_algo_ops = batadv_algo_get(algo_name); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index f7c54305a918..e84629ece9b7 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1514,6 +1514,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct ethhdr *ethhdr, ethhdr_tmp; uint8_t *orig_dest, ttl, ttvn; unsigned int coding_len; + int err; /* Save headers temporarily */ memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); @@ -1568,8 +1569,11 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, coding_len); /* Resize decoded skb if decoded with larger packet */ - if (nc_packet->skb->len > coding_len + h_size) - pskb_trim_rcsum(skb, coding_len + h_size); + if (nc_packet->skb->len > coding_len + h_size) { + err = pskb_trim_rcsum(skb, coding_len + h_size); + if (err) + return NULL; + } /* Create decoded unicast packet */ unicast_packet = (struct batadv_unicast_packet *)skb->data; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 2f3452546636..fad1a2093e15 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -156,12 +156,28 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) kfree(orig_node); } +/** + * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly + * schedule an rcu callback for freeing it + * @orig_node: the orig node to free + */ void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) { if (atomic_dec_and_test(&orig_node->refcount)) call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } +/** + * batadv_orig_node_free_ref_now - decrement the orig node refcounter and + * possibly free it (without rcu callback) + * @orig_node: the orig node to free + */ +void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node) +{ + if (atomic_dec_and_test(&orig_node->refcount)) + batadv_orig_node_free_rcu(&orig_node->rcu); +} + void batadv_originator_free(struct batadv_priv *bat_priv) { struct batadv_hashtable *hash = bat_priv->orig_hash; diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 7df48fa7669d..734e5a3d8a5b 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -26,6 +26,7 @@ int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); +void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_get_orig_node(struct batadv_priv *bat_priv, const uint8_t *addr); struct batadv_neigh_node * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 6f20d339e33a..819dfb006cdf 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -505,6 +505,7 @@ unreg_debugfs: batadv_debugfs_del_meshif(dev); free_bat_counters: free_percpu(bat_priv->bat_counters); + bat_priv->bat_counters = NULL; return ret; } diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 5e89deeb9542..9e8748575845 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -144,7 +144,12 @@ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu) struct batadv_tt_orig_list_entry *orig_entry; orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); - batadv_orig_node_free_ref(orig_entry->orig_node); + + /* We are in an rcu callback here, therefore we cannot use + * batadv_orig_node_free_ref() and its call_rcu(): + * An rcu_barrier() wouldn't wait for that to finish + */ + batadv_orig_node_free_ref_now(orig_entry->orig_node); kfree(orig_entry); } diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 9878eb8204c5..19c37a4929bc 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -72,13 +72,12 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset) } static void -ebt_log_packet(u_int8_t pf, unsigned int hooknum, - const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct nf_loginfo *loginfo, - const char *prefix) +ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *loginfo, + const char *prefix) { unsigned int bitmask; - struct net *net = dev_net(in ? in : out); /* FIXME: Disabled from containers until syslog ns is supported */ if (!net_eq(net, &init_net)) @@ -191,7 +190,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par) nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in, par->out, &li, "%s", info->prefix); else - ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, + ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in, par->out, &li, info->prefix); return EBT_CONTINUE; } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index fc1905c51417..df0364aa12d5 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -131,14 +131,16 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - const struct ebt_ulog_info *uloginfo, const char *prefix) +static void ebt_ulog_packet(struct net *net, unsigned int hooknr, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ebt_ulog_info *uloginfo, + const char *prefix) { ebt_ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; - struct net *net = dev_net(in ? in : out); struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); unsigned int group = uloginfo->nlgroup; ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group]; @@ -233,7 +235,7 @@ unlock: } /* this function is registered with the netfilter core */ -static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, +static void ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, const struct nf_loginfo *li, const char *prefix) @@ -252,13 +254,15 @@ static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); } - ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); + ebt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); } static unsigned int ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) { - ebt_ulog_packet(par->hooknum, skb, par->in, par->out, + struct net *net = dev_net(par->in ? par->in : par->out); + + ebt_ulog_packet(net, par->hooknum, skb, par->in, par->out, par->targinfo, NULL); return EBT_CONTINUE; } diff --git a/net/ceph/Makefile b/net/ceph/Makefile index e87ef435e11b..958d9856912c 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ crypto.o armor.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ - pagevec.o + pagevec.o snapshot.o diff --git a/net/ceph/auth.c b/net/ceph/auth.c index b4bf4ac090f1..6b923bcaa2a4 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp if (!ac) goto out; + mutex_init(&ac->mutex); ac->negotiating = true; if (name) ac->name = name; @@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac) */ void ceph_auth_reset(struct ceph_auth_client *ac) { + mutex_lock(&ac->mutex); dout("auth_reset %p\n", ac); if (ac->ops && !ac->negotiating) ac->ops->reset(ac); ac->negotiating = true; + mutex_unlock(&ac->mutex); } int ceph_entity_name_encode(const char *name, void **p, void *end) @@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) int i, num; int ret; + mutex_lock(&ac->mutex); dout("auth_build_hello\n"); monhdr->have_version = 0; monhdr->session_mon = cpu_to_le16(-1); @@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) ret = ceph_entity_name_encode(ac->name, &p, end); if (ret < 0) - return ret; + goto out; ceph_decode_need(&p, end, sizeof(u64), bad); ceph_encode_64(&p, ac->global_id); ceph_encode_32(&lenp, p - lenp - sizeof(u32)); - return p - buf; + ret = p - buf; +out: + mutex_unlock(&ac->mutex); + return ret; bad: - return -ERANGE; + ret = -ERANGE; + goto out; } static int ceph_build_auth_request(struct ceph_auth_client *ac, @@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac, if (ret < 0) { pr_err("error %d building auth method %s request\n", ret, ac->ops->name); - return ret; + goto out; } dout(" built request %d bytes\n", ret); ceph_encode_32(&p, ret); - return p + ret - msg_buf; + ret = p + ret - msg_buf; +out: + return ret; } /* @@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, int result_msg_len; int ret = -EINVAL; + mutex_lock(&ac->mutex); dout("handle_auth_reply %p %p\n", p, end); ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); protocol = ceph_decode_32(&p); @@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac, ret = ac->ops->handle_reply(ac, result, payload, payload_end); if (ret == -EAGAIN) { - return ceph_build_auth_request(ac, reply_buf, reply_len); + ret = ceph_build_auth_request(ac, reply_buf, reply_len); } else if (ret) { pr_err("auth method '%s' error %d\n", ac->ops->name, ret); - return ret; } - return 0; -bad: - pr_err("failed to decode auth msg\n"); out: + mutex_unlock(&ac->mutex); return ret; + +bad: + pr_err("failed to decode auth msg\n"); + ret = -EINVAL; + goto out; } int ceph_build_auth(struct ceph_auth_client *ac, void *msg_buf, size_t msg_len) { + int ret = 0; + + mutex_lock(&ac->mutex); if (!ac->protocol) - return ceph_auth_build_hello(ac, msg_buf, msg_len); - BUG_ON(!ac->ops); - if (ac->ops->should_authenticate(ac)) - return ceph_build_auth_request(ac, msg_buf, msg_len); - return 0; + ret = ceph_auth_build_hello(ac, msg_buf, msg_len); + else if (ac->ops->should_authenticate(ac)) + ret = ceph_build_auth_request(ac, msg_buf, msg_len); + mutex_unlock(&ac->mutex); + return ret; } int ceph_auth_is_authenticated(struct ceph_auth_client *ac) { - if (!ac->ops) - return 0; - return ac->ops->is_authenticated(ac); + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops) + ret = ac->ops->is_authenticated(ac); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_is_authenticated); + +int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->create_authorizer) + ret = ac->ops->create_authorizer(ac, peer_type, auth); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_create_authorizer); + +void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, a); + mutex_unlock(&ac->mutex); +} +EXPORT_SYMBOL(ceph_auth_destroy_authorizer); + +int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, a); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_update_authorizer); + +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->verify_authorizer_reply) + ret = ac->ops->verify_authorizer_reply(ac, a, len); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply); + +void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + mutex_unlock(&ac->mutex); } +EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index a16bf14eb027..96238ba95f2b 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, return -ENOMEM; } au->service = th->service; + au->secret_id = th->secret_id; msg_a = au->buf->vec.iov_base; msg_a->struct_v = 1; @@ -555,6 +556,26 @@ static int ceph_x_create_authorizer( return 0; } +static int ceph_x_update_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = (struct ceph_x_authorizer *)auth->authorizer; + if (au->secret_id < th->secret_id) { + dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", + au->service, au->secret_id, th->secret_id); + return ceph_x_build_authorizer(ac, th, au); + } + return 0; +} + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a, size_t len) { @@ -630,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - remove_ticket_handler(ac, th); + memset(&th->validity, 0, sizeof(th->validity)); } @@ -641,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .build_request = ceph_x_build_request, .handle_reply = ceph_x_handle_reply, .create_authorizer = ceph_x_create_authorizer, + .update_authorizer = ceph_x_update_authorizer, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, .destroy_authorizer = ceph_x_destroy_authorizer, .invalidate_authorizer = ceph_x_invalidate_authorizer, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index f459e93b774f..c5a058da7ac8 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -29,6 +29,7 @@ struct ceph_x_authorizer { struct ceph_buffer *buf; unsigned int service; u64 nonce; + u64 secret_id; char reply_buf[128]; /* big enough for encrypted blob */ }; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index e65e6e4be38b..34b11ee8124e 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -606,11 +606,17 @@ static int __init init_ceph_lib(void) if (ret < 0) goto out_crypto; + ret = ceph_osdc_setup(); + if (ret < 0) + goto out_msgr; + pr_info("loaded (mon/osd proto %d/%d)\n", CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); return 0; +out_msgr: + ceph_msgr_exit(); out_crypto: ceph_crypto_shutdown(); out_debugfs: @@ -622,6 +628,7 @@ out: static void __exit exit_ceph_lib(void) { dout("exit_ceph_lib\n"); + ceph_osdc_cleanup(); ceph_msgr_exit(); ceph_crypto_shutdown(); ceph_debugfs_cleanup(); diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 00d051f4894e..83661cdc0766 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp) mutex_lock(&osdc->request_mutex); for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { struct ceph_osd_request *req; + unsigned int i; int opcode; - int i; req = rb_entry(p, struct ceph_osd_request, r_node); @@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp) seq_printf(s, "\t"); for (i = 0; i < req->r_num_ops; i++) { - opcode = le16_to_cpu(req->r_request_ops[i].op); + opcode = req->r_ops[i].op; seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 2c0669fb54e3..eb0a46a49bd4 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -21,6 +21,9 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> +#define list_entry_next(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -149,6 +152,11 @@ static bool con_flag_test_and_set(struct ceph_connection *con, return test_and_set_bit(con_flag, &con->flags); } +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache *ceph_msg_cache; +static struct kmem_cache *ceph_msg_data_cache; + /* static tag bytes (protocol control messages) */ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; @@ -223,6 +231,41 @@ static void encode_my_addr(struct ceph_messenger *msgr) */ static struct workqueue_struct *ceph_msgr_wq; +static int ceph_msgr_slab_init(void) +{ + BUG_ON(ceph_msg_cache); + ceph_msg_cache = kmem_cache_create("ceph_msg", + sizeof (struct ceph_msg), + __alignof__(struct ceph_msg), 0, NULL); + + if (!ceph_msg_cache) + return -ENOMEM; + + BUG_ON(ceph_msg_data_cache); + ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", + sizeof (struct ceph_msg_data), + __alignof__(struct ceph_msg_data), + 0, NULL); + if (ceph_msg_data_cache) + return 0; + + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; + + return -ENOMEM; +} + +static void ceph_msgr_slab_exit(void) +{ + BUG_ON(!ceph_msg_data_cache); + kmem_cache_destroy(ceph_msg_data_cache); + ceph_msg_data_cache = NULL; + + BUG_ON(!ceph_msg_cache); + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; +} + static void _ceph_msgr_exit(void) { if (ceph_msgr_wq) { @@ -230,6 +273,8 @@ static void _ceph_msgr_exit(void) ceph_msgr_wq = NULL; } + ceph_msgr_slab_exit(); + BUG_ON(zero_page == NULL); kunmap(zero_page); page_cache_release(zero_page); @@ -242,6 +287,9 @@ int ceph_msgr_init(void) zero_page = ZERO_PAGE(0); page_cache_get(zero_page); + if (ceph_msgr_slab_init()) + return -ENOMEM; + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (ceph_msgr_wq) return 0; @@ -471,6 +519,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) return r; } +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + void *kaddr; + int ret; + + BUG_ON(page_offset + length > PAGE_SIZE); + + kaddr = kmap(page); + BUG_ON(!kaddr); + ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); + kunmap(page); + + return ret; +} + /* * write something. @more is true if caller will be sending more data * shortly. @@ -493,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, } static int ceph_tcp_sendpage(struct socket *sock, struct page *page, - int offset, size_t size, int more) + int offset, size_t size, bool more) { int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); int ret; @@ -697,50 +761,397 @@ static void con_out_kvec_add(struct ceph_connection *con, } #ifdef CONFIG_BLOCK -static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) + +/* + * For a bio data item, a piece is whatever remains of the next + * entry in the current bio iovec, or the first entry in the next + * bio in the list. + */ +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) { - if (!bio) { - *iter = NULL; - *seg = 0; - return; + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = data->bio; + BUG_ON(!bio); + BUG_ON(!bio->bi_vcnt); + + cursor->resid = min(length, data->bio_length); + cursor->bio = bio; + cursor->vector_index = 0; + cursor->vector_offset = 0; + cursor->last_piece = length <= bio->bi_io_vec[0].bv_len; +} + +static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, + size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + + bio_vec = &bio->bi_io_vec[index]; + BUG_ON(cursor->vector_offset >= bio_vec->bv_len); + *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset); + BUG_ON(*page_offset >= PAGE_SIZE); + if (cursor->last_piece) /* pagelist offset is always 0 */ + *length = cursor->resid; + else + *length = (size_t) (bio_vec->bv_len - cursor->vector_offset); + BUG_ON(*length > cursor->resid); + BUG_ON(*page_offset + *length > PAGE_SIZE); + + return bio_vec->bv_page; +} + +static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct bio *bio; + struct bio_vec *bio_vec; + unsigned int index; + + BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + index = cursor->vector_index; + BUG_ON(index >= (unsigned int) bio->bi_vcnt); + bio_vec = &bio->bi_io_vec[index]; + + /* Advance the cursor offset */ + + BUG_ON(cursor->resid < bytes); + cursor->resid -= bytes; + cursor->vector_offset += bytes; + if (cursor->vector_offset < bio_vec->bv_len) + return false; /* more bytes to process in this segment */ + BUG_ON(cursor->vector_offset != bio_vec->bv_len); + + /* Move on to the next segment, and possibly the next bio */ + + if (++index == (unsigned int) bio->bi_vcnt) { + bio = bio->bi_next; + index = 0; } - *iter = bio; - *seg = bio->bi_idx; + cursor->bio = bio; + cursor->vector_index = index; + cursor->vector_offset = 0; + + if (!cursor->last_piece) { + BUG_ON(!cursor->resid); + BUG_ON(!bio); + /* A short read is OK, so use <= rather than == */ + if (cursor->resid <= bio->bi_io_vec[index].bv_len) + cursor->last_piece = true; + } + + return true; } +#endif /* CONFIG_BLOCK */ -static void iter_bio_next(struct bio **bio_iter, int *seg) +/* + * For a page array, a piece comes from the first page in the array + * that has not already been fully consumed. + */ +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) { - if (*bio_iter == NULL) - return; + struct ceph_msg_data *data = cursor->data; + int page_count; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); - BUG_ON(*seg >= (*bio_iter)->bi_vcnt); + BUG_ON(!data->pages); + BUG_ON(!data->length); - (*seg)++; - if (*seg == (*bio_iter)->bi_vcnt) - init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); + cursor->resid = min(length, data->length); + page_count = calc_pages_for(data->alignment, (u64)data->length); + cursor->page_offset = data->alignment & ~PAGE_MASK; + cursor->page_index = 0; + BUG_ON(page_count > (int)USHRT_MAX); + cursor->page_count = (unsigned short)page_count; + BUG_ON(length > SIZE_MAX - cursor->page_offset); + cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE; } -#endif -static void prepare_write_message_data(struct ceph_connection *con) +static struct page * +ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) { - struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data *data = cursor->data; - BUG_ON(!msg); - BUG_ON(!msg->hdr.data_len); + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_index >= cursor->page_count); + BUG_ON(cursor->page_offset >= PAGE_SIZE); + + *page_offset = cursor->page_offset; + if (cursor->last_piece) + *length = cursor->resid; + else + *length = PAGE_SIZE - *page_offset; + + return data->pages[cursor->page_index]; +} + +static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); + + /* Advance the cursor page offset */ + + cursor->resid -= bytes; + cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; + if (!bytes || cursor->page_offset) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page; offset is already at 0 */ + + BUG_ON(cursor->page_index >= cursor->page_count); + cursor->page_index++; + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * For a pagelist, a piece is whatever remains to be consumed in the + * first page in the list, or the front of the next page. + */ +static void +ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + struct page *page; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + if (!length) + return; /* pagelist can be assigned but empty */ + + BUG_ON(list_empty(&pagelist->head)); + page = list_first_entry(&pagelist->head, struct page, lru); + + cursor->resid = min(length, pagelist->length); + cursor->page = page; + cursor->offset = 0; + cursor->last_piece = cursor->resid <= PAGE_SIZE; +} + +static struct page * +ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); - /* initialize page iterator */ - con->out_msg_pos.page = 0; - if (msg->pages) - con->out_msg_pos.page_pos = msg->page_alignment; + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(!cursor->page); + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + + /* offset of first page in pagelist is always 0 */ + *page_offset = cursor->offset & ~PAGE_MASK; + if (cursor->last_piece) + *length = cursor->resid; else - con->out_msg_pos.page_pos = 0; + *length = PAGE_SIZE - *page_offset; + + return cursor->page; +} + +static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); + + /* Advance the cursor offset */ + + cursor->resid -= bytes; + cursor->offset += bytes; + /* offset of first page in pagelist is always 0 */ + if (!bytes || cursor->offset & ~PAGE_MASK) + return false; /* more bytes to process in the current page */ + + /* Move on to the next page */ + + BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); + cursor->page = list_entry_next(cursor->page, lru); + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * Message data is handled (sent or received) in pieces, where each + * piece resides on a single page. The network layer might not + * consume an entire piece at once. A data item's cursor keeps + * track of which piece is next to process and how much remains to + * be processed in that piece. It also tracks whether the current + * piece is the last one in the data item. + */ +static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) +{ + size_t length = cursor->total_resid; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + ceph_msg_data_pagelist_cursor_init(cursor, length); + break; + case CEPH_MSG_DATA_PAGES: + ceph_msg_data_pages_cursor_init(cursor, length); + break; #ifdef CONFIG_BLOCK - if (msg->bio) - init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); -#endif - con->out_msg_pos.data_pos = 0; - con->out_msg_pos.did_page_crc = false; - con->out_more = 1; /* data + footer will follow */ + case CEPH_MSG_DATA_BIO: + ceph_msg_data_bio_cursor_init(cursor, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + /* BUG(); */ + break; + } + cursor->need_crc = true; +} + +static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +{ + struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data *data; + + BUG_ON(!length); + BUG_ON(length > msg->data_length); + BUG_ON(list_empty(&msg->data)); + + cursor->data_head = &msg->data; + cursor->total_resid = length; + data = list_first_entry(&msg->data, struct ceph_msg_data, links); + cursor->data = data; + + __ceph_msg_data_cursor_init(cursor); +} + +/* + * Return the page containing the next piece to process for a given + * data item, and supply the page offset and length of that piece. + * Indicate whether this is the last piece in this data item. + */ +static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece) +{ + struct page *page; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + page = ceph_msg_data_pagelist_next(cursor, page_offset, length); + break; + case CEPH_MSG_DATA_PAGES: + page = ceph_msg_data_pages_next(cursor, page_offset, length); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + page = ceph_msg_data_bio_next(cursor, page_offset, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + page = NULL; + break; + } + BUG_ON(!page); + BUG_ON(*page_offset + *length > PAGE_SIZE); + BUG_ON(!*length); + if (last_piece) + *last_piece = cursor->last_piece; + + return page; +} + +/* + * Returns true if the result moves the cursor on to the next piece + * of the data item. + */ +static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + bool new_piece; + + BUG_ON(bytes > cursor->resid); + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); + break; + case CEPH_MSG_DATA_PAGES: + new_piece = ceph_msg_data_pages_advance(cursor, bytes); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + new_piece = ceph_msg_data_bio_advance(cursor, bytes); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + BUG(); + break; + } + cursor->total_resid -= bytes; + + if (!cursor->resid && cursor->total_resid) { + WARN_ON(!cursor->last_piece); + BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); + cursor->data = list_entry_next(cursor->data, links); + __ceph_msg_data_cursor_init(cursor); + new_piece = true; + } + cursor->need_crc = new_piece; + + return new_piece; +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ + BUG_ON(!msg); + BUG_ON(!data_len); + + /* Initialize data cursor */ + + ceph_msg_data_cursor_init(msg, (size_t)data_len); } /* @@ -803,16 +1214,12 @@ static void prepare_write_message(struct ceph_connection *con) m->hdr.seq = cpu_to_le64(++con->out_seq); m->needs_out_seq = false; } -#ifdef CONFIG_BLOCK - else - m->bio_iter = NULL; -#endif + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); - dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), - le32_to_cpu(m->hdr.data_len), - m->nr_pages); + m->data_length); BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); /* tag + hdr + front + middle */ @@ -843,11 +1250,13 @@ static void prepare_write_message(struct ceph_connection *con) /* is there a data payload? */ con->out_msg->footer.data_crc = 0; - if (m->hdr.data_len) - prepare_write_message_data(con); - else + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); + con->out_more = 1; /* data + footer will follow */ + } else { /* no, queue up footer too and be done */ prepare_write_message_footer(con); + } con_flag_set(con, CON_FLAG_WRITE_PENDING); } @@ -874,6 +1283,24 @@ static void prepare_write_ack(struct ceph_connection *con) } /* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* * Prepare to write keepalive byte. */ static void prepare_write_keepalive(struct ceph_connection *con) @@ -1022,35 +1449,19 @@ out: return ret; /* done! */ } -static void out_msg_pos_next(struct ceph_connection *con, struct page *page, - size_t len, size_t sent, bool in_trail) +static u32 ceph_crc32c_page(u32 crc, struct page *page, + unsigned int page_offset, + unsigned int length) { - struct ceph_msg *msg = con->out_msg; + char *kaddr; - BUG_ON(!msg); - BUG_ON(!sent); - - con->out_msg_pos.data_pos += sent; - con->out_msg_pos.page_pos += sent; - if (sent < len) - return; + kaddr = kmap(page); + BUG_ON(kaddr == NULL); + crc = crc32c(crc, kaddr + page_offset, length); + kunmap(page); - BUG_ON(sent != len); - con->out_msg_pos.page_pos = 0; - con->out_msg_pos.page++; - con->out_msg_pos.did_page_crc = false; - if (in_trail) - list_move_tail(&page->lru, - &msg->trail->head); - else if (msg->pagelist) - list_move_tail(&page->lru, - &msg->pagelist->head); -#ifdef CONFIG_BLOCK - else if (msg->bio) - iter_bio_next(&msg->bio_iter, &msg->bio_seg); -#endif + return crc; } - /* * Write as much message data payload as we can. If we finish, queue * up the footer. @@ -1058,21 +1469,17 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page, * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_msg_pages(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con) { struct ceph_msg *msg = con->out_msg; - unsigned int data_len = le32_to_cpu(msg->hdr.data_len); - size_t len; + struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !con->msgr->nocrc; - int ret; - int total_max_write; - bool in_trail = false; - const size_t trail_len = (msg->trail ? msg->trail->length : 0); - const size_t trail_off = data_len - trail_len; + u32 crc; - dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", - con, msg, con->out_msg_pos.page, msg->nr_pages, - con->out_msg_pos.page_pos); + dout("%s %p msg %p\n", __func__, con, msg); + + if (list_empty(&msg->data)) + return -EINVAL; /* * Iterate through each page that contains data to be @@ -1082,72 +1489,41 @@ static int write_partial_msg_pages(struct ceph_connection *con) * need to map the page. If we have no pages, they have * been revoked, so use the zero page. */ - while (data_len > con->out_msg_pos.data_pos) { - struct page *page = NULL; - int max_write = PAGE_SIZE; - int bio_offset = 0; - - in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; - if (!in_trail) - total_max_write = trail_off - con->out_msg_pos.data_pos; - - if (in_trail) { - total_max_write = data_len - con->out_msg_pos.data_pos; - - page = list_first_entry(&msg->trail->head, - struct page, lru); - } else if (msg->pages) { - page = msg->pages[con->out_msg_pos.page]; - } else if (msg->pagelist) { - page = list_first_entry(&msg->pagelist->head, - struct page, lru); -#ifdef CONFIG_BLOCK - } else if (msg->bio) { - struct bio_vec *bv; + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; + while (cursor->resid) { + struct page *page; + size_t page_offset; + size_t length; + bool last_piece; + bool need_crc; + int ret; - bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); - page = bv->bv_page; - bio_offset = bv->bv_offset; - max_write = bv->bv_len; -#endif - } else { - page = zero_page; - } - len = min_t(int, max_write - con->out_msg_pos.page_pos, - total_max_write); - - if (do_datacrc && !con->out_msg_pos.did_page_crc) { - void *base; - u32 crc = le32_to_cpu(msg->footer.data_crc); - char *kaddr; - - kaddr = kmap(page); - BUG_ON(kaddr == NULL); - base = kaddr + con->out_msg_pos.page_pos + bio_offset; - crc = crc32c(crc, base, len); - kunmap(page); - msg->footer.data_crc = cpu_to_le32(crc); - con->out_msg_pos.did_page_crc = true; - } - ret = ceph_tcp_sendpage(con->sock, page, - con->out_msg_pos.page_pos + bio_offset, - len, 1); - if (ret <= 0) - goto out; + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + &last_piece); + ret = ceph_tcp_sendpage(con->sock, page, page_offset, + length, last_piece); + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); - out_msg_pos_next(con, page, len, (size_t) ret, in_trail); + return ret; + } + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); + need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret); } - dout("write_partial_msg_pages %p msg %p done\n", con, msg); + dout("%s %p msg %p done\n", __func__, con, msg); /* prepare and queue up footer, too */ - if (!do_datacrc) + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); prepare_write_message_footer(con); - ret = 1; -out: - return ret; + + return 1; /* must return > 0 to indicate success */ } /* @@ -1160,7 +1536,7 @@ static int write_partial_skip(struct ceph_connection *con) while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); - ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); if (ret <= 0) goto out; con->out_skip -= ret; @@ -1191,6 +1567,13 @@ static void prepare_read_ack(struct ceph_connection *con) con->in_base_pos = 0; } +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_SEQ; +} + static void prepare_read_tag(struct ceph_connection *con) { dout("prepare_read_tag %p\n", con); @@ -1597,7 +1980,6 @@ static int process_connect(struct ceph_connection *con) con->error_msg = "connect authorization failure"; return -1; } - con->auth_retry = 1; con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) @@ -1668,6 +2050,7 @@ static int process_connect(struct ceph_connection *con) prepare_read_connect(con); break; + case CEPH_MSGR_TAG_SEQ: case CEPH_MSGR_TAG_READY: if (req_feat & ~server_feat) { pr_err("%s%lld %s protocol feature mismatch," @@ -1682,7 +2065,7 @@ static int process_connect(struct ceph_connection *con) WARN_ON(con->state != CON_STATE_NEGOTIATING); con->state = CON_STATE_OPEN; - + con->auth_retry = 0; /* we authenticated; clear flag */ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); con->connect_seq++; con->peer_features = server_feat; @@ -1698,7 +2081,12 @@ static int process_connect(struct ceph_connection *con) con->delay = 0; /* reset backoff memory */ - prepare_read_tag(con); + if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } break; case CEPH_MSGR_TAG_WAIT: @@ -1732,7 +2120,6 @@ static int read_partial_ack(struct ceph_connection *con) return read_partial(con, end, size, &con->in_temp_ack); } - /* * We can finally discard anything that's been acked. */ @@ -1757,8 +2144,6 @@ static void process_ack(struct ceph_connection *con) } - - static int read_partial_message_section(struct ceph_connection *con, struct kvec *section, unsigned int sec_len, u32 *crc) @@ -1782,77 +2167,49 @@ static int read_partial_message_section(struct ceph_connection *con, return 1; } -static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); - -static int read_partial_message_pages(struct ceph_connection *con, - struct page **pages, - unsigned int data_len, bool do_datacrc) +static int read_partial_msg_data(struct ceph_connection *con) { - void *p; + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + const bool do_datacrc = !con->msgr->nocrc; + struct page *page; + size_t page_offset; + size_t length; + u32 crc = 0; int ret; - int left; - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); - /* (page) data */ - BUG_ON(pages == NULL); - p = kmap(pages[con->in_msg_pos.page]); - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(pages[con->in_msg_pos.page]); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == PAGE_SIZE) { - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.page++; - } - - return ret; -} - -#ifdef CONFIG_BLOCK -static int read_partial_message_bio(struct ceph_connection *con, - struct bio **bio_iter, int *bio_seg, - unsigned int data_len, bool do_datacrc) -{ - struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg); - void *p; - int ret, left; + BUG_ON(!msg); + if (list_empty(&msg->data)) + return -EIO; - left = min((int)(data_len - con->in_msg_pos.data_pos), - (int)(bv->bv_len - con->in_msg_pos.page_pos)); + if (do_datacrc) + crc = con->in_data_crc; + while (cursor->resid) { + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + NULL); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; - p = kmap(bv->bv_page) + bv->bv_offset; + return ret; + } - ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, - left); - if (ret > 0 && do_datacrc) - con->in_data_crc = - crc32c(con->in_data_crc, - p + con->in_msg_pos.page_pos, ret); - kunmap(bv->bv_page); - if (ret <= 0) - return ret; - con->in_msg_pos.data_pos += ret; - con->in_msg_pos.page_pos += ret; - if (con->in_msg_pos.page_pos == bv->bv_len) { - con->in_msg_pos.page_pos = 0; - iter_bio_next(bio_iter, bio_seg); + if (do_datacrc) + crc = ceph_crc32c_page(crc, page, page_offset, ret); + (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret); } + if (do_datacrc) + con->in_data_crc = crc; - return ret; + return 1; /* must return > 0 to indicate success */ } -#endif /* * read (part of) a message. */ +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); + static int read_partial_message(struct ceph_connection *con) { struct ceph_msg *m = con->in_msg; @@ -1885,7 +2242,7 @@ static int read_partial_message(struct ceph_connection *con) if (front_len > CEPH_MSG_MAX_FRONT_LEN) return -EIO; middle_len = le32_to_cpu(con->in_hdr.middle_len); - if (middle_len > CEPH_MSG_MAX_DATA_LEN) + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) return -EIO; data_len = le32_to_cpu(con->in_hdr.data_len); if (data_len > CEPH_MSG_MAX_DATA_LEN) @@ -1914,14 +2271,22 @@ static int read_partial_message(struct ceph_connection *con) int skip = 0; dout("got hdr type %d front %d data %d\n", con->in_hdr.type, - con->in_hdr.front_len, con->in_hdr.data_len); + front_len, data_len); ret = ceph_con_in_msg_alloc(con, &skip); if (ret < 0) return ret; + + BUG_ON(!con->in_msg ^ skip); + if (con->in_msg && data_len > con->in_msg->data_length) { + pr_warning("%s skipping long message (%u > %zd)\n", + __func__, data_len, con->in_msg->data_length); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + skip = 1; + } if (skip) { /* skip this message */ dout("alloc_msg said skip message\n"); - BUG_ON(con->in_msg); con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; @@ -1936,17 +2301,10 @@ static int read_partial_message(struct ceph_connection *con) if (m->middle) m->middle->vec.iov_len = 0; - con->in_msg_pos.page = 0; - if (m->pages) - con->in_msg_pos.page_pos = m->page_alignment; - else - con->in_msg_pos.page_pos = 0; - con->in_msg_pos.data_pos = 0; + /* prepare for data payload, if any */ -#ifdef CONFIG_BLOCK - if (m->bio) - init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg); -#endif + if (data_len) + prepare_message_data(con->in_msg, data_len); } /* front */ @@ -1965,24 +2323,10 @@ static int read_partial_message(struct ceph_connection *con) } /* (page) data */ - while (con->in_msg_pos.data_pos < data_len) { - if (m->pages) { - ret = read_partial_message_pages(con, m->pages, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#ifdef CONFIG_BLOCK - } else if (m->bio) { - BUG_ON(!m->bio_iter); - ret = read_partial_message_bio(con, - &m->bio_iter, &m->bio_seg, - data_len, do_datacrc); - if (ret <= 0) - return ret; -#endif - } else { - BUG_ON(1); - } + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; } /* footer */ @@ -2108,13 +2452,13 @@ more_kvec: goto do_next; } - ret = write_partial_msg_pages(con); + ret = write_partial_message_data(con); if (ret == 1) goto more_kvec; /* we need to send the footer, too! */ if (ret == 0) goto out; if (ret < 0) { - dout("try_write write_partial_msg_pages err %d\n", + dout("try_write write_partial_message_data err %d\n", ret); goto out; } @@ -2266,7 +2610,12 @@ more: prepare_read_tag(con); goto more; } - if (con->in_tag == CEPH_MSGR_TAG_ACK) { + if (con->in_tag == CEPH_MSGR_TAG_ACK || + con->in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ ret = read_partial_ack(con); if (ret <= 0) goto out; @@ -2672,6 +3021,88 @@ void ceph_con_keepalive(struct ceph_connection *con) } EXPORT_SYMBOL(ceph_con_keepalive); +static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) +{ + struct ceph_msg_data *data; + + if (WARN_ON(!ceph_msg_data_type_valid(type))) + return NULL; + + data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); + if (data) + data->type = type; + INIT_LIST_HEAD(&data->links); + + return data; +} + +static void ceph_msg_data_destroy(struct ceph_msg_data *data) +{ + if (!data) + return; + + WARN_ON(!list_empty(&data->links)); + if (data->type == CEPH_MSG_DATA_PAGELIST) { + ceph_pagelist_release(data->pagelist); + kfree(data->pagelist); + } + kmem_cache_free(ceph_msg_data_cache, data); +} + +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment) +{ + struct ceph_msg_data *data; + + BUG_ON(!pages); + BUG_ON(!length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); + BUG_ON(!data); + data->pages = pages; + data->length = length; + data->alignment = alignment & ~PAGE_MASK; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pages); + +void ceph_msg_data_add_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist) +{ + struct ceph_msg_data *data; + + BUG_ON(!pagelist); + BUG_ON(!pagelist->length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); + BUG_ON(!data); + data->pagelist = pagelist; + + list_add_tail(&data->links, &msg->data); + msg->data_length += pagelist->length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pagelist); + +#ifdef CONFIG_BLOCK +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, + size_t length) +{ + struct ceph_msg_data *data; + + BUG_ON(!bio); + + data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); + BUG_ON(!data); + data->bio = bio; + data->bio_length = length; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_bio); +#endif /* CONFIG_BLOCK */ /* * construct a new message with given type, size @@ -2682,49 +3113,20 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, { struct ceph_msg *m; - m = kmalloc(sizeof(*m), flags); + m = kmem_cache_zalloc(ceph_msg_cache, flags); if (m == NULL) goto out; - kref_init(&m->kref); - m->con = NULL; - INIT_LIST_HEAD(&m->list_head); - - m->hdr.tid = 0; m->hdr.type = cpu_to_le16(type); m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); - m->hdr.version = 0; m->hdr.front_len = cpu_to_le32(front_len); - m->hdr.middle_len = 0; - m->hdr.data_len = 0; - m->hdr.data_off = 0; - m->hdr.reserved = 0; - m->footer.front_crc = 0; - m->footer.middle_crc = 0; - m->footer.data_crc = 0; - m->footer.flags = 0; - m->front_max = front_len; - m->front_is_vmalloc = false; - m->more_to_follow = false; - m->ack_stamp = 0; - m->pool = NULL; - - /* middle */ - m->middle = NULL; - /* data */ - m->nr_pages = 0; - m->page_alignment = 0; - m->pages = NULL; - m->pagelist = NULL; -#ifdef CONFIG_BLOCK - m->bio = NULL; - m->bio_iter = NULL; - m->bio_seg = 0; -#endif /* CONFIG_BLOCK */ - m->trail = NULL; + INIT_LIST_HEAD(&m->list_head); + kref_init(&m->kref); + INIT_LIST_HEAD(&m->data); /* front */ + m->front_max = front_len; if (front_len) { if (front_len > PAGE_CACHE_SIZE) { m->front.iov_base = __vmalloc(front_len, flags, @@ -2802,49 +3204,37 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) { struct ceph_msg_header *hdr = &con->in_hdr; - int type = le16_to_cpu(hdr->type); - int front_len = le32_to_cpu(hdr->front_len); int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg; int ret = 0; BUG_ON(con->in_msg != NULL); + BUG_ON(!con->ops->alloc_msg); - if (con->ops->alloc_msg) { - struct ceph_msg *msg; - - mutex_unlock(&con->mutex); - msg = con->ops->alloc_msg(con, hdr, skip); - mutex_lock(&con->mutex); - if (con->state != CON_STATE_OPEN) { - if (msg) - ceph_msg_put(msg); - return -EAGAIN; - } - con->in_msg = msg; - if (con->in_msg) { - con->in_msg->con = con->ops->get(con); - BUG_ON(con->in_msg->con == NULL); - } - if (*skip) { - con->in_msg = NULL; - return 0; - } - if (!con->in_msg) { - con->error_msg = - "error allocating memory for incoming message"; - return -ENOMEM; - } + mutex_unlock(&con->mutex); + msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); + if (con->state != CON_STATE_OPEN) { + if (msg) + ceph_msg_put(msg); + return -EAGAIN; } - if (!con->in_msg) { - con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); - if (!con->in_msg) { - pr_err("unable to allocate msg type %d len %d\n", - type, front_len); - return -ENOMEM; - } + if (msg) { + BUG_ON(*skip); + con->in_msg = msg; con->in_msg->con = con->ops->get(con); BUG_ON(con->in_msg->con == NULL); - con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); + } else { + /* + * Null message pointer means either we should skip + * this message or we couldn't allocate memory. The + * former is not an error. + */ + if (*skip) + return 0; + con->error_msg = "error allocating memory for incoming message"; + + return -ENOMEM; } memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); @@ -2870,7 +3260,7 @@ void ceph_msg_kfree(struct ceph_msg *m) vfree(m->front.iov_base); else kfree(m->front.iov_base); - kfree(m); + kmem_cache_free(ceph_msg_cache, m); } /* @@ -2879,6 +3269,9 @@ void ceph_msg_kfree(struct ceph_msg *m) void ceph_msg_last_put(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); + LIST_HEAD(data); + struct list_head *links; + struct list_head *next; dout("ceph_msg_put last one on %p\n", m); WARN_ON(!list_empty(&m->list_head)); @@ -2888,16 +3281,16 @@ void ceph_msg_last_put(struct kref *kref) ceph_buffer_put(m->middle); m->middle = NULL; } - m->nr_pages = 0; - m->pages = NULL; - if (m->pagelist) { - ceph_pagelist_release(m->pagelist); - kfree(m->pagelist); - m->pagelist = NULL; - } + list_splice_init(&m->data, &data); + list_for_each_safe(links, next, &data) { + struct ceph_msg_data *data; - m->trail = NULL; + data = list_entry(links, struct ceph_msg_data, links); + list_del_init(links); + ceph_msg_data_destroy(data); + } + m->data_length = 0; if (m->pool) ceph_msgpool_put(m->pool, m); @@ -2908,8 +3301,8 @@ EXPORT_SYMBOL(ceph_msg_last_put); void ceph_msg_dump(struct ceph_msg *msg) { - pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, - msg->front_max, msg->nr_pages); + pr_debug("msg_dump %p (front_max %d length %zd)\n", msg, + msg->front_max, msg->data_length); print_hex_dump(KERN_DEBUG, "header: ", DUMP_PREFIX_OFFSET, 16, 1, &msg->hdr, sizeof(msg->hdr), true); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index aef5b1062bee..1fe25cd29d0e 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work) __validate_auth(monc); - if (monc->auth->ops->is_authenticated(monc->auth)) + if (ceph_auth_is_authenticated(monc->auth)) __send_subscribe(monc); } __schedule_delayed(monc); @@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, mutex_lock(&monc->mutex); had_debugfs_info = have_debugfs_info(monc); - if (monc->auth->ops) - was_auth = monc->auth->ops->is_authenticated(monc->auth); + was_auth = ceph_auth_is_authenticated(monc->auth); monc->pending_auth = 0; ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, msg->front.iov_len, @@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc, wake_up_all(&monc->client->auth_wq); } else if (ret > 0) { __send_prepared_auth_request(monc, ret); - } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { + } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { dout("authenticated, starting session\n"); monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d730dd4d8eb2..d5953b87918c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1,3 +1,4 @@ + #include <linux/ceph/ceph_debug.h> #include <linux/module.h> @@ -21,6 +22,8 @@ #define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 +static struct kmem_cache *ceph_osd_request_cache; + static const struct ceph_connection_operations osd_con_ops; static void __send_queued(struct ceph_osd_client *osdc); @@ -32,12 +35,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc, static void __send_request(struct ceph_osd_client *osdc, struct ceph_osd_request *req); -static int op_has_extent(int op) -{ - return (op == CEPH_OSD_OP_READ || - op == CEPH_OSD_OP_WRITE); -} - /* * Implement client access to distributed object storage cluster. * @@ -63,53 +60,238 @@ static int op_has_extent(int op) * * fill osd op in request message. */ -static int calc_layout(struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) +static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, + u64 *objnum, u64 *objoff, u64 *objlen) { u64 orig_len = *plen; - u64 bno = 0; - u64 objoff = 0; - u64 objlen = 0; int r; /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno, - &objoff, &objlen); + r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, + objoff, objlen); if (r < 0) return r; - if (objlen < orig_len) { - *plen = objlen; + if (*objlen < orig_len) { + *plen = *objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); } - if (op_has_extent(op->op)) { - u32 osize = le32_to_cpu(layout->fl_object_size); - op->extent.offset = objoff; - op->extent.length = objlen; - if (op->extent.truncate_size <= off - objoff) { - op->extent.truncate_size = 0; - } else { - op->extent.truncate_size -= off - objoff; - if (op->extent.truncate_size > osize) - op->extent.truncate_size = osize; - } + dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); + + return 0; +} + +static void ceph_osd_data_init(struct ceph_osd_data *osd_data) +{ + memset(osd_data, 0, sizeof (*osd_data)); + osd_data->type = CEPH_OSD_DATA_TYPE_NONE; +} + +static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->length = length; + osd_data->alignment = alignment; + osd_data->pages_from_pool = pages_from_pool; + osd_data->own_pages = own_pages; +} + +static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; + osd_data->pagelist = pagelist; +} + +#ifdef CONFIG_BLOCK +static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = bio; + osd_data->bio_length = bio_length; +} +#endif /* CONFIG_BLOCK */ + +#define osd_req_op_data(oreq, whch, typ, fld) \ + ({ \ + BUG_ON(whch >= (oreq)->r_num_ops); \ + &(oreq)->r_ops[whch].typ.fld; \ + }) + +static struct ceph_osd_data * +osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + return &osd_req->r_ops[which].raw_data_in; +} + +struct ceph_osd_data * +osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, extent, osd_data); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + +struct ceph_osd_data * +osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, cls, response_data); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ + +void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_raw_data_in(osd_req, which); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); + +void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); + +void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); + +#ifdef CONFIG_BLOCK +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, struct bio *bio, size_t bio_length) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_bio_init(osd_data, bio, bio_length); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); +#endif /* CONFIG_BLOCK */ + +static void osd_req_op_cls_request_info_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_info); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} + +void osd_req_op_cls_request_data_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); + +void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); + +void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, response_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); + +static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) +{ + switch (osd_data->type) { + case CEPH_OSD_DATA_TYPE_NONE: + return 0; + case CEPH_OSD_DATA_TYPE_PAGES: + return osd_data->length; + case CEPH_OSD_DATA_TYPE_PAGELIST: + return (u64)osd_data->pagelist->length; +#ifdef CONFIG_BLOCK + case CEPH_OSD_DATA_TYPE_BIO: + return (u64)osd_data->bio_length; +#endif /* CONFIG_BLOCK */ + default: + WARN(true, "unrecognized data type %d\n", (int)osd_data->type); + return 0; } - req->r_num_pages = calc_pages_for(off, *plen); - req->r_page_alignment = off & ~PAGE_MASK; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = *plen; +} - dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", - bno, objoff, objlen, req->r_num_pages); +static void ceph_osd_data_release(struct ceph_osd_data *osd_data) +{ + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { + int num_pages; - snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); - req->r_oid_len = strlen(req->r_oid); + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + ceph_release_page_vector(osd_data->pages, num_pages); + } + ceph_osd_data_init(osd_data); +} + +static void osd_req_op_data_release(struct ceph_osd_request *osd_req, + unsigned int which) +{ + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; - return r; + switch (op->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + ceph_osd_data_release(&op->extent.osd_data); + break; + case CEPH_OSD_OP_CALL: + ceph_osd_data_release(&op->cls.request_info); + ceph_osd_data_release(&op->cls.request_data); + ceph_osd_data_release(&op->cls.response_data); + break; + default: + break; + } } /* @@ -117,30 +299,26 @@ static int calc_layout(struct ceph_vino vino, */ void ceph_osdc_release_request(struct kref *kref) { - struct ceph_osd_request *req = container_of(kref, - struct ceph_osd_request, - r_kref); + struct ceph_osd_request *req; + unsigned int which; + req = container_of(kref, struct ceph_osd_request, r_kref); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_con_filling_msg) { - dout("%s revoking msg %p from con %p\n", __func__, - req->r_reply, req->r_con_filling_msg); + if (req->r_reply) { ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } - if (req->r_reply) ceph_msg_put(req->r_reply); - if (req->r_own_pages) - ceph_release_page_vector(req->r_pages, - req->r_num_pages); + } + + for (which = 0; which < req->r_num_ops; which++) + osd_req_op_data_release(req, which); + ceph_put_snap_context(req->r_snapc); - ceph_pagelist_release(&req->r_trail); if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else - kfree(req); + kmem_cache_free(ceph_osd_request_cache, req); + } EXPORT_SYMBOL(ceph_osdc_release_request); @@ -154,6 +332,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_msg *msg; size_t msg_size; + BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); + BUG_ON(num_ops > CEPH_OSD_MAX_OP); + msg_size = 4 + 4 + 8 + 8 + 4+8; msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pg_t */ @@ -168,13 +349,14 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, req = mempool_alloc(osdc->req_mempool, gfp_flags); memset(req, 0, sizeof(*req)); } else { - req = kzalloc(sizeof(*req), gfp_flags); + req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); } if (req == NULL) return NULL; req->r_osdc = osdc; req->r_mempool = use_mempool; + req->r_num_ops = num_ops; kref_init(&req->r_kref); init_completion(&req->r_completion); @@ -198,8 +380,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } req->r_reply = msg; - ceph_pagelist_init(&req->r_trail); - /* create request message; allow space for oid */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); @@ -218,60 +398,24 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_osdc_alloc_request); -static void osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, - struct ceph_osd_req_op *src) +static bool osd_req_opcode_valid(u16 opcode) { - dst->op = cpu_to_le16(src->op); - - switch (src->op) { - case CEPH_OSD_OP_STAT: - break; + switch (opcode) { case CEPH_OSD_OP_READ: - case CEPH_OSD_OP_WRITE: - dst->extent.offset = - cpu_to_le64(src->extent.offset); - dst->extent.length = - cpu_to_le64(src->extent.length); - dst->extent.truncate_size = - cpu_to_le64(src->extent.truncate_size); - dst->extent.truncate_seq = - cpu_to_le32(src->extent.truncate_seq); - break; - case CEPH_OSD_OP_CALL: - dst->cls.class_len = src->cls.class_len; - dst->cls.method_len = src->cls.method_len; - dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); - - ceph_pagelist_append(&req->r_trail, src->cls.class_name, - src->cls.class_len); - ceph_pagelist_append(&req->r_trail, src->cls.method_name, - src->cls.method_len); - ceph_pagelist_append(&req->r_trail, src->cls.indata, - src->cls.indata_len); - break; - case CEPH_OSD_OP_STARTSYNC: - break; - case CEPH_OSD_OP_NOTIFY_ACK: - case CEPH_OSD_OP_WATCH: - dst->watch.cookie = cpu_to_le64(src->watch.cookie); - dst->watch.ver = cpu_to_le64(src->watch.ver); - dst->watch.flag = src->watch.flag; - break; - default: - pr_err("unrecognized osd opcode %d\n", dst->op); - WARN_ON(1); - break; + case CEPH_OSD_OP_STAT: case CEPH_OSD_OP_MAPEXT: case CEPH_OSD_OP_MASKTRUNC: case CEPH_OSD_OP_SPARSE_READ: case CEPH_OSD_OP_NOTIFY: + case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_ASSERT_VER: + case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_TRUNCATE: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_DELETE: case CEPH_OSD_OP_APPEND: + case CEPH_OSD_OP_STARTSYNC: case CEPH_OSD_OP_SETTRUNC: case CEPH_OSD_OP_TRIMTRUNC: case CEPH_OSD_OP_TMAPUP: @@ -279,11 +423,11 @@ static void osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_TMAPGET: case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_ROLLBACK: + case CEPH_OSD_OP_WATCH: case CEPH_OSD_OP_OMAPGETKEYS: case CEPH_OSD_OP_OMAPGETVALS: case CEPH_OSD_OP_OMAPGETHEADER: case CEPH_OSD_OP_OMAPGETVALSBYKEYS: - case CEPH_OSD_OP_MODE_RD: case CEPH_OSD_OP_OMAPSETVALS: case CEPH_OSD_OP_OMAPSETHEADER: case CEPH_OSD_OP_OMAPCLEAR: @@ -314,113 +458,233 @@ static void osd_req_encode_op(struct ceph_osd_request *req, case CEPH_OSD_OP_RDUNLOCK: case CEPH_OSD_OP_UPLOCK: case CEPH_OSD_OP_DNLOCK: + case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_PGLS: case CEPH_OSD_OP_PGLS_FILTER: - pr_err("unsupported osd opcode %s\n", - ceph_osd_op_name(dst->op)); - WARN_ON(1); - break; + return true; + default: + return false; } - dst->payload_len = cpu_to_le32(src->payload_len); } /* - * build new request AND message - * + * This is an osd op init function for opcodes that have no data or + * other information associated with them. It also serves as a + * common init routine for all the other init functions, below. */ -void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 len, unsigned int num_ops, - struct ceph_osd_req_op *src_ops, - struct ceph_snap_context *snapc, u64 snap_id, - struct timespec *mtime) +static struct ceph_osd_req_op * +_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode) { - struct ceph_msg *msg = req->r_request; - struct ceph_osd_req_op *src_op; - void *p; - size_t msg_size; - int flags = req->r_flags; - u64 data_len; - int i; + struct ceph_osd_req_op *op; - req->r_num_ops = num_ops; - req->r_snapid = snap_id; - req->r_snapc = ceph_get_snap_context(snapc); + BUG_ON(which >= osd_req->r_num_ops); + BUG_ON(!osd_req_opcode_valid(opcode)); - /* encode request */ - msg->hdr.version = cpu_to_le16(4); + op = &osd_req->r_ops[which]; + memset(op, 0, sizeof (*op)); + op->op = opcode; - p = msg->front.iov_base; - ceph_encode_32(&p, 1); /* client_inc is always 1 */ - req->r_request_osdmap_epoch = p; - p += 4; - req->r_request_flags = p; - p += 4; - if (req->r_flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(p, mtime); - p += sizeof(struct ceph_timespec); - req->r_request_reassert_version = p; - p += sizeof(struct ceph_eversion); /* will get filled in */ + return op; +} - /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); - req->r_request_pool = p; - p += 8; - ceph_encode_32(&p, -1); /* preferred */ - ceph_encode_32(&p, 0); /* key len */ +void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode) +{ + (void)_osd_req_op_init(osd_req, which, opcode); +} +EXPORT_SYMBOL(osd_req_op_init); - ceph_encode_8(&p, 1); - req->r_request_pgid = p; - p += 8 + 4; - ceph_encode_32(&p, -1); /* preferred */ +void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + size_t payload_len = 0; - /* oid */ - ceph_encode_32(&p, req->r_oid_len); - memcpy(p, req->r_oid, req->r_oid_len); - dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); - p += req->r_oid_len; + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); - /* ops */ - ceph_encode_16(&p, num_ops); - src_op = src_ops; - req->r_request_ops = p; - for (i = 0; i < num_ops; i++, src_op++) { - osd_req_encode_op(req, p, src_op); - p += sizeof(struct ceph_osd_op); - } + op->extent.offset = offset; + op->extent.length = length; + op->extent.truncate_size = truncate_size; + op->extent.truncate_seq = truncate_seq; + if (opcode == CEPH_OSD_OP_WRITE) + payload_len += length; - /* snaps */ - ceph_encode_64(&p, req->r_snapid); - ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); - ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); - if (req->r_snapc) { - for (i = 0; i < snapc->num_snaps; i++) { - ceph_encode_64(&p, req->r_snapc->snaps[i]); - } + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_extent_init); + +void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length) +{ + struct ceph_osd_req_op *op; + u64 previous; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + previous = op->extent.length; + + if (length == previous) + return; /* Nothing to do */ + BUG_ON(length > previous); + + op->extent.length = length; + op->payload_len -= previous - length; +} +EXPORT_SYMBOL(osd_req_op_extent_update); + +void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *class, const char *method) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_pagelist *pagelist; + size_t payload_len = 0; + size_t size; + + BUG_ON(opcode != CEPH_OSD_OP_CALL); + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + BUG_ON(!pagelist); + ceph_pagelist_init(pagelist); + + op->cls.class_name = class; + size = strlen(class); + BUG_ON(size > (size_t) U8_MAX); + op->cls.class_len = size; + ceph_pagelist_append(pagelist, class, size); + payload_len += size; + + op->cls.method_name = method; + size = strlen(method); + BUG_ON(size > (size_t) U8_MAX); + op->cls.method_len = size; + ceph_pagelist_append(pagelist, method, size); + payload_len += size; + + osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); + + op->cls.argc = 0; /* currently unused */ + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_cls_init); + +void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 cookie, u64 version, int flag) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + + op->watch.cookie = cookie; + op->watch.ver = version; + if (opcode == CEPH_OSD_OP_WATCH && flag) + op->watch.flag = (u8)1; +} +EXPORT_SYMBOL(osd_req_op_watch_init); + +static void ceph_osdc_msg_data_add(struct ceph_msg *msg, + struct ceph_osd_data *osd_data) +{ + u64 length = ceph_osd_data_length(osd_data); + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + BUG_ON(length > (u64) SIZE_MAX); + if (length) + ceph_msg_data_add_pages(msg, osd_data->pages, + length, osd_data->alignment); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + BUG_ON(!length); + ceph_msg_data_add_pagelist(msg, osd_data->pagelist); +#ifdef CONFIG_BLOCK + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + ceph_msg_data_add_bio(msg, osd_data->bio, length); +#endif + } else { + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); } +} - req->r_request_attempts = p; - p += 4; +static u64 osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, unsigned int which) +{ + struct ceph_osd_req_op *src; + struct ceph_osd_data *osd_data; + u64 request_data_len = 0; + u64 data_length; - data_len = req->r_trail.length; - if (flags & CEPH_OSD_FLAG_WRITE) { - req->r_request->hdr.data_off = cpu_to_le16(off); - data_len += len; + BUG_ON(which >= req->r_num_ops); + src = &req->r_ops[which]; + if (WARN_ON(!osd_req_opcode_valid(src->op))) { + pr_err("unrecognized osd opcode %d\n", src->op); + + return 0; } - req->r_request->hdr.data_len = cpu_to_le32(data_len); - req->r_request->page_alignment = req->r_page_alignment; - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); + switch (src->op) { + case CEPH_OSD_OP_STAT: + osd_data = &src->raw_data_in; + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + if (src->op == CEPH_OSD_OP_WRITE) + request_data_len = src->extent.length; + dst->extent.offset = cpu_to_le64(src->extent.offset); + dst->extent.length = cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + osd_data = &src->extent.osd_data; + if (src->op == CEPH_OSD_OP_WRITE) + ceph_osdc_msg_data_add(req->r_request, osd_data); + else + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_CALL: + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + osd_data = &src->cls.request_info; + ceph_osdc_msg_data_add(req->r_request, osd_data); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); + request_data_len = osd_data->pagelist->length; + + osd_data = &src->cls.request_data; + data_length = ceph_osd_data_length(osd_data); + if (data_length) { + BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); + dst->cls.indata_len = cpu_to_le32(data_length); + ceph_osdc_msg_data_add(req->r_request, osd_data); + src->payload_len += data_length; + request_data_len += data_length; + } + osd_data = &src->cls.response_data; + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_STARTSYNC: + break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; + default: + pr_err("unsupported osd opcode %s\n", + ceph_osd_op_name(src->op)); + WARN_ON(1); - dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, - num_ops); - return; + return 0; + } + dst->op = cpu_to_le16(src->op); + dst->payload_len = cpu_to_le32(src->payload_len); + + return request_data_len; } -EXPORT_SYMBOL(ceph_osdc_build_request); /* * build new request AND message, calculate layout, and adjust file @@ -436,51 +700,63 @@ EXPORT_SYMBOL(ceph_osdc_build_request); struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, struct ceph_vino vino, - u64 off, u64 *plen, + u64 off, u64 *plen, int num_ops, int opcode, int flags, struct ceph_snap_context *snapc, - int do_sync, u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, - bool use_mempool, - int page_align) + bool use_mempool) { - struct ceph_osd_req_op ops[2]; struct ceph_osd_request *req; - unsigned int num_op = 1; + u64 objnum = 0; + u64 objoff = 0; + u64 objlen = 0; + u32 object_size; + u64 object_base; int r; - memset(&ops, 0, sizeof ops); - - ops[0].op = opcode; - ops[0].extent.truncate_seq = truncate_seq; - ops[0].extent.truncate_size = truncate_size; - - if (do_sync) { - ops[1].op = CEPH_OSD_OP_STARTSYNC; - num_op++; - } + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE); - req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; /* calculate max write size */ - r = calc_layout(vino, layout, off, plen, req, ops); - if (r < 0) + r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); + if (r < 0) { + ceph_osdc_put_request(req); return ERR_PTR(r); - req->r_file_layout = *layout; /* keep a copy */ + } - /* in case it differs from natural (file) alignment that - calc_layout filled in for us */ - req->r_num_pages = calc_pages_for(page_align, *plen); - req->r_page_alignment = page_align; + object_size = le32_to_cpu(layout->fl_object_size); + object_base = off - objoff; + if (truncate_size <= object_base) { + truncate_size = 0; + } else { + truncate_size -= object_base; + if (truncate_size > object_size) + truncate_size = object_size; + } + + osd_req_op_extent_init(req, 0, opcode, objoff, objlen, + truncate_size, truncate_seq); + + /* + * A second op in the ops array means the caller wants to + * also issue a include a 'startsync' command so that the + * osd will flush data quickly. + */ + if (num_ops > 1) + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + + req->r_file_layout = *layout; /* keep a copy */ - ceph_osdc_build_request(req, off, *plen, num_op, ops, - snapc, vino.snap, mtime); + snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", + vino.ino, objnum); + req->r_oid_len = strlen(req->r_oid); return req; } @@ -558,21 +834,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd) { struct ceph_osd_request *req, *nreq; + LIST_HEAD(resend); int err; dout("__kick_osd_requests osd%d\n", osd->o_osd); err = __reset_osd(osdc, osd); if (err) return; - + /* + * Build up a list of requests to resend by traversing the + * osd's list of requests. Requests for a given object are + * sent in tid order, and that is also the order they're + * kept on this list. Therefore all requests that are in + * flight will be found first, followed by all requests that + * have not yet been sent. And to resend requests while + * preserving this order we will want to put any sent + * requests back on the front of the osd client's unsent + * list. + * + * So we build a separate ordered list of already-sent + * requests for the affected osd and splice it onto the + * front of the osd client's unsent list. Once we've seen a + * request that has not yet been sent we're done. Those + * requests are already sitting right where they belong. + */ list_for_each_entry(req, &osd->o_requests, r_osd_item) { - list_move(&req->r_req_lru_item, &osdc->req_unsent); - dout("requeued %p tid %llu osd%d\n", req, req->r_tid, + if (!req->r_sent) + break; + list_move_tail(&req->r_req_lru_item, &resend); + dout("requeueing %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); if (!req->r_linger) req->r_flags |= CEPH_OSD_FLAG_RETRY; } + list_splice(&resend, &osdc->req_unsent); + /* + * Linger requests are re-registered before sending, which + * sets up a new tid for each. We add them to the unsent + * list at the end to keep things in tid order. + */ list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, r_linger_osd) { /* @@ -581,8 +882,8 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc, */ BUG_ON(!list_empty(&req->r_req_lru_item)); __register_request(osdc, req); - list_add(&req->r_req_lru_item, &osdc->req_unsent); - list_add(&req->r_osd_item, &req->r_osd->o_requests); + list_add_tail(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); __unregister_linger_request(osdc, req); dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, osd->o_osd); @@ -654,8 +955,7 @@ static void put_osd(struct ceph_osd *osd) if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); kfree(osd); } } @@ -820,14 +1120,6 @@ static void __register_request(struct ceph_osd_client *osdc, } } -static void register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) -{ - mutex_lock(&osdc->request_mutex); - __register_request(osdc, req); - mutex_unlock(&osdc->request_mutex); -} - /* * called under osdc->request_mutex */ @@ -912,6 +1204,7 @@ void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, mutex_lock(&osdc->request_mutex); if (req->r_linger) { __unregister_linger_request(osdc, req); + req->r_linger = 0; ceph_osdc_put_request(req); } mutex_unlock(&osdc->request_mutex); @@ -952,8 +1245,8 @@ static int __map_request(struct ceph_osd_client *osdc, int err; dout("map_request %p tid %lld\n", req, req->r_tid); - err = ceph_calc_object_layout(&pgid, req->r_oid, - &req->r_file_layout, osdc->osdmap); + err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, + ceph_file_layout_pg_pool(req->r_file_layout)); if (err) { list_move(&req->r_req_lru_item, &osdc->req_notarget); return err; @@ -1007,10 +1300,10 @@ static int __map_request(struct ceph_osd_client *osdc, if (req->r_osd) { __remove_osd_from_lru(req->r_osd); - list_add(&req->r_osd_item, &req->r_osd->o_requests); - list_move(&req->r_req_lru_item, &osdc->req_unsent); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); + list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); } else { - list_move(&req->r_req_lru_item, &osdc->req_notarget); + list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); } err = 1; /* osd or pg changed */ @@ -1045,8 +1338,14 @@ static void __send_request(struct ceph_osd_client *osdc, list_move_tail(&req->r_req_lru_item, &osdc->req_lru); ceph_msg_get(req->r_request); /* send consumes a ref */ - ceph_con_send(&req->r_osd->o_con, req->r_request); + + /* Mark the request unsafe if this is the first timet's being sent. */ + + if (!req->r_sent && req->r_unsafe_callback) + req->r_unsafe_callback(req, true); req->r_sent = req->r_osd->o_incarnation; + + ceph_con_send(&req->r_osd->o_con, req->r_request); } /* @@ -1134,31 +1433,11 @@ static void handle_osds_timeout(struct work_struct *work) static void complete_request(struct ceph_osd_request *req) { - if (req->r_safe_callback) - req->r_safe_callback(req, NULL); + if (req->r_unsafe_callback) + req->r_unsafe_callback(req, false); complete_all(&req->r_safe_completion); /* fsync waiter */ } -static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid) -{ - __u8 v; - - ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad); - v = ceph_decode_8(p); - if (v > 1) { - pr_warning("do not understand pg encoding %d > 1", v); - return -EINVAL; - } - pgid->pool = ceph_decode_64(p); - pgid->seed = ceph_decode_32(p); - *p += 4; - return 0; - -bad: - pr_warning("incomplete pg encoding"); - return -EINVAL; -} - /* * handle osd op reply. either call the callback if it is specified, * or do the completion to wake up the waiting thread. @@ -1170,7 +1449,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, struct ceph_osd_request *req; u64 tid; int object_len; - int numops, payload_len, flags; + unsigned int numops; + int payload_len, flags; s32 result; s32 retry_attempt; struct ceph_pg pg; @@ -1178,7 +1458,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, u32 reassert_epoch; u64 reassert_version; u32 osdmap_epoch; - int i; + int already_completed; + u32 bytes; + unsigned int i; tid = le64_to_cpu(msg->hdr.tid); dout("handle_reply %p tid %llu\n", msg, tid); @@ -1191,7 +1473,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ceph_decode_need(&p, end, object_len, bad); p += object_len; - err = __decode_pgid(&p, end, &pg); + err = ceph_decode_pgid(&p, end, &pg); if (err) goto bad; @@ -1207,8 +1489,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, req = __lookup_request(osdc, tid); if (req == NULL) { dout("handle_reply tid %llu dne\n", tid); - mutex_unlock(&osdc->request_mutex); - return; + goto bad_mutex; } ceph_osdc_get_request(req); @@ -1233,9 +1514,10 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, payload_len += len; p += sizeof(*op); } - if (payload_len != le32_to_cpu(msg->hdr.data_len)) { + bytes = le32_to_cpu(msg->hdr.data_len); + if (payload_len != bytes) { pr_warning("sum of op payload lens %d != data_len %d", - payload_len, le32_to_cpu(msg->hdr.data_len)); + payload_len, bytes); goto bad_put; } @@ -1244,21 +1526,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, for (i = 0; i < numops; i++) req->r_reply_op_result[i] = ceph_decode_32(&p); - /* - * if this connection filled our message, drop our reference now, to - * avoid a (safe but slower) revoke later. - */ - if (req->r_con_filling_msg == con && req->r_reply == msg) { - dout(" dropping con_filling_msg ref %p\n", con); - req->r_con_filling_msg = NULL; - con->ops->put(con); - } - if (!req->r_got_reply) { - unsigned int bytes; req->r_result = result; - bytes = le32_to_cpu(msg->hdr.data_len); dout("handle_reply result %d bytes %d\n", req->r_result, bytes); if (req->r_result == 0) @@ -1286,7 +1556,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, ((flags & CEPH_OSD_FLAG_WRITE) == 0)) __unregister_request(osdc, req); + already_completed = req->r_completed; + req->r_completed = 1; mutex_unlock(&osdc->request_mutex); + if (already_completed) + goto done; if (req->r_callback) req->r_callback(req, msg); @@ -1303,6 +1577,8 @@ done: bad_put: ceph_osdc_put_request(req); +bad_mutex: + mutex_unlock(&osdc->request_mutex); bad: pr_err("corrupt osd_op_reply got %d %d\n", (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); @@ -1736,6 +2012,104 @@ bad: } /* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, + struct ceph_snap_context *snapc, u64 snap_id, + struct timespec *mtime) +{ + struct ceph_msg *msg = req->r_request; + void *p; + size_t msg_size; + int flags = req->r_flags; + u64 data_len; + unsigned int i; + + req->r_snapid = snap_id; + req->r_snapc = ceph_get_snap_context(snapc); + + /* encode request */ + msg->hdr.version = cpu_to_le16(4); + + p = msg->front.iov_base; + ceph_encode_32(&p, 1); /* client_inc is always 1 */ + req->r_request_osdmap_epoch = p; + p += 4; + req->r_request_flags = p; + p += 4; + if (req->r_flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(p, mtime); + p += sizeof(struct ceph_timespec); + req->r_request_reassert_version = p; + p += sizeof(struct ceph_eversion); /* will get filled in */ + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + req->r_request_pool = p; + p += 8; + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + ceph_encode_8(&p, 1); + req->r_request_pgid = p; + p += 8 + 4; + ceph_encode_32(&p, -1); /* preferred */ + + /* oid */ + ceph_encode_32(&p, req->r_oid_len); + memcpy(p, req->r_oid, req->r_oid_len); + dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len); + p += req->r_oid_len; + + /* ops--can imply data */ + ceph_encode_16(&p, (u16)req->r_num_ops); + data_len = 0; + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(req, p, i); + p += sizeof(struct ceph_osd_op); + } + + /* snaps */ + ceph_encode_64(&p, req->r_snapid); + ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); + ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); + if (req->r_snapc) { + for (i = 0; i < snapc->num_snaps; i++) { + ceph_encode_64(&p, req->r_snapc->snaps[i]); + } + } + + req->r_request_attempts = p; + p += 4; + + /* data */ + if (flags & CEPH_OSD_FLAG_WRITE) { + u16 data_off; + + /* + * The header "data_off" is a hint to the receiver + * allowing it to align received data into its + * buffers such that there's no need to re-copy + * it before writing it to disk (direct I/O). + */ + data_off = (u16) (off & 0xffff); + req->r_request->hdr.data_off = cpu_to_le16(data_off); + } + req->r_request->hdr.data_len = cpu_to_le32(data_len); + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + + dout("build_request msg_size was %d\n", (int)msg_size); +} +EXPORT_SYMBOL(ceph_osdc_build_request); + +/* * Register request, send initial attempt. */ int ceph_osdc_start_request(struct ceph_osd_client *osdc, @@ -1744,41 +2118,28 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc, { int rc = 0; - req->r_request->pages = req->r_pages; - req->r_request->nr_pages = req->r_num_pages; -#ifdef CONFIG_BLOCK - req->r_request->bio = req->r_bio; -#endif - req->r_request->trail = &req->r_trail; - - register_request(osdc, req); - down_read(&osdc->map_sem); mutex_lock(&osdc->request_mutex); - /* - * a racing kick_requests() may have sent the message for us - * while we dropped request_mutex above, so only send now if - * the request still han't been touched yet. - */ - if (req->r_sent == 0) { - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } - goto out_unlock; - } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); - } else { - __send_request(osdc, req); + __register_request(osdc, req); + req->r_sent = 0; + req->r_got_reply = 0; + req->r_completed = 0; + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; } - rc = 0; + goto out_unlock; } - + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + rc = 0; out_unlock: mutex_unlock(&osdc->request_mutex); up_read(&osdc->map_sem); @@ -1940,18 +2301,22 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, 0, truncate_seq, truncate_size, NULL, - false, page_align); + NULL, truncate_seq, truncate_size, + false); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short read due to an object boundary */ - req->r_pages = pages; - dout("readpages final extent is %llu~%llu (%d pages align %d)\n", - off, *plen, req->r_num_pages, page_align); + osd_req_op_extent_osd_data_pages(req, 0, + pages, *plen, page_align, false, false); + + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, *plen, page_align); + + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); rc = ceph_osdc_start_request(osdc, req, false); if (!rc) @@ -1978,20 +2343,21 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, int rc = 0; int page_align = off & ~PAGE_MASK; - BUG_ON(vino.snap != CEPH_NOSNAP); - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, + BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - snapc, 0, - truncate_seq, truncate_size, mtime, - true, page_align); + snapc, truncate_seq, truncate_size, + true); if (IS_ERR(req)) return PTR_ERR(req); /* it may be a short write due to an object boundary */ - req->r_pages = pages; - dout("writepages %llu~%llu (%d pages)\n", off, len, - req->r_num_pages); + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); + + ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); rc = ceph_osdc_start_request(osdc, req, true); if (!rc) @@ -2005,6 +2371,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, } EXPORT_SYMBOL(ceph_osdc_writepages); +int ceph_osdc_setup(void) +{ + BUG_ON(ceph_osd_request_cache); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", + sizeof (struct ceph_osd_request), + __alignof__(struct ceph_osd_request), + 0, NULL); + + return ceph_osd_request_cache ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(ceph_osdc_setup); + +void ceph_osdc_cleanup(void) +{ + BUG_ON(!ceph_osd_request_cache); + kmem_cache_destroy(ceph_osd_request_cache); + ceph_osd_request_cache = NULL; +} +EXPORT_SYMBOL(ceph_osdc_cleanup); + /* * handle incoming message */ @@ -2064,13 +2450,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, goto out; } - if (req->r_con_filling_msg) { + if (req->r_reply->con) dout("%s revoking msg %p from old con %p\n", __func__, - req->r_reply, req->r_con_filling_msg); - ceph_msg_revoke_incoming(req->r_reply); - req->r_con_filling_msg->ops->put(req->r_con_filling_msg); - req->r_con_filling_msg = NULL; - } + req->r_reply, req->r_reply->con); + ceph_msg_revoke_incoming(req->r_reply); if (front > req->r_reply->front.iov_len) { pr_warning("get_reply front %d > preallocated %d\n", @@ -2084,26 +2467,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_get(req->r_reply); if (data_len > 0) { - int want = calc_pages_for(req->r_page_alignment, data_len); - - if (req->r_pages && unlikely(req->r_num_pages < want)) { - pr_warning("tid %lld reply has %d bytes %d pages, we" - " had only %d pages ready\n", tid, data_len, - want, req->r_num_pages); - *skip = 1; - ceph_msg_put(m); - m = NULL; - goto out; + struct ceph_osd_data *osd_data; + + /* + * XXX This is assuming there is only one op containing + * XXX page data. Probably OK for reads, but this + * XXX ought to be done more generally. + */ + osd_data = osd_req_op_extent_osd_data(req, 0); + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + if (osd_data->pages && + unlikely(osd_data->length < data_len)) { + + pr_warning("tid %lld reply has %d bytes " + "we had only %llu bytes ready\n", + tid, data_len, osd_data->length); + *skip = 1; + ceph_msg_put(m); + m = NULL; + goto out; + } } - m->pages = req->r_pages; - m->nr_pages = req->r_num_pages; - m->page_alignment = req->r_page_alignment; -#ifdef CONFIG_BLOCK - m->bio = req->r_bio; -#endif } *skip = 0; - req->r_con_filling_msg = con->ops->get(con); dout("get_reply tid %lld %p\n", tid, m); out: @@ -2168,13 +2554,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, struct ceph_auth_handshake *auth = &o->o_auth; if (force_new && auth->authorizer) { - if (ac->ops && ac->ops->destroy_authorizer) - ac->ops->destroy_authorizer(ac, auth->authorizer); + ceph_auth_destroy_authorizer(ac, auth->authorizer); auth->authorizer = NULL; } - if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { - int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, - auth); + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); if (ret) return ERR_PTR(ret); } @@ -2190,11 +2580,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - /* - * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, - * XXX which do we do: succeed or fail? - */ - return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len); } static int invalidate_authorizer(struct ceph_connection *con) @@ -2203,9 +2589,7 @@ static int invalidate_authorizer(struct ceph_connection *con) struct ceph_osd_client *osdc = o->o_osdc; struct ceph_auth_client *ac = osdc->client->monc.auth; - if (ac->ops && ac->ops->invalidate_authorizer) - ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); - + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); return ceph_monc_validate_auth(&osdc->client->monc); } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 4543b9aba40c..603ddd92db19 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -654,24 +654,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) return 0; } -static int __decode_pgid(void **p, void *end, struct ceph_pg *pg) -{ - u8 v; - - ceph_decode_need(p, end, 1+8+4+4, bad); - v = ceph_decode_8(p); - if (v != 1) - goto bad; - pg->pool = ceph_decode_64(p); - pg->seed = ceph_decode_32(p); - *p += 4; /* skip preferred */ - return 0; - -bad: - dout("error decoding pgid\n"); - return -EINVAL; -} - /* * decode a full map. */ @@ -765,7 +747,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) struct ceph_pg pgid; struct ceph_pg_mapping *pg; - err = __decode_pgid(p, end, &pgid); + err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; ceph_decode_need(p, end, sizeof(u32), bad); @@ -983,7 +965,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, struct ceph_pg pgid; u32 pglen; - err = __decode_pgid(p, end, &pgid); + err = ceph_decode_pgid(p, end, &pgid); if (err) goto bad; ceph_decode_need(p, end, sizeof(u32), bad); @@ -1111,27 +1093,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); * calculate an object layout (i.e. pgid) from an oid, * file_layout, and osdmap */ -int ceph_calc_object_layout(struct ceph_pg *pg, - const char *oid, - struct ceph_file_layout *fl, - struct ceph_osdmap *osdmap) +int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid, + struct ceph_osdmap *osdmap, uint64_t pool) { - unsigned int num, num_mask; - struct ceph_pg_pool_info *pool; + struct ceph_pg_pool_info *pool_info; BUG_ON(!osdmap); - pg->pool = le32_to_cpu(fl->fl_pg_pool); - pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); - if (!pool) + pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool); + if (!pool_info) return -EIO; - pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); - num = pool->pg_num; - num_mask = pool->pg_num_mask; + pg->pool = pool; + pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid)); - dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); + dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed); return 0; } -EXPORT_SYMBOL(ceph_calc_object_layout); +EXPORT_SYMBOL(ceph_calc_ceph_pg); /* * Calculate raw osd vector for the given pgid. Return pointer to osd diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c new file mode 100644 index 000000000000..154683f5f14c --- /dev/null +++ b/net/ceph/snapshot.c @@ -0,0 +1,78 @@ +/* + * snapshot.c Ceph snapshot context utility routines (part of libceph) + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include <stddef.h> + +#include <linux/types.h> +#include <linux/export.h> +#include <linux/ceph/libceph.h> + +/* + * Ceph snapshot contexts are reference counted objects, and the + * returned structure holds a single reference. Acquire additional + * references with ceph_get_snap_context(), and release them with + * ceph_put_snap_context(). When the reference count reaches zero + * the entire structure is freed. + */ + +/* + * Create a new ceph snapshot context large enough to hold the + * indicated number of snapshot ids (which can be 0). Caller has + * to fill in snapc->seq and snapc->snaps[0..snap_count-1]. + * + * Returns a null pointer if an error occurs. + */ +struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags) +{ + struct ceph_snap_context *snapc; + size_t size; + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (snapc->snaps[0]); + snapc = kzalloc(size, gfp_flags); + if (!snapc) + return NULL; + + atomic_set(&snapc->nref, 1); + snapc->num_snaps = snap_count; + + return snapc; +} +EXPORT_SYMBOL(ceph_create_snap_context); + +struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) +{ + if (sc) + atomic_inc(&sc->nref); + return sc; +} +EXPORT_SYMBOL(ceph_get_snap_context); + +void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} +EXPORT_SYMBOL(ceph_put_snap_context); diff --git a/net/core/dev.c b/net/core/dev.c index 40b1fadaf637..fc1e289397f5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2213,6 +2213,17 @@ __be16 skb_network_protocol(struct sk_buff *skb) __be16 type = skb->protocol; int vlan_depth = ETH_HLEN; + /* Tunnel gso handlers can set protocol to ethernet. */ + if (type == htons(ETH_P_TEB)) { + struct ethhdr *eth; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + return 0; + + eth = (struct ethhdr *)skb_mac_header(skb); + type = eth->h_proto; + } + while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { struct vlan_hdr *vh; diff --git a/net/core/iovec.c b/net/core/iovec.c index 7e7aeb01de45..de178e462682 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -75,31 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a /* * Copy kernel to iovec. Returns -EFAULT on error. - * - * Note: this modifies the original iovec. - */ - -int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) -{ - while (len > 0) { - if (iov->iov_len) { - int copy = min_t(unsigned int, iov->iov_len, len); - if (copy_to_user(iov->iov_base, kdata, copy)) - return -EFAULT; - kdata += copy; - len -= copy; - iov->iov_len -= copy; - iov->iov_base += copy; - } - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_toiovec); - -/* - * Copy kernel to iovec. Returns -EFAULT on error. */ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, @@ -125,31 +100,6 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, EXPORT_SYMBOL(memcpy_toiovecend); /* - * Copy iovec to kernel. Returns -EFAULT on error. - * - * Note: this modifies the original iovec. - */ - -int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) -{ - while (len > 0) { - if (iov->iov_len) { - int copy = min_t(unsigned int, len, iov->iov_len); - if (copy_from_user(kdata, iov->iov_base, copy)) - return -EFAULT; - len -= copy; - kdata += copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - iov++; - } - - return 0; -} -EXPORT_SYMBOL(memcpy_fromiovec); - -/* * Copy iovec from kernel. Returns -EFAULT on error. */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7427ab5e27d8..981fed397d1d 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -606,21 +606,11 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return sprintf(buf, "%lu\n", val); } -static void rps_dev_flow_table_release_work(struct work_struct *work) -{ - struct rps_dev_flow_table *table = container_of(work, - struct rps_dev_flow_table, free_work); - - vfree(table); -} - static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); - - INIT_WORK(&table->free_work, rps_dev_flow_table_release_work); - schedule_work(&table->free_work); + vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a5802a8b652f..cec074be8c43 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -206,7 +206,7 @@ static void netpoll_poll_dev(struct net_device *dev) * the dev_open/close paths use this to block netpoll activity * while changing device state */ - if (!down_trylock(&ni->dev_lock)) + if (down_trylock(&ni->dev_lock)) return; if (!netif_running(dev)) { diff --git a/net/core/sock.c b/net/core/sock.c index d4f4cea726e7..6ba327da79e1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1217,18 +1217,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) #endif } -/* - * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes - * un-modified. Special care is taken when initializing object to zero. - */ -static inline void sk_prot_clear_nulls(struct sock *sk, int size) -{ - if (offsetof(struct sock, sk_node.next) != 0) - memset(sk, 0, offsetof(struct sock, sk_node.next)); - memset(&sk->sk_node.pprev, 0, - size - offsetof(struct sock, sk_node.pprev)); -} - void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) { unsigned long nulls1, nulls2; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ff06b7543d9f..49616fed9340 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -125,7 +125,6 @@ struct tnode { unsigned int empty_children; /* KEYLENGTH bits needed */ union { struct rcu_head rcu; - struct work_struct work; struct tnode *tnode_free; }; struct rt_trie_node __rcu *child[0]; @@ -383,12 +382,6 @@ static struct tnode *tnode_alloc(size_t size) return vzalloc(size); } -static void __tnode_vfree(struct work_struct *arg) -{ - struct tnode *tn = container_of(arg, struct tnode, work); - vfree(tn); -} - static void __tnode_free_rcu(struct rcu_head *head) { struct tnode *tn = container_of(head, struct tnode, rcu); @@ -397,10 +390,8 @@ static void __tnode_free_rcu(struct rcu_head *head) if (size <= PAGE_SIZE) kfree(tn); - else { - INIT_WORK(&tn->work, __tnode_vfree); - schedule_work(&tn->work); - } + else + vfree(tn); } static inline void tnode_free(struct tnode *tn) diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index cc22363965d2..b2e805af9b87 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -150,13 +150,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, csum = false; /* setup inner skb. */ - if (greh->protocol == htons(ETH_P_TEB)) { - struct ethhdr *eth = (struct ethhdr *)skb_inner_mac_header(skb); - skb->protocol = eth->h_proto; - } else { - skb->protocol = greh->protocol; - } - + skb->protocol = greh->protocol; skb->encapsulation = 0; if (unlikely(!pskb_may_pull(skb, ghl))) diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index e97d66a1fdde..7e06641e36ae 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -305,6 +305,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, setup_timer(&q->timer, f->frag_expire, (unsigned long)q); spin_lock_init(&q->lock); atomic_set(&q->refcnt, 1); + INIT_LIST_HEAD(&q->lru_list); return q; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c625e4dad4b0..2a83591492dd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -235,7 +235,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) */ struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn; - const struct iphdr *iph = (const struct iphdr *)skb->data; + const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct ip_tunnel *t; @@ -281,6 +281,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) else itn = net_generic(net, ipgre_net_id); + iph = (const struct iphdr *)skb->data; t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, iph->daddr, iph->saddr, tpi.key); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 147abf5275aa..4bcabf3ab4ca 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -84,7 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; EXPORT_SYMBOL(sysctl_ip_default_ttl); /* Generate a checksum for an outgoing IP datagram. */ -__inline__ void ip_send_check(struct iphdr *iph) +void ip_send_check(struct iphdr *iph) { iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index f8a222cb6448..cf08218ddbcf 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -162,7 +162,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ipt_ulog_packet(unsigned int hooknum, +static void ipt_ulog_packet(struct net *net, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, @@ -174,7 +175,6 @@ static void ipt_ulog_packet(unsigned int hooknum, size_t size, copy_len; struct nlmsghdr *nlh; struct timeval tv; - struct net *net = dev_net(in ? in : out); struct ulog_net *ulog = ulog_pernet(net); /* ffs == find first bit set, necessary because userspace @@ -291,12 +291,15 @@ alloc_failure: static unsigned int ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) { - ipt_ulog_packet(par->hooknum, skb, par->in, par->out, + struct net *net = dev_net(par->in ? par->in : par->out); + + ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out, par->targinfo, NULL); return XT_CONTINUE; } -static void ipt_logfn(u_int8_t pf, +static void ipt_logfn(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -318,7 +321,7 @@ static void ipt_logfn(u_int8_t pf, strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); } - ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); + ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix); } static int ulog_tg_check(const struct xt_tgchk_param *par) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dcb116dde216..ab450c099aa4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2887,6 +2887,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, unsigned int mss; struct sk_buff *gso_skb = skb; __sum16 newcheck; + bool ooo_okay, copy_destructor; if (!pskb_may_pull(skb, sizeof(*th))) goto out; @@ -2927,10 +2928,18 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, goto out; } + copy_destructor = gso_skb->destructor == tcp_wfree; + ooo_okay = gso_skb->ooo_okay; + /* All segments but the first should have ooo_okay cleared */ + skb->ooo_okay = 0; + segs = skb_segment(skb, features); if (IS_ERR(segs)) goto out; + /* Only first segment might have ooo_okay set */ + segs->ooo_okay = ooo_okay; + delta = htonl(oldlen + (thlen + mss)); skb = segs; @@ -2950,6 +2959,17 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, thlen, skb->csum)); seq += mss; + if (copy_destructor) { + skb->destructor = gso_skb->destructor; + skb->sk = gso_skb->sk; + /* {tcp|sock}_wfree() use exact truesize accounting : + * sum(skb->truesize) MUST be exactly be gso_skb->truesize + * So we account mss bytes of 'true size' for each segment. + * The last segment will contain the remaining. + */ + skb->truesize = mss; + gso_skb->truesize -= mss; + } skb = skb->next; th = tcp_hdr(skb); @@ -2962,7 +2982,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, * is freed at TX completion, and not right now when gso_skb * is freed by GSO engine */ - if (gso_skb->destructor == tcp_wfree) { + if (copy_destructor) { swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); swap(gso_skb->truesize, skb->truesize); @@ -3269,8 +3289,11 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, for (i = 0; i < shi->nr_frags; ++i) { const struct skb_frag_struct *f = &shi->frags[i]; - struct page *page = skb_frag_page(f); - sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); + unsigned int offset = f->page_offset; + struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT); + + sg_set_page(&sg, page, skb_frag_size(f), + offset_in_page(offset)); if (crypto_hash_update(desc, &sg, skb_frag_size(f))) return 1; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 08bbe6096528..9c6225780bd5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2743,8 +2743,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int prior_sacked, bool is_dupack, - int flag) + int prior_sacked, int prior_packets, + bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2804,7 +2804,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_add_reno_sack(sk); } else do_lost = tcp_try_undo_partial(sk, pkts_acked); - newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; + newly_acked_sacked = prior_packets - tp->packets_out + + tp->sacked_out - prior_sacked; break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); @@ -2818,7 +2819,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (is_dupack) tcp_add_reno_sack(sk); } - newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; + newly_acked_sacked = prior_packets - tp->packets_out + + tp->sacked_out - prior_sacked; if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); @@ -3330,9 +3332,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_dupack = false; u32 prior_in_flight; u32 prior_fackets; - int prior_packets; + int prior_packets = tp->packets_out; int prior_sacked = tp->sacked_out; int pkts_acked = 0; + int previous_packets_out = 0; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3403,14 +3406,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sk->sk_err_soft = 0; icsk->icsk_probes_out = 0; tp->rcv_tstamp = tcp_time_stamp; - prior_packets = tp->packets_out; if (!prior_packets) goto no_queue; /* See if we can take anything off of the retransmit queue. */ + previous_packets_out = tp->packets_out; flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); - pkts_acked = prior_packets - tp->packets_out; + pkts_acked = previous_packets_out - tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ @@ -3418,7 +3421,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - is_dupack, flag); + prior_packets, is_dupack, flag); } else { if (flag & FLAG_DATA_ACKED) tcp_cong_avoid(sk, ack, prior_in_flight); @@ -3441,7 +3444,7 @@ no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - is_dupack, flag); + prior_packets, is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3464,7 +3467,7 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, - is_dupack, flag); + prior_packets, is_dupack, flag); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index f696d7c2e9fa..f6a005c485a9 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -96,7 +96,8 @@ struct tcpm_hash_bucket { static DEFINE_SPINLOCK(tcp_metrics_lock); -static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) +static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, + bool fastopen_clear) { u32 val; @@ -122,9 +123,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); tm->tcpm_ts = 0; tm->tcpm_ts_stamp = 0; - tm->tcpm_fastopen.mss = 0; - tm->tcpm_fastopen.syn_loss = 0; - tm->tcpm_fastopen.cookie.len = 0; + if (fastopen_clear) { + tm->tcpm_fastopen.mss = 0; + tm->tcpm_fastopen.syn_loss = 0; + tm->tcpm_fastopen.cookie.len = 0; + } } static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, @@ -154,7 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, } tm->tcpm_addr = *addr; - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, true); if (likely(!reclaim)) { tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; @@ -171,7 +174,7 @@ out_unlock: static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) { if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) - tcpm_suck_dst(tm, dst); + tcpm_suck_dst(tm, dst, false); } #define TCP_METRICS_RECLAIM_DEPTH 5 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 536d40929ba6..ec335fabd5cc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -874,11 +874,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) { + if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); - skb->ooo_okay = 1; - } else - skb->ooo_okay = 0; + + /* if no packet is in qdisc/device queue, then allow XPS to select + * another queue. + */ + skb->ooo_okay = sk_wmem_alloc_get(sk) == 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0ae038a4c7a8..0bf5d399a03c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2311,7 +2311,6 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); int mac_len = skb->mac_len; int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); - struct ethhdr *inner_eth = (struct ethhdr *)skb_inner_mac_header(skb); __be16 protocol = skb->protocol; netdev_features_t enc_features; int outer_hlen; @@ -2324,8 +2323,7 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); - inner_eth = (struct ethhdr *)skb_mac_header(skb); - skb->protocol = inner_eth->h_proto; + skb->protocol = htons(ETH_P_TEB); /* segment inner packet. */ enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d3ddd8400354..ecd60733e5e2 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1081,6 +1081,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, } if (t == NULL) t = netdev_priv(dev); + memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; @@ -1128,6 +1129,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, if (t) { err = 0; + memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d2eedf192330..dae1949019d7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1147,7 +1147,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, if (WARN_ON(np->cork.opt)) return -EINVAL; - np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); + np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation); if (unlikely(np->cork.opt == NULL)) return -ENOBUFS; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 71167069b394..0a17ed9eaf39 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1890,6 +1890,17 @@ void tcp6_proc_exit(struct net *net) } #endif +static void tcp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); + + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} + struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, @@ -1933,6 +1944,7 @@ struct proto tcpv6_prot = { #ifdef CONFIG_MEMCG_KMEM .proto_cgroup = tcp_proto_cgroup, #endif + .clear_sk = tcp_v6_clear_sk, }; static const struct inet6_protocol tcpv6_protocol = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d4defdd44937..42923b14dfa6 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1432,6 +1432,17 @@ void udp6_proc_exit(struct net *net) { } #endif /* CONFIG_PROC_FS */ +void udp_v6_clear_sk(struct sock *sk, int size) +{ + struct inet_sock *inet = inet_sk(sk); + + /* we do not want to clear pinet6 field, because of RCU lookups */ + sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6)); + + size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6); + memset(&inet->pinet6 + 1, 0, size); +} + /* ------------------------------------------------------------------------ */ struct proto udpv6_prot = { @@ -1462,7 +1473,7 @@ struct proto udpv6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udpv6_protosw = { diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index d7571046bfc4..4691ed50a928 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -31,6 +31,8 @@ extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); extern void udpv6_destroy_sock(struct sock *sk); +extern void udp_v6_clear_sk(struct sock *sk, int size); + #ifdef CONFIG_PROC_FS extern int udp6_seq_show(struct seq_file *seq, void *v); #endif diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 1d08e21d9f69..dfcc4be46898 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -56,7 +56,7 @@ struct proto udplitev6_prot = { .compat_setsockopt = compat_udpv6_setsockopt, .compat_getsockopt = compat_udpv6_getsockopt, #endif - .clear_sk = sk_prot_clear_portaddr_nulls, + .clear_sk = udp_v6_clear_sk, }; static struct inet_protosw udplite6_protosw = { diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 4ef7bdb65440..23ed03d786c8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -103,8 +103,10 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, dev_hold(dev); xdst->u.rt6.rt6i_idev = in6_dev_get(dev); - if (!xdst->u.rt6.rt6i_idev) + if (!xdst->u.rt6.rt6i_idev) { + dev_put(dev); return -ENODEV; + } rt6_transfer_peer(&xdst->u.rt6, rt); diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c index 8c004161a843..9ea0c933b9ff 100644 --- a/net/irda/irlap_frame.c +++ b/net/irda/irlap_frame.c @@ -544,7 +544,7 @@ static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self, /* * We now have some discovery info to deliver! */ - discovery = kmalloc(sizeof(discovery_t), GFP_ATOMIC); + discovery = kzalloc(sizeof(discovery_t), GFP_ATOMIC); if (!discovery) { IRDA_WARNING("%s: unable to malloc!\n", __func__); return; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 158e6eb188d3..44be28cfc6c4 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1267,6 +1267,7 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, __le16 fc, bool acked); +void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 29620bfc7a69..a46e490f20dd 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1015,7 +1015,8 @@ static void ieee80211_chswitch_timer(unsigned long data) static void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, - u64 timestamp, struct ieee802_11_elems *elems) + u64 timestamp, struct ieee802_11_elems *elems, + bool beacon) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -1032,6 +1033,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct cfg80211_chan_def new_vht_chandef = {}; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; + const struct ieee80211_ht_operation *ht_oper; int secondary_channel_offset = -1; ASSERT_MGD_MTX(ifmgd); @@ -1048,11 +1050,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sec_chan_offs = elems->sec_chan_offs; wide_bw_chansw_ie = elems->wide_bw_chansw_ie; + ht_oper = elems->ht_operation; if (ifmgd->flags & (IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_40MHZ)) { sec_chan_offs = NULL; wide_bw_chansw_ie = NULL; + /* only used for bandwidth here */ + ht_oper = NULL; } if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) @@ -1094,10 +1099,20 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } - if (sec_chan_offs) { + if (!beacon && sec_chan_offs) { secondary_channel_offset = sec_chan_offs->sec_chan_offs; + } else if (beacon && ht_oper) { + secondary_channel_offset = + ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET; } else if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - /* if HT is enabled and the IE not present, it's still HT */ + /* + * If it's not a beacon, HT is enabled and the IE not present, + * it's 20 MHz, 802.11-2012 8.5.2.6: + * This element [the Secondary Channel Offset Element] is + * present when switching to a 40 MHz channel. It may be + * present when switching to a 20 MHz channel (in which + * case the secondary channel offset is set to SCN). + */ secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE; } @@ -2796,7 +2811,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, mutex_unlock(&local->iflist_mtx); } - ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, elems); + ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, + elems, true); } @@ -3210,7 +3226,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, - &elems); + &elems, false); } else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { ies_len = skb->len - offsetof(struct ieee80211_mgmt, @@ -3232,7 +3248,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, - &elems); + &elems, false); } break; } @@ -3623,6 +3639,31 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata) } } +#ifdef CONFIG_PM +void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + + mutex_lock(&ifmgd->mtx); + if (!ifmgd->associated) { + mutex_unlock(&ifmgd->mtx); + return; + } + + if (sdata->flags & IEEE80211_SDATA_DISCONNECT_RESUME) { + sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_RESUME; + mlme_dbg(sdata, "driver requested disconnect after resume\n"); + ieee80211_sta_connection_lost(sdata, + ifmgd->associated->bssid, + WLAN_REASON_UNSPECIFIED, + true); + mutex_unlock(&ifmgd->mtx); + return; + } + mutex_unlock(&ifmgd->mtx); +} +#endif + /* interface setup */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) { @@ -4329,7 +4370,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; bool tx = !req->local_state_change; - bool sent_frame = false; + bool report_frame = false; mutex_lock(&ifmgd->mtx); @@ -4346,7 +4387,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_destroy_auth_data(sdata, false); mutex_unlock(&ifmgd->mtx); - sent_frame = tx; + report_frame = true; goto out; } @@ -4354,12 +4395,12 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ether_addr_equal(ifmgd->associated->bssid, req->bssid)) { ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, req->reason_code, tx, frame_buf); - sent_frame = tx; + report_frame = true; } mutex_unlock(&ifmgd->mtx); out: - if (sent_frame) + if (report_frame) __cfg80211_send_deauth(sdata->dev, frame_buf, IEEE80211_DEAUTH_FRAME_LEN); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 0d51877efdb7..d3f414fe67e0 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -688,8 +688,15 @@ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_sta_rates *rates) { - struct ieee80211_sta_rates *old = rcu_dereference(pubsta->rates); + struct ieee80211_sta_rates *old; + /* + * mac80211 guarantees that this function will not be called + * concurrently, so the following RCU access is safe, even without + * extra locking. This can not be checked easily, so we just set + * the condition to true. + */ + old = rcu_dereference_protected(pubsta->rates, true); rcu_assign_pointer(pubsta->rates, rates); if (old) kfree_rcu(old, rcu_head); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c8447af76ead..8e2952620256 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3036,6 +3036,9 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, * and location updates. Note that mac80211 * itself never looks at these frames. */ + if (!multicast && + !ether_addr_equal(sdata->vif.addr, hdr->addr1)) + return 0; if (ieee80211_is_public_action(hdr, skb->len)) return 1; if (!ieee80211_is_beacon(hdr->frame_control)) diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c index 3ed801d90f1e..124b1fdc20d0 100644 --- a/net/mac80211/tkip.c +++ b/net/mac80211/tkip.c @@ -208,10 +208,10 @@ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, u32 iv32 = get_unaligned_le32(&data[4]); u16 iv16 = data[2] | (data[0] << 8); - spin_lock_bh(&key->u.tkip.txlock); + spin_lock(&key->u.tkip.txlock); ieee80211_compute_tkip_p1k(key, iv32); tkip_mixing_phase2(tk, ctx, iv16, p2k); - spin_unlock_bh(&key->u.tkip.txlock); + spin_unlock(&key->u.tkip.txlock); } EXPORT_SYMBOL(ieee80211_get_tkip_p2k); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 3f87fa468b1f..27e07150eb46 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1740,6 +1740,13 @@ int ieee80211_reconfig(struct ieee80211_local *local) mb(); local->resuming = false; + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; + if (sdata->vif.type == NL80211_IFTYPE_STATION) + ieee80211_sta_restart(sdata); + } + mod_timer(&local->sta_cleanup, jiffies + 1); #else WARN_ON(1); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 388656d5a9ec..3b18dd1be7d9 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -148,7 +148,7 @@ void nf_log_packet(struct net *net, va_start(args, fmt); vsnprintf(prefix, sizeof(prefix), fmt, args); va_end(args); - logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); + logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); } rcu_read_unlock(); } @@ -368,17 +368,20 @@ static int __net_init nf_log_net_init(struct net *net) return 0; out_sysctl: +#ifdef CONFIG_PROC_FS /* For init_net: errors will trigger panic, don't unroll on error. */ if (!net_eq(net, &init_net)) remove_proc_entry("nf_log", net->nf.proc_netfilter); - +#endif return ret; } static void __net_exit nf_log_net_exit(struct net *net) { netfilter_log_sysctl_exit(net); +#ifdef CONFIG_PROC_FS remove_proc_entry("nf_log", net->nf.proc_netfilter); +#endif } static struct pernet_operations nf_log_net_ops = { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index faf1e9300d8a..962e9792e317 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -602,7 +602,8 @@ static struct nf_loginfo default_loginfo = { /* log handler for internal netfilter logging api */ void -nfulnl_log_packet(u_int8_t pf, +nfulnl_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -615,7 +616,6 @@ nfulnl_log_packet(u_int8_t pf, const struct nf_loginfo *li; unsigned int qthreshold; unsigned int plen; - struct net *net = dev_net(in ? in : out); struct nfnl_log_net *log = nfnl_log_pernet(net); if (li_user && li_user->type == NF_LOG_TYPE_ULOG) @@ -1045,7 +1045,9 @@ static int __net_init nfnl_log_net_init(struct net *net) static void __net_exit nfnl_log_net_exit(struct net *net) { +#ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); +#endif } static struct pernet_operations nfnl_log_net_ops = { diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 2e0e835baf72..4e27fa035814 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -1285,7 +1285,9 @@ static int __net_init nfnl_queue_net_init(struct net *net) static void __net_exit nfnl_queue_net_exit(struct net *net) { +#ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); +#endif } static struct pernet_operations nfnl_queue_net_ops = { diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index fe573f6c9e91..491c7d821a0b 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -466,7 +466,8 @@ log_packet_common(struct sbuff *m, static void -ipt_log_packet(u_int8_t pf, +ipt_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -475,7 +476,6 @@ ipt_log_packet(u_int8_t pf, const char *prefix) { struct sbuff *m; - struct net *net = dev_net(in ? in : out); /* FIXME: Disabled from containers until syslog ns is supported */ if (!net_eq(net, &init_net)) @@ -797,7 +797,8 @@ fallback: } static void -ip6t_log_packet(u_int8_t pf, +ip6t_log_packet(struct net *net, + u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, @@ -806,7 +807,6 @@ ip6t_log_packet(u_int8_t pf, const char *prefix) { struct sbuff *m; - struct net *net = dev_net(in ? in : out); /* FIXME: Disabled from containers until syslog ns is supported */ if (!net_eq(net, &init_net)) @@ -833,17 +833,18 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_log_info *loginfo = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_LOG; li.u.log.level = loginfo->level; li.u.log.logflags = loginfo->logflags; if (par->family == NFPROTO_IPV4) - ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, + ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in, par->out, &li, loginfo->prefix); #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) else if (par->family == NFPROTO_IPV6) - ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, + ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in, par->out, &li, loginfo->prefix); #endif else diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index a17dd0f589b2..fb7497c928a0 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -26,13 +26,14 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; + struct net *net = dev_net(par->in ? par->in : par->out); li.type = NF_LOG_TYPE_ULOG; li.u.ulog.copy_len = info->len; li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nfulnl_log_packet(par->family, par->hooknum, skb, par->in, + nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in, par->out, &li, info->prefix); return XT_CONTINUE; } diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 25fd1c4e1eec..1eb1a44bfd3d 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -30,17 +30,28 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) static unsigned int tcpoptstrip_mangle_packet(struct sk_buff *skb, - const struct xt_tcpoptstrip_target_info *info, + const struct xt_action_param *par, unsigned int tcphoff, unsigned int minlen) { + const struct xt_tcpoptstrip_target_info *info = par->targinfo; unsigned int optl, i, j; struct tcphdr *tcph; u_int16_t n, o; u_int8_t *opt; + int len; + + /* This is a fragment, no TCP header is available */ + if (par->fragoff != 0) + return XT_CONTINUE; if (!skb_make_writable(skb, skb->len)) return NF_DROP; + len = skb->len - tcphoff; + if (len < (int)sizeof(struct tcphdr) || + tcp_hdr(skb)->doff * 4 > len) + return NF_DROP; + tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); opt = (u_int8_t *)tcph; @@ -76,7 +87,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, static unsigned int tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) { - return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), + return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb), sizeof(struct iphdr) + sizeof(struct tcphdr)); } @@ -94,7 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, + return tcpoptstrip_mangle_packet(skb, par, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); } #endif diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c index d8d424337550..6bb1d42f0fac 100644 --- a/net/netlabel/netlabel_domainhash.c +++ b/net/netlabel/netlabel_domainhash.c @@ -245,6 +245,71 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry, } } +/** + * netlbl_domhsh_validate - Validate a new domain mapping entry + * @entry: the entry to validate + * + * This function validates the new domain mapping entry to ensure that it is + * a valid entry. Returns zero on success, negative values on failure. + * + */ +static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry) +{ + struct netlbl_af4list *iter4; + struct netlbl_domaddr4_map *map4; +#if IS_ENABLED(CONFIG_IPV6) + struct netlbl_af6list *iter6; + struct netlbl_domaddr6_map *map6; +#endif /* IPv6 */ + + if (entry == NULL) + return -EINVAL; + + switch (entry->type) { + case NETLBL_NLTYPE_UNLABELED: + if (entry->type_def.cipsov4 != NULL || + entry->type_def.addrsel != NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CIPSOV4: + if (entry->type_def.cipsov4 == NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_ADDRSELECT: + netlbl_af4list_foreach(iter4, &entry->type_def.addrsel->list4) { + map4 = netlbl_domhsh_addr4_entry(iter4); + switch (map4->type) { + case NETLBL_NLTYPE_UNLABELED: + if (map4->type_def.cipsov4 != NULL) + return -EINVAL; + break; + case NETLBL_NLTYPE_CIPSOV4: + if (map4->type_def.cipsov4 == NULL) + return -EINVAL; + break; + default: + return -EINVAL; + } + } +#if IS_ENABLED(CONFIG_IPV6) + netlbl_af6list_foreach(iter6, &entry->type_def.addrsel->list6) { + map6 = netlbl_domhsh_addr6_entry(iter6); + switch (map6->type) { + case NETLBL_NLTYPE_UNLABELED: + break; + default: + return -EINVAL; + } + } +#endif /* IPv6 */ + break; + default: + return -EINVAL; + } + + return 0; +} + /* * Domain Hash Table Functions */ @@ -311,6 +376,10 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry, struct netlbl_af6list *tmp6; #endif /* IPv6 */ + ret_val = netlbl_domhsh_validate(entry); + if (ret_val != 0) + return ret_val; + /* XXX - we can remove this RCU read lock as the spinlock protects the * entire function, but before we do we need to fixup the * netlbl_af[4,6]list RCU functions to do "the right thing" with diff --git a/net/socket.c b/net/socket.c index b416093997da..6b94633ca61d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2412,7 +2412,7 @@ static const unsigned char nargs[21] = { SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) { - unsigned long a[6]; + unsigned long a[AUDITSC_ARGS]; unsigned long a0, a1; int err; unsigned int len; @@ -2428,7 +2428,9 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) if (copy_from_user(a, args, len)) return -EFAULT; - audit_socketcall(nargs[call] / sizeof(unsigned long), a); + err = audit_socketcall(nargs[call] / sizeof(unsigned long), a); + if (err) + return err; a0 = a[0]; a1 = a[1]; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index a764e227fdde..fc2f78d6a9b4 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -52,6 +52,8 @@ #include <linux/sunrpc/gss_api.h> #include <asm/uaccess.h> +#include "../netns.h" + static const struct rpc_authops authgss_ops; static const struct rpc_credops gss_credops; @@ -85,8 +87,6 @@ struct gss_auth { }; /* pipe_version >= 0 if and only if someone has a pipe open. */ -static int pipe_version = -1; -static atomic_t pipe_users = ATOMIC_INIT(0); static DEFINE_SPINLOCK(pipe_version_lock); static struct rpc_wait_queue pipe_version_rpc_waitqueue; static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); @@ -266,24 +266,27 @@ struct gss_upcall_msg { char databuf[UPCALL_BUF_LEN]; }; -static int get_pipe_version(void) +static int get_pipe_version(struct net *net) { + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret; spin_lock(&pipe_version_lock); - if (pipe_version >= 0) { - atomic_inc(&pipe_users); - ret = pipe_version; + if (sn->pipe_version >= 0) { + atomic_inc(&sn->pipe_users); + ret = sn->pipe_version; } else ret = -EAGAIN; spin_unlock(&pipe_version_lock); return ret; } -static void put_pipe_version(void) +static void put_pipe_version(struct net *net) { - if (atomic_dec_and_lock(&pipe_users, &pipe_version_lock)) { - pipe_version = -1; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) { + sn->pipe_version = -1; spin_unlock(&pipe_version_lock); } } @@ -291,9 +294,10 @@ static void put_pipe_version(void) static void gss_release_msg(struct gss_upcall_msg *gss_msg) { + struct net *net = rpc_net_ns(gss_msg->auth->client); if (!atomic_dec_and_test(&gss_msg->count)) return; - put_pipe_version(); + put_pipe_version(net); BUG_ON(!list_empty(&gss_msg->list)); if (gss_msg->ctx != NULL) gss_put_ctx(gss_msg->ctx); @@ -439,7 +443,10 @@ static void gss_encode_msg(struct gss_upcall_msg *gss_msg, struct rpc_clnt *clnt, const char *service_name) { - if (pipe_version == 0) + struct net *net = rpc_net_ns(clnt); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); + + if (sn->pipe_version == 0) gss_encode_v0_msg(gss_msg); else /* pipe_version == 1 */ gss_encode_v1_msg(gss_msg, clnt, service_name); @@ -455,7 +462,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, struct rpc_clnt *clnt, gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); if (gss_msg == NULL) return ERR_PTR(-ENOMEM); - vers = get_pipe_version(); + vers = get_pipe_version(rpc_net_ns(clnt)); if (vers < 0) { kfree(gss_msg); return ERR_PTR(vers); @@ -559,24 +566,34 @@ out: static inline int gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) { + struct net *net = rpc_net_ns(gss_auth->client); + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; struct rpc_cred *cred = &gss_cred->gc_base; struct gss_upcall_msg *gss_msg; + unsigned long timeout; DEFINE_WAIT(wait); - int err = 0; + int err; dprintk("RPC: %s for uid %u\n", __func__, from_kuid(&init_user_ns, cred->cr_uid)); retry: + err = 0; + /* Default timeout is 15s unless we know that gssd is not running */ + timeout = 15 * HZ; + if (!sn->gssd_running) + timeout = HZ >> 2; gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { err = wait_event_interruptible_timeout(pipe_version_waitqueue, - pipe_version >= 0, 15*HZ); - if (pipe_version < 0) { + sn->pipe_version >= 0, timeout); + if (sn->pipe_version < 0) { + if (err == 0) + sn->gssd_running = 0; warn_gssd(); err = -EACCES; } - if (err) + if (err < 0) goto out; goto retry; } @@ -707,20 +724,22 @@ out: static int gss_pipe_open(struct inode *inode, int new_version) { + struct net *net = inode->i_sb->s_fs_info; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); int ret = 0; spin_lock(&pipe_version_lock); - if (pipe_version < 0) { + if (sn->pipe_version < 0) { /* First open of any gss pipe determines the version: */ - pipe_version = new_version; + sn->pipe_version = new_version; rpc_wake_up(&pipe_version_rpc_waitqueue); wake_up(&pipe_version_waitqueue); - } else if (pipe_version != new_version) { + } else if (sn->pipe_version != new_version) { /* Trying to open a pipe of a different version */ ret = -EBUSY; goto out; } - atomic_inc(&pipe_users); + atomic_inc(&sn->pipe_users); out: spin_unlock(&pipe_version_lock); return ret; @@ -740,6 +759,7 @@ static int gss_pipe_open_v1(struct inode *inode) static void gss_pipe_release(struct inode *inode) { + struct net *net = inode->i_sb->s_fs_info; struct rpc_pipe *pipe = RPC_I(inode)->pipe; struct gss_upcall_msg *gss_msg; @@ -758,7 +778,7 @@ restart: } spin_unlock(&pipe->lock); - put_pipe_version(); + put_pipe_version(net); } static void @@ -867,8 +887,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) err = -EINVAL; gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); if (!gss_auth->mech) { - printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n", - __func__, flavor); + dprintk("RPC: Pseudoflavor %d not found!\n", flavor); goto err_free; } gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 5c4c61d527e2..357f613df7ff 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -21,16 +21,6 @@ #include <linux/sunrpc/svcauth.h> #include "gss_rpc_xdr.h" -static bool gssx_check_pointer(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, 4); - if (unlikely(p == NULL)) - return -ENOSPC; - return *p?true:false; -} - static int gssx_enc_bool(struct xdr_stream *xdr, int v) { __be32 *p; @@ -264,25 +254,27 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, if (unlikely(p == NULL)) return -ENOSPC; count = be32_to_cpup(p++); - if (count != 0) { - /* we recognize only 1 currently: CREDS_VALUE */ - oa->count = 1; + if (!count) + return 0; - oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL); - if (!oa->data) - return -ENOMEM; + /* we recognize only 1 currently: CREDS_VALUE */ + oa->count = 1; - creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); - if (!creds) { - kfree(oa->data); - return -ENOMEM; - } + oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL); + if (!oa->data) + return -ENOMEM; - oa->data[0].option.data = CREDS_VALUE; - oa->data[0].option.len = sizeof(CREDS_VALUE); - oa->data[0].value.data = (void *)creds; - oa->data[0].value.len = 0; + creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); + if (!creds) { + kfree(oa->data); + return -ENOMEM; } + + oa->data[0].option.data = CREDS_VALUE; + oa->data[0].option.len = sizeof(CREDS_VALUE); + oa->data[0].value.data = (void *)creds; + oa->data[0].value.len = 0; + for (i = 0; i < count; i++) { gssx_buffer dummy = { 0, NULL }; u32 length; @@ -800,6 +792,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct xdr_stream *xdr, struct gssx_res_accept_sec_context *res) { + u32 value_follows; int err; /* res->status */ @@ -808,7 +801,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, return err; /* res->context_handle */ - if (gssx_check_pointer(xdr)) { + err = gssx_dec_bool(xdr, &value_follows); + if (err) + return err; + if (value_follows) { err = gssx_dec_ctx(xdr, res->context_handle); if (err) return err; @@ -817,7 +813,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, } /* res->output_token */ - if (gssx_check_pointer(xdr)) { + err = gssx_dec_bool(xdr, &value_follows); + if (err) + return err; + if (value_follows) { err = gssx_dec_buffer(xdr, res->output_token); if (err) return err; @@ -826,7 +825,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, } /* res->delegated_cred_handle */ - if (gssx_check_pointer(xdr)) { + err = gssx_dec_bool(xdr, &value_follows); + if (err) + return err; + if (value_follows) { /* we do not support upcall servers sending this data. */ return -EINVAL; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 3f7930f938cc..5a750b9c3640 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -360,7 +360,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru auth = rpcauth_create(args->authflavor, clnt); if (IS_ERR(auth)) { - printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", + dprintk("RPC: Couldn't create auth handle (flavor %u)\n", args->authflavor); err = PTR_ERR(auth); goto out_no_auth; diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h index 7111a4c9113b..74d948f5d5a1 100644 --- a/net/sunrpc/netns.h +++ b/net/sunrpc/netns.h @@ -28,7 +28,11 @@ struct sunrpc_net { wait_queue_head_t gssp_wq; struct rpc_clnt *gssp_clnt; int use_gss_proxy; + int pipe_version; + atomic_t pipe_users; struct proc_dir_entry *use_gssp_proc; + + unsigned int gssd_running; }; extern int sunrpc_net_id; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index a9129f8d7070..e7ce4b3eb0bd 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -216,11 +216,14 @@ rpc_destroy_inode(struct inode *inode) static int rpc_pipe_open(struct inode *inode, struct file *filp) { + struct net *net = inode->i_sb->s_fs_info; + struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); struct rpc_pipe *pipe; int first_open; int res = -ENXIO; mutex_lock(&inode->i_mutex); + sn->gssd_running = 1; pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -1069,6 +1072,8 @@ void rpc_pipefs_init_net(struct net *net) struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); mutex_init(&sn->pipefs_sb_lock); + sn->gssd_running = 1; + sn->pipe_version = -1; } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f8529fc8e542..5356b120dbf8 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -324,11 +324,17 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); * Note: If the task is ASYNC, and is being made runnable after sitting on an * rpc_wait_queue, this must be called with the queue spinlock held to protect * the wait queue operation. + * Note the ordering of rpc_test_and_set_running() and rpc_clear_queued(), + * which is needed to ensure that __rpc_execute() doesn't loop (due to the + * lockless RPC_IS_QUEUED() test) before we've had a chance to test + * the RPC_TASK_RUNNING flag. */ static void rpc_make_runnable(struct rpc_task *task) { + bool need_wakeup = !rpc_test_and_set_running(task); + rpc_clear_queued(task); - if (rpc_test_and_set_running(task)) + if (!need_wakeup) return; if (RPC_IS_ASYNC(task)) { INIT_WORK(&task->u.tk_work, rpc_async_schedule); diff --git a/net/tipc/link.c b/net/tipc/link.c index daa6080a2a0c..a80feee5197a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -2306,8 +2306,11 @@ static int link_recv_changeover_msg(struct tipc_link **l_ptr, struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf); u32 msg_typ = msg_type(tunnel_msg); u32 msg_count = msg_msgcnt(tunnel_msg); + u32 bearer_id = msg_bearer_id(tunnel_msg); - dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)]; + if (bearer_id >= MAX_BEARERS) + goto exit; + dest_link = (*l_ptr)->owner->links[bearer_id]; if (!dest_link) goto exit; if (dest_link == *l_ptr) { @@ -2521,14 +2524,16 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm); u32 msg_sz = msg_size(imsg); u32 fragm_sz = msg_data_sz(fragm); - u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz); + u32 exp_fragm_cnt; u32 max = TIPC_MAX_USER_MSG_SIZE + NAMED_H_SIZE; + if (msg_type(imsg) == TIPC_MCAST_MSG) max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE; - if (msg_size(imsg) > max) { + if (fragm_sz == 0 || msg_size(imsg) > max) { kfree_skb(fbuf); return 0; } + exp_fragm_cnt = msg_sz / fragm_sz + !!(msg_sz % fragm_sz); pbuf = tipc_buf_acquire(msg_size(imsg)); if (pbuf != NULL) { pbuf->next = *pending; diff --git a/net/wireless/core.c b/net/wireless/core.c index 84c9ad7e1dca..73405e00c800 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -638,17 +638,21 @@ int wiphy_register(struct wiphy *wiphy) * cfg80211_mutex lock */ res = rfkill_register(rdev->rfkill); - if (res) - goto out_rm_dev; + if (res) { + device_del(&rdev->wiphy.dev); + + mutex_lock(&cfg80211_mutex); + debugfs_remove_recursive(rdev->wiphy.debugfsdir); + list_del_rcu(&rdev->list); + wiphy_regulatory_deregister(wiphy); + mutex_unlock(&cfg80211_mutex); + return res; + } rtnl_lock(); rdev->wiphy.registered = true; rtnl_unlock(); return 0; - -out_rm_dev: - device_del(&rdev->wiphy.dev); - return res; } EXPORT_SYMBOL(wiphy_register); @@ -866,7 +870,6 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, #endif __cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); - cfg80211_mlme_down(rdev, dev); wdev_unlock(wdev); break; case NL80211_IFTYPE_MESH_POINT: diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index afa283841e8c..dfdb5e643211 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7577,6 +7577,8 @@ static int nl80211_send_wowlan_tcp(struct sk_buff *msg, &tcp->payload_tok)) return -ENOBUFS; + nla_nest_end(msg, nl_tcp); + return 0; } @@ -9970,6 +9972,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) || + nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) || nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || @@ -10010,6 +10013,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) || + nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) || (ack && nla_put_flag(msg, NL80211_ATTR_ACK))) diff --git a/net/wireless/sme.c b/net/wireless/sme.c index a9dc5c736df0..8b5eddfba1e5 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -961,7 +961,7 @@ int __cfg80211_disconnect(struct cfg80211_registered_device *rdev, /* was it connected by userspace SME? */ if (!wdev->conn) { cfg80211_mlme_down(rdev, dev); - return 0; + goto disconnect; } if (wdev->sme_state == CFG80211_SME_CONNECTING && @@ -987,6 +987,7 @@ int __cfg80211_disconnect(struct cfg80211_registered_device *rdev, return err; } + disconnect: if (wdev->sme_state == CFG80211_SME_CONNECTED) __cfg80211_disconnected(dev, NULL, 0, 0, false); else if (wdev->sme_state == CFG80211_SME_CONNECTING) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index ecd4fcec3c94..5755bc14abbd 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2441,6 +2441,7 @@ TRACE_EVENT(cfg80211_report_wowlan_wakeup, TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY + __field(bool, non_wireless) __field(bool, disconnect) __field(bool, magic_pkt) __field(bool, gtk_rekey_failure) @@ -2449,20 +2450,22 @@ TRACE_EVENT(cfg80211_report_wowlan_wakeup, __field(bool, rfkill_release) __field(s32, pattern_idx) __field(u32, packet_len) - __dynamic_array(u8, packet, wakeup->packet_present_len) + __dynamic_array(u8, packet, + wakeup ? wakeup->packet_present_len : 0) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; - __entry->disconnect = wakeup->disconnect; - __entry->magic_pkt = wakeup->magic_pkt; - __entry->gtk_rekey_failure = wakeup->gtk_rekey_failure; - __entry->eap_identity_req = wakeup->eap_identity_req; - __entry->four_way_handshake = wakeup->four_way_handshake; - __entry->rfkill_release = wakeup->rfkill_release; - __entry->pattern_idx = wakeup->pattern_idx; - __entry->packet_len = wakeup->packet_len; - if (wakeup->packet && wakeup->packet_present_len) + __entry->non_wireless = !wakeup; + __entry->disconnect = wakeup ? wakeup->disconnect : false; + __entry->magic_pkt = wakeup ? wakeup->magic_pkt : false; + __entry->gtk_rekey_failure = wakeup ? wakeup->gtk_rekey_failure : false; + __entry->eap_identity_req = wakeup ? wakeup->eap_identity_req : false; + __entry->four_way_handshake = wakeup ? wakeup->four_way_handshake : false; + __entry->rfkill_release = wakeup ? wakeup->rfkill_release : false; + __entry->pattern_idx = wakeup ? wakeup->pattern_idx : false; + __entry->packet_len = wakeup ? wakeup->packet_len : false; + if (wakeup && wakeup->packet && wakeup->packet_present_len) memcpy(__get_dynamic_array(packet), wakeup->packet, wakeup->packet_present_len); ), diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index bcfda8921b5b..0cf003dfa8fc 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -64,6 +64,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) if (unlikely(x->km.state != XFRM_STATE_VALID)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID); + err = -EINVAL; goto error; } |