diff options
Diffstat (limited to 'net/packet/af_packet.c')
| -rw-r--r-- | net/packet/af_packet.c | 645 |
1 files changed, 347 insertions, 298 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5ab98ca2511..494d628d10a5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *, static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); -static void prb_retire_rx_blk_timer_expired(struct timer_list *); -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); @@ -270,8 +269,11 @@ static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) } #endif -static int packet_direct_xmit(struct sk_buff *skb) +static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { + if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) + return dev_queue_xmit(skb); + #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); @@ -305,11 +307,6 @@ static void packet_cached_dev_reset(struct packet_sock *po) RCU_INIT_POINTER(po->cached_dev, NULL); } -static bool packet_use_direct_xmit(const struct packet_sock *po) -{ - return po->xmit == packet_direct_xmit; -} - static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -339,14 +336,14 @@ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); - if (!po->running) { + if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); - po->running = 1; + packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } @@ -368,7 +365,7 @@ static void __unregister_prot_hook(struct sock *sk, bool sync) lockdep_assert_held_once(&po->bind_lock); - po->running = 0; + packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); @@ -388,7 +385,7 @@ static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); - if (po->running) + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } @@ -403,18 +400,20 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; + /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: - h.h1->tp_status = status; + WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: - h.h2->tp_status = status; + WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: - h.h3->tp_status = status; + WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: @@ -431,17 +430,19 @@ static int __packet_get_status(const struct packet_sock *po, void *frame) smp_rmb(); + /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ + h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); - return h.h1->tp_status; + return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); - return h.h2->tp_status; + return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); - return h.h3->tp_status; + return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); @@ -473,7 +474,7 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct timespec64 ts; __u32 ts_status; - if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp))) + if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; @@ -536,33 +537,54 @@ static void *packet_current_frame(struct packet_sock *po, return packet_lookup_frame(po, rb, rb->head, status); } -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) +static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { - del_timer_sync(&pkc->retire_blk_timer); + struct vlan_hdr vhdr, *vh; + unsigned int header_len; + + if (!dev) + return 0; + + /* In the SOCK_DGRAM scenario, skb data starts at the network + * protocol, which is after the VLAN headers. The outer VLAN + * header is at the hard_header_len offset in non-variable + * length link layer headers. If it's a VLAN device, the + * min_header_len should be used to exclude the VLAN header + * size. + */ + if (dev->min_header_len == dev->hard_header_len) + header_len = dev->hard_header_len; + else if (is_vlan_dev(dev)) + header_len = dev->min_header_len; + else + return 0; + + vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, + sizeof(vhdr), &vhdr); + if (unlikely(!vh)) + return 0; + + return ntohs(vh->h_vlan_TCI); } -static void prb_shutdown_retire_blk_timer(struct packet_sock *po, - struct sk_buff_head *rb_queue) +static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { - struct tpacket_kbdq_core *pkc; + __be16 proto = skb->protocol; - pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - - spin_lock_bh(&rb_queue->lock); - pkc->delete_blk_timer = 1; - spin_unlock_bh(&rb_queue->lock); + if (unlikely(eth_type_vlan(proto))) + proto = __vlan_get_protocol_offset(skb, proto, + skb_mac_offset(skb), NULL); - prb_del_retire_blk_timer(pkc); + return proto; } -static void prb_setup_retire_blk_timer(struct packet_sock *po) +static void prb_shutdown_retire_blk_timer(struct packet_sock *po, + struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, - 0); - pkc->retire_blk_timer.expires = jiffies; + hrtimer_cancel(&pkc->retire_blk_timer); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, @@ -626,60 +648,39 @@ static void init_prb_bdqc(struct packet_sock *po, p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; - p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); else - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, - req_u->req3.tp_block_size); - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, + req_u->req3.tp_block_size)); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po); + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, + HRTIMER_MODE_REL_SOFT); prb_open_block(p1, pbd); } -/* Do NOT update the last_blk_num first. - * Assumes sk_buff_head lock is held. - */ -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - mod_timer(&pkc->retire_blk_timer, - jiffies + pkc->tov_in_jiffies); - pkc->last_kactive_blk_num = pkc->kactive_blk_num; -} - /* - * Timer logic: - * 1) We refresh the timer only when we open a block. - * By doing this we don't waste cycles refreshing the timer - * on packet-by-packet basis. - * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * - * So, if the user sets the 'tmo' to 10ms then the timer - * will never fire while the block is still getting filled - * (which is what we want). However, the user could choose - * to close a block early and that's fine. - * - * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. - * */ -static void prb_retire_rx_blk_timer_expired(struct timer_list *t) +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) { struct packet_sock *po = - from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); + timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; @@ -689,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); - if (unlikely(pkc->delete_blk_timer)) - goto out; - /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() @@ -707,46 +705,31 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) write_unlock(&pkc->blk_fill_in_prog_lock); } - if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { - if (!frozen) { - if (!BLOCK_NUM_PKTS(pbd)) { - /* An empty block. Just refresh the timer. */ - goto refresh_timer; - } + if (!frozen) { + if (BLOCK_NUM_PKTS(pbd)) { + /* Not an empty block. Need retire the block. */ prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); - if (!prb_dispatch_next_block(pkc, po)) - goto refresh_timer; - else - goto out; - } else { - /* Case 1. Queue was frozen because user-space was - * lagging behind. + prb_dispatch_next_block(pkc, po); + } + } else { + /* Case 1. Queue was frozen because user-space was + * lagging behind. + */ + if (!prb_curr_blk_in_use(pbd)) { + /* Case 2. queue was frozen,user-space caught up, + * now the link went idle && the timer fired. + * We don't have a block to close.So we open this + * block and restart the timer. + * opening a block thaws the queue,restarts timer + * Thawing/timer-refresh is a side effect. */ - if (prb_curr_blk_in_use(pbd)) { - /* - * Ok, user-space is still behind. - * So just refresh the timer. - */ - goto refresh_timer; - } else { - /* Case 2. queue was frozen,user-space caught up, - * now the link went idle && the timer fired. - * We don't have a block to close.So we open this - * block and restart the timer. - * opening a block thaws the queue,restarts timer - * Thawing/timer-refresh is a side effect. - */ - prb_open_block(pkc, pbd); - goto out; - } + prb_open_block(pkc, pbd); } } -refresh_timer: - _prb_refresh_rx_retire_blk_timer(pkc); - -out: + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); spin_unlock(&po->sk.sk_receive_queue.lock); + return HRTIMER_RESTART; } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, @@ -840,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) } /* - * Side effect of opening a block: + * prb_open_block is called by tpacket_rcv or timer callback. * - * 1) prb_queue is thawed. - * 2) retire_blk_timer is refreshed. + * Reasons why NOT update hrtimer in prb_open_block: + * 1) It will increase complexity to distinguish the two caller scenario. + * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update + * TMO of an already enqueued hrtimer, leading to complex shutdown logic. * + * One side effect of NOT update hrtimer when called by tpacket_rcv is that + * a newly opened block triggered by tpacket_rcv may be retired earlier than + * expected. On the other hand, if timeout is updated in prb_open_block, the + * frequent reception of network packets that leads to prb_open_block being + * called may cause hrtimer to be removed and enqueued repeatedly. */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) @@ -878,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); - _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } @@ -1005,10 +994,16 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { + struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc); + if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) { + ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev); + ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol); + ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; @@ -1306,22 +1301,23 @@ static int __packet_rcv_has_room(const struct packet_sock *po, static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { - int pressure, ret; + bool pressure; + int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; - if (READ_ONCE(po->pressure) != pressure) - WRITE_ONCE(po->pressure, pressure); + if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) + packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { - if (READ_ONCE(po->pressure) && + if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) - WRITE_ONCE(po->pressure, 0); + packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) @@ -1335,8 +1331,6 @@ static void packet_sock_destruct(struct sock *sk) pr_err("Attempt to release alive packet socket: %p\n", sk); return; } - - sk_refcnt_debug_dec(sk); } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) @@ -1410,7 +1404,8 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f, i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); - if (po_next != po_skip && !READ_ONCE(po_next->pressure) && + if (po_next != po_skip && + !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; @@ -1783,21 +1778,22 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) err = -EINVAL; spin_lock(&po->bind_lock); - if (po->running && + if (po->num && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { - __dev_remove_pack(&po->prot_hook); - /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); - __fanout_link(sk, po); + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { + __dev_remove_pack(&po->prot_hook); + __fanout_link(sk, po); + } err = 0; } } @@ -1936,10 +1932,8 @@ static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && - __vlan_get_protocol(skb, skb->protocol, &depth) != 0) { - if (pskb_may_pull(skb, depth)) - skb_set_network_header(skb, depth); - } + vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) + skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } @@ -2035,7 +2029,7 @@ retry: goto retry; } - if (!dev_validate_header(dev, skb->data, len)) { + if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } @@ -2054,11 +2048,10 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = sk->sk_priority; - skb->mark = sk->sk_mark; - skb->tstamp = sockc.transmit_time; - - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb->priority = sockc.priority; + skb->mark = sockc.mark; + skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); + skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -2092,18 +2085,18 @@ static unsigned int run_filter(struct sk_buff *skb, } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, - size_t *len) + size_t *len, int vnet_hdr_sz) { - struct virtio_net_hdr vnet_hdr; + struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; - if (*len < sizeof(vnet_hdr)) + if (*len < vnet_hdr_sz) return -EINVAL; - *len -= sizeof(vnet_hdr); + *len -= vnet_hdr_sz; - if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) + if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; - return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); + return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* @@ -2121,13 +2114,13 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct sock *sk; + enum skb_drop_reason drop_reason = SKB_CONSUMED; + struct sock *sk = NULL; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; - bool is_drop_n_account = false; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -2156,7 +2149,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, } } - snaplen = skb->len; + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) @@ -2185,7 +2178,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev)) + if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; @@ -2217,9 +2210,9 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, return 0; drop_n_acct: - is_drop_n_account = true; atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); + drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { @@ -2227,17 +2220,15 @@ drop_n_restore: skb->len = skb_len; } drop: - if (!is_drop_n_account) - consume_skb(skb); - else - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct sock *sk; + enum skb_drop_reason drop_reason = SKB_CONSUMED; + struct sock *sk = NULL; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; @@ -2250,9 +2241,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; - bool is_drop_n_account = false; unsigned int slot_id = 0; - bool do_vnet = false; + int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing @@ -2279,7 +2269,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } } - snaplen = skb->len; + snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) @@ -2296,6 +2286,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; @@ -2308,10 +2300,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; - if (po->has_vnet_hdr) { - netoff += sizeof(struct virtio_net_hdr); - do_vnet = true; - } + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); + if (vnet_hdr_sz) + netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { @@ -2320,7 +2311,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { - if (po->copy_thresh && + if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); @@ -2337,7 +2328,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; - do_vnet = false; + vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > @@ -2351,7 +2342,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; - do_vnet = false; + vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); @@ -2367,7 +2358,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, __set_bit(slot_id, po->rx_ring.rx_owner_map); } - if (do_vnet && + if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { @@ -2402,7 +2393,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, - po->tp_tstamp | SOF_TIMESTAMPING_SOFTWARE); + READ_ONCE(po->tp_tstamp) | + SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); @@ -2429,6 +2421,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { + h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev); + h.h2->tp_vlan_tpid = ntohs(skb->protocol); + status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; @@ -2458,9 +2454,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; - sll->sll_protocol = skb->protocol; + sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ? + vlan_get_protocol_dgram(skb) : skb->protocol; sll->sll_pkttype = skb->pkt_type; - if (unlikely(po->origdev)) + if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; @@ -2496,19 +2493,16 @@ drop_n_restore: skb->len = skb_len; } drop: - if (!is_drop_n_account) - consume_skb(skb); - else - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); - is_drop_n_account = true; + drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); - kfree_skb(copy_skb); + sk_skb_reason_drop(sk, copy_skb, drop_reason); goto drop_n_restore; } @@ -2526,8 +2520,7 @@ static void tpacket_destruct_skb(struct sk_buff *skb) ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); - if (!packet_read_pending(&po->tx_ring)) - complete(&po->skb_completion); + complete(&po->skb_completion); } sock_wfree(skb); @@ -2550,16 +2543,26 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, - struct virtio_net_hdr *vnet_hdr) + struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { - if (*len < sizeof(*vnet_hdr)) + int ret; + + if (*len < vnet_hdr_sz) return -EINVAL; - *len -= sizeof(*vnet_hdr); + *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; - return __packet_snd_vnet_parse(vnet_hdr, *len); + ret = __packet_snd_vnet_parse(vnet_hdr, *len); + if (ret) + return ret; + + /* move iter to point to the start of mac header */ + if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) + iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); + + return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, @@ -2577,10 +2580,10 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = po->sk.sk_priority; - skb->mark = po->sk.sk_mark; - skb->tstamp = sockc->transmit_time; - skb_setup_tx_timestamp(skb, sockc->tsflags); + skb->priority = sockc->priority; + skb->mark = sockc->mark; + skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); + skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); @@ -2621,8 +2624,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { - pr_err("Packet exceed the number of skb frags(%lu)\n", - MAX_SKB_FRAGS); + pr_err("Packet exceed the number of skb frags(%u)\n", + (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } @@ -2670,7 +2673,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, return -EMSGSIZE; } - if (unlikely(po->tp_tx_has_off)) { + if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); @@ -2721,13 +2724,14 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; - long timeo = 0; + long timeo; mutex_lock(&po->pg_vec_lock); @@ -2778,25 +2782,31 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); - if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; + timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { - if (need_wait && skb) { - timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); + /* Note: packet_read_pending() might be slow if we + * have to call it as it's per_cpu variable, but in + * fast-path we don't have to call it, only when ph + * is NULL, we need to check the pending_refcnt. + */ + if (need_wait && packet_read_pending(&po->tx_ring)) { timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } - } - /* check for additional frames */ - continue; + /* check for additional frames */ + continue; + } else + break; } skb = NULL; @@ -2807,10 +2817,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; - if (po->has_vnet_hdr) { + if (vnet_hdr_sz) { vnet_hdr = data; - data += sizeof(*vnet_hdr); - tp_len -= sizeof(*vnet_hdr); + data += vnet_hdr_sz; + tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; @@ -2835,13 +2845,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && - !po->has_vnet_hdr && + !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: - if (po->tp_loss) { + if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); @@ -2854,7 +2864,7 @@ tpacket_error: } } - if (po->has_vnet_hdr) { + if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; @@ -2867,7 +2877,7 @@ tpacket_error: packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; - err = po->xmit(skb); + err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); @@ -2885,14 +2895,7 @@ tpacket_error: } packet_increment_head(&po->tx_ring); len_sum += tp_len; - } while (likely((ph != NULL) || - /* Note: packet_read_pending() might be slow if we have - * to call it as it's per_cpu variable, but in fast-path - * we already short-circuit the loop with the first - * condition, and luckily don't have to go that path - * anyway. - */ - (need_wait && packet_read_pending(&po->tx_ring)))); + } while (1); err = len_sum; goto out_put; @@ -2918,8 +2921,10 @@ static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, if (prepad + len < PAGE_SIZE || !linear) linear = len; + if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, - err, 0); + err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; @@ -2944,7 +2949,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); - bool has_vnet_hdr = false; + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; @@ -2979,7 +2984,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -2988,11 +2992,10 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; - if (po->has_vnet_hdr) { - err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); + if (vnet_hdr_sz) { + err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; - has_vnet_hdr = true; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { @@ -3043,7 +3046,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } - skb_setup_tx_timestamp(skb, sockc.tsflags); + skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { @@ -3053,24 +3056,25 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->protocol = proto; skb->dev = dev; - skb->priority = sk->sk_priority; + skb->priority = sockc.priority; skb->mark = sockc.mark; - skb->tstamp = sockc.transmit_time; + skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); - if (has_vnet_hdr) { + if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; - len += sizeof(vnet_hdr); + len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } - err = po->xmit(skb); + err = packet_xmit(po, skb); + if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); @@ -3172,7 +3176,6 @@ static int packet_release(struct socket *sock) skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); - sk_refcnt_debug_release(sk); sock_put(sk); return 0; @@ -3193,6 +3196,9 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, lock_sock(sk); spin_lock(&po->bind_lock); + if (!proto) + proto = po->num; + rcu_read_lock(); if (po->fanout) { @@ -3218,7 +3224,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, if (need_rehook) { dev_hold(dev); - if (po->running) { + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() @@ -3231,7 +3237,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, dev->ifindex); } - BUG_ON(po->running); + BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; @@ -3273,11 +3279,12 @@ out_unlock: * Bind a packet socket to a device */ -static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, +static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data_min) + 1]; + struct sockaddr *sa = (struct sockaddr *)uaddr; + char name[sizeof(sa->sa_data) + 1]; /* * Check legality @@ -3288,13 +3295,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); - name[sizeof(uaddr->sa_data_min)] = 0; + memcpy(name, sa->sa_data, sizeof(sa->sa_data)); + name[sizeof(sa->sa_data)] = 0; - return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); + return packet_do_bind(sk, name, 0, 0); } -static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; @@ -3308,8 +3315,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len if (sll->sll_family != AF_PACKET) return -EINVAL; - return packet_do_bind(sk, NULL, sll->sll_ifindex, - sll->sll_protocol ? : pkt_sk(sk)->num); + return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { @@ -3347,22 +3353,20 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; + po = pkt_sk(sk); + err = packet_alloc_pending(po); + if (err) + goto out_sk_free; + sock_init_data(sock, sk); - po = pkt_sk(sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; - po->xmit = dev_queue_xmit; - - err = packet_alloc_pending(po); - if (err) - goto out2; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; - sk_refcnt_debug_inc(sk); /* * Attach a protocol block @@ -3391,7 +3395,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, sock_prot_inuse_add(net, &packet_proto, 1); return 0; -out2: +out_sk_free: sk_free(sk); out: return err; @@ -3408,7 +3412,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; - int vnet_hdr_len = 0; + int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; @@ -3449,11 +3453,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, packet_rcv_try_clear_pressure(pkt_sk(sk)); - if (pkt_sk(sk)->has_vnet_hdr) { - err = packet_rcv_vnet(msg, skb, &len); + if (vnet_hdr_len) { + err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; - vnet_hdr_len = sizeof(struct virtio_net_hdr); } /* You lose any data beyond the buffer you gave. If it worries @@ -3476,7 +3479,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; - sll->sll_protocol = skb->protocol; + sll->sll_protocol = (sock->type == SOCK_DGRAM) ? + vlan_get_protocol_dgram(skb) : skb->protocol; } sock_recv_cmsgs(msg, sk, skb); @@ -3513,7 +3517,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } - if (pkt_sk(sk)->auxdata) { + if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; @@ -3522,6 +3526,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; + if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) + aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; @@ -3531,6 +3537,21 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { + struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex); + if (dev) { + aux.tp_vlan_tci = vlan_get_tci(skb, dev); + aux.tp_vlan_tpid = ntohs(skb->protocol); + aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; + } else { + aux.tp_vlan_tci = 0; + aux.tp_vlan_tpid = 0; + } + rcu_read_unlock(); } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; @@ -3560,11 +3581,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); return sizeof(*uaddr); @@ -3592,7 +3613,12 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; - memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); + + /* Let __fortify_memcpy_chk() know the actual buffer size. */ + memcpy(((struct sockaddr_storage *)sll)->__data + + offsetof(struct sockaddr_ll, sll_addr) - + offsetofend(struct sockaddr_ll, sll_family), + dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; @@ -3633,15 +3659,15 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, } static void packet_dev_mclist_delete(struct net_device *dev, - struct packet_mclist **mlp) + struct packet_mclist **mlp, + struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { - packet_dev_mc(dev, ml, -1); + list_add(&ml->remove_list, list); *mlp = ml->next; - kfree(ml); } else mlp = &ml->next; } @@ -3689,6 +3715,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; + INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); @@ -3786,28 +3813,30 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, case PACKET_TX_RING: { union tpacket_req_u req_u; - int len; + ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: - len = sizeof(req_u.req); + if (optlen < sizeof(req_u.req)) + break; + ret = copy_from_sockptr(&req_u.req, optval, + sizeof(req_u.req)) ? + -EINVAL : 0; break; case TPACKET_V3: default: - len = sizeof(req_u.req3); + if (optlen < sizeof(req_u.req3)) + break; + ret = copy_from_sockptr(&req_u.req3, optval, + sizeof(req_u.req3)) ? + -EINVAL : 0; break; } - if (optlen < len) { - ret = -EINVAL; - } else { - if (copy_from_sockptr(&req_u.req, optval, len)) - ret = -EFAULT; - else - ret = packet_set_ring(sk, &req_u, 0, - optname == PACKET_TX_RING); - } + if (!ret) + ret = packet_set_ring(sk, &req_u, 0, + optname == PACKET_TX_RING); release_sock(sk); return ret; } @@ -3820,7 +3849,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - pkt_sk(sk)->copy_thresh = val; + WRITE_ONCE(pkt_sk(sk)->copy_thresh, val); return 0; } case PACKET_VERSION: @@ -3882,7 +3911,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { - po->tp_loss = !!val; + packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); @@ -3897,9 +3926,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - po->auxdata = !!val; - release_sock(sk); + packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: @@ -3911,14 +3938,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); - po->origdev = !!val; - release_sock(sk); + packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: + case PACKET_VNET_HDR_SZ: { - int val; + int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; @@ -3927,11 +3953,19 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; + if (optname == PACKET_VNET_HDR_SZ) { + if (val && val != sizeof(struct virtio_net_hdr) && + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) + return -EINVAL; + hdr_len = val; + } else { + hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; + } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { - po->has_vnet_hdr = !!val; + WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); @@ -3946,7 +3980,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - po->tp_tstamp = val; + WRITE_ONCE(po->tp_tstamp, val); return 0; } case PACKET_FANOUT: @@ -3979,7 +4013,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (val < 0 || val > 1) return -EINVAL; - po->prot_hook.ignore_outgoing = !!val; + WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: @@ -3993,7 +4027,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) - po->tp_tx_has_off = !!val; + packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; @@ -4007,7 +4041,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - po->xmit = val ? packet_direct_xmit : dev_queue_xmit; + packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: @@ -4058,13 +4092,19 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, break; case PACKET_AUXDATA: - val = po->auxdata; + val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: - val = po->origdev; + val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: - val = po->has_vnet_hdr; + val = !!READ_ONCE(po->vnet_hdr_sz); + break; + case PACKET_VNET_HDR_SZ: + val = READ_ONCE(po->vnet_hdr_sz); + break; + case PACKET_COPY_THRESH: + val = READ_ONCE(pkt_sk(sk)->copy_thresh); break; case PACKET_VERSION: val = po->tp_version; @@ -4094,10 +4134,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, val = po->tp_reserve; break; case PACKET_LOSS: - val = po->tp_loss; + val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: - val = po->tp_tstamp; + val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT: val = (po->fanout ? @@ -4107,7 +4147,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, 0); break; case PACKET_IGNORE_OUTGOING: - val = po->prot_hook.ignore_outgoing; + val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) @@ -4119,10 +4159,10 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: - val = po->tp_tx_has_off; + val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: - val = packet_use_direct_xmit(po); + val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; @@ -4140,9 +4180,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { - struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct packet_mclist *ml, *tmp; + LIST_HEAD(mclist); + struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { @@ -4151,13 +4193,14 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist_delete(dev, &po->mclist); + packet_dev_mclist_delete(dev, &po->mclist, + &mclist); fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); - if (po->running) { + if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) @@ -4184,6 +4227,13 @@ static int packet_notifier(struct notifier_block *this, } } rcu_read_unlock(); + + /* packet_dev_mc might grab instance locks so can't run under rcu */ + list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + return NOTIFY_DONE; } @@ -4272,7 +4322,7 @@ static void packet_mm_open(struct vm_area_struct *vma) struct sock *sk = sock->sk; if (sk) - atomic_inc(&pkt_sk(sk)->mapped); + atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) @@ -4282,7 +4332,7 @@ static void packet_mm_close(struct vm_area_struct *vma) struct sock *sk = sock->sk; if (sk) - atomic_dec(&pkt_sk(sk)->mapped); + atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { @@ -4377,7 +4427,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, err = -EBUSY; if (!closing) { - if (atomic_read(&po->mapped)) + if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; @@ -4468,19 +4518,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, /* Detach socket from network */ spin_lock(&po->bind_lock); - was_running = po->running; + was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; - if (was_running) { - WRITE_ONCE(po->num, 0); + WRITE_ONCE(po->num, 0); + if (was_running) __unregister_prot_hook(sk, false); - } + spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); - if (closing || atomic_read(&po->mapped) == 0) { + if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); @@ -4498,17 +4548,17 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); - if (atomic_read(&po->mapped)) - pr_err("packet_mmap: vma is busy: %d\n", - atomic_read(&po->mapped)); + if (atomic_long_read(&po->mapped)) + pr_err("packet_mmap: vma is busy: %ld\n", + atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); - if (was_running) { - WRITE_ONCE(po->num, num); + WRITE_ONCE(po->num, num); + if (was_running) register_prot_hook(sk); - } + spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ @@ -4578,7 +4628,7 @@ static int packet_mmap(struct file *file, struct socket *sock, } } - atomic_inc(&po->mapped); + atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; @@ -4604,7 +4654,6 @@ static const struct proto_ops packet_ops_spkt = { .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, - .sendpage = sock_no_sendpage, }; static const struct proto_ops packet_ops = { @@ -4626,7 +4675,6 @@ static const struct proto_ops packet_ops = { .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, - .sendpage = sock_no_sendpage, }; static const struct net_proto_family packet_family_ops = { @@ -4679,9 +4727,9 @@ static int packet_seq_show(struct seq_file *seq, void *v) s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), - po->running, + packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), + from_kuid_munged(seq_user_ns(seq), sk_uid(s)), sock_i_ino(s)); } @@ -4761,5 +4809,6 @@ out: module_init(packet_init); module_exit(packet_exit); +MODULE_DESCRIPTION("Packet socket support (AF_PACKET)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET); |
