diff options
Diffstat (limited to 'net/sched/sch_api.c')
| -rw-r--r-- | net/sched/sch_api.c | 667 |
1 files changed, 436 insertions, 231 deletions
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 72d2c204d5f3..f56b18c8aebf 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -25,7 +25,9 @@ #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> +#include <linux/bpf.h> +#include <net/netdev_lock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -206,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name) for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -228,7 +230,7 @@ int qdisc_set_default(const char *name) if (!ops) { /* Not found, drop lock and try to load module */ write_unlock(&qdisc_mod_lock); - request_module("sch_%s", name); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); @@ -236,7 +238,7 @@ int qdisc_set_default(const char *name) if (ops) { /* Set new default */ - module_put(default_qdisc_ops->owner); + bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); @@ -309,7 +311,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (dev_ingress_queue(dev)) q = qdisc_match_from_root( - dev_ingress_queue(dev)->qdisc_sleeping, + rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), handle); out: return q; @@ -328,22 +330,28 @@ struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) nq = dev_ingress_queue_rcu(dev); if (nq) - q = qdisc_match_from_root(nq->qdisc_sleeping, handle); + q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), + handle); out: return q; } -static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid, + struct netlink_ext_ack *extack) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; - if (cops == NULL) - return NULL; + if (cops == NULL) { + NL_SET_ERR_MSG(extack, "Parent qdisc is not classful"); + return ERR_PTR(-EOPNOTSUPP); + } cl = cops->find(p, classid); - if (cl == 0) - return NULL; + if (cl == 0) { + NL_SET_ERR_MSG(extack, "Specified class not found"); + return ERR_PTR(-ENOENT); + } return cops->leaf(p, cl); } @@ -357,7 +365,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -423,7 +431,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && - !memcmp(&rtab->data, nla_data(tab), 1024)) { + !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) { rtab->refcnt++; return rtab; } @@ -433,7 +441,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, if (rtab) { rtab->rate = *r; rtab->refcnt = 1; - memcpy(rtab->data, nla_data(tab), 1024); + memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE); if (r->linklayer == TC_LINKLAYER_UNAWARE) r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; @@ -592,17 +600,6 @@ out: pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } -EXPORT_SYMBOL(__qdisc_calculate_pkt_len); - -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) -{ - if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { - pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", - txt, qdisc->ops->id, qdisc->handle >> 16); - qdisc->flags |= TCQ_F_WARN_NONWC; - } -} -EXPORT_SYMBOL(qdisc_warn_nonwc); static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { @@ -619,8 +616,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { - hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); - wd->timer.function = qdisc_watchdog; + hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); @@ -634,19 +630,26 @@ EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns) { - if (test_bit(__QDISC_STATE_DEACTIVATED, - &qdisc_root_sleeping(wd->qdisc)->state)) + bool deactivated; + + rcu_read_lock(); + deactivated = test_bit(__QDISC_STATE_DEACTIVATED, + &qdisc_root_sleeping(wd->qdisc)->state); + rcu_read_unlock(); + if (deactivated) return; if (hrtimer_is_queued(&wd->timer)) { + u64 softexpires; + + softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); /* If timer is already set in [expires, expires + delta_ns], * do not reprogram it. */ - if (wd->last_expires - expires <= delta_ns) + if (softexpires - expires <= delta_ns) return; } - wd->last_expires = expires; hrtimer_start_range_ns(&wd->timer, ns_to_ktime(expires), delta_ns, @@ -772,42 +775,33 @@ static u32 qdisc_alloc_handle(struct net_device *dev) void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { - bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; - if (n == 0 && len == 0) - return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { - if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) + if (parentid == TC_H_ROOT) break; if (sch->flags & TCQ_F_NOPARENT) break; - /* Notify parent qdisc only if child qdisc becomes empty. - * - * If child was empty even before update then backlog - * counter is screwed and we skip notification because - * parent class is already passive. - * - * If the original child was offloaded then it is allowed - * to be seem as empty, so the parent is notified anyway. - */ - notify = !sch->q.qlen && !WARN_ON_ONCE(!n && - !qdisc_is_offloaded); + /* Notify parent qdisc only if child qdisc becomes empty. */ + notify = !sch->q.qlen; /* TODO: perform the search on a per txq basis */ - sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); + sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { + /* Note that qlen_notify must be idempotent as it may get called + * multiple times. + */ cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } @@ -902,7 +896,8 @@ static void qdisc_offload_graft_root(struct net_device *dev, } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; @@ -970,7 +965,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -989,25 +989,55 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) return false; } +static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, u32 clid, struct Qdisc *q, + struct netlink_ext_ack *extack) +{ + struct sk_buff *skb; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (!tc_qdisc_dump_ignore(q, false)) { + if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, + RTM_NEWQDISC, extack) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return 0; + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, - 0, RTM_DELQDISC) < 0) + 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, - old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } @@ -1022,10 +1052,11 @@ err_out: static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { if (new || old) - qdisc_notify(net, skb, n, clid, old, new); + qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); @@ -1063,17 +1094,29 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (parent == NULL) { unsigned int i, num_q, ingress; + struct netdev_queue *dev_queue; ingress = 0; num_q = dev->num_tx_queues; if ((q && q->flags & TCQ_F_INGRESS) || (new && new->flags & TCQ_F_INGRESS)) { - num_q = 1; ingress = 1; - if (!dev_ingress_queue(dev)) { + dev_queue = dev_ingress_queue(dev); + if (!dev_queue) { NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); return -ENOENT; } + + q = rtnl_dereference(dev_queue->qdisc_sleeping); + + /* This is the counterpart of that qdisc_refcount_inc_nz() call in + * __tcf_qdisc_find() for filter requests. + */ + if (!qdisc_refcount_dec_if_one(q)) { + NL_SET_ERR_MSG(extack, + "Current ingress or clsact Qdisc has ongoing filter requests"); + return -EBUSY; + } } if (dev->flags & IFF_UP) @@ -1084,18 +1127,26 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, if (new && new->ops->attach && !ingress) goto skip; - for (i = 0; i < num_q; i++) { - struct netdev_queue *dev_queue = dev_ingress_queue(dev); - - if (!ingress) + if (!ingress) { + for (i = 0; i < num_q; i++) { dev_queue = netdev_get_tx_queue(dev, i); + old = dev_graft_qdisc(dev_queue, new); - old = dev_graft_qdisc(dev_queue, new); - if (new && i > 0) - qdisc_refcount_inc(new); - - if (!ingress) + if (new && i > 0) + qdisc_refcount_inc(new); qdisc_put(old); + } + } else { + old = dev_graft_qdisc(dev_queue, NULL); + + /* {ingress,clsact}_destroy() @old before grafting @new to avoid + * unprotected concurrent accesses to net_device::miniq_{in,e}gress + * pointer(s) in mini_qdisc_pair_swap(). + */ + qdisc_notify(net, skb, n, classid, old, new, extack); + qdisc_destroy(old); + + dev_graft_qdisc(dev_queue, new); } skip: @@ -1105,12 +1156,10 @@ skip: qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); - } else { - notify_and_destroy(net, skb, n, classid, old, new); } if (dev->flags & IFF_UP) @@ -1138,10 +1187,16 @@ skip: return -EINVAL; } + if (new && + !(parent->flags & TCQ_F_MQROOT) && + rcu_access_pointer(new->stab)) { + NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); + return -EINVAL; + } err = cops->graft(parent, cl, new, &old, extack); if (err) return err; - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } @@ -1199,36 +1254,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev, struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); -#ifdef CONFIG_MODULES - if (ops == NULL && kind != NULL) { - char name[IFNAMSIZ]; - if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { - /* We dropped the RTNL semaphore in order to - * perform the module load. So, even if we - * succeeded in loading the module we have to - * tell the caller to replay the request. We - * indicate this using -EAGAIN. - * We replay the request because the device may - * go away in the mean time. - */ - rtnl_unlock(); - request_module("sch_%s", name); - rtnl_lock(); - ops = qdisc_lookup_ops(kind); - if (ops != NULL) { - /* We will try again qdisc_lookup_ops, - * so don't keep a reference. - */ - module_put(ops->owner); - err = -EAGAIN; - goto err_out; - } - } - } -#endif - - err = -ENOENT; if (!ops) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); goto err_out; } @@ -1242,7 +1269,12 @@ static struct Qdisc *qdisc_create(struct net_device *dev, sch->parent = parent; if (handle == TC_H_INGRESS) { - sch->flags |= TCQ_F_INGRESS; + if (!(sch->flags & TCQ_F_INGRESS)) { + NL_SET_ERR_MSG(extack, + "Specified parent ID is reserved for ingress and clsact Qdiscs"); + err = -EINVAL; + goto err_out3; + } handle = TC_H_MAKE(TC_H_INGRESS, 0); } else { if (handle == 0) { @@ -1266,7 +1298,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, * before again attaching a qdisc. */ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { - dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + WRITE_ONCE(dev->tx_queue_len, DEFAULT_TX_QUEUE_LEN); netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } @@ -1274,20 +1306,21 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (err) goto err_out3; - if (ops->init) { - err = ops->init(sch, tca[TCA_OPTIONS], extack); - if (err != 0) - goto err_out5; - } - if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) { err = PTR_ERR(stab); - goto err_out4; + goto err_out3; } rcu_assign_pointer(sch->stab, stab); } + + if (ops->init) { + err = ops->init(sch, tca[TCA_OPTIONS], extack); + if (err != 0) + goto err_out4; + } + if (tca[TCA_RATE]) { err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { @@ -1312,28 +1345,22 @@ static struct Qdisc *qdisc_create(struct net_device *dev, return sch; -err_out5: - /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ +err_out4: + /* Even if ops->init() failed, we call ops->destroy() + * like qdisc_create_dflt(). + */ if (ops->destroy) ops->destroy(sch); + qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: + lockdep_unregister_key(&sch->root_lock_key); netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: - module_put(ops->owner); + bpf_module_put(ops, ops->owner); err_out: *errp = err; return NULL; - -err_out4: - /* - * Any broken qdiscs that would require a ops->reset() here? - * The qdisc was never in action so it shouldn't be necessary. - */ - qdisc_put_stab(rtnl_dereference(sch->stab)); - if (ops->destroy) - ops->destroy(sch); - goto err_out3; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, @@ -1437,27 +1464,18 @@ const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { * Delete/get qdisc. */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = nlmsg_data(n); - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; + u32 clid; int err; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { @@ -1467,9 +1485,9 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } - q = qdisc_leaf(p, clid); + q = qdisc_leaf(p, clid, extack); } else if (dev_ingress_queue(dev)) { - q = dev_ingress_queue(dev)->qdisc_sleeping; + q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); @@ -1478,6 +1496,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } + if (IS_ERR(q)) + return PTR_ERR(q); if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); @@ -1492,7 +1512,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } @@ -1509,41 +1529,67 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, if (err != 0) return err; } else { - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_get_notify(net, skb, n, clid, q, NULL); } return 0; } -/* - * Create/change qdisc. - */ - -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm; + struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; - u32 clid; - struct Qdisc *q, *p; int err; -replay: - /* Reinit, just in case something touches this. */ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; - tcm = nlmsg_data(n); - clid = tcm->tcm_parent; - q = p = NULL; - dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; + netdev_lock_ops(dev); + err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + +static bool req_create_or_replace(struct nlmsghdr *n) +{ + return (n->nlmsg_flags & NLM_F_CREATE && + n->nlmsg_flags & NLM_F_REPLACE); +} + +static bool req_create_exclusive(struct nlmsghdr *n) +{ + return (n->nlmsg_flags & NLM_F_CREATE && + n->nlmsg_flags & NLM_F_EXCL); +} + +static bool req_change(struct nlmsghdr *n) +{ + return (!(n->nlmsg_flags & NLM_F_CREATE) && + !(n->nlmsg_flags & NLM_F_REPLACE) && + !(n->nlmsg_flags & NLM_F_EXCL)); +} + +static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) +{ + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + u32 clid; + int err; + + clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { @@ -1553,9 +1599,16 @@ replay: NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } - q = qdisc_leaf(p, clid); + if (p->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot add children to ingress/clsact qdisc"); + return -EOPNOTSUPP; + } + q = qdisc_leaf(p, clid, extack); + if (IS_ERR(q)) + return PTR_ERR(q); } else if (dev_ingress_queue_create(dev)) { - q = dev_ingress_queue(dev)->qdisc_sleeping; + q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); @@ -1578,13 +1631,22 @@ replay: q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; + if (q->parent != tcm->tcm_parent) { + NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); + return -EINVAL; + } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); + return -EINVAL; + } + if (q->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot regraft ingress or clsact Qdiscs"); return -EINVAL; } if (q == p || @@ -1592,6 +1654,10 @@ replay: NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; } + if (clid == TC_H_INGRESS) { + NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); + return -EINVAL; + } qdisc_refcount_inc(q); goto graft; } else { @@ -1602,27 +1668,35 @@ replay: * * We know, that some child q is already * attached to this parent and have choice: - * either to change it or to create/graft new one. + * 1) change it or 2) create/graft new one. + * If the requested qdisc kind is different + * than the existing one, then we choose graft. + * If they are the same then this is "change" + * operation - just let it fallthrough.. * * 1. We are allowed to create/graft only - * if CREATE and REPLACE flags are set. + * if the request is explicitly stating + * "please create if it doesn't exist". * - * 2. If EXCL is set, requestor wanted to say, - * that qdisc tcm_handle is not expected + * 2. If the request is to exclusive create + * then the qdisc tcm_handle is not expected * to exist, so that we choose create/graft too. * * 3. The last case is when no flags are set. + * This will happen when for example tc + * utility issues a "change" command. * Alas, it is sort of hole in API, we * cannot decide what to do unambiguously. - * For now we select create/graft, if - * user gave KIND, which does not match existing. + * For now we select create/graft. */ - if ((n->nlmsg_flags & NLM_F_CREATE) && - (n->nlmsg_flags & NLM_F_REPLACE) && - ((n->nlmsg_flags & NLM_F_EXCL) || - (tca[TCA_KIND] && - nla_strcmp(tca[TCA_KIND], q->ops->id)))) - goto create_n_graft; + if (tca[TCA_KIND] && + nla_strcmp(tca[TCA_KIND], q->ops->id)) { + if (req_create_or_replace(n) || + req_create_exclusive(n)) + goto create_n_graft; + else if (req_change(n)) + goto create_n_graft2; + } } } } else { @@ -1643,12 +1717,12 @@ replay: return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } err = qdisc_change(q, tca, extack); if (err == 0) - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack); return err; create_n_graft: @@ -1656,6 +1730,7 @@ create_n_graft: NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); return -ENOENT; } +create_n_graft2: if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) { q = qdisc_create(dev, dev_ingress_queue(dev), @@ -1679,11 +1754,8 @@ create_n_graft: tcm->tcm_parent, tcm->tcm_handle, tca, &err, extack); } - if (q == NULL) { - if (err == -EAGAIN) - goto replay; + if (!q) return err; - } graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); @@ -1696,6 +1768,58 @@ graft: return 0; } +static void request_qdisc_module(struct nlattr *kind) +{ + struct Qdisc_ops *ops; + char name[IFNAMSIZ]; + + if (!kind) + return; + + ops = qdisc_lookup_ops(kind); + if (ops) { + bpf_module_put(ops, ops->owner); + return; + } + + if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { + rtnl_unlock(); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); + rtnl_lock(); + } +} + +/* + * Create/change qdisc. + */ +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + struct tcmsg *tcm; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + request_qdisc_module(tca[TCA_KIND]); + + tcm = nlmsg_data(n); + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, int *q_idx_p, int s_q_idx, bool recur, @@ -1715,7 +1839,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1737,7 +1861,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1780,17 +1904,23 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; + netdev_lock_ops(dev); if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), skb, cb, &q_idx, s_q_idx, - true, tca[TCA_DUMP_INVISIBLE]) < 0) + true, tca[TCA_DUMP_INVISIBLE]) < 0) { + netdev_unlock_ops(dev); goto done; + } dev_queue = dev_ingress_queue(dev); if (dev_queue && - tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, - &q_idx, s_q_idx, false, - tca[TCA_DUMP_INVISIBLE]) < 0) + tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), + skb, cb, &q_idx, s_q_idx, false, + tca[TCA_DUMP_INVISIBLE]) < 0) { + netdev_unlock_ops(dev); goto done; + } + netdev_unlock_ops(dev); cont: idx++; @@ -1810,8 +1940,8 @@ done: ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, - unsigned long cl, - u32 portid, u32 seq, u16 flags, int event) + unsigned long cl, u32 portid, u32 seq, u16 flags, + int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1846,7 +1976,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1857,7 +1992,30 @@ nla_put_failure: static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, - unsigned long cl, int event) + unsigned long cl, int event, struct netlink_ext_ack *extack) +{ + struct sk_buff *skb; + u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; + + if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) + return 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); +} + +static int tclass_get_notify(struct net *net, struct sk_buff *oskb, + struct nlmsghdr *n, struct Qdisc *q, + unsigned long cl, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1866,7 +2024,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS, + extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -1888,14 +2047,18 @@ static int tclass_del_notify(struct net *net, if (!cops->delete) return -EOPNOTSUPP; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; + if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, - RTM_DELTCLASS) < 0) { - kfree_skb(skb); - return -EINVAL; + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, + RTM_DELTCLASS, extack) < 0) { + kfree_skb(skb); + return -EINVAL; + } + } else { + skb = NULL; } err = cops->delete(q, cl, extack); @@ -1904,8 +2067,8 @@ static int tclass_del_notify(struct net *net, return err; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); + err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); return err; } @@ -1994,15 +2157,15 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, #endif -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = nlmsg_data(n); - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; + struct Qdisc *q = NULL; unsigned long cl = 0; unsigned long new_cl; u32 portid; @@ -2010,15 +2173,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, u32 qid; int err; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. @@ -2100,7 +2254,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: - err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); + err = tclass_get_notify(net, skb, n, q, cl, extack); goto out; default: err = -EINVAL; @@ -2113,12 +2267,18 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, return -EOPNOTSUPP; } + /* Prevent creation of traffic classes with classid TC_H_ROOT */ + if (clid == TC_H_ROOT) { + NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT"); + return -EINVAL; + } + new_cl = cl; err = -EOPNOTSUPP; if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { - tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); @@ -2127,6 +2287,31 @@ out: return err; } +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(n); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + struct qdisc_dump_args { struct qdisc_walker w; struct sk_buff *skb; @@ -2140,7 +2325,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTCLASS); + RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, @@ -2203,20 +2388,12 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, return 0; } -static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb, + struct tcmsg *tcm, struct net_device *dev) { - struct tcmsg *tcm = nlmsg_data(cb->nlh); - struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; - struct net_device *dev; int t, s_t; - if (nlmsg_len(cb->nlh) < sizeof(*tcm)) - return 0; - dev = dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return 0; - s_t = cb->args[0]; t = 0; @@ -2226,17 +2403,39 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) dev_queue = dev_ingress_queue(dev); if (dev_queue && - tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, - &t, s_t, false) < 0) + tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), + skb, tcm, cb, &t, s_t, false) < 0) goto done; done: cb->args[0] = t; - dev_put(dev); return skb->len; } +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) + return 0; + + dev = dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return 0; + + netdev_lock_ops(dev); + err = __tc_dump_tclass(skb, cb, tcm, dev); + netdev_unlock_ops(dev); + + dev_put(dev); + + return err; +} + #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { @@ -2279,7 +2478,20 @@ static struct pernet_operations psched_net_ops = { .exit = psched_net_exit, }; +#if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); +#endif + +static const struct rtnl_msg_handler psched_rtnl_msg_handlers[] __initconst = { + {.msgtype = RTM_NEWQDISC, .doit = tc_modify_qdisc}, + {.msgtype = RTM_DELQDISC, .doit = tc_get_qdisc}, + {.msgtype = RTM_GETQDISC, .doit = tc_get_qdisc, + .dumpit = tc_dump_qdisc}, + {.msgtype = RTM_NEWTCLASS, .doit = tc_ctl_tclass}, + {.msgtype = RTM_DELTCLASS, .doit = tc_ctl_tclass}, + {.msgtype = RTM_GETTCLASS, .doit = tc_ctl_tclass, + .dumpit = tc_dump_tclass}, +}; static int __init pktsched_init(void) { @@ -2299,14 +2511,7 @@ static int __init pktsched_init(void) register_qdisc(&mq_qdisc_ops); register_qdisc(&noqueue_qdisc_ops); - rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, - 0); - rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0); - rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, - 0); + rtnl_register_many(psched_rtnl_msg_handlers); tc_wrapper_init(); |
