diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/datagram.c | 100 | ||||
-rw-r--r-- | net/core/dev.c | 137 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 1 | ||||
-rw-r--r-- | net/core/dst.c | 291 | ||||
-rw-r--r-- | net/core/fib_rules.c | 6 | ||||
-rw-r--r-- | net/core/filter.c | 684 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 69 | ||||
-rw-r--r-- | net/core/lwt_bpf.c | 5 | ||||
-rw-r--r-- | net/core/lwtunnel.c | 38 | ||||
-rw-r--r-- | net/core/neighbour.c | 80 | ||||
-rw-r--r-- | net/core/net-procfs.c | 13 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 16 | ||||
-rw-r--r-- | net/core/net_namespace.c | 64 | ||||
-rw-r--r-- | net/core/netpoll.c | 14 | ||||
-rw-r--r-- | net/core/pktgen.c | 58 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 132 | ||||
-rw-r--r-- | net/core/secure_seq.c | 9 | ||||
-rw-r--r-- | net/core/skbuff.c | 188 | ||||
-rw-r--r-- | net/core/sock.c | 89 |
19 files changed, 1359 insertions, 635 deletions
diff --git a/net/core/datagram.c b/net/core/datagram.c index f9653987c0f9..6877c43cc92d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -161,6 +161,45 @@ done: return skb; } +struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, + struct sk_buff_head *queue, + unsigned int flags, + void (*destructor)(struct sock *sk, + struct sk_buff *skb), + int *peeked, int *off, int *err, + struct sk_buff **last) +{ + struct sk_buff *skb; + int _off = *off; + + *last = queue->prev; + skb_queue_walk(queue, skb) { + if (flags & MSG_PEEK) { + if (_off >= skb->len && (skb->len || _off || + skb->peeked)) { + _off -= skb->len; + continue; + } + if (!skb->len) { + skb = skb_set_peeked(skb); + if (unlikely(IS_ERR(skb))) { + *err = PTR_ERR(skb); + return NULL; + } + } + *peeked = 1; + refcount_inc(&skb->users); + } else { + __skb_unlink(skb, queue); + if (destructor) + destructor(sk, skb); + } + *off = _off; + return skb; + } + return NULL; +} + /** * __skb_try_recv_datagram - Receive a datagram skbuff * @sk: socket @@ -222,40 +261,14 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, * Look at current nfs client by the way... * However, this function was correct in any case. 8) */ - int _off = *off; - - *last = (struct sk_buff *)queue; spin_lock_irqsave(&queue->lock, cpu_flags); - skb_queue_walk(queue, skb) { - *last = skb; - if (flags & MSG_PEEK) { - if (_off >= skb->len && (skb->len || _off || - skb->peeked)) { - _off -= skb->len; - continue; - } - if (!skb->len) { - skb = skb_set_peeked(skb); - if (IS_ERR(skb)) { - error = PTR_ERR(skb); - spin_unlock_irqrestore(&queue->lock, - cpu_flags); - goto no_packet; - } - } - *peeked = 1; - atomic_inc(&skb->users); - } else { - __skb_unlink(skb, queue); - if (destructor) - destructor(sk, skb); - } - spin_unlock_irqrestore(&queue->lock, cpu_flags); - *off = _off; - return skb; - } - + skb = __skb_try_recv_from_queue(sk, queue, flags, destructor, + peeked, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); + if (error) + goto no_packet; + if (skb) + return skb; if (!sk_can_busy_loop(sk)) break; @@ -317,9 +330,7 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) { bool slow; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) { + if (!skb_unref(skb)) { sk_peek_offset_bwd(sk, len); return; } @@ -335,8 +346,8 @@ void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) } EXPORT_SYMBOL(__skb_free_datagram_locked); -int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, - unsigned int flags, +int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, + struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)) { @@ -344,15 +355,15 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff *skb, if (flags & MSG_PEEK) { err = -ENOENT; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - atomic_dec(&skb->users); + spin_lock_bh(&sk_queue->lock); + if (skb == skb_peek(sk_queue)) { + __skb_unlink(skb, sk_queue); + refcount_dec(&skb->users); if (destructor) destructor(sk, skb); err = 0; } - spin_unlock_bh(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk_queue->lock); } atomic_inc(&sk->sk_drops); @@ -383,7 +394,8 @@ EXPORT_SYMBOL(__sk_queue_drop_skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) { - int err = __sk_queue_drop_skb(sk, skb, flags, NULL); + int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags, + NULL); kfree_skb(skb); sk_mem_reclaim_partial(sk); @@ -602,7 +614,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) skb->data_len += copied; skb->len += copied; skb->truesize += truesize; - atomic_add(truesize, &skb->sk->sk_wmem_alloc); + refcount_add(truesize, &skb->sk->sk_wmem_alloc); while (copied) { int size = min_t(int, copied, PAGE_SIZE - start); skb_fill_page_desc(skb, frag++, pages[n], start, size); diff --git a/net/core/dev.c b/net/core/dev.c index 416137c64bf8..7098fba52be1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -105,6 +105,7 @@ #include <net/dst.h> #include <net/dst_metadata.h> #include <net/pkt_sched.h> +#include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> #include <linux/highmem.h> @@ -142,6 +143,7 @@ #include <linux/hrtimer.h> #include <linux/netfilter_ingress.h> #include <linux/crash_dump.h> +#include <linux/sctp.h> #include "net-sysfs.h" @@ -161,6 +163,7 @@ static int netif_rx_internal(struct sk_buff *skb); static int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev, struct netdev_notifier_info *info); +static struct napi_struct *napi_by_id(unsigned int napi_id); /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl @@ -865,6 +868,31 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) EXPORT_SYMBOL(dev_get_by_index); /** + * dev_get_by_napi_id - find a device by napi_id + * @napi_id: ID of the NAPI struct + * + * Search for an interface by NAPI ID. Returns %NULL if the device + * is not found or a pointer to the device. The device has not had + * its reference counter increased so the caller must be careful + * about locking. The caller must hold RCU lock. + */ + +struct net_device *dev_get_by_napi_id(unsigned int napi_id) +{ + struct napi_struct *napi; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (napi_id < MIN_NAPI_ID) + return NULL; + + napi = napi_by_id(napi_id); + + return napi ? napi->dev : NULL; +} +EXPORT_SYMBOL(dev_get_by_napi_id); + +/** * netdev_get_name - get a netdevice name, knowing its ifindex. * @net: network namespace * @name: a pointer to the buffer where the name will be stored. @@ -1834,7 +1862,7 @@ static inline int deliver_skb(struct sk_buff *skb, { if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) return -ENOMEM; - atomic_inc(&skb->users); + refcount_inc(&skb->users); return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } @@ -2456,10 +2484,10 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) if (unlikely(!skb)) return; - if (likely(atomic_read(&skb->users) == 1)) { + if (likely(refcount_read(&skb->users) == 1)) { smp_rmb(); - atomic_set(&skb->users, 0); - } else if (likely(!atomic_dec_and_test(&skb->users))) { + refcount_set(&skb->users, 0); + } else if (likely(!refcount_dec_and_test(&skb->users))) { return; } get_kfree_skb_cb(skb)->reason = reason; @@ -2612,6 +2640,47 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +int skb_crc32c_csum_help(struct sk_buff *skb) +{ + __le32 crc32c_csum; + int ret = 0, offset, start; + + if (skb->ip_summed != CHECKSUM_PARTIAL) + goto out; + + if (unlikely(skb_is_gso(skb))) + goto out; + + /* Before computing a checksum, we should make sure no frag could + * be modified by an external entity : checksum could be wrong. + */ + if (unlikely(skb_has_shared_frag(skb))) { + ret = __skb_linearize(skb); + if (ret) + goto out; + } + start = skb_checksum_start_offset(skb); + offset = start + offsetof(struct sctphdr, checksum); + if (WARN_ON_ONCE(offset >= skb_headlen(skb))) { + ret = -EINVAL; + goto out; + } + if (skb_cloned(skb) && + !skb_clone_writable(skb, offset + sizeof(__le32))) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, + skb->len - start, ~(__u32)0, + crc32c_csum_stub)); + *(__le32 *)(skb->data + offset) = crc32c_csum; + skb->ip_summed = CHECKSUM_NONE; + skb->csum_not_inet = 0; +out: + return ret; +} + __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { __be16 type = skb->protocol; @@ -2954,6 +3023,17 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, return skb; } +int skb_csum_hwoffload_help(struct sk_buff *skb, + const netdev_features_t features) +{ + if (unlikely(skb->csum_not_inet)) + return !!(features & NETIF_F_SCTP_CRC) ? 0 : + skb_crc32c_csum_help(skb); + + return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb); +} +EXPORT_SYMBOL(skb_csum_hwoffload_help); + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { netdev_features_t features; @@ -2992,8 +3072,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device else skb_set_transport_header(skb, skb_checksum_start_offset(skb)); - if (!(features & NETIF_F_CSUM_MASK) && - skb_checksum_help(skb)) + if (skb_csum_hwoffload_help(skb, features)) goto out_kfree_skb; } } @@ -3179,7 +3258,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3191,6 +3270,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: *ret = NET_XMIT_SUCCESS; consume_skb(skb); return NULL; @@ -3875,7 +3955,7 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) clist = clist->next; - WARN_ON(atomic_read(&skb->users)); + WARN_ON(refcount_read(&skb->users)); if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) trace_consume_skb(skb); else @@ -3949,7 +4029,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res, false)) { + switch (tcf_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -3960,6 +4040,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: + case TC_ACT_TRAP: consume_skb(skb); return NULL; case TC_ACT_REDIRECT: @@ -4261,13 +4342,12 @@ static struct static_key generic_xdp_needed __read_mostly; static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) { + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); struct bpf_prog *new = xdp->prog; int ret = 0; switch (xdp->command) { - case XDP_SETUP_PROG: { - struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); - + case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); if (old) bpf_prog_put(old); @@ -4279,10 +4359,10 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) dev_disable_lro(dev); } break; - } case XDP_QUERY_PROG: - xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog); + xdp->prog_attached = !!old; + xdp->prog_id = old ? old->aux->id : 0; break; default: @@ -4637,9 +4717,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - if (skb->csum_bad) - goto normal; - gro_list_prepare(napi, skb); rcu_read_lock(); @@ -6867,7 +6944,7 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op) +u8 __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op, u32 *prog_id) { struct netdev_xdp xdp; @@ -6876,18 +6953,25 @@ bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op) /* Query must always succeed. */ WARN_ON(xdp_op(dev, &xdp) < 0); + if (prog_id) + *prog_id = xdp.prog_id; + return xdp.prog_attached; } static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op, - struct netlink_ext_ack *extack, + struct netlink_ext_ack *extack, u32 flags, struct bpf_prog *prog) { struct netdev_xdp xdp; memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_SETUP_PROG; + if (flags & XDP_FLAGS_HW_MODE) + xdp.command = XDP_SETUP_PROG_HW; + else + xdp.command = XDP_SETUP_PROG; xdp.extack = extack; + xdp.flags = flags; xdp.prog = prog; return xdp_op(dev, &xdp); @@ -6913,7 +6997,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, ASSERT_RTNL(); xdp_op = xdp_chk = ops->ndo_xdp; - if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE)) + if (!xdp_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) return -EOPNOTSUPP; if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) xdp_op = generic_xdp_install; @@ -6921,10 +7005,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, xdp_chk = generic_xdp_install; if (fd >= 0) { - if (xdp_chk && __dev_xdp_attached(dev, xdp_chk)) + if (xdp_chk && __dev_xdp_attached(dev, xdp_chk, NULL)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, xdp_op)) + __dev_xdp_attached(dev, xdp_op, NULL)) return -EBUSY; prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); @@ -6932,7 +7016,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return PTR_ERR(prog); } - err = dev_xdp_install(dev, xdp_op, extack, prog); + err = dev_xdp_install(dev, xdp_op, extack, flags, prog); if (err < 0 && prog) bpf_prog_put(prog); @@ -7023,7 +7107,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL); /* @@ -7751,7 +7835,7 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, { #if BITS_PER_LONG == 64 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); - memcpy(stats64, netdev_stats, sizeof(*stats64)); + memcpy(stats64, netdev_stats, sizeof(*netdev_stats)); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + sizeof(*netdev_stats), 0, sizeof(*stats64) - sizeof(*netdev_stats)); @@ -8608,7 +8692,6 @@ static int __init net_dev_init(void) rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead", NULL, dev_cpu_dead); WARN_ON(rc < 0); - dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 27fad31784a8..82fd4c9c4a1b 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -225,6 +225,7 @@ static int net_hwtstamp_validate(struct ifreq *ifr) case HWTSTAMP_FILTER_PTP_V2_EVENT: case HWTSTAMP_FILTER_PTP_V2_SYNC: case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: + case HWTSTAMP_FILTER_NTP_ALL: rx_filter_valid = 1; break; } diff --git a/net/core/dst.c b/net/core/dst.c index 13ba4a090c41..00aa972ad1a1 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -42,108 +42,6 @@ * to dirty as few cache lines as possible in __dst_free(). * As this is not a very strong hint, we dont force an alignment on SMP. */ -static struct { - spinlock_t lock; - struct dst_entry *list; - unsigned long timer_inc; - unsigned long timer_expires; -} dst_garbage = { - .lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock), - .timer_inc = DST_GC_MAX, -}; -static void dst_gc_task(struct work_struct *work); -static void ___dst_free(struct dst_entry *dst); - -static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task); - -static DEFINE_MUTEX(dst_gc_mutex); -/* - * long lived entries are maintained in this list, guarded by dst_gc_mutex - */ -static struct dst_entry *dst_busy_list; - -static void dst_gc_task(struct work_struct *work) -{ - int delayed = 0; - int work_performed = 0; - unsigned long expires = ~0L; - struct dst_entry *dst, *next, head; - struct dst_entry *last = &head; - - mutex_lock(&dst_gc_mutex); - next = dst_busy_list; - -loop: - while ((dst = next) != NULL) { - next = dst->next; - prefetch(&next->next); - cond_resched(); - if (likely(atomic_read(&dst->__refcnt))) { - last->next = dst; - last = dst; - delayed++; - continue; - } - work_performed++; - - dst = dst_destroy(dst); - if (dst) { - /* NOHASH and still referenced. Unless it is already - * on gc list, invalidate it and add to gc list. - * - * Note: this is temporary. Actually, NOHASH dst's - * must be obsoleted when parent is obsoleted. - * But we do not have state "obsoleted, but - * referenced by parent", so it is right. - */ - if (dst->obsolete > 0) - continue; - - ___dst_free(dst); - dst->next = next; - next = dst; - } - } - - spin_lock_bh(&dst_garbage.lock); - next = dst_garbage.list; - if (next) { - dst_garbage.list = NULL; - spin_unlock_bh(&dst_garbage.lock); - goto loop; - } - last->next = NULL; - dst_busy_list = head.next; - if (!dst_busy_list) - dst_garbage.timer_inc = DST_GC_MAX; - else { - /* - * if we freed less than 1/10 of delayed entries, - * we can sleep longer. - */ - if (work_performed <= delayed/10) { - dst_garbage.timer_expires += dst_garbage.timer_inc; - if (dst_garbage.timer_expires > DST_GC_MAX) - dst_garbage.timer_expires = DST_GC_MAX; - dst_garbage.timer_inc += DST_GC_INC; - } else { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - } - expires = dst_garbage.timer_expires; - /* - * if the next desired timer is more than 4 seconds in the - * future then round the timer to whole seconds - */ - if (expires > 4*HZ) - expires = round_jiffies_relative(expires); - schedule_delayed_work(&dst_gc_work, expires); - } - - spin_unlock_bh(&dst_garbage.lock); - mutex_unlock(&dst_gc_mutex); -} - int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); @@ -216,41 +114,12 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, } EXPORT_SYMBOL(dst_alloc); -static void ___dst_free(struct dst_entry *dst) -{ - /* The first case (dev==NULL) is required, when - protocol module is unloaded. - */ - if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } - dst->obsolete = DST_OBSOLETE_DEAD; -} - -void __dst_free(struct dst_entry *dst) -{ - spin_lock_bh(&dst_garbage.lock); - ___dst_free(dst); - dst->next = dst_garbage.list; - dst_garbage.list = dst; - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); - } - spin_unlock_bh(&dst_garbage.lock); -} -EXPORT_SYMBOL(__dst_free); - struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child; smp_rmb(); -again: child = dst->child; if (!(dst->flags & DST_NOCOUNT)) @@ -269,20 +138,8 @@ again: kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; - if (dst) { - int nohash = dst->flags & DST_NOHASH; - - if (atomic_dec_and_test(&dst->__refcnt)) { - /* We were real parent of this dst, so kill child. */ - if (nohash) - goto again; - } else { - /* Child is still referenced, return it for freeing. */ - if (nohash) - return dst; - /* Child is still in his hash table */ - } - } + if (dst) + dst_release_immediate(dst); return NULL; } EXPORT_SYMBOL(dst_destroy); @@ -292,26 +149,62 @@ static void dst_destroy_rcu(struct rcu_head *head) struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst = dst_destroy(dst); - if (dst) - __dst_free(dst); } +/* Operations to mark dst as DEAD and clean up the net device referenced + * by dst: + * 1. put the dst under loopback interface and discard all tx/rx packets + * on this route. + * 2. release the net_device + * This function should be called when removing routes from the fib tree + * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to + * make the next dst_ops->check() fail. + */ +void dst_dev_put(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + + dst->obsolete = DST_OBSOLETE_DEAD; + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, true); + dst->input = dst_discard; + dst->output = dst_discard_out; + dst->dev = dev_net(dst->dev)->loopback_dev; + dev_hold(dst->dev); + dev_put(dev); +} +EXPORT_SYMBOL(dst_dev_put); + void dst_release(struct dst_entry *dst) { if (dst) { int newrefcnt; - unsigned short nocache = dst->flags & DST_NOCACHE; newrefcnt = atomic_dec_return(&dst->__refcnt); if (unlikely(newrefcnt < 0)) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); - if (!newrefcnt && unlikely(nocache)) + if (!newrefcnt) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); +void dst_release_immediate(struct dst_entry *dst) +{ + if (dst) { + int newrefcnt; + + newrefcnt = atomic_dec_return(&dst->__refcnt); + if (unlikely(newrefcnt < 0)) + net_warn_ratelimited("%s: dst:%p refcnt:%d\n", + __func__, dst, newrefcnt); + if (!newrefcnt) + dst_destroy(dst); + } +} +EXPORT_SYMBOL(dst_release_immediate); + u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); @@ -371,21 +264,25 @@ static int dst_md_discard(struct sk_buff *skb) return 0; } -static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) +static void __metadata_dst_init(struct metadata_dst *md_dst, + enum metadata_type type, u8 optslen) + { struct dst_entry *dst; dst = &md_dst->dst; dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, - DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + DST_METADATA | DST_NOCOUNT); dst->input = dst_md_discard; dst->output = dst_md_discard_out; memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); + md_dst->type = type; } -struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, + gfp_t flags) { struct metadata_dst *md_dst; @@ -393,7 +290,7 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) if (!md_dst) return NULL; - __metadata_dst_init(md_dst, optslen); + __metadata_dst_init(md_dst, type, optslen); return md_dst; } @@ -407,7 +304,8 @@ void metadata_dst_free(struct metadata_dst *md_dst) kfree(md_dst); } -struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) +struct metadata_dst __percpu * +metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) { int cpu; struct metadata_dst __percpu *md_dst; @@ -418,91 +316,8 @@ struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) return NULL; for_each_possible_cpu(cpu) - __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen); + __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); - -/* Dirty hack. We did it in 2.2 (in __dst_free), - * we have _very_ good reasons not to repeat - * this mistake in 2.3, but we have no choice - * now. _It_ _is_ _explicit_ _deliberate_ - * _race_ _condition_. - * - * Commented and originally written by Alexey. - */ -static void dst_ifdown(struct dst_entry *dst, struct net_device *dev, - int unregister) -{ - if (dst->ops->ifdown) - dst->ops->ifdown(dst, dev, unregister); - - if (dev != dst->dev) - return; - - if (!unregister) { - dst->input = dst_discard; - dst->output = dst_discard_out; - } else { - dst->dev = dev_net(dst->dev)->loopback_dev; - dev_hold(dst->dev); - dev_put(dev); - } -} - -static int dst_dev_event(struct notifier_block *this, unsigned long event, - void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct dst_entry *dst, *last = NULL; - - switch (event) { - case NETDEV_UNREGISTER_FINAL: - case NETDEV_DOWN: - mutex_lock(&dst_gc_mutex); - for (dst = dst_busy_list; dst; dst = dst->next) { - last = dst; - dst_ifdown(dst, dev, event != NETDEV_DOWN); - } - - spin_lock_bh(&dst_garbage.lock); - dst = dst_garbage.list; - dst_garbage.list = NULL; - /* The code in dst_ifdown places a hold on the loopback device. - * If the gc entry processing is set to expire after a lengthy - * interval, this hold can cause netdev_wait_allrefs() to hang - * out and wait for a long time -- until the the loopback - * interface is released. If we're really unlucky, it'll emit - * pr_emerg messages to console too. Reset the interval here, - * so dst cleanups occur in a more timely fashion. - */ - if (dst_garbage.timer_inc > DST_GC_INC) { - dst_garbage.timer_inc = DST_GC_INC; - dst_garbage.timer_expires = DST_GC_MIN; - mod_delayed_work(system_wq, &dst_gc_work, - dst_garbage.timer_expires); - } - spin_unlock_bh(&dst_garbage.lock); - - if (last) - last->next = dst; - else - dst_busy_list = dst; - for (; dst; dst = dst->next) - dst_ifdown(dst, dev, event != NETDEV_DOWN); - mutex_unlock(&dst_gc_mutex); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block dst_dev_notifier = { - .notifier_call = dst_dev_event, - .priority = -10, /* must be called after other network notifiers */ -}; - -void __init dst_subsys_init(void) -{ - register_netdevice_notifier(&dst_dev_notifier); -} diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 3bba291c6c32..a0093e1b0235 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -46,7 +46,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, if (r == NULL) return -ENOMEM; - atomic_set(&r->refcnt, 1); + refcount_set(&r->refcnt, 1); r->action = FR_ACT_TO_TBL; r->pref = pref; r->table = table; @@ -283,7 +283,7 @@ jumped: if (err != -EAGAIN) { if ((arg->flags & FIB_LOOKUP_NOREF) || - likely(atomic_inc_not_zero(&rule->refcnt))) { + likely(refcount_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } @@ -517,7 +517,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, last = r; } - fib_rule_get(rule); + refcount_set(&rule->refcnt, 1); if (last) list_add_rcu(&rule->list, &last->list); diff --git a/net/core/filter.c b/net/core/filter.c index a6bb95fa87b2..c7f737058d89 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -54,6 +54,7 @@ #include <net/dst.h> #include <net/sock_reuseport.h> #include <net/busy_poll.h> +#include <net/tcp.h> /** * sk_filter_trim_cap - run a packet through a socket filter @@ -352,7 +353,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * bpf_convert_filter - convert filter program * @prog: the user passed filter program * @len: the length of the user passed filter program - * @new_prog: buffer where converted program will be stored + * @new_prog: allocated 'struct bpf_prog' or NULL * @new_len: pointer to store length of converted program * * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' @@ -364,14 +365,13 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * * 2) 2nd pass to remap in two passes: 1st pass finds new * jump offsets, 2nd pass remapping: - * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); */ static int bpf_convert_filter(struct sock_filter *prog, int len, - struct bpf_insn *new_prog, int *new_len) + struct bpf_prog *new_prog, int *new_len) { - int new_flen = 0, pass = 0, target, i; - struct bpf_insn *new_insn; + int new_flen = 0, pass = 0, target, i, stack_off; + struct bpf_insn *new_insn, *first_insn = NULL; struct sock_filter *fp; int *addrs = NULL; u8 bpf_src; @@ -383,6 +383,7 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, return -EINVAL; if (new_prog) { + first_insn = new_prog->insnsi; addrs = kcalloc(len, sizeof(*addrs), GFP_KERNEL | __GFP_NOWARN); if (!addrs) @@ -390,11 +391,11 @@ static int bpf_convert_filter(struct sock_filter *prog, int len, } do_pass: - new_insn = new_prog; + new_insn = first_insn; fp = prog; /* Classic BPF related prologue emission. */ - if (new_insn) { + if (new_prog) { /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ @@ -415,7 +416,7 @@ do_pass: struct bpf_insn *insn = tmp_insns; if (addrs) - addrs[i] = new_insn - new_prog; + addrs[i] = new_insn - first_insn; switch (fp->code) { /* All arithmetic insns and skb loads map as-is. */ @@ -561,17 +562,25 @@ do_pass: /* Store to stack. */ case BPF_ST: case BPF_STX: + stack_off = fp->k * 4 + 4; *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == BPF_ST ? BPF_REG_A : BPF_REG_X, - -(BPF_MEMWORDS - fp->k) * 4); + -stack_off); + /* check_load_and_stores() verifies that classic BPF can + * load from stack only after write, so tracking + * stack_depth for ST|STX insns is enough + */ + if (new_prog && new_prog->aux->stack_depth < stack_off) + new_prog->aux->stack_depth = stack_off; break; /* Load from stack. */ case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: + stack_off = fp->k * 4 + 4; *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? BPF_REG_A : BPF_REG_X, BPF_REG_FP, - -(BPF_MEMWORDS - fp->k) * 4); + -stack_off); break; /* A = K or X = K */ @@ -619,13 +628,13 @@ do_pass: if (!new_prog) { /* Only calculating new length. */ - *new_len = new_insn - new_prog; + *new_len = new_insn - first_insn; return 0; } pass++; - if (new_flen != new_insn - new_prog) { - new_flen = new_insn - new_prog; + if (new_flen != new_insn - first_insn) { + new_flen = new_insn - first_insn; if (pass > 2) goto err; goto do_pass; @@ -1017,7 +1026,7 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) fp->len = new_len; /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ - err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); + err = bpf_convert_filter(old_prog, old_len, fp, &new_len); if (err) /* 2nd bpf_convert_filter() can fail only if it fails * to allocate memory, remapping must succeed. Note, @@ -1866,6 +1875,24 @@ static const struct bpf_func_proto bpf_set_hash_invalid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) +{ + /* Set user specified hash as L4(+), so that it gets returned + * on skb_get_hash() call unless BPF prog later on triggers a + * skb_clear_hash(). + */ + __skb_set_sw_hash(skb, hash, true); + return 0; +} + +static const struct bpf_func_proto bpf_set_hash_proto = { + .func = bpf_set_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, u16, vlan_tci) { @@ -1985,7 +2012,7 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); - u32 off = skb->network_header - skb->mac_header; + u32 off = skb_mac_header_len(skb); int ret; ret = skb_cow(skb, len_diff); @@ -2021,7 +2048,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) static int bpf_skb_proto_6_to_4(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); - u32 off = skb->network_header - skb->mac_header; + u32 off = skb_mac_header_len(skb); int ret; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2127,6 +2154,124 @@ static const struct bpf_func_proto bpf_skb_change_type_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 bpf_skb_net_base_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case htons(ETH_P_IP): + return sizeof(struct iphdr); + case htons(ETH_P_IPV6): + return sizeof(struct ipv6hdr); + default: + return ~0U; + } +} + +static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) +{ + u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); + int ret; + + ret = skb_cow(skb, len_diff); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_push(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* Due to header grow, MSS needs to be downgraded. */ + skb_shinfo(skb)->gso_size -= len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + return 0; +} + +static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) +{ + u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); + int ret; + + ret = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(ret < 0)) + return ret; + + ret = bpf_skb_net_hdr_pop(skb, off, len_diff); + if (unlikely(ret < 0)) + return ret; + + if (skb_is_gso(skb)) { + /* Due to header shrink, MSS can be upgraded. */ + skb_shinfo(skb)->gso_size += len_diff; + /* Header must be checked, and gso_segs recomputed. */ + skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; + skb_shinfo(skb)->gso_segs = 0; + } + + return 0; +} + +static u32 __bpf_skb_max_len(const struct sk_buff *skb) +{ + return skb->dev->mtu + skb->dev->hard_header_len; +} + +static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) +{ + bool trans_same = skb->transport_header == skb->network_header; + u32 len_cur, len_diff_abs = abs(len_diff); + u32 len_min = bpf_skb_net_base_len(skb); + u32 len_max = __bpf_skb_max_len(skb); + __be16 proto = skb->protocol; + bool shrink = len_diff < 0; + int ret; + + if (unlikely(len_diff_abs > 0xfffU)) + return -EFAULT; + if (unlikely(proto != htons(ETH_P_IP) && + proto != htons(ETH_P_IPV6))) + return -ENOTSUPP; + + len_cur = skb->len - skb_network_offset(skb); + if (skb_transport_header_was_set(skb) && !trans_same) + len_cur = skb_network_header_len(skb); + if ((shrink && (len_diff_abs >= len_cur || + len_cur - len_diff_abs < len_min)) || + (!shrink && (skb->len + len_diff_abs > len_max && + !skb_is_gso(skb)))) + return -ENOTSUPP; + + ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) : + bpf_skb_net_grow(skb, len_diff_abs); + + bpf_compute_data_end(skb); + return 0; +} + +BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, + u32, mode, u64, flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (likely(mode == BPF_ADJ_ROOM_NET)) + return bpf_skb_adjust_net(skb, len_diff); + + return -ENOTSUPP; +} + +static const struct bpf_func_proto bpf_skb_adjust_room_proto = { + .func = bpf_skb_adjust_room, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + static u32 __bpf_skb_min_len(const struct sk_buff *skb) { u32 min_len = skb_network_offset(skb); @@ -2139,11 +2284,6 @@ static u32 __bpf_skb_min_len(const struct sk_buff *skb) return min_len; } -static u32 __bpf_skb_max_len(const struct sk_buff *skb) -{ - return skb->dev->mtu + skb->dev->hard_header_len; -} - static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) { unsigned int old_len = skb->len; @@ -2280,6 +2420,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_skb_change_proto || func == bpf_skb_change_head || func == bpf_skb_change_tail || + func == bpf_skb_adjust_room || func == bpf_skb_pull_data || func == bpf_clone_redirect || func == bpf_l3_csum_replace || @@ -2539,6 +2680,7 @@ bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) * that is holding verifier mutex. */ md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, GFP_KERNEL); if (!md_dst) return NULL; @@ -2645,6 +2787,110 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = 0; + int val; + + if (!sk_fullsock(sk)) + return -EINVAL; + + if (level == SOL_SOCKET) { + if (optlen != sizeof(int)) + return -EINVAL; + val = *((int *)optval); + + /* Only some socketops are supported */ + switch (optname) { + case SO_RCVBUF: + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); + break; + case SO_SNDBUF: + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); + break; + case SO_MAX_PACING_RATE: + sk->sk_max_pacing_rate = val; + sk->sk_pacing_rate = min(sk->sk_pacing_rate, + sk->sk_max_pacing_rate); + break; + case SO_PRIORITY: + sk->sk_priority = val; + break; + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + case SO_MARK: + sk->sk_mark = val; + break; + default: + ret = -EINVAL; + } +#ifdef CONFIG_INET + } else if (level == SOL_TCP && + sk->sk_prot->setsockopt == tcp_setsockopt) { + if (optname == TCP_CONGESTION) { + char name[TCP_CA_NAME_MAX]; + + strncpy(name, optval, min_t(long, optlen, + TCP_CA_NAME_MAX-1)); + name[TCP_CA_NAME_MAX-1] = 0; + ret = tcp_set_congestion_control(sk, name, false); + if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + /* replacing an existing ca */ + tcp_reinit_congestion_control(sk, + inet_csk(sk)->icsk_ca_ops); + } else { + struct tcp_sock *tp = tcp_sk(sk); + + if (optlen != sizeof(int)) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > 0) + ret = -EINVAL; + else + tp->snd_cwnd = val; + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) { + ret = -EINVAL; + } else { + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + } + break; + default: + ret = -EINVAL; + } + } + ret = -EINVAL; +#endif + } else { + ret = -EINVAL; + } + return ret; +} + +static const struct bpf_func_proto bpf_setsockopt_proto = { + .func = bpf_setsockopt, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -2718,6 +2964,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_change_proto_proto; case BPF_FUNC_skb_change_type: return &bpf_skb_change_type_proto; + case BPF_FUNC_skb_adjust_room: + return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; case BPF_FUNC_skb_get_tunnel_key: @@ -2736,6 +2984,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_hash_recalc_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; + case BPF_FUNC_set_hash: + return &bpf_set_hash_proto; case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; case BPF_FUNC_get_smp_processor_id: @@ -2767,12 +3017,6 @@ xdp_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -cg_skb_func_proto(enum bpf_func_id func_id) -{ - return sk_filter_func_proto(func_id); -} - -static const struct bpf_func_proto * lwt_inout_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -2800,6 +3044,17 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * + sock_ops_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_setsockopt_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -2834,8 +3089,11 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) } } -static bool __is_valid_access(int off, int size) +static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct __sk_buff)) return false; @@ -2844,15 +3102,25 @@ static bool __is_valid_access(int off, int size) return false; switch (off) { - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: - if (off + size > - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32)) + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; - default: - if (size != sizeof(__u32)) + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (size != size_default) return false; + break; + default: + /* Only narrow read access allowed for now. */ + if (type == BPF_WRITE) { + if (size != size_default) + return false; + } else { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } } return true; @@ -2860,43 +3128,41 @@ static bool __is_valid_access(int off, int size) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): - case offsetof(struct __sk_buff, data): - case offsetof(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; } } - return __is_valid_access(off, size); + return bpf_skb_is_valid_access(off, size, type, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { switch (off) { - case offsetof(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, tc_classid): return false; } if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; @@ -2904,20 +3170,20 @@ static bool lwt_is_valid_access(int off, int size, } switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; break; } - return __is_valid_access(off, size); + return bpf_skb_is_valid_access(off, size, type, info); } static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { @@ -2980,16 +3246,15 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { switch (off) { - case offsetof(struct __sk_buff, mark): - case offsetof(struct __sk_buff, tc_index): - case offsetof(struct __sk_buff, priority): - case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: - case offsetof(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; default: return false; @@ -2997,15 +3262,15 @@ static bool tc_cls_act_is_valid_access(int off, int size, } switch (off) { - case offsetof(struct __sk_buff, data): - *reg_type = PTR_TO_PACKET; + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; break; - case offsetof(struct __sk_buff, data_end): - *reg_type = PTR_TO_PACKET_END; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; break; } - return __is_valid_access(off, size); + return bpf_skb_is_valid_access(off, size, type, info); } static bool __is_valid_xdp_access(int off, int size) @@ -3022,17 +3287,17 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, - enum bpf_reg_type *reg_type) + struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) return false; switch (off) { case offsetof(struct xdp_md, data): - *reg_type = PTR_TO_PACKET; + info->reg_type = PTR_TO_PACKET; break; case offsetof(struct xdp_md, data_end): - *reg_type = PTR_TO_PACKET_END; + info->reg_type = PTR_TO_PACKET_END; break; } @@ -3045,101 +3310,141 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool __is_valid_sock_ops_access(int off, int size) +{ + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) + return false; + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + if (size != sizeof(__u32)) + return false; + + return true; +} + +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + break; + default: + return false; + } + } + + return __is_valid_sock_ops_access(off, size); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; int off; switch (si->off) { case offsetof(struct __sk_buff, len): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, len)); + bpf_target_off(struct sk_buff, len, 4, + target_size)); break; case offsetof(struct __sk_buff, protocol): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, protocol)); + bpf_target_off(struct sk_buff, protocol, 2, + target_size)); break; case offsetof(struct __sk_buff, vlan_proto): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, vlan_proto)); + bpf_target_off(struct sk_buff, vlan_proto, 2, + target_size)); break; case offsetof(struct __sk_buff, priority): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, priority)); + bpf_target_off(struct sk_buff, priority, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, priority)); + bpf_target_off(struct sk_buff, priority, 4, + target_size)); break; case offsetof(struct __sk_buff, ingress_ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, skb_iif)); + bpf_target_off(struct sk_buff, skb_iif, 4, + target_size)); break; case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - offsetof(struct net_device, ifindex)); + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; case offsetof(struct __sk_buff, hash): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, hash)); + bpf_target_off(struct sk_buff, hash, 4, + target_size)); break; case offsetof(struct __sk_buff, mark): - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, mark)); + bpf_target_off(struct sk_buff, mark, 4, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, mark)); + bpf_target_off(struct sk_buff, mark, 4, + target_size)); break; case offsetof(struct __sk_buff, pkt_type): - return convert_skb_access(SKF_AD_PKTTYPE, si->dst_reg, - si->src_reg, insn); + *target_size = 1; + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, + PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +#endif + break; case offsetof(struct __sk_buff, queue_mapping): - return convert_skb_access(SKF_AD_QUEUE, si->dst_reg, - si->src_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, queue_mapping, 2, + target_size)); + break; case offsetof(struct __sk_buff, vlan_present): - return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, - si->dst_reg, si->src_reg, insn); - case offsetof(struct __sk_buff, vlan_tci): - return convert_skb_access(SKF_AD_VLAN_TAG, - si->dst_reg, si->src_reg, insn); + BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); + + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, vlan_tci, 2, + target_size)); + if (si->off == offsetof(struct __sk_buff, vlan_tci)) { + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, + ~VLAN_TAG_PRESENT); + } else { + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + } + break; case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]) + sizeof(__u32) - 1: + offsetofend(struct __sk_buff, cb[4]) - 1: BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); BUILD_BUG_ON((offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data)) % @@ -3165,6 +3470,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off -= offsetof(struct __sk_buff, tc_classid); off += offsetof(struct sk_buff, cb); off += offsetof(struct qdisc_skb_cb, tc_classid); + *target_size = 2; if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, off); @@ -3190,14 +3496,14 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); - if (type == BPF_WRITE) *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, tc_index)); + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); else *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, tc_index)); + bpf_target_off(struct sk_buff, tc_index, 2, + target_size)); #else if (type == BPF_WRITE) *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); @@ -3208,10 +3514,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, napi_id): #if defined(CONFIG_NET_RX_BUSY_POLL) - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4); - *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, - offsetof(struct sk_buff, napi_id)); + bpf_target_off(struct sk_buff, napi_id, 4, + target_size)); *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #else @@ -3226,7 +3531,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; @@ -3270,22 +3575,22 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; switch (si->off) { case offsetof(struct __sk_buff, ifindex): - BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), si->dst_reg, si->src_reg, offsetof(struct sk_buff, dev)); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, - offsetof(struct net_device, ifindex)); + bpf_target_off(struct net_device, ifindex, 4, + target_size)); break; default: - return bpf_convert_ctx_access(type, si, insn_buf, prog); + return bpf_convert_ctx_access(type, si, insn_buf, prog, + target_size); } return insn - insn_buf; @@ -3294,7 +3599,7 @@ static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, static u32 xdp_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, - struct bpf_prog *prog) + struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; @@ -3314,6 +3619,139 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_ops, op) ... + offsetof(struct bpf_sock_ops, replylong[3]): + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, op)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, reply)); + BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) != + FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong)); + off = si->off; + off -= offsetof(struct bpf_sock_ops, op); + off += offsetof(struct bpf_sock_ops_kern, op); + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + off); + break; + + case offsetof(struct bpf_sock_ops, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_family)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_daddr)); + break; + + case offsetof(struct bpf_sock_ops, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_rcv_saddr)); + break; + + case offsetof(struct bpf_sock_ops, remote_ip6[0]) ... + offsetof(struct bpf_sock_ops, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, remote_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_ip6[0]) ... + offsetof(struct bpf_sock_ops, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct bpf_sock_ops, local_ip6[0]); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock_ops, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_dport)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct bpf_sock_ops, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_num)); + break; + } + return insn - insn_buf; +} + const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, @@ -3336,7 +3774,7 @@ const struct bpf_verifier_ops xdp_prog_ops = { }; const struct bpf_verifier_ops cg_skb_prog_ops = { - .get_func_proto = cg_skb_func_proto, + .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .test_run = bpf_prog_test_run_skb, @@ -3363,6 +3801,12 @@ const struct bpf_verifier_ops cg_sock_prog_ops = { .convert_ctx_access = sock_filter_convert_ctx_access, }; +const struct bpf_verifier_ops sock_ops_prog_ops = { + .get_func_proto = sock_ops_func_proto, + .is_valid_access = sock_ops_is_valid_access, + .convert_ctx_access = sock_ops_convert_ctx_access, +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 28d94bce4df8..fc5fc4594c90 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -18,6 +18,7 @@ #include <linux/stddef.h> #include <linux/if_ether.h> #include <linux/mpls.h> +#include <linux/tcp.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> @@ -342,6 +343,64 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, return FLOW_DISSECT_RET_OUT_PROTO_AGAIN; } +static void +__skb_flow_dissect_tcp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int thoff, int hlen) +{ + struct flow_dissector_key_tcp *key_tcp; + struct tcphdr *th, _th; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP)) + return; + + th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th); + if (!th) + return; + + if (unlikely(__tcp_hdrlen(th) < sizeof(_th))) + return; + + key_tcp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_TCP, + target_container); + key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF)); +} + +static void +__skb_flow_dissect_ipv4(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct iphdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = iph->tos; + key_ip->ttl = iph->ttl; +} + +static void +__skb_flow_dissect_ipv6(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, const struct ipv6hdr *iph) +{ + struct flow_dissector_key_ip *key_ip; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP)) + return; + + key_ip = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IP, + target_container); + key_ip->tos = ipv6_get_dsfield(iph); + key_ip->ttl = iph->hop_limit; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -444,6 +503,9 @@ ip: } } + __skb_flow_dissect_ipv4(skb, flow_dissector, + target_container, data, iph); + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) goto out_good; @@ -489,6 +551,9 @@ ipv6: goto out_good; } + __skb_flow_dissect_ipv6(skb, flow_dissector, + target_container, data, iph); + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) goto out_good; @@ -683,6 +748,10 @@ ip_proto_again: case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); goto mpls; + case IPPROTO_TCP: + __skb_flow_dissect_tcp(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index b3bc0a31af9f..1307731ddfe4 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -240,7 +240,8 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { static int bpf_build_state(struct nlattr *nla, unsigned int family, const void *cfg, - struct lwtunnel_state **ts) + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_BPF_MAX + 1]; struct lwtunnel_state *newts; @@ -250,7 +251,7 @@ static int bpf_build_state(struct nlattr *nla, if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; - ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, NULL); + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); if (ret < 0) return ret; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index cfae3d5fe11f..d9cb3532f1dd 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -103,37 +103,53 @@ EXPORT_SYMBOL(lwtunnel_encap_del_ops); int lwtunnel_build_state(u16 encap_type, struct nlattr *encap, unsigned int family, - const void *cfg, struct lwtunnel_state **lws) + const void *cfg, struct lwtunnel_state **lws, + struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; + bool found = false; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || - encap_type > LWTUNNEL_ENCAP_MAX) + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "Unknown LWT encapsulation type"); return ret; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state && try_module_get(ops->owner))) { - ret = ops->build_state(encap, family, cfg, lws); + found = true; + ret = ops->build_state(encap, family, cfg, lws, extack); if (ret) module_put(ops->owner); } rcu_read_unlock(); + /* don't rely on -EOPNOTSUPP to detect match as build_state + * handlers could return it + */ + if (!found) { + NL_SET_ERR_MSG_ATTR(extack, encap, + "LWT encapsulation type not supported"); + } + return ret; } EXPORT_SYMBOL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; if (encap_type == LWTUNNEL_ENCAP_NONE || - encap_type > LWTUNNEL_ENCAP_MAX) + encap_type > LWTUNNEL_ENCAP_MAX) { + NL_SET_ERR_MSG(extack, "Unknown lwt encapsulation type"); return ret; + } rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); @@ -153,11 +169,16 @@ int lwtunnel_valid_encap_type(u16 encap_type) } } #endif - return ops ? 0 : -EOPNOTSUPP; + ret = ops ? 0 : -EOPNOTSUPP; + if (ret < 0) + NL_SET_ERR_MSG(extack, "lwt encapsulation type not supported"); + + return ret; } EXPORT_SYMBOL(lwtunnel_valid_encap_type); -int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) +int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, + struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; @@ -174,7 +195,8 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) if (nla_entype) { encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type) != 0) + if (lwtunnel_valid_encap_type(encap_type, + extack) != 0) return -EOPNOTSUPP; } } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d274f81fcc2c..e31fc11a8000 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -118,6 +118,50 @@ unsigned long neigh_rand_reach_time(unsigned long base) EXPORT_SYMBOL(neigh_rand_reach_time); +static bool neigh_del(struct neighbour *n, __u8 state, + struct neighbour __rcu **np, struct neigh_table *tbl) +{ + bool retval = false; + + write_lock(&n->lock); + if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state)) { + struct neighbour *neigh; + + neigh = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + rcu_assign_pointer(*np, neigh); + n->dead = 1; + retval = true; + } + write_unlock(&n->lock); + if (retval) + neigh_cleanup_and_release(n); + return retval; +} + +bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl) +{ + struct neigh_hash_table *nht; + void *pkey = ndel->primary_key; + u32 hash_val; + struct neighbour *n; + struct neighbour __rcu **np; + + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd); + hash_val = hash_val >> (32 - nht->hash_shift); + + np = &nht->hash_buckets[hash_val]; + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock)))) { + if (n == ndel) + return neigh_del(n, 0, np, tbl); + np = &n->next; + } + return false; +} + static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; @@ -140,19 +184,10 @@ static int neigh_forced_gc(struct neigh_table *tbl) * - nobody refers to it. * - it is not permanent */ - write_lock(&n->lock); - if (atomic_read(&n->refcnt) == 1 && - !(n->nud_state & NUD_PERMANENT)) { - rcu_assign_pointer(*np, - rcu_dereference_protected(n->next, - lockdep_is_held(&tbl->lock))); - n->dead = 1; - shrunk = 1; - write_unlock(&n->lock); - neigh_cleanup_and_release(n); + if (neigh_del(n, NUD_PERMANENT, np, tbl)) { + shrunk = 1; continue; } - write_unlock(&n->lock); np = &n->next; } } @@ -219,7 +254,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) neigh_del_timer(n); n->dead = 1; - if (atomic_read(&n->refcnt) != 1) { + if (refcount_read(&n->refcnt) != 1) { /* The most unpleasant situation. We must destroy neighbour entry, but someone still uses it. @@ -300,7 +335,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device NEIGH_CACHE_STAT_INC(tbl, allocs); n->tbl = tbl; - atomic_set(&n->refcnt, 1); + refcount_set(&n->refcnt, 1); n->dead = 1; out: return n; @@ -409,7 +444,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, rcu_read_lock_bh(); n = __neigh_lookup_noref(tbl, pkey, dev); if (n) { - if (!atomic_inc_not_zero(&n->refcnt)) + if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); } @@ -438,7 +473,7 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, n = rcu_dereference_bh(n->next)) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { - if (!atomic_inc_not_zero(&n->refcnt)) + if (!refcount_inc_not_zero(&n->refcnt)) n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; @@ -674,7 +709,7 @@ static void neigh_parms_destroy(struct neigh_parms *parms); static inline void neigh_parms_put(struct neigh_parms *parms) { - if (atomic_dec_and_test(&parms->refcnt)) + if (refcount_dec_and_test(&parms->refcnt)) neigh_parms_destroy(parms); } @@ -786,7 +821,7 @@ static void neigh_periodic_work(struct work_struct *work) if (time_before(n->used, n->confirmed)) n->used = n->confirmed; - if (atomic_read(&n->refcnt) == 1 && + if (refcount_read(&n->refcnt) == 1 && (state == NUD_FAILED || time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) { *np = n->next; @@ -1444,7 +1479,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL); if (p) { p->tbl = tbl; - atomic_set(&p->refcnt, 1); + refcount_set(&p->refcnt, 1); p->reachable_time = neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); dev_hold(dev); @@ -1507,7 +1542,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) INIT_LIST_HEAD(&tbl->parms_list); list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); - atomic_set(&tbl->parms.refcnt, 1); + refcount_set(&tbl->parms.refcnt, 1); tbl->parms.reachable_time = neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); @@ -1649,7 +1684,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid); + write_lock_bh(&tbl->lock); neigh_release(neigh); + neigh_remove_one(neigh, tbl); + write_unlock_bh(&tbl->lock); out: return err; @@ -1758,7 +1796,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) if ((parms->dev && nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || - nla_put_u32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt)) || + nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || /* approximative value for deprecated QUEUE_LEN (in packets) */ @@ -2196,7 +2234,7 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); - ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1; + ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 14d09345f00d..4847964931df 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -363,15 +363,10 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) netif_addr_lock_bh(dev); netdev_for_each_mc_addr(ha, dev) { - int i; - - seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, - dev->name, ha->refcount, ha->global_use); - - for (i = 0; i < dev->addr_len; i++) - seq_printf(seq, "%02x", ha->addr[i]); - - seq_putc(seq, '\n'); + seq_printf(seq, "%-4d %-15s %-5d %-5d %*phN\n", + dev->ifindex, dev->name, + ha->refcount, ha->global_use, + (int)dev->addr_len, ha->addr); } netif_addr_unlock_bh(dev); return 0; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 65ea0ff4017c..b4f9922b6f23 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -323,7 +323,11 @@ NETDEVICE_SHOW_RW(flags, fmt_hex); static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) { - int res, orig_len = dev->tx_queue_len; + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; if (new_len != orig_len) { dev->tx_queue_len = new_len; @@ -349,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev, return netdev_store(dev, attr, buf, len, change_tx_queue_len); } -NETDEVICE_SHOW_RW(tx_queue_len, fmt_ulong); +NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { @@ -622,7 +626,7 @@ static struct attribute *netstat_attrs[] = { }; -static struct attribute_group netstat_group = { +static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; @@ -632,7 +636,7 @@ static struct attribute *wireless_attrs[] = { NULL }; -static struct attribute_group wireless_group = { +static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; @@ -1200,7 +1204,7 @@ static struct attribute *dql_attrs[] = { NULL }; -static struct attribute_group dql_group = { +static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; @@ -1444,7 +1448,7 @@ static void *net_grab_current_ns(void) struct net *ns = current->nsproxy->net_ns; #ifdef CONFIG_NET_NS if (ns) - atomic_inc(&ns->passive); + refcount_inc(&ns->passive); #endif return ns; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 26bbfababff2..8726d051f31d 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -284,7 +284,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) LIST_HEAD(net_exit_list); atomic_set(&net->count, 1); - atomic_set(&net->passive, 1); + refcount_set(&net->passive, 1); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -380,7 +380,7 @@ static void net_free(struct net *net) void net_drop_ns(void *p) { struct net *ns = p; - if (ns && atomic_dec_and_test(&ns->passive)) + if (ns && refcount_dec_and_test(&ns->passive)) net_free(ns); } @@ -501,6 +501,23 @@ static void cleanup_net(struct work_struct *work) net_drop_ns(net); } } + +/** + * net_ns_barrier - wait until concurrent net_cleanup_work is done + * + * cleanup_net runs from work queue and will first remove namespaces + * from the global list, then run net exit functions. + * + * Call this in module exit path to make sure that all netns + * ->exit ops have been invoked before the function is removed. + */ +void net_ns_barrier(void) +{ + mutex_lock(&net_mutex); + mutex_unlock(&net_mutex); +} +EXPORT_SYMBOL(net_ns_barrier); + static DECLARE_WORK(net_cleanup_work, cleanup_net); void __put_net(struct net *net) @@ -596,6 +613,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct nlattr *nla; struct net *peer; int nsid, err; @@ -603,23 +621,35 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, rtnl_net_policy, extack); if (err < 0) return err; - if (!tb[NETNSA_NSID]) + if (!tb[NETNSA_NSID]) { + NL_SET_ERR_MSG(extack, "nsid is missing"); return -EINVAL; + } nsid = nla_get_s32(tb[NETNSA_NSID]); - if (tb[NETNSA_PID]) + if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); - else if (tb[NETNSA_FD]) + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); - else + nla = tb[NETNSA_FD]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; - if (IS_ERR(peer)) + } + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); + } spin_lock_bh(&net->nsid_lock); if (__peernet2id(net, peer) >= 0) { spin_unlock_bh(&net->nsid_lock); err = -EEXIST; + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, + "Peer netns already has a nsid assigned"); goto out; } @@ -628,6 +658,10 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, if (err >= 0) { rtnl_net_notifyid(net, RTM_NEWNSID, err); err = 0; + } else if (err == -ENOSPC && nsid >= 0) { + err = -EEXIST; + NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]); + NL_SET_ERR_MSG(extack, "The specified nsid is already used"); } out: put_net(peer); @@ -670,6 +704,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; + struct nlattr *nla; struct sk_buff *msg; struct net *peer; int err, id; @@ -678,15 +713,22 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, rtnl_net_policy, extack); if (err < 0) return err; - if (tb[NETNSA_PID]) + if (tb[NETNSA_PID]) { peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID])); - else if (tb[NETNSA_FD]) + nla = tb[NETNSA_PID]; + } else if (tb[NETNSA_FD]) { peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD])); - else + nla = tb[NETNSA_FD]; + } else { + NL_SET_ERR_MSG(extack, "Peer netns reference is missing"); return -EINVAL; + } - if (IS_ERR(peer)) + if (IS_ERR(peer)) { + NL_SET_BAD_ATTR(extack, nla); + NL_SET_ERR_MSG(extack, "Peer netns reference is invalid"); return PTR_ERR(peer); + } msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL); if (!msg) { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 29be2466970c..d3408a693166 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -277,7 +277,7 @@ static void zap_completion_queue(void) struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { - atomic_inc(&skb->users); + refcount_inc(&skb->users); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); @@ -309,7 +309,7 @@ repeat: return NULL; } - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb_reserve(skb, reserve); return skb; } @@ -441,7 +441,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) ip6h->saddr = np->local_ip.in6; ip6h->daddr = np->remote_ip.in6; - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IPV6); } else { @@ -470,7 +470,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) put_unaligned(np->remote_ip.ip, &(iph->daddr)); iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); - eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + eth = skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); skb->protocol = eth->h_proto = htons(ETH_P_IP); } @@ -632,7 +632,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) skb_queue_head_init(&npinfo->txq); INIT_DELAYED_WORK(&npinfo->tx_work, queue_process); - atomic_set(&npinfo->refcnt, 1); + refcount_set(&npinfo->refcnt, 1); ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { @@ -642,7 +642,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) } } else { npinfo = rtnl_dereference(ndev->npinfo); - atomic_inc(&npinfo->refcnt); + refcount_inc(&npinfo->refcnt); } npinfo->netpoll = np; @@ -821,7 +821,7 @@ void __netpoll_cleanup(struct netpoll *np) synchronize_srcu(&netpoll_srcu); - if (atomic_dec_and_test(&npinfo->refcnt)) { + if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; ops = np->dev->netdev_ops; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 96947f5d41e4..6e1e10ff433a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2675,7 +2675,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, goto err; } /* restore ll */ - eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); + eth = skb_push(skb, ETH_HLEN); memcpy(eth, pkt_dev->hh, 2 * ETH_ALEN); eth->h_proto = protocol; @@ -2714,11 +2714,11 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, struct timeval timestamp; struct pktgen_hdr *pgh; - pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh)); + pgh = skb_put(skb, sizeof(*pgh)); datalen -= sizeof(*pgh); if (pkt_dev->nfrags <= 0) { - memset(skb_put(skb, datalen), 0, datalen); + skb_put_zero(skb, datalen); } else { int frags = pkt_dev->nfrags; int i, len; @@ -2729,7 +2729,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, frags = MAX_SKB_FRAGS; len = datalen - frags * PAGE_SIZE; if (len > 0) { - memset(skb_put(skb, len), 0, len); + skb_put_zero(skb, len); datalen = frags * PAGE_SIZE; } @@ -2844,34 +2844,35 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ - eth = (__u8 *) skb_push(skb, 14); - mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); if (pkt_dev->nr_labels) mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { if (pkt_dev->svlan_id != 0xffff) { - svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_tci = skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); - svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); *svlan_encapsulated_proto = htons(ETH_P_8021Q); } - vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_tci = skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); - vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); *vlan_encapsulated_proto = htons(ETH_P_IP); } skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); - iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr)); + iph = skb_put(skb, sizeof(struct iphdr)); skb_set_transport_header(skb, skb->len); - udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); + udph = skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; @@ -2971,34 +2972,35 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb_reserve(skb, 16); /* Reserve for ethernet and IP header */ - eth = (__u8 *) skb_push(skb, 14); - mpls = (__be32 *)skb_put(skb, pkt_dev->nr_labels*sizeof(__u32)); + eth = skb_push(skb, 14); + mpls = skb_put(skb, pkt_dev->nr_labels * sizeof(__u32)); if (pkt_dev->nr_labels) mpls_push(mpls, pkt_dev); if (pkt_dev->vlan_id != 0xffff) { if (pkt_dev->svlan_id != 0xffff) { - svlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_tci = skb_put(skb, sizeof(__be16)); *svlan_tci = build_tci(pkt_dev->svlan_id, pkt_dev->svlan_cfi, pkt_dev->svlan_p); - svlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + svlan_encapsulated_proto = skb_put(skb, + sizeof(__be16)); *svlan_encapsulated_proto = htons(ETH_P_8021Q); } - vlan_tci = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_tci = skb_put(skb, sizeof(__be16)); *vlan_tci = build_tci(pkt_dev->vlan_id, pkt_dev->vlan_cfi, pkt_dev->vlan_p); - vlan_encapsulated_proto = (__be16 *)skb_put(skb, sizeof(__be16)); + vlan_encapsulated_proto = skb_put(skb, sizeof(__be16)); *vlan_encapsulated_proto = htons(ETH_P_IPV6); } skb_reset_mac_header(skb); skb_set_network_header(skb, skb->len); - iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + iph = skb_put(skb, sizeof(struct ipv6hdr)); skb_set_transport_header(skb, skb->len); - udph = (struct udphdr *) skb_put(skb, sizeof(struct udphdr)); + udph = skb_put(skb, sizeof(struct udphdr)); skb_set_queue_mapping(skb, queue_map); skb->priority = pkt_dev->skb_priority; @@ -3361,7 +3363,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); - while (atomic_read(&(pkt_dev->skb->users)) != 1) { + while (refcount_read(&(pkt_dev->skb->users)) != 1) { if (signal_pending(current)) break; @@ -3418,7 +3420,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) { skb = pkt_dev->skb; skb->protocol = eth_type_trans(skb, skb->dev); - atomic_add(burst, &skb->users); + refcount_add(burst, &skb->users); local_bh_disable(); do { ret = netif_receive_skb(skb); @@ -3426,11 +3428,11 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->errors++; pkt_dev->sofar++; pkt_dev->seq_num++; - if (atomic_read(&skb->users) != burst) { + if (refcount_read(&skb->users) != burst) { /* skb was queued by rps/rfs or taps, * so cannot reuse this skb */ - atomic_sub(burst - 1, &skb->users); + WARN_ON(refcount_sub_and_test(burst - 1, &skb->users)); /* get out of the loop and wait * until skb is consumed */ @@ -3444,7 +3446,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) goto out; /* Skips xmit_mode M_START_XMIT */ } else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) { local_bh_disable(); - atomic_inc(&pkt_dev->skb->users); + refcount_inc(&pkt_dev->skb->users); ret = dev_queue_xmit(pkt_dev->skb); switch (ret) { @@ -3485,7 +3487,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->last_ok = 0; goto unlock; } - atomic_add(burst, &pkt_dev->skb->users); + refcount_add(burst, &pkt_dev->skb->users); xmit_more: ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0); @@ -3511,11 +3513,11 @@ xmit_more: /* fallthru */ case NETDEV_TX_BUSY: /* Retry it next time */ - atomic_dec(&(pkt_dev->skb->users)); + refcount_dec(&(pkt_dev->skb->users)); pkt_dev->last_ok = 0; } if (unlikely(burst)) - atomic_sub(burst, &pkt_dev->skb->users); + WARN_ON(refcount_sub_and_test(burst, &pkt_dev->skb->users)); unlock: HARD_TX_UNLOCK(odev, txq); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 467a2f4510a7..d1ba90980be1 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -16,6 +16,7 @@ * Vitaly E. Lavrov RTA_OK arithmetics was wrong. */ +#include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> @@ -39,6 +40,7 @@ #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> +#include <linux/bpf.h> #include <linux/uaccess.h> @@ -647,7 +649,7 @@ int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int g NETLINK_CB(skb).dst_group = group; if (echo) - atomic_inc(&skb->users); + refcount_inc(&skb->users); netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); if (echo) err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); @@ -899,7 +901,8 @@ static size_t rtnl_port_size(const struct net_device *dev, static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ - nla_total_size(1); /* XDP_ATTACHED */ + nla_total_size(1) + /* XDP_ATTACHED */ + nla_total_size(4); /* XDP_PROG_ID */ return xdp_size; } @@ -942,6 +945,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1248,23 +1252,29 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) return 0; } -static u8 rtnl_xdp_attached_mode(struct net_device *dev) +static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) { const struct net_device_ops *ops = dev->netdev_ops; + const struct bpf_prog *generic_xdp_prog; ASSERT_RTNL(); - if (rcu_access_pointer(dev->xdp_prog)) + *prog_id = 0; + generic_xdp_prog = rtnl_dereference(dev->xdp_prog); + if (generic_xdp_prog) { + *prog_id = generic_xdp_prog->aux->id; return XDP_ATTACHED_SKB; - if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp)) - return XDP_ATTACHED_DRV; + } + if (!ops->ndo_xdp) + return XDP_ATTACHED_NONE; - return XDP_ATTACHED_NONE; + return __dev_xdp_attached(dev, ops->ndo_xdp, prog_id); } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; + u32 prog_id; int err; xdp = nla_nest_start(skb, IFLA_XDP); @@ -1272,10 +1282,16 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) return -EMSGSIZE; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, - rtnl_xdp_attached_mode(dev)); + rtnl_xdp_attached_mode(dev, &prog_id)); if (err) goto err_cancel; + if (prog_id) { + err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); + if (err) + goto err_cancel; + } + nla_nest_end(skb, xdp); return 0; @@ -1284,9 +1300,40 @@ err_cancel: return err; } +static u32 rtnl_get_event(unsigned long event) +{ + u32 rtnl_event_type = IFLA_EVENT_NONE; + + switch (event) { + case NETDEV_REBOOT: + rtnl_event_type = IFLA_EVENT_REBOOT; + break; + case NETDEV_FEAT_CHANGE: + rtnl_event_type = IFLA_EVENT_FEATURES; + break; + case NETDEV_BONDING_FAILOVER: + rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; + break; + case NETDEV_NOTIFY_PEERS: + rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; + break; + case NETDEV_RESEND_IGMP: + rtnl_event_type = IFLA_EVENT_IGMP_RESEND; + break; + case NETDEV_CHANGEINFODATA: + rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; + break; + default: + break; + } + + return rtnl_event_type; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, u32 ext_filter_mask) + unsigned int flags, u32 ext_filter_mask, + u32 event) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1335,6 +1382,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; + if (event != IFLA_EVENT_NONE) { + if (nla_put_u32(skb, IFLA_EVENT, event)) + goto nla_put_failure; + } + if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; @@ -1469,6 +1521,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, + [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, }; @@ -1517,6 +1570,7 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, + [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) @@ -1629,7 +1683,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask); + ext_filter_mask, 0); if (err < 0) { if (likely(skb->len)) @@ -2051,8 +2105,8 @@ static int do_setlink(const struct sk_buff *skb, } if (tb[IFLA_TXQLEN]) { - unsigned long value = nla_get_u32(tb[IFLA_TXQLEN]); - unsigned long orig_len = dev->tx_queue_len; + unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); + unsigned int orig_len = dev->tx_queue_len; if (dev->tx_queue_len ^ value) { dev->tx_queue_len = value; @@ -2189,7 +2243,7 @@ static int do_setlink(const struct sk_buff *skb, if (err < 0) goto errout; - if (xdp[IFLA_XDP_ATTACHED]) { + if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } @@ -2200,8 +2254,7 @@ static int do_setlink(const struct sk_buff *skb, err = -EINVAL; goto errout; } - if ((xdp_flags & XDP_FLAGS_SKB_MODE) && - (xdp_flags & XDP_FLAGS_DRV_MODE)) { + if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { err = -EINVAL; goto errout; } @@ -2529,7 +2582,7 @@ replay: data = attr; } if (ops->validate) { - err = ops->validate(tb, data); + err = ops->validate(tb, data, extack); if (err < 0) return err; } @@ -2548,7 +2601,8 @@ replay: slave_data = slave_attr; } if (m_ops->slave_validate) { - err = m_ops->slave_validate(tb, slave_data); + err = m_ops->slave_validate(tb, slave_data, + extack); if (err < 0) return err; } @@ -2567,7 +2621,7 @@ replay: !ops->changelink) return -EOPNOTSUPP; - err = ops->changelink(dev, tb, data); + err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; @@ -2578,7 +2632,8 @@ replay: return -EOPNOTSUPP; err = m_ops->slave_changelink(master_dev, dev, - tb, slave_data); + tb, slave_data, + extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; @@ -2652,7 +2707,8 @@ replay: dev->ifindex = ifm->ifi_index; if (ops->newlink) { - err = ops->newlink(link_net ? : net, dev, tb, data); + err = ops->newlink(link_net ? : net, dev, tb, data, + extack); /* Drivers should call free_netdev() in ->destructor * and unregister it on failure after registration * so that device could be finally freed in rtnl_unlock. @@ -2739,7 +2795,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2811,7 +2867,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned int change, gfp_t flags) + unsigned int change, + u32 event, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2822,7 +2879,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2843,18 +2900,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +static void rtmsg_ifinfo_event(int type, struct net_device *dev, + unsigned int change, u32 event, + gfp_t flags) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags); +} EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4159,6 +4223,18 @@ static void rtnetlink_rcv(struct sk_buff *skb) rtnl_unlock(); } +static int rtnetlink_bind(struct net *net, int group) +{ + switch (group) { + case RTNLGRP_IPV4_MROUTE_R: + case RTNLGRP_IPV6_MROUTE_R: + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + break; + } + return 0; +} + static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); @@ -4171,7 +4247,8 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_NOTIFY_PEERS: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); + rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), + GFP_KERNEL); break; default: break; @@ -4192,6 +4269,7 @@ static int __net_init rtnetlink_net_init(struct net *net) .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, + .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index ae35cce3a40d..7232274de334 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -51,7 +51,8 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr) +u32 secure_tcpv6_ts_off(const struct net *net, + const __be32 *saddr, const __be32 *daddr) { const struct { struct in6_addr saddr; @@ -61,7 +62,7 @@ u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr) .daddr = *(struct in6_addr *)daddr, }; - if (sysctl_tcp_timestamps != 1) + if (net->ipv4.sysctl_tcp_timestamps != 1) return 0; ts_secret_init(); @@ -113,9 +114,9 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr) +u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) { - if (sysctl_tcp_timestamps != 1) + if (net->ipv4.sysctl_tcp_timestamps != 1) return 0; ts_secret_init(); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b1be7c01efe2..8b11341ed69a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -176,7 +176,7 @@ struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->head = NULL; skb->truesize = sizeof(struct sk_buff); - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb->mac_header = (typeof(skb->mac_header))~0U; out: @@ -247,7 +247,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, /* Account for allocated memory : skb + skb->head */ skb->truesize = SKB_TRUESIZE(size); skb->pfmemalloc = pfmemalloc; - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); @@ -268,7 +268,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; - atomic_set(&fclones->fclone_ref, 1); + refcount_set(&fclones->fclone_ref, 1); fclones->skb2.fclone = SKB_FCLONE_CLONE; } @@ -314,7 +314,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) memset(skb, 0, offsetof(struct sk_buff, tail)); skb->truesize = SKB_TRUESIZE(size); - atomic_set(&skb->users, 1); + refcount_set(&skb->users, 1); skb->head = data; skb->data = data; skb_reset_tail_pointer(skb); @@ -629,7 +629,7 @@ static void kfree_skbmem(struct sk_buff *skb) * This test would have no chance to be true for the clone, * while here, branch prediction will be good. */ - if (atomic_read(&fclones->fclone_ref) == 1) + if (refcount_read(&fclones->fclone_ref) == 1) goto fastpath; break; @@ -637,18 +637,16 @@ static void kfree_skbmem(struct sk_buff *skb) fclones = container_of(skb, struct sk_buff_fclones, skb2); break; } - if (!atomic_dec_and_test(&fclones->fclone_ref)) + if (!refcount_dec_and_test(&fclones->fclone_ref)) return; fastpath: kmem_cache_free(skbuff_fclone_cache, fclones); } -static void skb_release_head_state(struct sk_buff *skb) +void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); -#ifdef CONFIG_XFRM - secpath_put(skb->sp); -#endif + secpath_reset(skb); if (skb->destructor) { WARN_ON(in_irq()); skb->destructor(skb); @@ -694,12 +692,9 @@ EXPORT_SYMBOL(__kfree_skb); */ void kfree_skb(struct sk_buff *skb) { - if (unlikely(!skb)) - return; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + if (!skb_unref(skb)) return; + trace_kfree_skb(skb, __builtin_return_address(0)); __kfree_skb(skb); } @@ -746,17 +741,32 @@ EXPORT_SYMBOL(skb_tx_error); */ void consume_skb(struct sk_buff *skb) { - if (unlikely(!skb)) - return; - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + if (!skb_unref(skb)) return; + trace_consume_skb(skb); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); +/** + * consume_stateless_skb - free an skbuff, assuming it is stateless + * @skb: buffer to free + * + * Works like consume_skb(), but this variant assumes that all the head + * states have been already dropped. + */ +void consume_stateless_skb(struct sk_buff *skb) +{ + if (!skb_unref(skb)) + return; + + trace_consume_skb(skb); + if (likely(skb->head)) + skb_release_data(skb); + kfree_skbmem(skb); +} + void __kfree_skb_flush(void) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -807,10 +817,9 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - if (likely(atomic_read(&skb->users) == 1)) - smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + if (!skb_unref(skb)) return; + /* if reaching here SKB is ready to free */ trace_consume_skb(skb); @@ -906,7 +915,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb) C(head_frag); C(data); C(truesize); - atomic_set(&n->users, 1); + refcount_set(&n->users, 1); atomic_inc(&(skb_shinfo(skb)->dataref)); skb->cloned = 1; @@ -1018,9 +1027,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL; if (skb->fclone == SKB_FCLONE_ORIG && - atomic_read(&fclones->fclone_ref) == 1) { + refcount_read(&fclones->fclone_ref) == 1) { n = &fclones->skb2; - atomic_set(&fclones->fclone_ref, 2); + refcount_set(&fclones->fclone_ref, 2); } else { if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; @@ -1412,7 +1421,7 @@ EXPORT_SYMBOL(skb_pad); * returned. */ -unsigned char *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) { if (tail != skb) { skb->data_len += len; @@ -1431,9 +1440,9 @@ EXPORT_SYMBOL_GPL(pskb_put); * exceed the total buffer size the kernel will panic. A pointer to the * first byte of the extra data is returned. */ -unsigned char *skb_put(struct sk_buff *skb, unsigned int len) +void *skb_put(struct sk_buff *skb, unsigned int len) { - unsigned char *tmp = skb_tail_pointer(skb); + void *tmp = skb_tail_pointer(skb); SKB_LINEAR_ASSERT(skb); skb->tail += len; skb->len += len; @@ -1452,7 +1461,7 @@ EXPORT_SYMBOL(skb_put); * start. If this would exceed the total buffer headroom the kernel will * panic. A pointer to the first byte of the extra data is returned. */ -unsigned char *skb_push(struct sk_buff *skb, unsigned int len) +void *skb_push(struct sk_buff *skb, unsigned int len) { skb->data -= len; skb->len += len; @@ -1472,7 +1481,7 @@ EXPORT_SYMBOL(skb_push); * is returned. Once the data has been pulled future pushes will overwrite * the old data. */ -unsigned char *skb_pull(struct sk_buff *skb, unsigned int len) +void *skb_pull(struct sk_buff *skb, unsigned int len) { return skb_pull_inline(skb, len); } @@ -1607,7 +1616,7 @@ EXPORT_SYMBOL(___pskb_trim); * * It is pretty complicated. Luckily, it is called only in exceptional cases. */ -unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +void *__pskb_pull_tail(struct sk_buff *skb, int delta) { /* If skb has not enough free space at tail, get new one * plus 128 bytes for future expansions. If we have enough @@ -2243,6 +2252,32 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, + int offset, int len) +{ + net_warn_ratelimited( + "%s: attempt to compute crc32c without libcrc32c.ko\n", + __func__); + return 0; +} + +static const struct skb_checksum_ops default_crc32c_ops = { + .update = warn_crc32c_csum_update, + .combine = warn_crc32c_csum_combine, +}; + +const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = + &default_crc32c_ops; +EXPORT_SYMBOL(crc32c_csum_stub); + /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer @@ -2620,7 +2655,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) { int pos = skb_headlen(skb); - skb_shinfo(skb1)->tx_flags = skb_shinfo(skb)->tx_flags & SKBTX_SHARED_FRAG; + skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags & + SKBTX_SHARED_FRAG; if (len < pos) /* Split line is inside header. */ skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ @@ -2988,7 +3024,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, get_page(pfrag->page); skb->truesize += copy; - atomic_add(copy, &sk->sk_wmem_alloc); + refcount_add(copy, &sk->sk_wmem_alloc); skb->len += copy; skb->data_len += copy; offset += copy; @@ -3029,7 +3065,7 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags); * that the checksum difference is zero (e.g., a valid IP header) * or you are setting ip_summed to CHECKSUM_NONE. */ -unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) +void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) { unsigned char *data = skb->data; @@ -3235,8 +3271,8 @@ normal: skb_copy_from_linear_data_offset(head_skb, offset, skb_put(nskb, hsize), hsize); - skb_shinfo(nskb)->tx_flags = skb_shinfo(head_skb)->tx_flags & - SKBTX_SHARED_FRAG; + skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & + SKBTX_SHARED_FRAG; while (pos < offset + len) { if (i >= nfrags) { @@ -3482,24 +3518,18 @@ void __init skb_init(void) NULL); } -/** - * skb_to_sgvec - Fill a scatter-gather list from a socket buffer - * @skb: Socket buffer containing the buffers to be mapped - * @sg: The scatter-gather list to map into - * @offset: The offset into the buffer's contents to start mapping - * @len: Length of buffer space to be mapped - * - * Fill the specified scatter-gather list with mappings/pointers into a - * region of the buffer space attached to a socket buffer. - */ static int -__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len, + unsigned int recursion_level) { int start = skb_headlen(skb); int i, copy = start - offset; struct sk_buff *frag_iter; int elt = 0; + if (unlikely(recursion_level >= 24)) + return -EMSGSIZE; + if (copy > 0) { if (copy > len) copy = len; @@ -3518,6 +3548,8 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]); if ((copy = end - offset) > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; if (copy > len) copy = len; @@ -3532,16 +3564,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) } skb_walk_frags(skb, frag_iter) { - int end; + int end, ret; WARN_ON(start > offset + len); end = start + frag_iter->len; if ((copy = end - offset) > 0) { + if (unlikely(elt && sg_is_last(&sg[elt - 1]))) + return -EMSGSIZE; + if (copy > len) copy = len; - elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start, - copy); + ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start, + copy, recursion_level + 1); + if (unlikely(ret < 0)) + return ret; + elt += ret; if ((len -= copy) == 0) return elt; offset += copy; @@ -3552,6 +3590,31 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) return elt; } +/** + * skb_to_sgvec - Fill a scatter-gather list from a socket buffer + * @skb: Socket buffer containing the buffers to be mapped + * @sg: The scatter-gather list to map into + * @offset: The offset into the buffer's contents to start mapping + * @len: Length of buffer space to be mapped + * + * Fill the specified scatter-gather list with mappings/pointers into a + * region of the buffer space attached to a socket buffer. Returns either + * the number of scatterlist items used, or -EMSGSIZE if the contents + * could not fit. + */ +int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int nsg = __skb_to_sgvec(skb, sg, offset, len, 0); + + if (nsg <= 0) + return nsg; + + sg_mark_end(&sg[nsg - 1]); + + return nsg; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); + /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given * sglist without mark the sg which contain last skb data as the end. * So the caller can mannipulate sg list as will when padding new data after @@ -3574,19 +3637,11 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) { - return __skb_to_sgvec(skb, sg, offset, len); + return __skb_to_sgvec(skb, sg, offset, len, 0); } EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark); -int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) -{ - int nsg = __skb_to_sgvec(skb, sg, offset, len); - sg_mark_end(&sg[nsg - 1]); - - return nsg; -} -EXPORT_SYMBOL_GPL(skb_to_sgvec); /** * skb_cow_data - Check that a socket buffer's data buffers are writable @@ -3789,7 +3844,7 @@ struct sk_buff *skb_clone_sk(struct sk_buff *skb) struct sock *sk = skb->sk; struct sk_buff *clone; - if (!sk || !atomic_inc_not_zero(&sk->sk_refcnt)) + if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) return NULL; clone = skb_clone(skb, GFP_ATOMIC); @@ -3860,7 +3915,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb, /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. */ - if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { *skb_hwtstamps(skb) = *hwtstamps; __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false); sock_put(sk); @@ -3878,6 +3933,10 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; @@ -3899,7 +3958,8 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, return; if (tsonly) { - skb_shinfo(skb)->tx_flags = skb_shinfo(orig_skb)->tx_flags; + skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags & + SKBTX_ANY_TSTAMP; skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey; } @@ -3937,7 +3997,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked) /* Take a reference to prevent skb_orphan() from freeing the socket, * but only if the socket refcount is not zero. */ - if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { + if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) { err = sock_queue_err_skb(sk, skb); sock_put(sk); } diff --git a/net/core/sock.c b/net/core/sock.c index 0c3fc16223f9..ac2a404c73eb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1038,6 +1038,10 @@ set_rcvbuf: #endif case SO_MAX_PACING_RATE: + if (val != ~0U) + cmpxchg(&sk->sk_pacing_status, + SK_PACING_NONE, + SK_PACING_NEEDED); sk->sk_max_pacing_rate = val; sk->sk_pacing_rate = min(sk->sk_pacing_rate, sk->sk_max_pacing_rate); @@ -1074,6 +1078,18 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } +static int groups_to_user(gid_t __user *dst, const struct group_info *src) +{ + struct user_namespace *user_ns = current_user_ns(); + int i; + + for (i = 0; i < src->ngroups; i++) + if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + return -EFAULT; + + return 0; +} + int sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { @@ -1227,6 +1243,27 @@ int sock_getsockopt(struct socket *sock, int level, int optname, goto lenout; } + case SO_PEERGROUPS: + { + int ret, n; + + if (!sk->sk_peer_cred) + return -ENODATA; + + n = sk->sk_peer_cred->group_info->ngroups; + if (len < n * sizeof(gid_t)) { + len = n * sizeof(gid_t); + return put_user(len, optlen) ? -EFAULT : -ERANGE; + } + len = n * sizeof(gid_t); + + ret = groups_to_user((gid_t __user *)optval, + sk->sk_peer_cred->group_info); + if (ret) + return ret; + goto lenout; + } + case SO_PEERNAME: { char address[128]; @@ -1491,7 +1528,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, if (likely(sk->sk_net_refcnt)) get_net(net); sock_net_set(sk, net); - atomic_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, 1); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -1515,7 +1552,7 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_destruct(sk); filter = rcu_dereference_check(sk->sk_filter, - atomic_read(&sk->sk_wmem_alloc) == 0); + refcount_read(&sk->sk_wmem_alloc) == 0); if (filter) { sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); @@ -1565,7 +1602,7 @@ void sk_free(struct sock *sk) * some packets are still in some tx queue. * If not null, sock_wfree() will call __sk_free(sk) later */ - if (atomic_dec_and_test(&sk->sk_wmem_alloc)) + if (refcount_dec_and_test(&sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sk_free); @@ -1622,7 +1659,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ - atomic_set(&newsk->sk_wmem_alloc, 1); + refcount_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); @@ -1671,7 +1708,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * (Documentation/RCU/rculist_nulls.txt for details) */ smp_wmb(); - atomic_set(&newsk->sk_refcnt, 2); + refcount_set(&newsk->sk_refcnt, 2); /* * Increment the counter in the same struct proto as the master @@ -1750,7 +1787,7 @@ void sock_wfree(struct sk_buff *skb) * Keep a reference on sk_wmem_alloc, this will be released * after sk_write_space() call */ - atomic_sub(len - 1, &sk->sk_wmem_alloc); + WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); sk->sk_write_space(sk); len = 1; } @@ -1758,7 +1795,7 @@ void sock_wfree(struct sk_buff *skb) * if sk_wmem_alloc reaches 0, we must finish what sk_free() * could not do because of in-flight packets */ - if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) + if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) __sk_free(sk); } EXPORT_SYMBOL(sock_wfree); @@ -1770,7 +1807,7 @@ void __sock_wfree(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) __sk_free(sk); } @@ -1792,7 +1829,7 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) * is enough to guarantee sk_free() wont free this sock until * all in-flight packets are completed */ - atomic_add(skb->truesize, &sk->sk_wmem_alloc); + refcount_add(skb->truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(skb_set_owner_w); @@ -1814,8 +1851,8 @@ void skb_orphan_partial(struct sk_buff *skb) ) { struct sock *sk = skb->sk; - if (atomic_inc_not_zero(&sk->sk_refcnt)) { - atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + if (refcount_inc_not_zero(&sk->sk_refcnt)) { + WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); skb->destructor = sock_efree; } } else { @@ -1875,7 +1912,7 @@ EXPORT_SYMBOL(sock_i_ino); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority) { - if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) { skb_set_owner_w(skb, sk); @@ -1950,7 +1987,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo) break; set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) break; if (sk->sk_shutdown & SEND_SHUTDOWN) break; @@ -2072,6 +2109,26 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, } EXPORT_SYMBOL(sock_cmsg_send); +static void sk_enter_memory_pressure(struct sock *sk) +{ + if (!sk->sk_prot->enter_memory_pressure) + return; + + sk->sk_prot->enter_memory_pressure(sk); +} + +static void sk_leave_memory_pressure(struct sock *sk) +{ + if (sk->sk_prot->leave_memory_pressure) { + sk->sk_prot->leave_memory_pressure(sk); + } else { + unsigned long *memory_pressure = sk->sk_prot->memory_pressure; + + if (memory_pressure && *memory_pressure) + *memory_pressure = 0; + } +} + /* On 32bit arches, an skb frag is limited to 2^15 */ #define SKB_FRAG_PAGE_ORDER get_order(32768) @@ -2253,7 +2310,7 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) if (sk->sk_type == SOCK_STREAM) { if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) return 1; - } else if (atomic_read(&sk->sk_wmem_alloc) < + } else if (refcount_read(&sk->sk_wmem_alloc) < prot->sysctl_wmem[0]) return 1; } @@ -2520,7 +2577,7 @@ static void sock_def_write_space(struct sock *sk) /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | @@ -2630,7 +2687,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) * (Documentation/RCU/rculist_nulls.txt for details) */ smp_wmb(); - atomic_set(&sk->sk_refcnt, 1); + refcount_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_drops, 0); } EXPORT_SYMBOL(sock_init_data); |