diff options
Diffstat (limited to 'net/core/dev.c')
-rw-r--r-- | net/core/dev.c | 363 |
1 files changed, 305 insertions, 58 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 59d4394d2ce8..c0e260870c0a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1084,9 +1084,9 @@ void netdev_state_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_state_change); -void netdev_bonding_change(struct net_device *dev, unsigned long event) +int netdev_bonding_change(struct net_device *dev, unsigned long event) { - call_netdevice_notifiers(event, dev); + return call_netdevice_notifiers(event, dev); } EXPORT_SYMBOL(netdev_bonding_change); @@ -1931,7 +1931,7 @@ out_kfree_skb: return rc; } -static u32 skb_tx_hashrnd; +static u32 hashrnd __read_mostly; u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { @@ -1949,7 +1949,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) else hash = skb->protocol; - hash = jhash_1word(hash, skb_tx_hashrnd); + hash = jhash_1word(hash, hashrnd); return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); } @@ -1959,10 +1959,9 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) { if (unlikely(queue_index >= dev->real_num_tx_queues)) { if (net_ratelimit()) { - WARN(1, "%s selects TX queue %d, but " + netdev_warn(dev, "selects TX queue %d, but " "real number of TX queues is %d\n", - dev->name, queue_index, - dev->real_num_tx_queues); + queue_index, dev->real_num_tx_queues); } return 0; } @@ -2175,6 +2174,178 @@ int weight_p __read_mostly = 64; /* old backlog weight */ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; +#ifdef CONFIG_SMP +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb) +{ + struct ipv6hdr *ip6; + struct iphdr *ip; + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + int cpu = -1; + u8 ip_proto; + u32 addr1, addr2, ports, ihl; + + rcu_read_lock(); + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + if (net_ratelimit()) { + netdev_warn(dev, "received packet on queue " + "%u, but number of RX queues is %u\n", + index, dev->num_rx_queues); + } + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (!rxqueue->rps_map) + goto done; + + if (skb->rxhash) + goto got_hash; /* Skip hash computation on packet header */ + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (!pskb_may_pull(skb, sizeof(*ip))) + goto done; + + ip = (struct iphdr *) skb->data; + ip_proto = ip->protocol; + addr1 = ip->saddr; + addr2 = ip->daddr; + ihl = ip->ihl; + break; + case __constant_htons(ETH_P_IPV6): + if (!pskb_may_pull(skb, sizeof(*ip6))) + goto done; + + ip6 = (struct ipv6hdr *) skb->data; + ip_proto = ip6->nexthdr; + addr1 = ip6->saddr.s6_addr32[3]; + addr2 = ip6->daddr.s6_addr32[3]; + ihl = (40 >> 2); + break; + default: + goto done; + } + ports = 0; + switch (ip_proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_DCCP: + case IPPROTO_ESP: + case IPPROTO_AH: + case IPPROTO_SCTP: + case IPPROTO_UDPLITE: + if (pskb_may_pull(skb, (ihl * 4) + 4)) + ports = *((u32 *) (skb->data + (ihl * 4))); + break; + + default: + break; + } + + skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd); + if (!skb->rxhash) + skb->rxhash = 1; + +got_hash: + map = rcu_dereference(rxqueue->rps_map); + if (map) { + u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; + + if (cpu_online(tcpu)) { + cpu = tcpu; + goto done; + } + } + +done: + rcu_read_unlock(); + return cpu; +} + +/* + * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled + * to be sent to kick remote softirq processing. There are two masks since + * the sending of IPIs must be done with interrupts enabled. The select field + * indicates the current mask that enqueue_backlog uses to schedule IPIs. + * select is flipped before net_rps_action is called while still under lock, + * net_rps_action then uses the non-selected mask to send the IPIs and clears + * it without conflicting with enqueue_backlog operation. + */ +struct rps_remote_softirq_cpus { + cpumask_t mask[2]; + int select; +}; +static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus); + +/* Called from hardirq (IPI) context */ +static void trigger_softirq(void *data) +{ + struct softnet_data *queue = data; + __napi_schedule(&queue->backlog); + __get_cpu_var(netdev_rx_stat).received_rps++; +} +#endif /* CONFIG_SMP */ + +/* + * enqueue_to_backlog is called to queue an skb to a per CPU backlog + * queue (may be a remote CPU queue). + */ +static int enqueue_to_backlog(struct sk_buff *skb, int cpu) +{ + struct softnet_data *queue; + unsigned long flags; + + queue = &per_cpu(softnet_data, cpu); + + local_irq_save(flags); + __get_cpu_var(netdev_rx_stat).total++; + + spin_lock(&queue->input_pkt_queue.lock); + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { + if (queue->input_pkt_queue.qlen) { +enqueue: + __skb_queue_tail(&queue->input_pkt_queue, skb); + spin_unlock_irqrestore(&queue->input_pkt_queue.lock, + flags); + return NET_RX_SUCCESS; + } + + /* Schedule NAPI for backlog device */ + if (napi_schedule_prep(&queue->backlog)) { +#ifdef CONFIG_SMP + if (cpu != smp_processor_id()) { + struct rps_remote_softirq_cpus *rcpus = + &__get_cpu_var(rps_remote_softirq_cpus); + + cpu_set(cpu, rcpus->mask[rcpus->select]); + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + } else + __napi_schedule(&queue->backlog); +#else + __napi_schedule(&queue->backlog); +#endif + } + goto enqueue; + } + + spin_unlock(&queue->input_pkt_queue.lock); + + __get_cpu_var(netdev_rx_stat).dropped++; + local_irq_restore(flags); + + kfree_skb(skb); + return NET_RX_DROP; +} /** * netif_rx - post buffer to the network code @@ -2193,8 +2364,7 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; int netif_rx(struct sk_buff *skb) { - struct softnet_data *queue; - unsigned long flags; + int cpu; /* if netpoll wants it, pretend we never saw it */ if (netpoll_rx(skb)) @@ -2203,31 +2373,15 @@ int netif_rx(struct sk_buff *skb) if (!skb->tstamp.tv64) net_timestamp(skb); - /* - * The code is rearranged so that the path is the most - * short when CPU is congested, but is still operating. - */ - local_irq_save(flags); - queue = &__get_cpu_var(softnet_data); - - __get_cpu_var(netdev_rx_stat).total++; - if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { - if (queue->input_pkt_queue.qlen) { -enqueue: - __skb_queue_tail(&queue->input_pkt_queue, skb); - local_irq_restore(flags); - return NET_RX_SUCCESS; - } - - napi_schedule(&queue->backlog); - goto enqueue; - } - - __get_cpu_var(netdev_rx_stat).dropped++; - local_irq_restore(flags); +#ifdef CONFIG_SMP + cpu = get_rps_cpu(skb->dev, skb); + if (cpu < 0) + cpu = smp_processor_id(); +#else + cpu = smp_processor_id(); +#endif - kfree_skb(skb); - return NET_RX_DROP; + return enqueue_to_backlog(skb, cpu); } EXPORT_SYMBOL(netif_rx); @@ -2464,22 +2618,7 @@ void netif_nit_deliver(struct sk_buff *skb) rcu_read_unlock(); } -/** - * netif_receive_skb - process receive buffer from network - * @skb: buffer to process - * - * netif_receive_skb() is the main receive data processing function. - * It always succeeds. The buffer may be dropped during processing - * for congestion control or by the protocol layers. - * - * This function may only be called from softirq context and interrupts - * should be enabled. - * - * Return values (usually ignored): - * NET_RX_SUCCESS: no congestion - * NET_RX_DROP: packet was dropped - */ -int netif_receive_skb(struct sk_buff *skb) +int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; struct net_device *orig_dev; @@ -2590,6 +2729,37 @@ out: rcu_read_unlock(); return ret; } + +/** + * netif_receive_skb - process receive buffer from network + * @skb: buffer to process + * + * netif_receive_skb() is the main receive data processing function. + * It always succeeds. The buffer may be dropped during processing + * for congestion control or by the protocol layers. + * + * This function may only be called from softirq context and interrupts + * should be enabled. + * + * Return values (usually ignored): + * NET_RX_SUCCESS: no congestion + * NET_RX_DROP: packet was dropped + */ +int netif_receive_skb(struct sk_buff *skb) +{ +#ifdef CONFIG_SMP + int cpu; + + cpu = get_rps_cpu(skb->dev, skb); + + if (cpu < 0) + return __netif_receive_skb(skb); + else + return enqueue_to_backlog(skb, cpu); +#else + return __netif_receive_skb(skb); +#endif +} EXPORT_SYMBOL(netif_receive_skb); /* Network device is going away, flush any packets still pending */ @@ -2916,16 +3086,16 @@ static int process_backlog(struct napi_struct *napi, int quota) do { struct sk_buff *skb; - local_irq_disable(); + spin_lock_irq(&queue->input_pkt_queue.lock); skb = __skb_dequeue(&queue->input_pkt_queue); if (!skb) { __napi_complete(napi); - local_irq_enable(); + spin_unlock_irq(&queue->input_pkt_queue.lock); break; } - local_irq_enable(); + spin_unlock_irq(&queue->input_pkt_queue.lock); - netif_receive_skb(skb); + __netif_receive_skb(skb); } while (++work < quota && jiffies == start_time); return work; @@ -3014,6 +3184,24 @@ void netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(netif_napi_del); +#ifdef CONFIG_SMP +/* + * net_rps_action sends any pending IPI's for rps. This is only called from + * softirq and interrupts must be enabled. + */ +static void net_rps_action(cpumask_t *mask) +{ + int cpu; + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + for_each_cpu_mask_nr(cpu, *mask) { + struct softnet_data *queue = &per_cpu(softnet_data, cpu); + if (cpu_online(cpu)) + __smp_call_function_single(cpu, &queue->csd, 0); + } + cpus_clear(*mask); +} +#endif static void net_rx_action(struct softirq_action *h) { @@ -3021,6 +3209,10 @@ static void net_rx_action(struct softirq_action *h) unsigned long time_limit = jiffies + 2; int budget = netdev_budget; void *have; +#ifdef CONFIG_SMP + int select; + struct rps_remote_softirq_cpus *rcpus; +#endif local_irq_disable(); @@ -3083,8 +3275,18 @@ static void net_rx_action(struct softirq_action *h) netpoll_poll_unlock(have); } out: +#ifdef CONFIG_SMP + rcpus = &__get_cpu_var(rps_remote_softirq_cpus); + select = rcpus->select; + rcpus->select ^= 1; + local_irq_enable(); + net_rps_action(&rcpus->mask[select]); +#else + local_irq_enable(); +#endif + #ifdef CONFIG_NET_DMA /* * There may not be any more sk_buffs coming right now, so push @@ -3329,10 +3531,10 @@ static int softnet_seq_show(struct seq_file *seq, void *v) { struct netif_rx_stats *s = v; - seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", s->total, s->dropped, s->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - s->cpu_collision); + s->cpu_collision, s->received_rps); return 0; } @@ -5069,6 +5271,23 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; + if (!dev->num_rx_queues) { + /* + * Allocate a single RX queue if driver never called + * alloc_netdev_mq + */ + + dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!dev->_rx) { + ret = -ENOMEM; + goto out; + } + + dev->_rx->first = dev->_rx; + atomic_set(&dev->_rx->count, 1); + dev->num_rx_queues = 1; + } + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5426,9 +5645,11 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { struct netdev_queue *tx; + struct netdev_rx_queue *rx; struct net_device *dev; size_t alloc_size; struct net_device *p; + int i; BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5454,11 +5675,27 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } + rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + printk(KERN_ERR "alloc_netdev: Unable to allocate " + "rx queues.\n"); + goto free_tx; + } + + atomic_set(&rx->count, queue_count); + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < queue_count; i++) + rx[i].first = rx; + dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) - goto free_tx; + goto free_rx; dev_unicast_init(dev); @@ -5468,6 +5705,9 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; + dev->_rx = rx; + dev->num_rx_queues = queue_count; + dev->gso_max_size = GSO_MAX_SIZE; netdev_init_queues(dev); @@ -5482,9 +5722,10 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; +free_rx: + kfree(rx); free_tx: kfree(tx); - free_p: kfree(p); return NULL; @@ -5987,6 +6228,12 @@ static int __init net_dev_init(void) queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); +#ifdef CONFIG_SMP + queue->csd.func = trigger_softirq; + queue->csd.info = queue; + queue->csd.flags = 0; +#endif + queue->backlog.poll = process_backlog; queue->backlog.weight = weight_p; queue->backlog.gro_list = NULL; @@ -6025,7 +6272,7 @@ subsys_initcall(net_dev_init); static int __init initialize_hashrnd(void) { - get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); + get_random_bytes(&hashrnd, sizeof(hashrnd)); return 0; } |