summaryrefslogtreecommitdiff
path: root/kernel/bpf
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-16 19:28:34 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-16 19:28:34 -0700
commit51835949dda3783d4639cfa74ce13a3c9829de00 (patch)
tree2b593de5eba6ecc73f7c58fc65fdaffae45c7323 /kernel/bpf
parent0434dbe32053d07d658165be681505120c6b1abc (diff)
parent77ae5e5b00720372af2860efdc4bc652ac682696 (diff)
Merge tag 'net-next-6.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextHEADmaster
Pull networking updates from Jakub Kicinski: "Not much excitement - a handful of large patchsets (devmem among them) did not make it in time. Core & protocols: - Use local_lock in addition to local_bh_disable() to protect per-CPU resources in networking, a step closer for local_bh_disable() not to act as a big lock on PREEMPT_RT - Use flex array for netdevice priv area, ensure its cache alignment - Add a sysctl knob to allow user to specify a default rto_min at socket init time. Bit of a big hammer but multiple companies were independently carrying such patch downstream so clearly it's useful - Support scheduling transmission of packets based on CLOCK_TAI - Un-pin TCP TIMEWAIT timer to avoid it firing on CPUs later cordoned off using cpusets - Support multiple L2TPv3 UDP tunnels using the same 5-tuple address - Allow configuration of multipath hash seed, to both allow synchronizing hashing of two routers, and preventing partial accidental sync - Improve TCP compliance with RFC 9293 for simultaneous connect() - Support sending NAT keepalives in IPsec ESP in UDP states. Userspace IKE daemon had to do this before, but the kernel can better keep track of it - Support sending supervision HSR frames with MAC addresses stored in ProxyNodeTable when RedBox (i.e. HSR-SAN) is enabled - Introduce IPPROTO_SMC for selecting SMC when socket is created - Allow UDP GSO transmit from devices with no checksum offload - openvswitch: add packet sampling via psample, separating the sampled traffic from "upcall" packets sent to user space for forwarding - nf_tables: shrink memory consumption for transaction objects Things we sprinkled into general kernel code: - Power Sequencing subsystem (used by Qualcomm Bluetooth driver for QCA6390) [ Already merged separately - Linus ] - Add IRQ information in sysfs for auxiliary bus - Introduce guard definition for local_lock - Add aligned flavor of __cacheline_group_{begin, end}() markings for grouping fields in structures BPF: - Notify user space (via epoll) when a struct_ops object is getting detached/unregistered - Add new kfuncs for a generic, open-coded bits iterator - Enable BPF programs to declare arrays of kptr, bpf_rb_root, and bpf_list_head - Support resilient split BTF which cuts down on duplication and makes BTF as compact as possible WRT BTF from modules - Add support for dumping kfunc prototypes from BTF which enables both detecting as well as dumping compilable prototypes for kfuncs - riscv64 BPF JIT improvements in particular to add 12-argument support for BPF trampolines and to utilize bpf_prog_pack for the latter - Add the capability to offload the netfilter flowtable in XDP layer through kfuncs Driver API: - Allow users to configure IRQ tresholds between which automatic IRQ moderation can choose - Expand Power Sourcing (PoE) status with power, class and failure reason. Support setting power limits - Track additional RSS contexts in the core, make sure configuration changes don't break them - Support IPsec crypto offload for IPv6 ESP and IPv4 UDP-encapsulated ESP data paths - Support updating firmware on SFP modules Tests and tooling: - mptcp: use net/lib.sh to manage netns - TCP-AO and TCP-MD5: replace debug prints used by tests with tracepoints - openvswitch: make test self-contained (don't depend on OvS CLI tools) Drivers: - Ethernet high-speed NICs: - Broadcom (bnxt): - increase the max total outstanding PTP TX packets to 4 - add timestamping statistics support - implement netdev_queue_mgmt_ops - support new RSS context API - Intel (100G, ice, idpf): - implement FEC statistics and dumping signal quality indicators - support E825C products (with 56Gbps PHYs) - nVidia/Mellanox: - support HW-GRO - mlx4/mlx5: support per-queue statistics via netlink - obey the max number of EQs setting in sub-functions - AMD/Solarflare: - support new RSS context API - AMD/Pensando: - ionic: rework fix for doorbell miss to lower overhead and skip it on new HW - Wangxun: - txgbe: support Flow Director perfect filters - Ethernet NICs consumer, embedded and virtual: - Add driver for Tehuti Networks TN40xx chips - Add driver for Meta's internal NIC chips - Add driver for Ethernet MAC on Airoha EN7581 SoCs - Add driver for Renesas Ethernet-TSN devices - Google cloud vNIC: - flow steering support - Microsoft vNIC: - support page sizes other than 4KB on ARM64 - vmware vNIC: - support latency measurement (update to version 9) - VirtIO net: - support for Byte Queue Limits - support configuring thresholds for automatic IRQ moderation - support for AF_XDP Rx zero-copy - Synopsys (stmmac): - support for STM32MP13 SoC - let platforms select the right PCS implementation - TI: - icssg-prueth: add multicast filtering support - icssg-prueth: enable PTP timestamping and PPS - Renesas: - ravb: improve Rx performance 30-400% by using page pool, theaded NAPI and timer-based IRQ coalescing - ravb: add MII support for R-Car V4M - Cadence (macb): - macb: add ARP support to Wake-On-LAN - Cortina: - use phylib for RX and TX pause configuration - Ethernet switches: - nVidia/Mellanox: - support configuration of multipath hash seed - report more accurate max MTU - use page_pool to improve Rx performance - MediaTek: - mt7530: add support for bridge port isolation - Qualcomm: - qca8k: add support for bridge port isolation - Microchip: - lan9371/2: add 100BaseTX PHY support - NXP: - vsc73xx: implement VLAN operations - Ethernet PHYs: - aquantia: enable support for aqr115c - aquantia: add support for PHY LEDs - realtek: add support for rtl8224 2.5Gbps PHY - xpcs: add memory-mapped device support - add BroadR-Reach link mode and support in Broadcom's PHY driver - CAN: - add document for ISO 15765-2 protocol support - mcp251xfd: workaround for erratum DS80000789E, use timestamps to catch when device returns incorrect FIFO status - WiFi: - mac80211/cfg80211: - parse Transmit Power Envelope (TPE) data in mac80211 instead of in drivers - improvements for 6 GHz regulatory flexibility - multi-link improvements - support multiple radios per wiphy - remove DEAUTH_NEED_MGD_TX_PREP flag - Intel (iwlwifi): - bump FW API to 91 for BZ/SC devices - report 64-bit radiotap timestamp - enable P2P low latency by default - handle Transmit Power Envelope (TPE) advertised by AP - remove support for older FW for new devices - fast resume (keeping the device configured) - mvm: re-enable Multi-Link Operation (MLO) - aggregation (A-MSDU) optimizations - MediaTek (mt76): - mt7925 Multi-Link Operation (MLO) support - Qualcomm (ath10k): - LED support for various chipsets - Qualcomm (ath12k): - remove unsupported Tx monitor handling - support channel 2 in 6 GHz band - support Spatial Multiplexing Power Save (SMPS) in 6 GHz band - supprt multiple BSSID (MBSSID) and Enhanced Multi-BSSID Advertisements (EMA) - support dynamic VLAN - add panic handler for resetting the firmware state - DebugFS support for datapath statistics - WCN7850: support for Wake on WLAN - Microchip (wilc1000): - read MAC address during probe to make it visible to user space - suspend/resume improvements - TI (wl18xx): - support newer firmware versions - RealTek (rtw89): - preparation for RTL8852BE-VT support - Wake on WLAN support for WiFi 6 chips - 36-bit PCI DMA support - RealTek (rtlwifi): - RTL8192DU support - Broadcom (brcmfmac): - Management Frame Protection support (to enable WPA3) - Bluetooth: - qualcomm: use the power sequencer for QCA6390 - btusb: mediatek: add ISO data transmission functions - hci_bcm4377: add BCM4388 support - btintel: add support for BlazarU core - btintel: add support for Whale Peak2 - btnxpuart: add support for AW693 A1 chipset - btnxpuart: add support for IW615 chipset - btusb: add Realtek RTL8852BE support ID 0x13d3:0x3591" * tag 'net-next-6.11' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1589 commits) eth: fbnic: Fix spelling mistake "tiggerring" -> "triggering" tcp: Replace strncpy() with strscpy() wifi: ath12k: fix build vs old compiler tcp: Don't access uninit tcp_rsk(req)->ao_keyid in tcp_create_openreq_child(). eth: fbnic: Write the TCAM tables used for RSS control and Rx to host eth: fbnic: Add L2 address programming eth: fbnic: Add basic Rx handling eth: fbnic: Add basic Tx handling eth: fbnic: Add link detection eth: fbnic: Add initial messaging to notify FW of our presence eth: fbnic: Implement Rx queue alloc/start/stop/free eth: fbnic: Implement Tx queue alloc/start/stop/free eth: fbnic: Allocate a netdevice and napi vectors with queues eth: fbnic: Add FW communication mechanism eth: fbnic: Add message parsing for FW messages eth: fbnic: Add register init to set PCIe/Ethernet device config eth: fbnic: Allocate core device specific structures and devlink interface eth: fbnic: Add scaffolding for Meta's NIC driver PCI: Add Meta Platforms vendor ID net/sched: cls_flower: propagate tca[TCA_OPTIONS] to NL_REQ_ATTR_CHECK ...
Diffstat (limited to 'kernel/bpf')
-rw-r--r--kernel/bpf/Makefile8
-rw-r--r--kernel/bpf/bpf_lsm.c1
-rw-r--r--kernel/bpf/bpf_struct_ops.c77
-rw-r--r--kernel/bpf/btf.c509
-rw-r--r--kernel/bpf/core.c8
-rw-r--r--kernel/bpf/cpumap.c35
-rw-r--r--kernel/bpf/crypto.c42
-rw-r--r--kernel/bpf/devmap.c57
-rw-r--r--kernel/bpf/helpers.c164
-rw-r--r--kernel/bpf/log.c6
-rw-r--r--kernel/bpf/syscall.c34
-rw-r--r--kernel/bpf/task_iter.c9
-rw-r--r--kernel/bpf/verifier.c324
13 files changed, 830 insertions, 444 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 7eb9ad3a3ae6..0291eef9ce92 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -50,5 +50,11 @@ endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
-$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
+
+# Some source files are common to libbpf.
+vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf
+
+$(obj)/%.o: %.c FORCE
$(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 68240c3c6e7d..08a338e1f231 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -280,6 +280,7 @@ BTF_ID(func, bpf_lsm_cred_prepare)
BTF_ID(func, bpf_lsm_file_ioctl)
BTF_ID(func, bpf_lsm_file_lock)
BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_post_open)
BTF_ID(func, bpf_lsm_file_receive)
BTF_ID(func, bpf_lsm_inode_create)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 86c7884abaf8..0d515ec57aa5 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -12,6 +12,7 @@
#include <linux/mutex.h>
#include <linux/btf_ids.h>
#include <linux/rcupdate_wait.h>
+#include <linux/poll.h>
struct bpf_struct_ops_value {
struct bpf_struct_ops_common_value common;
@@ -56,6 +57,7 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_link {
struct bpf_link link;
struct bpf_map __rcu *map;
+ wait_queue_head_t wait_hup;
};
static DEFINE_MUTEX(update_mutex);
@@ -571,7 +573,7 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
}
size = arch_prepare_bpf_trampoline(NULL, image + image_off,
- image + PAGE_SIZE,
+ image + image_off + size,
model, flags, tlinks, stub_func);
if (size <= 0) {
if (image != *_image)
@@ -757,7 +759,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- err = st_ops->reg(kdata);
+ err = st_ops->reg(kdata, NULL);
if (likely(!err)) {
/* This refcnt increment on the map here after
* 'st_ops->reg()' is secure since the state of the
@@ -805,7 +807,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
@@ -1057,10 +1059,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
st_map = (struct bpf_struct_ops_map *)
rcu_dereference_protected(st_link->map, true);
if (st_map) {
- /* st_link->map can be NULL if
- * bpf_struct_ops_link_create() fails to register.
- */
- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
bpf_map_put(&st_map->map);
}
kfree(st_link);
@@ -1075,7 +1074,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- seq_printf(seq, "map_id:\t%d\n", map->id);
+ if (map)
+ seq_printf(seq, "map_id:\t%d\n", map->id);
rcu_read_unlock();
}
@@ -1088,7 +1088,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
- info->struct_ops.map_id = map->id;
+ if (map)
+ info->struct_ops.map_id = map->id;
rcu_read_unlock();
return 0;
}
@@ -1113,6 +1114,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
mutex_lock(&update_mutex);
old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!old_map) {
+ err = -ENOLINK;
+ goto err_out;
+ }
if (expected_old_map && old_map != expected_old_map) {
err = -EPERM;
goto err_out;
@@ -1125,7 +1130,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
goto err_out;
}
- err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
if (err)
goto err_out;
@@ -1139,11 +1144,53 @@ err_out:
return err;
}
+static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
+{
+ struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
+ struct bpf_struct_ops_map *st_map;
+ struct bpf_map *map;
+
+ mutex_lock(&update_mutex);
+
+ map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
+ if (!map) {
+ mutex_unlock(&update_mutex);
+ return 0;
+ }
+ st_map = container_of(map, struct bpf_struct_ops_map, map);
+
+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
+
+ RCU_INIT_POINTER(st_link->map, NULL);
+ /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
+ * bpf_map_inc() in bpf_struct_ops_map_link_update().
+ */
+ bpf_map_put(&st_map->map);
+
+ mutex_unlock(&update_mutex);
+
+ wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);
+
+ return 0;
+}
+
+static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
+ struct poll_table_struct *pts)
+{
+ struct bpf_struct_ops_link *st_link = file->private_data;
+
+ poll_wait(file, &st_link->wait_hup, pts);
+
+ return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
+}
+
static const struct bpf_link_ops bpf_struct_ops_map_lops = {
.dealloc = bpf_struct_ops_map_link_dealloc,
+ .detach = bpf_struct_ops_map_link_detach,
.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
.update_map = bpf_struct_ops_map_link_update,
+ .poll = bpf_struct_ops_map_link_poll,
};
int bpf_struct_ops_link_create(union bpf_attr *attr)
@@ -1176,13 +1223,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
if (err)
goto err_out;
- err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
+ init_waitqueue_head(&link->wait_hup);
+
+ /* Hold the update_mutex such that the subsystem cannot
+ * do link->ops->detach() before the link is fully initialized.
+ */
+ mutex_lock(&update_mutex);
+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
if (err) {
+ mutex_unlock(&update_mutex);
bpf_link_cleanup(&link_primer);
link = NULL;
goto err_out;
}
RCU_INIT_POINTER(link->map, map);
+ mutex_unlock(&update_mutex);
return bpf_link_settle(&link_primer);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 821063660d9f..520f49f422fe 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -274,6 +274,7 @@ struct btf {
u32 start_str_off; /* first string offset (0 for base BTF) */
char name[MODULE_NAME_LEN];
bool kernel_btf;
+ __u32 *base_id_map; /* map from distilled base BTF -> vmlinux BTF ids */
};
enum verifier_phase {
@@ -414,7 +415,7 @@ const char *btf_type_str(const struct btf_type *t)
struct btf_show {
u64 flags;
void *target; /* target of show operation (seq file, buffer) */
- void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ __printf(2, 0) void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
const struct btf *btf;
/* below are used during iteration */
struct {
@@ -530,6 +531,11 @@ static bool btf_type_is_decl_tag_target(const struct btf_type *t)
btf_type_is_var(t) || btf_type_is_typedef(t);
}
+bool btf_is_vmlinux(const struct btf *btf)
+{
+ return btf->kernel_btf && !btf->base_btf;
+}
+
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@@ -772,7 +778,7 @@ static bool __btf_name_char_ok(char c, bool first)
return true;
}
-static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+const char *btf_str_by_offset(const struct btf *btf, u32 offset)
{
while (offset < btf->start_str_off)
btf = btf->base_btf;
@@ -1670,14 +1676,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)
if (!tab)
return;
- /* For module BTF, we directly assign the sets being registered, so
- * there is nothing to free except kfunc_set_tab.
- */
- if (btf_is_module(btf))
- goto free_tab;
for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
kfree(tab->sets[hook]);
-free_tab:
kfree(tab);
btf->kfunc_set_tab = NULL;
}
@@ -1735,7 +1735,12 @@ static void btf_free(struct btf *btf)
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
- kvfree(btf->data);
+ /* vmlinux does not allocate btf->data, it simply points it at
+ * __start_BTF.
+ */
+ if (!btf_is_vmlinux(btf))
+ kvfree(btf->data);
+ kvfree(btf->base_id_map);
kfree(btf);
}
@@ -1764,6 +1769,23 @@ void btf_put(struct btf *btf)
}
}
+struct btf *btf_base_btf(const struct btf *btf)
+{
+ return btf->base_btf;
+}
+
+const struct btf_header *btf_header(const struct btf *btf)
+{
+ return &btf->hdr;
+}
+
+void btf_set_base_btf(struct btf *btf, const struct btf *base_btf)
+{
+ btf->base_btf = (struct btf *)base_btf;
+ btf->start_id = btf_nr_types(base_btf);
+ btf->start_str_off = base_btf->hdr.str_len;
+}
+
static int env_resolve_init(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -3442,10 +3464,12 @@ btf_find_graph_root(const struct btf *btf, const struct btf_type *pt,
goto end; \
}
-static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_type,
+ u32 field_mask, u32 *seen_mask,
int *align, int *sz)
{
int type = 0;
+ const char *name = __btf_name_by_offset(btf, var_type->name_off);
if (field_mask & BPF_SPIN_LOCK) {
if (!strcmp(name, "bpf_spin_lock")) {
@@ -3481,7 +3505,7 @@ static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
- if (field_mask & BPF_KPTR) {
+ if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) {
type = BPF_KPTR_REF;
goto end;
}
@@ -3494,140 +3518,232 @@ end:
#undef field_mask_test_name
+/* Repeat a number of fields for a specified number of times.
+ *
+ * Copy the fields starting from the first field and repeat them for
+ * repeat_cnt times. The fields are repeated by adding the offset of each
+ * field with
+ * (i + 1) * elem_size
+ * where i is the repeat index and elem_size is the size of an element.
+ */
+static int btf_repeat_fields(struct btf_field_info *info,
+ u32 field_cnt, u32 repeat_cnt, u32 elem_size)
+{
+ u32 i, j;
+ u32 cur;
+
+ /* Ensure not repeating fields that should not be repeated. */
+ for (i = 0; i < field_cnt; i++) {
+ switch (info[i].type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ cur = field_cnt;
+ for (i = 0; i < repeat_cnt; i++) {
+ memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0]));
+ for (j = 0; j < field_cnt; j++)
+ info[cur++].off += (i + 1) * elem_size;
+ }
+
+ return 0;
+}
+
static int btf_find_struct_field(const struct btf *btf,
const struct btf_type *t, u32 field_mask,
- struct btf_field_info *info, int info_cnt)
+ struct btf_field_info *info, int info_cnt,
+ u32 level);
+
+/* Find special fields in the struct type of a field.
+ *
+ * This function is used to find fields of special types that is not a
+ * global variable or a direct field of a struct type. It also handles the
+ * repetition if it is the element type of an array.
+ */
+static int btf_find_nested_struct(const struct btf *btf, const struct btf_type *t,
+ u32 off, u32 nelems,
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
- const struct btf_member *member;
+ int ret, err, i;
+
+ level++;
+ if (level >= MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ ret = btf_find_struct_field(btf, t, field_mask, info, info_cnt, level);
+
+ if (ret <= 0)
+ return ret;
+
+ /* Shift the offsets of the nested struct fields to the offsets
+ * related to the container.
+ */
+ for (i = 0; i < ret; i++)
+ info[i].off += off;
+
+ if (nelems > 1) {
+ err = btf_repeat_fields(info, ret, nelems - 1, t->size);
+ if (err == 0)
+ ret *= nelems;
+ else
+ ret = err;
+ }
+
+ return ret;
+}
+
+static int btf_find_field_one(const struct btf *btf,
+ const struct btf_type *var,
+ const struct btf_type *var_type,
+ int var_idx,
+ u32 off, u32 expected_size,
+ u32 field_mask, u32 *seen_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, align, sz, field_type;
struct btf_field_info tmp;
+ const struct btf_array *array;
+ u32 i, nelems = 1;
+
+ /* Walk into array types to find the element type and the number of
+ * elements in the (flattened) array.
+ */
+ for (i = 0; i < MAX_RESOLVE_DEPTH && btf_type_is_array(var_type); i++) {
+ array = btf_array(var_type);
+ nelems *= array->nelems;
+ var_type = btf_type_by_id(btf, array->type);
+ }
+ if (i == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+ if (nelems == 0)
+ return 0;
+
+ field_type = btf_get_field_type(btf, var_type,
+ field_mask, seen_mask, &align, &sz);
+ /* Look into variables of struct types */
+ if (!field_type && __btf_type_is_struct(var_type)) {
+ sz = var_type->size;
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ ret = btf_find_nested_struct(btf, var_type, off, nelems, field_mask,
+ &info[0], info_cnt, level);
+ return ret;
+ }
+
+ if (field_type == 0)
+ return 0;
+ if (field_type < 0)
+ return field_type;
+
+ if (expected_size && expected_size != sz * nelems)
+ return 0;
+ if (off % align)
+ return 0;
+
+ switch (field_type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_WORKQUEUE:
+ case BPF_LIST_NODE:
+ case BPF_RB_NODE:
+ case BPF_REFCOUNT:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ case BPF_KPTR_PERCPU:
+ ret = btf_find_kptr(btf, var_type, off, sz,
+ info_cnt ? &info[0] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_RB_ROOT:
+ ret = btf_find_graph_root(btf, var, var_type,
+ var_idx, off, sz,
+ info_cnt ? &info[0] : &tmp,
+ field_type);
+ if (ret < 0)
+ return ret;
+ break;
+ default:
+ return -EFAULT;
+ }
+
+ if (ret == BTF_FIELD_IGNORE)
+ return 0;
+ if (nelems > info_cnt)
+ return -E2BIG;
+ if (nelems > 1) {
+ ret = btf_repeat_fields(info, 1, nelems - 1, sz);
+ if (ret < 0)
+ return ret;
+ }
+ return nelems;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
+ struct btf_field_info *info, int info_cnt,
+ u32 level)
+{
+ int ret, idx = 0;
+ const struct btf_member *member;
u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % align)
- continue;
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_WORKQUEUE:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, member_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, member_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, t, member_type,
- i, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
-
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, t, member_type, i,
+ off, 0,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx, level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
u32 field_mask, struct btf_field_info *info,
- int info_cnt)
+ int info_cnt, u32 level)
{
- int ret, idx = 0, align, sz, field_type;
+ int ret, idx = 0;
const struct btf_var_secinfo *vsi;
- struct btf_field_info tmp;
u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
- field_mask, &seen_mask, &align, &sz);
- if (field_type == 0)
- continue;
- if (field_type < 0)
- return field_type;
-
off = vsi->offset;
- if (vsi->size != sz)
- continue;
- if (off % align)
- continue;
-
- switch (field_type) {
- case BPF_SPIN_LOCK:
- case BPF_TIMER:
- case BPF_WORKQUEUE:
- case BPF_LIST_NODE:
- case BPF_RB_NODE:
- case BPF_REFCOUNT:
- ret = btf_find_struct(btf, var_type, off, sz, field_type,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_KPTR_UNREF:
- case BPF_KPTR_REF:
- case BPF_KPTR_PERCPU:
- ret = btf_find_kptr(btf, var_type, off, sz,
- idx < info_cnt ? &info[idx] : &tmp);
- if (ret < 0)
- return ret;
- break;
- case BPF_LIST_HEAD:
- case BPF_RB_ROOT:
- ret = btf_find_graph_root(btf, var, var_type,
- -1, off, sz,
- idx < info_cnt ? &info[idx] : &tmp,
- field_type);
- if (ret < 0)
- return ret;
- break;
- default:
- return -EFAULT;
- }
-
- if (ret == BTF_FIELD_IGNORE)
- continue;
- if (idx >= info_cnt)
- return -E2BIG;
- ++idx;
+ ret = btf_find_field_one(btf, var, var_type, -1, off, vsi->size,
+ field_mask, &seen_mask,
+ &info[idx], info_cnt - idx,
+ level);
+ if (ret < 0)
+ return ret;
+ idx += ret;
}
return idx;
}
@@ -3637,9 +3753,9 @@ static int btf_find_field(const struct btf *btf, const struct btf_type *t,
int info_cnt)
{
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt, 0);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt, 0);
return -EINVAL;
}
@@ -5726,6 +5842,15 @@ static int find_kern_ctx_type_id(enum bpf_prog_type prog_type)
return ctx_type->type;
}
+bool btf_is_projection_of(const char *pname, const char *tname)
+{
+ if (strcmp(pname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
+ return true;
+ if (strcmp(pname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ return true;
+ return false;
+}
+
bool btf_is_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
@@ -5788,9 +5913,7 @@ again:
* int socket_filter_bpf_prog(struct __sk_buff *skb)
* { // no fields of skb are ever used }
*/
- if (strcmp(ctx_tname, "__sk_buff") == 0 && strcmp(tname, "sk_buff") == 0)
- return true;
- if (strcmp(ctx_tname, "xdp_md") == 0 && strcmp(tname, "xdp_buff") == 0)
+ if (btf_is_projection_of(ctx_tname, tname))
return true;
if (strcmp(ctx_tname, tname)) {
/* bpf_user_pt_regs_t is a typedef, so resolve it to
@@ -5982,23 +6105,15 @@ int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_ty
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
-struct btf *btf_parse_vmlinux(void)
+static struct btf *btf_parse_base(struct btf_verifier_env *env, const char *name,
+ void *data, unsigned int data_size)
{
- struct btf_verifier_env *env = NULL;
- struct bpf_verifier_log *log;
struct btf *btf = NULL;
int err;
if (!IS_ENABLED(CONFIG_DEBUG_INFO_BTF))
return ERR_PTR(-ENOENT);
- env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
- if (!env)
- return ERR_PTR(-ENOMEM);
-
- log = &env->log;
- log->level = BPF_LOG_KERNEL;
-
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
@@ -6006,10 +6121,10 @@ struct btf *btf_parse_vmlinux(void)
}
env->btf = btf;
- btf->data = __start_BTF;
- btf->data_size = __stop_BTF - __start_BTF;
+ btf->data = data;
+ btf->data_size = data_size;
btf->kernel_btf = true;
- snprintf(btf->name, sizeof(btf->name), "vmlinux");
+ snprintf(btf->name, sizeof(btf->name), "%s", name);
err = btf_parse_hdr(env);
if (err)
@@ -6029,20 +6144,11 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
-
refcount_set(&btf->refcnt, 1);
- err = btf_alloc_id(btf);
- if (err)
- goto errout;
-
- btf_verifier_env_free(env);
return btf;
errout:
- btf_verifier_env_free(env);
if (btf) {
kvfree(btf->types);
kfree(btf);
@@ -6050,19 +6156,61 @@ errout:
return ERR_PTR(err);
}
+struct btf *btf_parse_vmlinux(void)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf;
+ int err;
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+ btf = btf_parse_base(env, "vmlinux", __start_BTF, __stop_BTF - __start_BTF);
+ if (IS_ERR(btf))
+ goto err_out;
+
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ btf = ERR_PTR(err);
+ }
+err_out:
+ btf_verifier_env_free(env);
+ return btf;
+}
+
+/* If .BTF_ids section was created with distilled base BTF, both base and
+ * split BTF ids will need to be mapped to actual base/split ids for
+ * BTF now that it has been relocated.
+ */
+static __u32 btf_relocate_id(const struct btf *btf, __u32 id)
+{
+ if (!btf->base_btf || !btf->base_id_map)
+ return id;
+ return btf->base_id_map[id];
+}
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
-static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+static struct btf *btf_parse_module(const char *module_name, const void *data,
+ unsigned int data_size, void *base_data,
+ unsigned int base_data_size)
{
+ struct btf *btf = NULL, *vmlinux_btf, *base_btf = NULL;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
- struct btf *btf = NULL, *base_btf;
- int err;
+ int err = 0;
- base_btf = bpf_get_btf_vmlinux();
- if (IS_ERR(base_btf))
- return base_btf;
- if (!base_btf)
+ vmlinux_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(vmlinux_btf))
+ return vmlinux_btf;
+ if (!vmlinux_btf)
return ERR_PTR(-EINVAL);
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
@@ -6072,6 +6220,16 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
log = &env->log;
log->level = BPF_LOG_KERNEL;
+ if (base_data) {
+ base_btf = btf_parse_base(env, ".BTF.base", base_data, base_data_size);
+ if (IS_ERR(base_btf)) {
+ err = PTR_ERR(base_btf);
+ goto errout;
+ }
+ } else {
+ base_btf = vmlinux_btf;
+ }
+
btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
if (!btf) {
err = -ENOMEM;
@@ -6111,12 +6269,22 @@ static struct btf *btf_parse_module(const char *module_name, const void *data, u
if (err)
goto errout;
+ if (base_btf != vmlinux_btf) {
+ err = btf_relocate(btf, vmlinux_btf, &btf->base_id_map);
+ if (err)
+ goto errout;
+ btf_free(base_btf);
+ base_btf = vmlinux_btf;
+ }
+
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
errout:
btf_verifier_env_free(env);
+ if (base_btf != vmlinux_btf)
+ btf_free(base_btf);
if (btf) {
kvfree(btf->data);
kvfree(btf->types);
@@ -6693,7 +6861,7 @@ int btf_struct_access(struct bpf_verifier_log *log,
for (i = 0; i < rec->cnt; i++) {
struct btf_field *field = &rec->fields[i];
u32 offset = field->offset;
- if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ if (off < offset + field->size && offset < off + size) {
bpf_log(log,
"direct access to %s is disallowed\n",
btf_field_type_name(field->type));
@@ -7370,8 +7538,8 @@ static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
}
-static void btf_seq_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
seq_vprintf((struct seq_file *)show->target, fmt, args);
}
@@ -7404,8 +7572,8 @@ struct btf_show_snprintf {
int len; /* length we would have written */
};
-static void btf_snprintf_show(struct btf_show *show, const char *fmt,
- va_list args)
+__printf(2, 0) static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
{
struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
int len;
@@ -7669,7 +7837,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
err = -ENOMEM;
goto out;
}
- btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size,
+ mod->btf_base_data, mod->btf_base_data_size);
if (IS_ERR(btf)) {
kfree(btf_mod);
if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) {
@@ -7993,7 +8162,7 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
bool add_filter = !!kset->filter;
struct btf_kfunc_set_tab *tab;
struct btf_id_set8 *set;
- u32 set_cnt;
+ u32 set_cnt, i;
int ret;
if (hook >= BTF_KFUNC_HOOK_MAX) {
@@ -8039,21 +8208,15 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
goto end;
}
- /* We don't need to allocate, concatenate, and sort module sets, because
- * only one is allowed per hook. Hence, we can directly assign the
- * pointer and return.
- */
- if (!vmlinux_set) {
- tab->sets[hook] = add_set;
- goto do_add_filter;
- }
-
/* In case of vmlinux sets, there may be more than one set being
* registered per hook. To create a unified set, we allocate a new set
* and concatenate all individual sets being registered. While each set
* is individually sorted, they may become unsorted when concatenated,
* hence re-sorting the final set again is required to make binary
* searching the set using btf_id_set8_contains function work.
+ *
+ * For module sets, we need to allocate as we may need to relocate
+ * BTF ids.
*/
set_cnt = set ? set->cnt : 0;
@@ -8083,11 +8246,14 @@ static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
/* Concatenate the two sets */
memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
+ /* Now that the set is copied, update with relocated BTF ids */
+ for (i = set->cnt; i < set->cnt + add_set->cnt; i++)
+ set->pairs[i].id = btf_relocate_id(btf, set->pairs[i].id);
+
set->cnt += add_set->cnt;
sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
-do_add_filter:
if (add_filter) {
hook_filter = &tab->hook_filters[hook];
hook_filter->filters[hook_filter->nr_filters++] = kset->filter;
@@ -8207,7 +8373,7 @@ static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
return PTR_ERR(btf);
for (i = 0; i < kset->set->cnt; i++) {
- ret = btf_check_kfunc_protos(btf, kset->set->pairs[i].id,
+ ret = btf_check_kfunc_protos(btf, btf_relocate_id(btf, kset->set->pairs[i].id),
kset->set->pairs[i].flags);
if (ret)
goto err_out;
@@ -8271,7 +8437,7 @@ static int btf_check_dtor_kfuncs(struct btf *btf, const struct btf_id_dtor_kfunc
u32 nr_args, i;
for (i = 0; i < cnt; i++) {
- dtor_btf_id = dtors[i].kfunc_btf_id;
+ dtor_btf_id = btf_relocate_id(btf, dtors[i].kfunc_btf_id);
dtor_func = btf_type_by_id(btf, dtor_btf_id);
if (!dtor_func || !btf_type_is_func(dtor_func))
@@ -8306,7 +8472,7 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
{
struct btf_id_dtor_kfunc_tab *tab;
struct btf *btf;
- u32 tab_cnt;
+ u32 tab_cnt, i;
int ret;
btf = btf_get_module_btf(owner);
@@ -8357,6 +8523,13 @@ int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_c
btf->dtor_kfunc_tab = tab;
memcpy(tab->dtors + tab->cnt, dtors, add_cnt * sizeof(tab->dtors[0]));
+
+ /* remap BTF ids based on BTF relocation (if any) */
+ for (i = tab_cnt; i < tab_cnt + add_cnt; i++) {
+ tab->dtors[i].btf_id = btf_relocate_id(btf, tab->dtors[i].btf_id);
+ tab->dtors[i].kfunc_btf_id = btf_relocate_id(btf, tab->dtors[i].kfunc_btf_id);
+ }
+
tab->cnt += add_cnt;
sort(tab->dtors, tab->cnt, sizeof(tab->dtors[0]), btf_id_cmp_func, NULL);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 695a0fb2cd4d..7ee62e38faf0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1173,8 +1173,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
}
/* Copy JITed text from rw_header to its final location, the ro_header. */
-int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
- struct bpf_binary_header *ro_header,
+int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
struct bpf_binary_header *rw_header)
{
void *ptr;
@@ -2742,8 +2741,7 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
-void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
- struct btf_mod_pair *used_btfs, u32 len)
+void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len)
{
#ifdef CONFIG_BPF_SYSCALL
struct btf_mod_pair *btf_mod;
@@ -2760,7 +2758,7 @@ void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
{
- __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt);
kfree(aux->used_btfs);
}
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a8e34416e960..fbdf5a1aabfe 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -79,8 +79,6 @@ struct bpf_cpu_map {
struct bpf_cpu_map_entry __rcu **cpu_map;
};
-static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
@@ -240,12 +238,14 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
int xdp_n, struct xdp_cpumap_stats *stats,
struct list_head *list)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int nframes;
if (!rcpu->prog)
return xdp_n;
rcu_read_lock_bh();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
@@ -255,6 +255,7 @@ static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
if (unlikely(!list_empty(list)))
cpu_map_bpf_prog_run_skb(rcpu, list, stats);
+ bpf_net_ctx_clear(bpf_net_ctx);
rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
return nframes;
@@ -706,7 +707,6 @@ static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
*/
static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
@@ -723,8 +723,11 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
*/
bq->q[bq->count++] = xdpf;
- if (!bq->flush_node.prev)
+ if (!bq->flush_node.prev) {
+ struct list_head *flush_list = bpf_net_ctx_get_cpu_map_flush_list();
+
list_add(&bq->flush_node, flush_list);
+ }
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
@@ -756,9 +759,8 @@ trace:
return ret;
}
-void __cpu_map_flush(void)
+void __cpu_map_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -768,24 +770,3 @@ void __cpu_map_flush(void)
wake_up_process(bq->obj->kthread);
}
}
-
-#ifdef CONFIG_DEBUG_NET
-bool cpu_map_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&cpu_map_flush_list)))
- return false;
- __cpu_map_flush();
- return true;
-}
-#endif
-
-static int __init cpu_map_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(cpu_map_flush_list, cpu));
- return 0;
-}
-
-subsys_initcall(cpu_map_init);
diff --git a/kernel/bpf/crypto.c b/kernel/bpf/crypto.c
index 2bee4af91e38..94854cd9c4cc 100644
--- a/kernel/bpf/crypto.c
+++ b/kernel/bpf/crypto.c
@@ -275,7 +275,7 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
if (__bpf_dynptr_is_rdonly(dst))
return -EINVAL;
- siv_len = __bpf_dynptr_size(siv);
+ siv_len = siv ? __bpf_dynptr_size(siv) : 0;
src_len = __bpf_dynptr_size(src);
dst_len = __bpf_dynptr_size(dst);
if (!src_len || !dst_len)
@@ -303,36 +303,44 @@ static int bpf_crypto_crypt(const struct bpf_crypto_ctx *ctx,
/**
* bpf_crypto_decrypt() - Decrypt buffer using configured context and IV provided.
- * @ctx: The crypto context being used. The ctx must be a trusted pointer.
- * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
- * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
- * @siv: bpf_dynptr to IV data and state data to be used by decryptor.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the encrypted data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
*
* Decrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
*/
__bpf_kfunc int bpf_crypto_decrypt(struct bpf_crypto_ctx *ctx,
- const struct bpf_dynptr_kern *src,
- const struct bpf_dynptr_kern *dst,
- const struct bpf_dynptr_kern *siv)
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
{
- return bpf_crypto_crypt(ctx, src, dst, siv, true);
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, true);
}
/**
* bpf_crypto_encrypt() - Encrypt buffer using configured context and IV provided.
- * @ctx: The crypto context being used. The ctx must be a trusted pointer.
- * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
- * @dst: bpf_dynptr to buffer where to store the result. Must be a trusted pointer.
- * @siv: bpf_dynptr to IV data and state data to be used by decryptor.
+ * @ctx: The crypto context being used. The ctx must be a trusted pointer.
+ * @src: bpf_dynptr to the plain data. Must be a trusted pointer.
+ * @dst: bpf_dynptr to the buffer where to store the result. Must be a trusted pointer.
+ * @siv__nullable: bpf_dynptr to IV data and state data to be used by decryptor. May be NULL.
*
* Encrypts provided buffer using IV data and the crypto context. Crypto context must be configured.
*/
__bpf_kfunc int bpf_crypto_encrypt(struct bpf_crypto_ctx *ctx,
- const struct bpf_dynptr_kern *src,
- const struct bpf_dynptr_kern *dst,
- const struct bpf_dynptr_kern *siv)
+ const struct bpf_dynptr *src,
+ const struct bpf_dynptr *dst,
+ const struct bpf_dynptr *siv__nullable)
{
- return bpf_crypto_crypt(ctx, src, dst, siv, false);
+ const struct bpf_dynptr_kern *src_kern = (struct bpf_dynptr_kern *)src;
+ const struct bpf_dynptr_kern *dst_kern = (struct bpf_dynptr_kern *)dst;
+ const struct bpf_dynptr_kern *siv_kern = (struct bpf_dynptr_kern *)siv__nullable;
+
+ return bpf_crypto_crypt(ctx, src_kern, dst_kern, siv_kern, false);
}
__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 7f3b34452243..9e0e3b0a18e4 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -83,7 +83,6 @@ struct bpf_dtab {
u32 n_buckets;
};
-static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
@@ -107,7 +106,7 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}
-static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+static int dev_map_alloc_check(union bpf_attr *attr)
{
u32 valsize = attr->value_size;
@@ -121,23 +120,28 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
+ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
+ /* Hash table size must be power of 2; roundup_pow_of_two()
+ * can overflow into UB on 32-bit arches
+ */
+ if (attr->max_entries > 1UL << 31)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
+{
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
* verifier prevents writes from the BPF side
*/
attr->map_flags |= BPF_F_RDONLY_PROG;
-
-
bpf_map_init_from_attr(&dtab->map, attr);
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- /* hash table size must be power of 2; roundup_pow_of_two() can
- * overflow into UB on 32-bit arches, so check that first
- */
- if (dtab->map.max_entries > 1UL << 31)
- return -EINVAL;
-
+ /* Hash table size must be power of 2 */
dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
-
dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
dtab->map.numa_node);
if (!dtab->dev_index_head)
@@ -196,7 +200,14 @@ static void dev_map_free(struct bpf_map *map)
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
- bpf_clear_redirect_map(map);
+ /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
+ * during NAPI callback and cleared after the XDP redirect. There is no
+ * explicit RCU read section which protects bpf_redirect_info->map but
+ * local_bh_disable() also marks the beginning an RCU section. This
+ * makes the complete softirq callback RCU protected. Thus after
+ * following synchronize_rcu() there no bpf_redirect_info->map == map
+ * assignment.
+ */
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
@@ -406,9 +417,8 @@ out:
* driver before returning from its napi->poll() routine. See the comment above
* xdp_do_flush() in filter.c.
*/
-void __dev_flush(void)
+void __dev_flush(struct list_head *flush_list)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
@@ -419,16 +429,6 @@ void __dev_flush(void)
}
}
-#ifdef CONFIG_DEBUG_NET
-bool dev_check_flush(void)
-{
- if (list_empty(this_cpu_ptr(&dev_flush_list)))
- return false;
- __dev_flush();
- return true;
-}
-#endif
-
/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
* by local_bh_disable() (from XDP calls inside NAPI). The
* rcu_read_lock_bh_held() below makes lockdep accept both.
@@ -453,7 +453,6 @@ static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
- struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
@@ -467,6 +466,8 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
* are only ever modified together.
*/
if (!bq->dev_rx) {
+ struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();
+
bq->dev_rx = dev_rx;
bq->xdp_prog = xdp_prog;
list_add(&bq->flush_node, flush_list);
@@ -1040,6 +1041,7 @@ static u64 dev_map_mem_usage(const struct bpf_map *map)
BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -1054,6 +1056,7 @@ const struct bpf_map_ops dev_map_ops = {
const struct bpf_map_ops dev_map_hash_ops = {
.map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = dev_map_alloc_check,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
@@ -1153,15 +1156,11 @@ static struct notifier_block dev_map_notifier = {
static int __init dev_map_init(void)
{
- int cpu;
-
/* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
offsetof(struct _bpf_dtab_netdev, dev));
register_netdevice_notifier(&dev_map_notifier);
- for_each_possible_cpu(cpu)
- INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
return 0;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 3243c83ef3e3..b5f0adae8293 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2498,7 +2498,7 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
/**
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2524,9 +2524,10 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
enum bpf_dynptr_type type;
u32 len = buffer__szk;
int err;
@@ -2568,7 +2569,7 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
/**
* bpf_dynptr_slice_rdwr() - Obtain a writable pointer to the dynptr data.
- * @ptr: The dynptr whose data slice to retrieve
+ * @p: The dynptr whose data slice to retrieve
* @offset: Offset into the dynptr
* @buffer__opt: User-provided buffer to copy contents into. May be NULL
* @buffer__szk: Size (in bytes) of the buffer if present. This is the
@@ -2608,9 +2609,11 @@ __bpf_kfunc void *bpf_dynptr_slice(const struct bpf_dynptr_kern *ptr, u32 offset
* provided buffer, with its contents containing the data, if unable to obtain
* direct pointer)
*/
-__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 offset,
+__bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr *p, u32 offset,
void *buffer__opt, u32 buffer__szk)
{
+ const struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data || __bpf_dynptr_is_rdonly(ptr))
return NULL;
@@ -2636,11 +2639,12 @@ __bpf_kfunc void *bpf_dynptr_slice_rdwr(const struct bpf_dynptr_kern *ptr, u32 o
* will be copied out into the buffer and the user will need to call
* bpf_dynptr_write() to commit changes.
*/
- return bpf_dynptr_slice(ptr, offset, buffer__opt, buffer__szk);
+ return bpf_dynptr_slice(p, offset, buffer__opt, buffer__szk);
}
-__bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 end)
+__bpf_kfunc int bpf_dynptr_adjust(const struct bpf_dynptr *p, u32 start, u32 end)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
u32 size;
if (!ptr->data || start > end)
@@ -2657,36 +2661,45 @@ __bpf_kfunc int bpf_dynptr_adjust(struct bpf_dynptr_kern *ptr, u32 start, u32 en
return 0;
}
-__bpf_kfunc bool bpf_dynptr_is_null(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_null(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
return !ptr->data;
}
-__bpf_kfunc bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+__bpf_kfunc bool bpf_dynptr_is_rdonly(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return false;
return __bpf_dynptr_is_rdonly(ptr);
}
-__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr_kern *ptr)
+__bpf_kfunc __u32 bpf_dynptr_size(const struct bpf_dynptr *p)
{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data)
return -EINVAL;
return __bpf_dynptr_size(ptr);
}
-__bpf_kfunc int bpf_dynptr_clone(struct bpf_dynptr_kern *ptr,
- struct bpf_dynptr_kern *clone__uninit)
+__bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p,
+ struct bpf_dynptr *clone__uninit)
{
+ struct bpf_dynptr_kern *clone = (struct bpf_dynptr_kern *)clone__uninit;
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)p;
+
if (!ptr->data) {
- bpf_dynptr_set_null(clone__uninit);
+ bpf_dynptr_set_null(clone);
return -EINVAL;
}
- *clone__uninit = *ptr;
+ *clone = *ptr;
return 0;
}
@@ -2786,7 +2799,7 @@ __bpf_kfunc int bpf_wq_start(struct bpf_wq *wq, unsigned int flags)
}
__bpf_kfunc int bpf_wq_set_callback_impl(struct bpf_wq *wq,
- int (callback_fn)(void *map, int *key, struct bpf_wq *wq),
+ int (callback_fn)(void *map, int *key, void *value),
unsigned int flags,
void *aux__ign)
{
@@ -2809,6 +2822,122 @@ __bpf_kfunc void bpf_preempt_enable(void)
preempt_enable();
}
+struct bpf_iter_bits {
+ __u64 __opaque[2];
+} __aligned(8);
+
+struct bpf_iter_bits_kern {
+ union {
+ unsigned long *bits;
+ unsigned long bits_copy;
+ };
+ u32 nr_bits;
+ int bit;
+} __aligned(8);
+
+/**
+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area
+ * @it: The new bpf_iter_bits to be created
+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over
+ * @nr_words: The size of the specified memory area, measured in 8-byte units.
+ * Due to the limitation of memalloc, it can't be greater than 512.
+ *
+ * This function initializes a new bpf_iter_bits structure for iterating over
+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It
+ * copies the data of the memory area to the newly created bpf_iter_bits @it for
+ * subsequent iteration operations.
+ *
+ * On success, 0 is returned. On failure, ERR is returned.
+ */
+__bpf_kfunc int
+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bytes = nr_words * sizeof(u64);
+ u32 nr_bits = BYTES_TO_BITS(nr_bytes);
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits));
+ BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) !=
+ __alignof__(struct bpf_iter_bits));
+
+ kit->nr_bits = 0;
+ kit->bits_copy = 0;
+ kit->bit = -1;
+
+ if (!unsafe_ptr__ign || !nr_words)
+ return -EINVAL;
+
+ /* Optimization for u64 mask */
+ if (nr_bits == 64) {
+ err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign);
+ if (err)
+ return -EFAULT;
+
+ kit->nr_bits = nr_bits;
+ return 0;
+ }
+
+ /* Fallback to memalloc */
+ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes);
+ if (!kit->bits)
+ return -ENOMEM;
+
+ err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign);
+ if (err) {
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+ return err;
+ }
+
+ kit->nr_bits = nr_bits;
+ return 0;
+}
+
+/**
+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits
+ * @it: The bpf_iter_bits to be checked
+ *
+ * This function returns a pointer to a number representing the value of the
+ * next bit in the bits.
+ *
+ * If there are no further bits available, it returns NULL.
+ */
+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+ u32 nr_bits = kit->nr_bits;
+ const unsigned long *bits;
+ int bit;
+
+ if (nr_bits == 0)
+ return NULL;
+
+ bits = nr_bits == 64 ? &kit->bits_copy : kit->bits;
+ bit = find_next_bit(bits, nr_bits, kit->bit + 1);
+ if (bit >= nr_bits) {
+ kit->nr_bits = 0;
+ return NULL;
+ }
+
+ kit->bit = bit;
+ return &kit->bit;
+}
+
+/**
+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits
+ * @it: The bpf_iter_bits to be destroyed
+ *
+ * Destroy the resource associated with the bpf_iter_bits.
+ */
+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it)
+{
+ struct bpf_iter_bits_kern *kit = (void *)it;
+
+ if (kit->nr_bits <= 64)
+ return;
+ bpf_mem_free(&bpf_global_ma, kit->bits);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -2891,6 +3020,9 @@ BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
BTF_ID_FLAGS(func, bpf_wq_start)
BTF_ID_FLAGS(func, bpf_preempt_disable)
BTF_ID_FLAGS(func, bpf_preempt_enable)
+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
@@ -2932,7 +3064,9 @@ late_initcall(kfunc_init);
*/
const void *__bpf_dynptr_data(const struct bpf_dynptr_kern *ptr, u32 len)
{
- return bpf_dynptr_slice(ptr, 0, NULL, len);
+ const struct bpf_dynptr *p = (struct bpf_dynptr *)ptr;
+
+ return bpf_dynptr_slice(p, 0, NULL, len);
}
/* Get a pointer to dynptr data up to len bytes for read write access. If
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 4bd8f17a9f24..5aebfc3051e3 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -91,7 +91,7 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
goto fail;
} else {
u64 new_end, new_start;
- u32 buf_start, buf_end, new_n;
+ u32 buf_start, buf_end;
new_end = log->end_pos + n;
if (new_end - log->start_pos >= log->len_total)
@@ -708,7 +708,9 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id));
verbose(env, "(");
if (reg->id)
- verbose_a("id=%d", reg->id);
+ verbose_a("id=%d", reg->id & ~BPF_ADD_CONST);
+ if (reg->id & BPF_ADD_CONST)
+ verbose(env, "%+d", reg->off);
if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (type_is_non_owning_ref(reg->type))
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index f45ed6adc092..869265852d51 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3151,6 +3151,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif
+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct bpf_link *link = file->private_data;
+
+ return link->ops->poll(file, pts);
+}
+
static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
@@ -3160,6 +3167,16 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};
+static const struct file_operations bpf_link_fops_poll = {
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = bpf_link_show_fdinfo,
+#endif
+ .release = bpf_link_release,
+ .read = bpf_dummy_read,
+ .write = bpf_dummy_write,
+ .poll = bpf_link_poll,
+};
+
static int bpf_link_alloc_id(struct bpf_link *link)
{
int id;
@@ -3202,7 +3219,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
return id;
}
- file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
+ file = anon_inode_getfile("bpf_link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
if (IS_ERR(file)) {
bpf_link_free_id(id);
put_unused_fd(fd);
@@ -3230,7 +3249,9 @@ int bpf_link_settle(struct bpf_link_primer *primer)
int bpf_link_new_fd(struct bpf_link *link)
{
- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
+ return anon_inode_getfd("bpf-link",
+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
+ link, O_CLOEXEC);
}
struct bpf_link *bpf_link_get_from_fd(u32 ufd)
@@ -3240,7 +3261,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
if (!f.file)
return ERR_PTR(-EBADF);
- if (f.file->f_op != &bpf_link_fops) {
+ if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
fdput(f);
return ERR_PTR(-EINVAL);
}
@@ -4972,7 +4993,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
- else if (f.file->f_op == &bpf_link_fops)
+ else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
attr, uattr);
else
@@ -5107,7 +5128,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!file)
return -EBADF;
- if (file->f_op == &bpf_link_fops) {
+ if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
struct bpf_link *link = file->private_data;
if (link->ops == &bpf_raw_tp_link_lops) {
@@ -5417,10 +5438,11 @@ static int link_detach(union bpf_attr *attr)
return ret;
}
-static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
+EXPORT_SYMBOL(bpf_link_inc_not_zero);
struct bpf_link *bpf_link_by_id(u32 id)
{
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index ec4e97c61eef..02aa9db8d796 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -261,6 +261,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
u32 saved_tid = info->tid;
struct task_struct *curr_task;
unsigned int curr_fd = info->fd;
+ struct file *f;
/* If this function returns a non-NULL file object,
* it held a reference to the task/file.
@@ -286,12 +287,8 @@ again:
}
rcu_read_lock();
- for (;; curr_fd++) {
- struct file *f;
- f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
- if (!f)
- break;
-
+ f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
+ if (f) {
/* set info->fd */
info->fd = curr_fd;
rcu_read_unlock();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 214a9fa8c6fb..8da132a1ef28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2982,8 +2982,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
if (code == (BPF_JMP | BPF_CALL) &&
insn[i].src_reg == 0 &&
- insn[i].imm == BPF_FUNC_tail_call)
+ insn[i].imm == BPF_FUNC_tail_call) {
subprog[cur_subprog].has_tail_call = true;
+ subprog[cur_subprog].tail_call_reachable = true;
+ }
if (BPF_CLASS(code) == BPF_LD &&
(BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND))
subprog[cur_subprog].has_ld_abs = true;
@@ -3215,7 +3217,8 @@ static int insn_def_regno(const struct bpf_insn *insn)
case BPF_ST:
return -1;
case BPF_STX:
- if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ if ((BPF_MODE(insn->code) == BPF_ATOMIC ||
+ BPF_MODE(insn->code) == BPF_PROBE_ATOMIC) &&
(insn->imm & BPF_FETCH)) {
if (insn->imm == BPF_CMPXCHG)
return BPF_REG_0;
@@ -3991,7 +3994,7 @@ static bool idset_contains(struct bpf_idset *s, u32 id)
u32 i;
for (i = 0; i < s->count; ++i)
- if (s->ids[i] == id)
+ if (s->ids[i] == (id & ~BPF_ADD_CONST))
return true;
return false;
@@ -4001,7 +4004,7 @@ static int idset_push(struct bpf_idset *s, u32 id)
{
if (WARN_ON_ONCE(s->count >= ARRAY_SIZE(s->ids)))
return -EFAULT;
- s->ids[s->count++] = id;
+ s->ids[s->count++] = id & ~BPF_ADD_CONST;
return 0;
}
@@ -4438,8 +4441,20 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
static void assign_scalar_id_before_mov(struct bpf_verifier_env *env,
struct bpf_reg_state *src_reg)
{
- if (src_reg->type == SCALAR_VALUE && !src_reg->id &&
- !tnum_is_const(src_reg->var_off))
+ if (src_reg->type != SCALAR_VALUE)
+ return;
+
+ if (src_reg->id & BPF_ADD_CONST) {
+ /*
+ * The verifier is processing rX = rY insn and
+ * rY->id has special linked register already.
+ * Cleared it, since multiple rX += const are not supported.
+ */
+ src_reg->id = 0;
+ src_reg->off = 0;
+ }
+
+ if (!src_reg->id && !tnum_is_const(src_reg->var_off))
/* Ensure that src_reg has a valid ID that will be copied to
* dst_reg and then will be used by find_equal_scalars() to
* propagate min/max range.
@@ -5449,7 +5464,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ if (reg->smin_value + off < p + field->size &&
p < reg->umax_value + off + size) {
switch (field->type) {
case BPF_KPTR_UNREF:
@@ -7715,6 +7730,13 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
int err;
+ if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env,
+ "arg#%d expected pointer to stack or const struct bpf_dynptr\n",
+ regno);
+ return -EINVAL;
+ }
+
/* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
* ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
*/
@@ -9464,6 +9486,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
} else if (arg->arg_type == (ARG_PTR_TO_DYNPTR | MEM_RDONLY)) {
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_PTR_TO_DYNPTR);
+ if (ret)
+ return ret;
+
ret = process_dynptr_func(env, regno, -1, arg->arg_type, 0);
if (ret)
return ret;
@@ -10917,7 +10943,7 @@ enum {
};
BTF_ID_LIST(kf_arg_btf_ids)
-BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_dynptr)
BTF_ID(struct, bpf_list_head)
BTF_ID(struct, bpf_list_node)
BTF_ID(struct, bpf_rb_root)
@@ -11190,6 +11216,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (btf_is_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
return KF_ARG_PTR_TO_CTX;
+ if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
+ return KF_ARG_PTR_TO_NULL;
+
if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_ALLOC_BTF_ID;
@@ -11235,9 +11264,6 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_callback(env, meta->btf, &args[argno]))
return KF_ARG_PTR_TO_CALLBACK;
- if (is_kfunc_arg_nullable(meta->btf, &args[argno]) && register_is_null(reg))
- return KF_ARG_PTR_TO_NULL;
-
if (argno + 1 < nargs &&
(is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]) ||
is_kfunc_arg_const_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1])))
@@ -11268,6 +11294,8 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
bool strict_type_match = false;
const struct btf *reg_btf;
const char *reg_ref_tname;
+ bool taking_projection;
+ bool struct_same;
u32 reg_ref_id;
if (base_type(reg->type) == PTR_TO_BTF_ID) {
@@ -11307,11 +11335,19 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
btf_type_ids_nocast_alias(&env->log, reg_btf, reg_ref_id, meta->btf, ref_id))
strict_type_match = true;
- WARN_ON_ONCE(is_kfunc_trusted_args(meta) && reg->off);
+ WARN_ON_ONCE(is_kfunc_release(meta) &&
+ (reg->off || !tnum_is_const(reg->var_off) ||
+ reg->var_off.value));
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
- if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ struct_same = btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match);
+ /* If kfunc is accepting a projection type (ie. __sk_buff), it cannot
+ * actually use it -- it must cast to the underlying type. So we allow
+ * caller to pass in the underlying type.
+ */
+ taking_projection = btf_is_projection_of(ref_tname, reg_ref_tname);
+ if (!taking_projection && !struct_same) {
verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
btf_type_str(reg_ref_t), reg_ref_tname);
@@ -11651,7 +11687,7 @@ __process_kf_arg_ptr_to_graph_node(struct bpf_verifier_env *env,
node_off = reg->off + reg->var_off.value;
field = reg_find_field_offset(reg, node_off, node_field_type);
- if (!field || field->offset != node_off) {
+ if (!field) {
verbose(env, "%s not found at offset=%u\n", node_type_name, node_off);
return -EINVAL;
}
@@ -11883,12 +11919,8 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
}
-
fallthrough;
case KF_ARG_PTR_TO_CTX:
- /* Trusted arguments have the same offset checks as release arguments */
- arg_type |= OBJ_RELEASE;
- break;
case KF_ARG_PTR_TO_DYNPTR:
case KF_ARG_PTR_TO_ITER:
case KF_ARG_PTR_TO_LIST_HEAD:
@@ -11901,7 +11933,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
- /* Trusted by default */
break;
default:
WARN_ON_ONCE(1);
@@ -11957,12 +11988,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
enum bpf_arg_type dynptr_arg_type = ARG_PTR_TO_DYNPTR;
int clone_ref_obj_id = 0;
- if (reg->type != PTR_TO_STACK &&
- reg->type != CONST_PTR_TO_DYNPTR) {
- verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
- return -EINVAL;
- }
-
if (reg->type == CONST_PTR_TO_DYNPTR)
dynptr_arg_type |= MEM_RDONLY;
@@ -12701,56 +12726,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return 0;
}
-static bool signed_add_overflows(s64 a, s64 b)
-{
- /* Do the add in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a + (u64)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_add32_overflows(s32 a, s32 b)
-{
- /* Do the add in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a + (u32)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_add16_overflows(s16 a, s16 b)
-{
- /* Do the add in u16, where overflow is well-defined */
- s16 res = (s16)((u16)a + (u16)b);
-
- if (b < 0)
- return res > a;
- return res < a;
-}
-
-static bool signed_sub_overflows(s64 a, s64 b)
-{
- /* Do the sub in u64, where overflow is well-defined */
- s64 res = (s64)((u64)a - (u64)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
-static bool signed_sub32_overflows(s32 a, s32 b)
-{
- /* Do the sub in u32, where overflow is well-defined */
- s32 res = (s32)((u32)a - (u32)b);
-
- if (b < 0)
- return res < a;
- return res > a;
-}
-
static bool check_reg_sane_offset(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
enum bpf_reg_type type)
@@ -13232,21 +13207,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
* added into the variable offset, and we copy the fixed offset
* from ptr_reg.
*/
- if (signed_add_overflows(smin_ptr, smin_val) ||
- signed_add_overflows(smax_ptr, smax_val)) {
+ if (check_add_overflow(smin_ptr, smin_val, &dst_reg->smin_value) ||
+ check_add_overflow(smax_ptr, smax_val, &dst_reg->smax_value)) {
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr + smin_val;
- dst_reg->smax_value = smax_ptr + smax_val;
}
- if (umin_ptr + umin_val < umin_ptr ||
- umax_ptr + umax_val < umax_ptr) {
+ if (check_add_overflow(umin_ptr, umin_val, &dst_reg->umin_value) ||
+ check_add_overflow(umax_ptr, umax_val, &dst_reg->umax_value)) {
dst_reg->umin_value = 0;
dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value = umin_ptr + umin_val;
- dst_reg->umax_value = umax_ptr + umax_val;
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
@@ -13289,14 +13258,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* A new variable offset is created. If the subtrahend is known
* nonnegative, then any reg->range we had before is still good.
*/
- if (signed_sub_overflows(smin_ptr, smax_val) ||
- signed_sub_overflows(smax_ptr, smin_val)) {
+ if (check_sub_overflow(smin_ptr, smax_val, &dst_reg->smin_value) ||
+ check_sub_overflow(smax_ptr, smin_val, &dst_reg->smax_value)) {
/* Overflow possible, we know nothing */
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value = smin_ptr - smax_val;
- dst_reg->smax_value = smax_ptr - smin_val;
}
if (umin_ptr < umax_val) {
/* Overflow possible, we know nothing */
@@ -13349,71 +13315,56 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
static void scalar32_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
- if (signed_add32_overflows(dst_reg->s32_min_value, smin_val) ||
- signed_add32_overflows(dst_reg->s32_max_value, smax_val)) {
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value += smin_val;
- dst_reg->s32_max_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->s32_min_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->s32_max_value, dst_smax)) {
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
- if (dst_reg->u32_min_value + umin_val < umin_val ||
- dst_reg->u32_max_value + umax_val < umax_val) {
- dst_reg->u32_min_value = 0;
- dst_reg->u32_max_value = U32_MAX;
- } else {
- dst_reg->u32_min_value += umin_val;
- dst_reg->u32_max_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->u32_min_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->u32_max_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
}
static void scalar_min_max_add(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
- if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
- signed_add_overflows(dst_reg->smax_value, smax_val)) {
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value += smin_val;
- dst_reg->smax_value += smax_val;
+ if (check_add_overflow(*dst_smin, src_reg->smin_value, dst_smin) ||
+ check_add_overflow(*dst_smax, src_reg->smax_value, dst_smax)) {
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
- if (dst_reg->umin_value + umin_val < umin_val ||
- dst_reg->umax_value + umax_val < umax_val) {
- dst_reg->umin_value = 0;
- dst_reg->umax_value = U64_MAX;
- } else {
- dst_reg->umin_value += umin_val;
- dst_reg->umax_value += umax_val;
+ if (check_add_overflow(*dst_umin, src_reg->umin_value, dst_umin) ||
+ check_add_overflow(*dst_umax, src_reg->umax_value, dst_umax)) {
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
}
static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- s32 smax_val = src_reg->s32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
u32 umin_val = src_reg->u32_min_value;
u32 umax_val = src_reg->u32_max_value;
- if (signed_sub32_overflows(dst_reg->s32_min_value, smax_val) ||
- signed_sub32_overflows(dst_reg->s32_max_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->s32_max_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->s32_min_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
- } else {
- dst_reg->s32_min_value -= smax_val;
- dst_reg->s32_max_value -= smin_val;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
}
if (dst_reg->u32_min_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -13429,19 +13380,16 @@ static void scalar32_min_max_sub(struct bpf_reg_state *dst_reg,
static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- s64 smax_val = src_reg->smax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
u64 umin_val = src_reg->umin_value;
u64 umax_val = src_reg->umax_value;
- if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
- signed_sub_overflows(dst_reg->smax_value, smin_val)) {
+ if (check_sub_overflow(*dst_smin, src_reg->smax_value, dst_smin) ||
+ check_sub_overflow(*dst_smax, src_reg->smin_value, dst_smax)) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
- } else {
- dst_reg->smin_value -= smax_val;
- dst_reg->smax_value -= smin_val;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
}
if (dst_reg->umin_value < umax_val) {
/* Overflow possible, we know nothing */
@@ -14047,6 +13995,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
+ bool alu32 = (BPF_CLASS(insn->code) != BPF_ALU64);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -14069,11 +14018,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
- else
- /* Make sure ID is cleared otherwise dst_reg min/max could be
- * incorrectly propagated into other registers by find_equal_scalars()
- */
- dst_reg->id = 0;
+
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@@ -14137,7 +14082,43 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
- return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ err = adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
+ if (err)
+ return err;
+ /*
+ * Compilers can generate the code
+ * r1 = r2
+ * r1 += 0x1
+ * if r2 < 1000 goto ...
+ * use r1 in memory access
+ * So remember constant delta between r2 and r1 and update r1 after
+ * 'if' condition.
+ */
+ if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD &&
+ dst_reg->id && is_reg_const(src_reg, alu32)) {
+ u64 val = reg_const_value(src_reg, alu32);
+
+ if ((dst_reg->id & BPF_ADD_CONST) ||
+ /* prevent overflow in find_equal_scalars() later */
+ val > (u32)S32_MAX) {
+ /*
+ * If the register already went through rX += val
+ * we cannot accumulate another val into rx->off.
+ */
+ dst_reg->off = 0;
+ dst_reg->id = 0;
+ } else {
+ dst_reg->id |= BPF_ADD_CONST;
+ dst_reg->off = val;
+ }
+ } else {
+ /*
+ * Make sure ID is cleared otherwise dst_reg min/max could be
+ * incorrectly propagated into other registers by find_equal_scalars()
+ */
+ dst_reg->id = 0;
+ }
+ return 0;
}
/* check validity of 32-bit and 64-bit arithmetic operations */
@@ -15109,12 +15090,36 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
static void find_equal_scalars(struct bpf_verifier_state *vstate,
struct bpf_reg_state *known_reg)
{
+ struct bpf_reg_state fake_reg;
struct bpf_func_state *state;
struct bpf_reg_state *reg;
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
- if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
+ if (reg->type != SCALAR_VALUE || reg == known_reg)
+ continue;
+ if ((reg->id & ~BPF_ADD_CONST) != (known_reg->id & ~BPF_ADD_CONST))
+ continue;
+ if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) ||
+ reg->off == known_reg->off) {
+ copy_register_state(reg, known_reg);
+ } else {
+ s32 saved_off = reg->off;
+
+ fake_reg.type = SCALAR_VALUE;
+ __mark_reg_known(&fake_reg, (s32)reg->off - (s32)known_reg->off);
+
+ /* reg = known_reg; reg += delta */
copy_register_state(reg, known_reg);
+ /*
+ * Must preserve off, id and add_const flag,
+ * otherwise another find_equal_scalars() will be incorrect.
+ */
+ reg->off = saved_off;
+
+ scalar32_min_max_add(reg, &fake_reg);
+ scalar_min_max_add(reg, &fake_reg);
+ reg->var_off = tnum_add(reg->var_off, fake_reg.var_off);
+ }
}));
}
@@ -16749,6 +16754,10 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
if (!rold->precise && exact == NOT_EXACT)
return true;
+ if ((rold->id & BPF_ADD_CONST) != (rcur->id & BPF_ADD_CONST))
+ return false;
+ if ((rold->id & BPF_ADD_CONST) && (rold->off != rcur->off))
+ return false;
/* Why check_ids() for scalar registers?
*
* Consider the following BPF code:
@@ -18630,8 +18639,7 @@ static void release_maps(struct bpf_verifier_env *env)
/* drop refcnt of maps used by the rejected program */
static void release_btfs(struct bpf_verifier_env *env)
{
- __bpf_free_used_btfs(env->prog->aux, env->used_btfs,
- env->used_btf_cnt);
+ __bpf_free_used_btfs(env->used_btfs, env->used_btf_cnt);
}
/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
@@ -18750,6 +18758,8 @@ static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
{
struct bpf_insn *insn = prog->insnsi;
u32 insn_cnt = prog->len, i;
+ s32 imm;
+ s16 off;
for (i = 0; i < insn_cnt; i++, insn++) {
u8 code = insn->code;
@@ -18761,15 +18771,15 @@ static int adjust_jmp_off(struct bpf_prog *prog, u32 tgt_idx, u32 delta)
if (insn->code == (BPF_JMP32 | BPF_JA)) {
if (i + 1 + insn->imm != tgt_idx)
continue;
- if (signed_add32_overflows(insn->imm, delta))
+ if (check_add_overflow(insn->imm, delta, &imm))
return -ERANGE;
- insn->imm += delta;
+ insn->imm = imm;
} else {
if (i + 1 + insn->off != tgt_idx)
continue;
- if (signed_add16_overflows(insn->imm, delta))
+ if (check_add_overflow(insn->off, delta, &off))
return -ERANGE;
- insn->off += delta;
+ insn->off = off;
}
}
return 0;