diff options
author | Mauro Carvalho Chehab <mchehab@kernel.org> | 2022-12-28 16:07:44 +0000 |
---|---|---|
committer | Mauro Carvalho Chehab <mchehab@kernel.org> | 2022-12-28 16:07:44 +0000 |
commit | 6599e683db1bf22fee74302c47e31b9a42a1c3d2 (patch) | |
tree | b11b13b325fbc0f8d20c27b9b3d304cf535f627f /kernel | |
parent | d4acfa22b634347be33d5906744366742fccd151 (diff) | |
parent | 1b929c02afd37871d5afb9d498426f83432e71c2 (diff) |
Linux 6.2-rc1
* tag 'v6.2-rc1': (14398 commits)
Linux 6.2-rc1
treewide: Convert del_timer*() to timer_shutdown*()
pstore: Properly assign mem_type property
pstore: Make sure CONFIG_PSTORE_PMSG selects CONFIG_RT_MUTEXES
cfi: Fix CFI failure with KASAN
perf python: Fix splitting CC into compiler and options
afs: Stop implementing ->writepage()
afs: remove afs_cache_netfs and afs_zap_permits() declarations
afs: remove variable nr_servers
afs: Fix lost servers_outstanding count
ALSA: usb-audio: Add new quirk FIXED_RATE for JBL Quantum810 Wireless
ALSA: azt3328: Remove the unused function snd_azf3328_codec_outl()
gcov: add support for checksum field
test_maple_tree: add test for mas_spanning_rebalance() on insufficient data
maple_tree: fix mas_spanning_rebalance() on insufficient data
hugetlb: really allocate vma lock for all sharable vmas
kmsan: export kmsan_handle_urb
kmsan: include linux/vmalloc.h
mm/mempolicy: fix memory leak in set_mempolicy_home_node system call
mm, mremap: fix mremap() expanding vma with addr inside vma
...
Diffstat (limited to 'kernel')
149 files changed, 10758 insertions, 4182 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index d754e0be1176..10ef068f598d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,9 +41,6 @@ UBSAN_SANITIZE_kcov.o := n KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector -# Don't instrument error handlers -CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI) - obj-y += sched/ obj-y += locking/ obj-y += power/ @@ -69,6 +66,7 @@ endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o diff --git a/kernel/acct.c b/kernel/acct.c index 62200d799b9b..010667ce6080 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -320,7 +320,7 @@ void acct_exit_ns(struct pid_namespace *ns) } /* - * encode an unsigned long into a comp_t + * encode an u64 into a comp_t * * This routine has been adopted from the encode_comp_t() function in * the kern_acct.c file of the FreeBSD operating system. The encoding @@ -331,7 +331,7 @@ void acct_exit_ns(struct pid_namespace *ns) #define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ #define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ -static comp_t encode_comp_t(unsigned long value) +static comp_t encode_comp_t(u64 value) { int exp, rnd; @@ -350,6 +350,8 @@ static comp_t encode_comp_t(unsigned long value) exp++; } + if (exp > (((comp_t) ~0U) >> MANTSIZE)) + return (comp_t) ~0U; /* * Clean it up and polish it off. */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9f8c05228d6d..547c88be8a28 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -806,30 +806,53 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val) } /** - * audit_filter_uring - apply filters to an io_uring operation + * __audit_filter_op - common filter helper for operations (syscall/uring/etc) * @tsk: associated task * @ctx: audit context + * @list: audit filter list + * @name: audit_name (can be NULL) + * @op: current syscall/uring_op + * + * Run the udit filters specified in @list against @tsk using @ctx, + * @name, and @op, as necessary; the caller is responsible for ensuring + * that the call is made while the RCU read lock is held. The @name + * parameter can be NULL, but all others must be specified. + * Returns 1/true if the filter finds a match, 0/false if none are found. */ -static void audit_filter_uring(struct task_struct *tsk, - struct audit_context *ctx) +static int __audit_filter_op(struct task_struct *tsk, + struct audit_context *ctx, + struct list_head *list, + struct audit_names *name, + unsigned long op) { struct audit_entry *e; enum audit_state state; + list_for_each_entry_rcu(e, list, list) { + if (audit_in_mask(&e->rule, op) && + audit_filter_rules(tsk, &e->rule, ctx, name, + &state, false)) { + ctx->current_state = state; + return 1; + } + } + return 0; +} + +/** + * audit_filter_uring - apply filters to an io_uring operation + * @tsk: associated task + * @ctx: audit context + */ +static void audit_filter_uring(struct task_struct *tsk, + struct audit_context *ctx) +{ if (auditd_test_task(tsk)) return; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT], - list) { - if (audit_in_mask(&e->rule, ctx->uring_op) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, &state, - false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT], + NULL, ctx->uring_op); rcu_read_unlock(); } @@ -841,24 +864,13 @@ static void audit_filter_uring(struct task_struct *tsk, static void audit_filter_syscall(struct task_struct *tsk, struct audit_context *ctx) { - struct audit_entry *e; - enum audit_state state; - if (auditd_test_task(tsk)) return; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state, false)) { - rcu_read_unlock(); - ctx->current_state = state; - return; - } - } + __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT], + NULL, ctx->major); rcu_read_unlock(); - return; } /* @@ -870,17 +882,8 @@ static int audit_filter_inode_name(struct task_struct *tsk, struct audit_context *ctx) { int h = audit_hash_ino((u32)n->ino); struct list_head *list = &audit_inode_hash[h]; - struct audit_entry *e; - enum audit_state state; - list_for_each_entry_rcu(e, list, list) { - if (audit_in_mask(&e->rule, ctx->major) && - audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) { - ctx->current_state = state; - return 1; - } - } - return 0; + return __audit_filter_op(tsk, ctx, list, n, ctx->major); } /* At syscall exit time, this filter is called if any audit_names have been diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 341c94f208f4..3a12e6b400a2 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -25,7 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif ifeq ($(CONFIG_CGROUPS),y) -obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o +obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o endif obj-$(CONFIG_CGROUP_BPF) += cgroup.o ifeq ($(CONFIG_INET),y) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 832b2659e96e..484706959556 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -306,14 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_fields(struct bpf_array *arr, void *val) -{ - if (map_value_has_timer(&arr->map)) - bpf_timer_cancel_and_free(val + arr->map.timer_off); - if (map_value_has_kptrs(&arr->map)) - bpf_map_free_kptrs(&arr->map, val); -} - /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) @@ -335,13 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -349,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } return 0; } @@ -386,7 +378,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - check_and_free_fields(array, per_cpu_ptr(pptr, cpu)); + bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -409,12 +401,12 @@ static void array_map_free_timers(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(map)) + /* We don't reset or free fields other than timer on uref dropping to zero. */ + if (!btf_record_has_field(map->record, BPF_TIMER)) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ @@ -423,22 +415,21 @@ static void array_map_free(struct bpf_map *map) struct bpf_array *array = container_of(map, struct bpf_array, map); int i; - if (map_value_has_kptrs(map)) { + if (!IS_ERR_OR_NULL(map->record)) { if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { for (i = 0; i < array->map.max_entries; i++) { void __percpu *pptr = array->pptrs[i & array->index_mask]; int cpu; for_each_possible_cpu(cpu) { - bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu)); + bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu)); cond_resched(); } } } else { for (i = 0; i < array->map.max_entries; i++) - bpf_map_free_kptrs(map, array_map_elem_ptr(array, i)); + bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i)); } - bpf_map_free_kptr_off_tab(map); } if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c new file mode 100644 index 000000000000..6cdf6d9ed91d --- /dev/null +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + */ + +#include <linux/types.h> +#include <linux/bpf.h> +#include <linux/bpf_local_storage.h> +#include <uapi/linux/btf.h> +#include <linux/btf_ids.h> + +DEFINE_BPF_STORAGE_CACHE(cgroup_cache); + +static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); + +static void bpf_cgrp_storage_lock(void) +{ + migrate_disable(); + this_cpu_inc(bpf_cgrp_storage_busy); +} + +static void bpf_cgrp_storage_unlock(void) +{ + this_cpu_dec(bpf_cgrp_storage_busy); + migrate_enable(); +} + +static bool bpf_cgrp_storage_trylock(void) +{ + migrate_disable(); + if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { + this_cpu_dec(bpf_cgrp_storage_busy); + migrate_enable(); + return false; + } + return true; +} + +static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) +{ + struct cgroup *cg = owner; + + return &cg->bpf_cgrp_storage; +} + +void bpf_cgrp_storage_free(struct cgroup *cgroup) +{ + struct bpf_local_storage *local_storage; + bool free_cgroup_storage = false; + unsigned long flags; + + rcu_read_lock(); + local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + bpf_cgrp_storage_lock(); + raw_spin_lock_irqsave(&local_storage->lock, flags); + free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage); + raw_spin_unlock_irqrestore(&local_storage->lock, flags); + bpf_cgrp_storage_unlock(); + rcu_read_unlock(); + + if (free_cgroup_storage) + kfree_rcu(local_storage, rcu); +} + +static struct bpf_local_storage_data * +cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) +{ + struct bpf_local_storage *cgroup_storage; + struct bpf_local_storage_map *smap; + + cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, + bpf_rcu_lock_held()); + if (!cgroup_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); +} + +static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct cgroup *cgroup; + int fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return ERR_CAST(cgroup); + + bpf_cgrp_storage_lock(); + sdata = cgroup_storage_lookup(cgroup, map, true); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return sdata ? sdata->data : NULL; +} + +static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct cgroup *cgroup; + int fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + + bpf_cgrp_storage_lock(); + sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, + value, map_flags, GFP_ATOMIC); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return PTR_ERR_OR_ZERO(sdata); +} + +static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = cgroup_storage_lookup(cgroup, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata), true); + return 0; +} + +static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct cgroup *cgroup; + int err, fd; + + fd = *(int *)key; + cgroup = cgroup_get_from_fd(fd); + if (IS_ERR(cgroup)) + return PTR_ERR(cgroup); + + bpf_cgrp_storage_lock(); + err = cgroup_storage_delete(cgroup, map); + bpf_cgrp_storage_unlock(); + cgroup_put(cgroup); + return err; +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +{ + return bpf_local_storage_map_alloc(attr, &cgroup_cache); +} + +static void cgroup_storage_map_free(struct bpf_map *map) +{ + bpf_local_storage_map_free(map, &cgroup_cache, NULL); +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, + void *, value, u64, flags, gfp_t, gfp_flags) +{ + struct bpf_local_storage_data *sdata; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + if (!cgroup) + return (unsigned long)NULL; + + if (!bpf_cgrp_storage_trylock()) + return (unsigned long)NULL; + + sdata = cgroup_storage_lookup(cgroup, map, true); + if (sdata) + goto unlock; + + /* only allocate new storage, when the cgroup is refcounted */ + if (!percpu_ref_is_dying(&cgroup->self.refcnt) && + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, + value, BPF_NOEXIST, gfp_flags); + +unlock: + bpf_cgrp_storage_unlock(); + return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; +} + +BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) +{ + int ret; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!cgroup) + return -EINVAL; + + if (!bpf_cgrp_storage_trylock()) + return -EBUSY; + + ret = cgroup_storage_delete(cgroup, map); + bpf_cgrp_storage_unlock(); + return ret; +} + +const struct bpf_map_ops cgrp_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = cgroup_storage_map_alloc, + .map_free = cgroup_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_cgrp_storage_lookup_elem, + .map_update_elem = bpf_cgrp_storage_update_elem, + .map_delete_elem = bpf_cgrp_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_id = &bpf_local_storage_map_btf_id[0], + .map_owner_storage_ptr = cgroup_storage_ptr, +}; + +const struct bpf_func_proto bpf_cgrp_storage_get_proto = { + .func = bpf_cgrp_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_cgroup_btf_id[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { + .func = bpf_cgrp_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_cgroup_btf_id[0], +}; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 5f7683b19199..05f4c66c9089 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, void bpf_inode_storage_free(struct inode *inode) { - struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_inode_storage = false; struct bpf_storage_blob *bsb; - struct hlist_node *n; bsb = bpf_inode(inode); if (!bsb) @@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode) return; } - /* Neither the bpf_prog nor the bpf-map's syscall - * could be modifying the local_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * local_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_local_storage_map_free() alone - * when unlinking elem from the local_storage->list and - * the map's bucket->list. - */ raw_spin_lock_bh(&local_storage->lock); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - free_inode_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); - } + free_inode_storage = bpf_local_storage_unlink_nolock(local_storage); raw_spin_unlock_bh(&local_storage->lock); rcu_read_unlock(); - /* free_inoode_storage should always be true as long as - * local_storage->list was non-empty. - */ if (free_inode_storage) kfree_rcu(local_storage, rcu); } @@ -226,27 +205,14 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { - struct bpf_local_storage_map *smap; - - smap = bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); - - smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); - return &smap->map; + return bpf_local_storage_map_alloc(attr, &inode_cache); } static void inode_storage_map_free(struct bpf_map *map) { - struct bpf_local_storage_map *smap; - - smap = (struct bpf_local_storage_map *)map; - bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap, NULL); + bpf_local_storage_map_free(map, &inode_cache, NULL); } -BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, - bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -257,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &inode_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 802fc15b0d73..b39a46e8fb08 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -74,7 +74,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, gfp_flags | __GFP_NOWARN); if (selem) { if (value) - memcpy(SDATA(selem)->data, value, smap->map.value_size); + copy_map_value(&smap->map, SDATA(selem)->data, value); return selem; } @@ -88,8 +88,14 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu) { struct bpf_local_storage *local_storage; + /* If RCU Tasks Trace grace period implies RCU grace period, do + * kfree(), else do kfree_rcu(). + */ local_storage = container_of(rcu, struct bpf_local_storage, rcu); - kfree_rcu(local_storage, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(local_storage); + else + kfree_rcu(local_storage, rcu); } static void bpf_selem_free_rcu(struct rcu_head *rcu) @@ -97,16 +103,19 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) struct bpf_local_storage_elem *selem; selem = container_of(rcu, struct bpf_local_storage_elem, rcu); - kfree_rcu(selem, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(selem); + else + kfree_rcu(selem, rcu); } /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool use_trace_rcu) +static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem, bool use_trace_rcu) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -233,6 +242,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu) __bpf_selem_unlink_storage(selem, use_trace_rcu); } +/* If cacheit_lockit is false, this lookup function is lockless */ struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, @@ -372,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(&smap->map))) + !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) @@ -491,7 +501,7 @@ unlock_err: return ERR_PTR(err); } -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) +static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) { u64 min_usage = U64_MAX; u16 i, res = 0; @@ -515,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) return res; } -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx) +static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx) { spin_lock(&cache->idx_lock); cache->idx_usage_counts[idx]--; spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, - int __percpu *busy_counter) -{ - struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map_bucket *b; - unsigned int i; - - /* Note that this map might be concurrently cloned from - * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone - * RCU read section to finish before proceeding. New RCU - * read sections should be prevented via bpf_map_inc_not_zero. - */ - synchronize_rcu(); - - /* bpf prog and the userspace can no longer access this map - * now. No new selem (of this map) can be added - * to the owner->storage or to the map bucket's list. - * - * The elem of this map can be cleaned up here - * or when the storage is freed e.g. - * by bpf_sk_storage_free() during __sk_destruct(). - */ - for (i = 0; i < (1U << smap->bucket_log); i++) { - b = &smap->buckets[i]; - - rcu_read_lock(); - /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - if (busy_counter) { - migrate_disable(); - this_cpu_inc(*busy_counter); - } - bpf_selem_unlink(selem, false); - if (busy_counter) { - this_cpu_dec(*busy_counter); - migrate_enable(); - } - cond_resched_rcu(); - } - rcu_read_unlock(); - } - - /* While freeing the storage we may still need to access the map. - * - * e.g. when bpf_sk_storage_free() has unlinked selem from the map - * which then made the above while((selem = ...)) loop - * exit immediately. - * - * However, while freeing the storage one still needs to access the - * smap->elem_size to do the uncharging in - * bpf_selem_unlink_storage_nolock(). - * - * Hence, wait another rcu grace period for the storage to be freed. - */ - synchronize_rcu(); - - kvfree(smap->buckets); - bpf_map_area_free(smap); -} - int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || @@ -604,7 +552,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; unsigned int i; @@ -654,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } + +bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) +{ + struct bpf_local_storage_elem *selem; + bool free_storage = false; + struct hlist_node *n; + + /* Neither the bpf_prog nor the bpf_map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added to or deleted from the + * local_storage->list by the bpf_prog or by the bpf_map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + /* If local_storage list has only one element, the + * bpf_selem_unlink_storage_nolock() will return true. + * Otherwise, it will return false. The current loop iteration + * intends to remove all local storage. So the last iteration + * of the loop will set the free_cgroup_storage to true. + */ + free_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false, false); + } + + return free_storage; +} + +struct bpf_map * +bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache) +{ + struct bpf_local_storage_map *smap; + + smap = __bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(cache); + return &smap->map; +} + +void bpf_local_storage_map_free(struct bpf_map *map, + struct bpf_local_storage_cache *cache, + int __percpu *busy_counter) +{ + struct bpf_local_storage_map_bucket *b; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + unsigned int i; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(cache, smap->cache_idx); + + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the owner->storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or when the storage is freed e.g. + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + this_cpu_inc(*busy_counter); + } + bpf_selem_unlink(selem, false); + if (busy_counter) { + this_cpu_dec(*busy_counter); + migrate_enable(); + } + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* While freeing the storage we may still need to access the map. + * + * e.g. when bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exit immediately. + * + * However, while freeing the storage one still needs to access the + * smap->elem_size to do the uncharging in + * bpf_selem_unlink_storage_nolock(). + * + * Hence, wait another rcu grace period for the storage to be freed. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + bpf_map_area_free(smap); +} diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index d6c9b3705f24..9ea42a45da47 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -151,6 +151,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .func = bpf_ima_inode_hash, .gpl_only = false, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0], @@ -169,6 +170,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file) static const struct bpf_func_proto bpf_ima_file_hash_proto = { .func = bpf_ima_file_hash, .gpl_only = false, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0], @@ -221,9 +223,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: - return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; + return &bpf_ima_inode_hash_proto; case BPF_FUNC_ima_file_hash: - return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; + return &bpf_ima_file_hash_proto; case BPF_FUNC_get_attach_cookie: return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL; #ifdef CONFIG_NET @@ -343,11 +345,27 @@ BTF_ID(func, bpf_lsm_task_to_inode) BTF_ID(func, bpf_lsm_userns_create) BTF_SET_END(sleepable_lsm_hooks) +BTF_SET_START(untrusted_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog_alloc_security) +BTF_ID(func, bpf_lsm_bpf_prog_free_security) +BTF_ID(func, bpf_lsm_file_alloc_security) +BTF_ID(func, bpf_lsm_file_free_security) +BTF_ID(func, bpf_lsm_sk_alloc_security) +BTF_ID(func, bpf_lsm_sk_free_security) +BTF_ID(func, bpf_lsm_task_free) +BTF_SET_END(untrusted_lsm_hooks) + bool bpf_lsm_is_sleepable_hook(u32 btf_id) { return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); } +bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 84b2d9dba79a..ece9870cab68 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -494,8 +494,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, refcount_set(&kvalue->refcnt, 1); bpf_map_inc(map); - set_memory_ro((long)st_map->image, 1); - set_memory_x((long)st_map->image, 1); + set_memory_rox((long)st_map->image, 1); err = st_ops->reg(kdata); if (likely(!err)) { /* Pair with smp_load_acquire() during lookup_elem(). diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 6f290623347e..1e486055a523 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, void bpf_task_storage_free(struct task_struct *task) { - struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct hlist_node *n; unsigned long flags; rcu_read_lock(); @@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task) return; } - /* Neither the bpf_prog nor the bpf-map's syscall - * could be modifying the local_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * local_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_local_storage_map_free() alone - * when unlinking elem from the local_storage->list and - * the map's bucket->list. - */ bpf_task_storage_lock(); raw_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - free_task_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); - } + free_task_storage = bpf_local_storage_unlink_nolock(local_storage); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_task_storage_unlock(); rcu_read_unlock(); - /* free_task_storage should always be true as long as - * local_storage->list was non-empty. - */ if (free_task_storage) kfree_rcu(local_storage, rcu); } @@ -184,7 +163,8 @@ out: return err; } -static int task_storage_delete(struct task_struct *task, struct bpf_map *map) +static int task_storage_delete(struct task_struct *task, struct bpf_map *map, + bool nobusy) { struct bpf_local_storage_data *sdata; @@ -192,6 +172,9 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map) if (!sdata) return -ENOENT; + if (!nobusy) + return -EBUSY; + bpf_selem_unlink(SELEM(sdata), true); return 0; @@ -220,63 +203,108 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) } bpf_task_storage_lock(); - err = task_storage_delete(task, map); + err = task_storage_delete(task, map, true); bpf_task_storage_unlock(); out: put_pid(pid); return err; } -/* *gfp_flags* is a hidden argument provided by the verifier */ -BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags, gfp_t, gfp_flags) +/* Called by bpf_task_storage_get*() helpers */ +static void *__bpf_task_storage_get(struct bpf_map *map, + struct task_struct *task, void *value, + u64 flags, gfp_t gfp_flags, bool nobusy) { struct bpf_local_storage_data *sdata; - WARN_ON_ONCE(!bpf_rcu_lock_held()); - if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) - return (unsigned long)NULL; - - if (!task) - return (unsigned long)NULL; - - if (!bpf_task_storage_trylock()) - return (unsigned long)NULL; - - sdata = task_storage_lookup(task, map, true); + sdata = task_storage_lookup(task, map, nobusy); if (sdata) - goto unlock; + return sdata->data; /* only allocate new storage, when the task is refcounted */ if (refcount_read(&task->usage) && - (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) + (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) { sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, gfp_flags); + return IS_ERR(sdata) ? NULL : sdata->data; + } + + return NULL; +} -unlock: +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) +{ + bool nobusy; + void *data; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) + return (unsigned long)NULL; + + nobusy = bpf_task_storage_trylock(); + data = __bpf_task_storage_get(map, task, value, flags, + gfp_flags, nobusy); + if (nobusy) + bpf_task_storage_unlock(); + return (unsigned long)data; +} + +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) +{ + void *data; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task) + return (unsigned long)NULL; + + bpf_task_storage_lock(); + data = __bpf_task_storage_get(map, task, value, flags, + gfp_flags, true); bpf_task_storage_unlock(); - return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : - (unsigned long)sdata->data; + return (unsigned long)data; } -BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, +BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *, task) { + bool nobusy; int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; - if (!bpf_task_storage_trylock()) - return -EBUSY; + nobusy = bpf_task_storage_trylock(); + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + ret = task_storage_delete(task, map, nobusy); + if (nobusy) + bpf_task_storage_unlock(); + return ret; +} + +BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, + task) +{ + int ret; + + WARN_ON_ONCE(!bpf_rcu_lock_held()); + if (!task) + return -EINVAL; + bpf_task_storage_lock(); /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. */ - ret = task_storage_delete(task, map); + ret = task_storage_delete(task, map, true); bpf_task_storage_unlock(); return ret; } @@ -288,26 +316,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) { - struct bpf_local_storage_map *smap; - - smap = bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); - - smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); - return &smap->map; + return bpf_local_storage_map_alloc(attr, &task_cache); } static void task_storage_map_free(struct bpf_map *map) { - struct bpf_local_storage_map *smap; - - smap = (struct bpf_local_storage_map *)map; - bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap, &bpf_task_storage_busy); + bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); } -BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) +BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -318,10 +335,21 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &task_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = task_storage_ptr, }; +const struct bpf_func_proto bpf_task_storage_get_recur_proto = { + .func = bpf_task_storage_get_recur, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + const struct bpf_func_proto bpf_task_storage_get_proto = { .func = bpf_task_storage_get, .gpl_only = false, @@ -333,6 +361,15 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .arg4_type = ARG_ANYTHING, }; +const struct bpf_func_proto bpf_task_storage_delete_recur_proto = { + .func = bpf_task_storage_delete_recur, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], +}; + const struct bpf_func_proto bpf_task_storage_delete_proto = { .func = bpf_task_storage_delete, .gpl_only = false, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 35c07afac924..f7dd8af06413 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -19,6 +19,7 @@ #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf_lsm.h> #include <linux/skmsg.h> #include <linux/perf_event.h> #include <linux/bsearch.h> @@ -199,11 +200,13 @@ DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); enum btf_kfunc_hook { + BTF_KFUNC_HOOK_COMMON, BTF_KFUNC_HOOK_XDP, BTF_KFUNC_HOOK_TC, BTF_KFUNC_HOOK_STRUCT_OPS, BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, + BTF_KFUNC_HOOK_FMODRET, BTF_KFUNC_HOOK_MAX, }; @@ -237,6 +240,7 @@ struct btf { struct rcu_head rcu; struct btf_kfunc_set_tab *kfunc_set_tab; struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab; + struct btf_struct_metas *struct_meta_tab; /* split BTF support */ struct btf *base_btf; @@ -477,16 +481,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t) return !t || btf_type_nosize(t); } -static bool __btf_type_is_struct(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; -} - -static bool btf_type_is_array(const struct btf_type *t) -{ - return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; -} - static bool btf_type_is_datasec(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; @@ -1642,8 +1636,30 @@ static void btf_free_dtor_kfunc_tab(struct btf *btf) btf->dtor_kfunc_tab = NULL; } +static void btf_struct_metas_free(struct btf_struct_metas *tab) +{ + int i; + + if (!tab) + return; + for (i = 0; i < tab->cnt; i++) { + btf_record_free(tab->types[i].record); + kfree(tab->types[i].field_offs); + } + kfree(tab); +} + +static void btf_free_struct_meta_tab(struct btf *btf) +{ + struct btf_struct_metas *tab = btf->struct_meta_tab; + + btf_struct_metas_free(tab); + btf->struct_meta_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_struct_meta_tab(btf); btf_free_dtor_kfunc_tab(btf); btf_free_kfunc_set_tab(btf); kvfree(btf->types); @@ -3191,7 +3207,7 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } -enum btf_field_type { +enum btf_field_info_type { BTF_FIELD_SPIN_LOCK, BTF_FIELD_TIMER, BTF_FIELD_KPTR, @@ -3203,18 +3219,28 @@ enum { }; struct btf_field_info { - u32 type_id; + enum btf_field_type type; u32 off; - enum bpf_kptr_type type; + union { + struct { + u32 type_id; + } kptr; + struct { + const char *node_name; + u32 value_btf_id; + } list_head; + }; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, enum btf_field_type field_type, + struct btf_field_info *info) { if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; + info->type = field_type; info->off = off; return BTF_FIELD_FOUND; } @@ -3222,9 +3248,12 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t, static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, u32 off, int sz, struct btf_field_info *info) { - enum bpf_kptr_type type; + enum btf_field_type type; u32 res_id; + /* Permit modifiers on the pointer itself */ + if (btf_type_is_volatile(t)) + t = btf_type_by_id(btf, t->type); /* For PTR, sz is always == 8 */ if (!btf_type_is_ptr(t)) return BTF_FIELD_IGNORE; @@ -3248,28 +3277,135 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!__btf_type_is_struct(t)) return -EINVAL; - info->type_id = res_id; - info->off = off; info->type = type; + info->off = off; + info->kptr.type_id = res_id; return BTF_FIELD_FOUND; } -static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_type field_type, +static const char *btf_find_decl_tag_value(const struct btf *btf, + const struct btf_type *pt, + int comp_idx, const char *tag_key) +{ + int i; + + for (i = 1; i < btf_nr_types(btf); i++) { + const struct btf_type *t = btf_type_by_id(btf, i); + int len = strlen(tag_key); + + if (!btf_type_is_decl_tag(t)) + continue; + if (pt != btf_type_by_id(btf, t->type) || + btf_type_decl_tag(t)->component_idx != comp_idx) + continue; + if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len)) + continue; + return __btf_name_by_offset(btf, t->name_off) + len; + } + return NULL; +} + +static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt, + const struct btf_type *t, int comp_idx, + u32 off, int sz, struct btf_field_info *info) +{ + const char *value_type; + const char *list_node; + s32 id; + + if (!__btf_type_is_struct(t)) + return BTF_FIELD_IGNORE; + if (t->size != sz) + return BTF_FIELD_IGNORE; + value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:"); + if (!value_type) + return -EINVAL; + list_node = strstr(value_type, ":"); + if (!list_node) + return -EINVAL; + value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN); + if (!value_type) + return -ENOMEM; + id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT); + kfree(value_type); + if (id < 0) + return id; + list_node++; + if (str_is_empty(list_node)) + return -EINVAL; + info->type = BPF_LIST_HEAD; + info->off = off; + info->list_head.value_btf_id = id; + info->list_head.node_name = list_node; + return BTF_FIELD_FOUND; +} + +static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, + int *align, int *sz) +{ + int type = 0; + + if (field_mask & BPF_SPIN_LOCK) { + if (!strcmp(name, "bpf_spin_lock")) { + if (*seen_mask & BPF_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_SPIN_LOCK; + type = BPF_SPIN_LOCK; + goto end; + } + } + if (field_mask & BPF_TIMER) { + if (!strcmp(name, "bpf_timer")) { + if (*seen_mask & BPF_TIMER) + return -E2BIG; + *seen_mask |= BPF_TIMER; + type = BPF_TIMER; + goto end; + } + } + if (field_mask & BPF_LIST_HEAD) { + if (!strcmp(name, "bpf_list_head")) { + type = BPF_LIST_HEAD; + goto end; + } + } + if (field_mask & BPF_LIST_NODE) { + if (!strcmp(name, "bpf_list_node")) { + type = BPF_LIST_NODE; + goto end; + } + } + /* Only return BPF_KPTR when all other types with matchable names fail */ + if (field_mask & BPF_KPTR) { + type = BPF_KPTR_REF; + goto end; + } + return 0; +end: + *sz = btf_field_type_size(type); + *align = btf_field_type_align(type); + return type; +} + +static int btf_find_struct_field(const struct btf *btf, + const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_member *member; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; off = __btf_member_bit_offset(t, member); if (off % 8) @@ -3277,22 +3413,30 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t return -EINVAL; off /= 8; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, member_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_LIST_NODE: + ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; + case BPF_LIST_HEAD: + ret = btf_find_list_head(btf, t, member_type, i, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } @@ -3307,42 +3451,53 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_var_secinfo *vsi; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - off = vsi->offset; - - if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; + + off = vsi->offset; if (vsi->size != sz) continue; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, var_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + case BPF_LIST_NODE: + ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; + case BPF_LIST_HEAD: + ret = btf_find_list_head(btf, var, var_type, -1, off, sz, + idx < info_cnt ? &info[idx] : &tmp); + if (ret < 0) + return ret; + break; default: return -EFAULT; } @@ -3357,169 +3512,327 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - enum btf_field_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { - const char *name; - int sz, align; - - switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - name = "bpf_spin_lock"; - sz = sizeof(struct bpf_spin_lock); - align = __alignof__(struct bpf_spin_lock); - break; - case BTF_FIELD_TIMER: - name = "bpf_timer"; - sz = sizeof(struct bpf_timer); - align = __alignof__(struct bpf_timer); - break; - case BTF_FIELD_KPTR: - name = NULL; - sz = sizeof(u64); - align = 8; - break; - default: - return -EFAULT; - } - if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_struct_field(btf, t, field_mask, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_datasec_var(btf, t, field_mask, info, info_cnt); return -EINVAL; } -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) { - struct btf_field_info info; + struct module *mod = NULL; + const struct btf_type *t; + struct btf *kernel_btf; int ret; + s32 id; - ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info->kptr.type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) + return id; + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info->type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } + + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } + + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + field->kptr.dtor = (void *)addr; + } + + field->kptr.btf_id = id; + field->kptr.btf = kernel_btf; + field->kptr.module = mod; + return 0; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); + return ret; } -int btf_find_timer(const struct btf *btf, const struct btf_type *t) +static int btf_parse_list_head(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) { - struct btf_field_info info; - int ret; + const struct btf_type *t, *n = NULL; + const struct btf_member *member; + u32 offset; + int i; - ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); - if (ret < 0) - return ret; - if (!ret) + t = btf_type_by_id(btf, info->list_head.value_btf_id); + /* We've already checked that value_btf_id is a struct type. We + * just need to figure out the offset of the list_node, and + * verify its type. + */ + for_each_member(i, t, member) { + if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off))) + continue; + /* Invalid BTF, two members with same name */ + if (n) + return -EINVAL; + n = btf_type_by_id(btf, member->type); + if (!__btf_type_is_struct(n)) + return -EINVAL; + if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off))) + return -EINVAL; + offset = __btf_member_bit_offset(n, member); + if (offset % 8) + return -EINVAL; + offset /= 8; + if (offset % __alignof__(struct bpf_list_node)) + return -EINVAL; + + field->list_head.btf = (struct btf *)btf; + field->list_head.value_btf_id = info->list_head.value_btf_id; + field->list_head.node_offset = offset; + } + if (!n) return -ENOENT; - return info.off; + return 0; } -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t) +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size) { - struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX]; - struct bpf_map_value_off *tab; - struct btf *kernel_btf = NULL; - struct module *mod = NULL; - int ret, i, nr_off; + struct btf_field_info info_arr[BTF_FIELDS_MAX]; + struct btf_record *rec; + u32 next_off = 0; + int ret, i, cnt; - ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) return ERR_PTR(ret); if (!ret) return NULL; - nr_off = ret; - tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN); - if (!tab) + cnt = ret; + /* This needs to be kzalloc to zero out padding and unused fields, see + * comment in btf_record_equal. + */ + rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); + if (!rec) return ERR_PTR(-ENOMEM); - for (i = 0; i < nr_off; i++) { - const struct btf_type *t; - s32 id; + rec->spin_lock_off = -EINVAL; + rec->timer_off = -EINVAL; + for (i = 0; i < cnt; i++) { + if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) { + WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); + ret = -EFAULT; + goto end; + } + if (info_arr[i].off < next_off) { + ret = -EEXIST; + goto end; + } + next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type); - /* Find type in map BTF, and use it to look up the matching type - * in vmlinux or module BTFs, by name and kind. - */ - t = btf_type_by_id(btf, info_arr[i].type_id); - id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), - &kernel_btf); - if (id < 0) { - ret = id; + rec->field_mask |= info_arr[i].type; + rec->fields[i].offset = info_arr[i].off; + rec->fields[i].type = info_arr[i].type; + + switch (info_arr[i].type) { + case BPF_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->spin_lock_off = rec->fields[i].offset; + break; + case BPF_TIMER: + WARN_ON_ONCE(rec->timer_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->timer_off = rec->fields[i].offset; + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; + case BPF_LIST_HEAD: + ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; + case BPF_LIST_NODE: + break; + default: + ret = -EFAULT; goto end; } + rec->cnt++; + } - /* Find and stash the function pointer for the destruction function that - * needs to be eventually invoked from the map free path. - */ - if (info_arr[i].type == BPF_KPTR_REF) { - const struct btf_type *dtor_func; - const char *dtor_func_name; - unsigned long addr; - s32 dtor_btf_id; - - /* This call also serves as a whitelist of allowed objects that - * can be used as a referenced pointer and be stored in a map at - * the same time. - */ - dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); - if (dtor_btf_id < 0) { - ret = dtor_btf_id; - goto end_btf; - } + /* bpf_list_head requires bpf_spin_lock */ + if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) { + ret = -EINVAL; + goto end; + } - dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); - if (!dtor_func) { - ret = -ENOENT; - goto end_btf; - } + return rec; +end: + btf_record_free(rec); + return ERR_PTR(ret); +} - if (btf_is_module(kernel_btf)) { - mod = btf_try_get_module(kernel_btf); - if (!mod) { - ret = -ENXIO; - goto end_btf; - } - } +int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec) +{ + int i; - /* We already verified dtor_func to be btf_type_is_func - * in register_btf_id_dtor_kfuncs. - */ - dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); - addr = kallsyms_lookup_name(dtor_func_name); - if (!addr) { - ret = -EINVAL; - goto end_mod; - } - tab->off[i].kptr.dtor = (void *)addr; - } + /* There are two owning types, kptr_ref and bpf_list_head. The former + * only supports storing kernel types, which can never store references + * to program allocated local types, atleast not yet. Hence we only need + * to ensure that bpf_list_head ownership does not form cycles. + */ + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD)) + return 0; + for (i = 0; i < rec->cnt; i++) { + struct btf_struct_meta *meta; + u32 btf_id; + + if (!(rec->fields[i].type & BPF_LIST_HEAD)) + continue; + btf_id = rec->fields[i].list_head.value_btf_id; + meta = btf_find_struct_meta(btf, btf_id); + if (!meta) + return -EFAULT; + rec->fields[i].list_head.value_rec = meta->record; + + if (!(rec->field_mask & BPF_LIST_NODE)) + continue; - tab->off[i].offset = info_arr[i].off; - tab->off[i].type = info_arr[i].type; - tab->off[i].kptr.btf_id = id; - tab->off[i].kptr.btf = kernel_btf; - tab->off[i].kptr.module = mod; + /* We need to ensure ownership acyclicity among all types. The + * proper way to do it would be to topologically sort all BTF + * IDs based on the ownership edges, since there can be multiple + * bpf_list_head in a type. Instead, we use the following + * reasoning: + * + * - A type can only be owned by another type in user BTF if it + * has a bpf_list_node. + * - A type can only _own_ another type in user BTF if it has a + * bpf_list_head. + * + * We ensure that if a type has both bpf_list_head and + * bpf_list_node, its element types cannot be owning types. + * + * To ensure acyclicity: + * + * When A only has bpf_list_head, ownership chain can be: + * A -> B -> C + * Where: + * - B has both bpf_list_head and bpf_list_node. + * - C only has bpf_list_node. + * + * When A has both bpf_list_head and bpf_list_node, some other + * type already owns it in the BTF domain, hence it can not own + * another owning type through any of the bpf_list_head edges. + * A -> B + * Where: + * - B only has bpf_list_node. + */ + if (meta->record->field_mask & BPF_LIST_HEAD) + return -ELOOP; } - tab->nr_off = nr_off; - return tab; -end_mod: - module_put(mod); -end_btf: - btf_put(kernel_btf); -end: - while (i--) { - btf_put(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); + return 0; +} + +static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv) +{ + const u32 a = *(const u32 *)_a; + const u32 b = *(const u32 *)_b; + + if (a < b) + return -1; + else if (a > b) + return 1; + return 0; +} + +static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv) +{ + struct btf_field_offs *foffs = (void *)priv; + u32 *off_base = foffs->field_off; + u32 *a = _a, *b = _b; + u8 *sz_a, *sz_b; + + sz_a = foffs->field_sz + (a - off_base); + sz_b = foffs->field_sz + (b - off_base); + + swap(*a, *b); + swap(*sz_a, *sz_b); +} + +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec) +{ + struct btf_field_offs *foffs; + u32 i, *off; + u8 *sz; + + BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz)); + if (IS_ERR_OR_NULL(rec)) + return NULL; + + foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN); + if (!foffs) + return ERR_PTR(-ENOMEM); + + off = foffs->field_off; + sz = foffs->field_sz; + for (i = 0; i < rec->cnt; i++) { + off[i] = rec->fields[i].offset; + sz[i] = btf_field_type_size(rec->fields[i].type); } - kfree(tab); - return ERR_PTR(ret); + foffs->cnt = rec->cnt; + + if (foffs->cnt == 1) + return foffs; + sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]), + btf_field_offs_cmp, btf_field_offs_swap, foffs); + return foffs; } static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, @@ -4468,7 +4781,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env, nr_args--; } - err = 0; for (i = 0; i < nr_args; i++) { const struct btf_type *arg_type; u32 arg_type_id; @@ -4477,8 +4789,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env, arg_type = btf_type_by_id(btf, arg_type_id); if (!arg_type) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; + return -EINVAL; + } + + if (btf_type_is_resolve_source_only(arg_type)) { + btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); + return -EINVAL; } if (args[i].name_off && @@ -4486,25 +4802,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env, !btf_name_valid_identifier(btf, args[i].name_off))) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; + return -EINVAL; } if (btf_type_needs_resolve(arg_type) && !env_type_is_resolved(env, arg_type_id)) { err = btf_resolve(env, arg_type, arg_type_id); if (err) - break; + return err; } if (!btf_type_id_size(btf, &arg_type_id, NULL)) { btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); - err = -EINVAL; - break; + return -EINVAL; } } - return err; + return 0; } static int btf_func_check(struct btf_verifier_env *env, @@ -4918,6 +5232,119 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return btf_check_sec_info(env, btf_data_size); } +static const char *alloc_obj_fields[] = { + "bpf_spin_lock", + "bpf_list_head", + "bpf_list_node", +}; + +static struct btf_struct_metas * +btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf) +{ + union { + struct btf_id_set set; + struct { + u32 _cnt; + u32 _ids[ARRAY_SIZE(alloc_obj_fields)]; + } _arr; + } aof; + struct btf_struct_metas *tab = NULL; + int i, n, id, ret; + + BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0); + BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32)); + + memset(&aof, 0, sizeof(aof)); + for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) { + /* Try to find whether this special type exists in user BTF, and + * if so remember its ID so we can easily find it among members + * of structs that we iterate in the next loop. + */ + id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT); + if (id < 0) + continue; + aof.set.ids[aof.set.cnt++] = id; + } + + if (!aof.set.cnt) + return NULL; + sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL); + + n = btf_nr_types(btf); + for (i = 1; i < n; i++) { + struct btf_struct_metas *new_tab; + const struct btf_member *member; + struct btf_field_offs *foffs; + struct btf_struct_meta *type; + struct btf_record *record; + const struct btf_type *t; + int j, tab_cnt; + + t = btf_type_by_id(btf, i); + if (!t) { + ret = -EINVAL; + goto free; + } + if (!__btf_type_is_struct(t)) + continue; + + cond_resched(); + + for_each_member(j, t, member) { + if (btf_id_set_contains(&aof.set, member->type)) + goto parse; + } + continue; + parse: + tab_cnt = tab ? tab->cnt : 0; + new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]), + GFP_KERNEL | __GFP_NOWARN); + if (!new_tab) { + ret = -ENOMEM; + goto free; + } + if (!tab) + new_tab->cnt = 0; + tab = new_tab; + + type = &tab->types[tab->cnt]; + type->btf_id = i; + record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size); + /* The record cannot be unset, treat it as an error if so */ + if (IS_ERR_OR_NULL(record)) { + ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT; + goto free; + } + foffs = btf_parse_field_offs(record); + /* We need the field_offs to be valid for a valid record, + * either both should be set or both should be unset. + */ + if (IS_ERR_OR_NULL(foffs)) { + btf_record_free(record); + ret = -EFAULT; + goto free; + } + type->record = record; + type->field_offs = foffs; + tab->cnt++; + } + return tab; +free: + btf_struct_metas_free(tab); + return ERR_PTR(ret); +} + +struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) +{ + struct btf_struct_metas *tab; + + BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0); + tab = btf->struct_meta_tab; + if (!tab) + return NULL; + return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func); +} + static int btf_check_type_tags(struct btf_verifier_env *env, struct btf *btf, int start_id) { @@ -4968,6 +5395,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env, static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, u32 log_level, char __user *log_ubuf, u32 log_size) { + struct btf_struct_metas *struct_meta_tab; struct btf_verifier_env *env = NULL; struct bpf_verifier_log *log; struct btf *btf = NULL; @@ -5036,15 +5464,34 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, if (err) goto errout; + struct_meta_tab = btf_parse_struct_metas(log, btf); + if (IS_ERR(struct_meta_tab)) { + err = PTR_ERR(struct_meta_tab); + goto errout; + } + btf->struct_meta_tab = struct_meta_tab; + + if (struct_meta_tab) { + int i; + + for (i = 0; i < struct_meta_tab->cnt; i++) { + err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record); + if (err < 0) + goto errout_meta; + } + } + if (log->level && bpf_verifier_log_full(log)) { err = -ENOSPC; - goto errout; + goto errout_meta; } btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); return btf; +errout_meta: + btf_free_struct_meta_tab(btf); errout: btf_verifier_env_free(env); if (btf) @@ -5086,7 +5533,7 @@ static u8 bpf_ctx_convert_map[] = { #undef BPF_MAP_TYPE #undef BPF_LINK_TYPE -static const struct btf_member * +const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg) @@ -5159,6 +5606,26 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log, return kern_ctx_type->type; } +int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type) +{ + const struct btf_member *kctx_member; + const struct btf_type *conv_struct; + const struct btf_type *kctx_type; + u32 kctx_type_id; + + conv_struct = bpf_ctx_convert.t; + /* get member for kernel ctx type */ + kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1; + kctx_type_id = kctx_member->type; + kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id); + if (!btf_type_is_struct(kctx_type)) { + bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id); + return -EINVAL; + } + + return kctx_type_id; +} + BTF_ID_LIST(bpf_ctx_convert_btf_id) BTF_ID(struct, bpf_ctx_convert) @@ -5356,6 +5823,22 @@ static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, return nr_args + 1; } +static bool prog_args_trusted(const struct bpf_prog *prog) +{ + enum bpf_attach_type atype = prog->expected_attach_type; + + switch (prog->type) { + case BPF_PROG_TYPE_TRACING: + return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER; + case BPF_PROG_TYPE_LSM: + return bpf_lsm_is_trusted(prog); + case BPF_PROG_TYPE_STRUCT_OPS: + return true; + default: + return false; + } +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -5499,6 +5982,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } info->reg_type = PTR_TO_BTF_ID; + if (prog_args_trusted(prog)) + info->reg_type |= PTR_TRUSTED; + if (tgt_prog) { enum bpf_prog_type tgt_type; @@ -5765,6 +6251,9 @@ error: /* check __percpu tag */ if (strcmp(tag_value, "percpu") == 0) tmp_flag = MEM_PERCPU; + /* check __rcu tag */ + if (strcmp(tag_value, "rcu") == 0) + tmp_flag = MEM_RCU; } stype = btf_type_skip_modifiers(btf, mtype->type, &id); @@ -5794,20 +6283,50 @@ error: return -EINVAL; } -int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype __maybe_unused, +int btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id, enum bpf_type_flag *flag) { + const struct btf *btf = reg->btf; enum bpf_type_flag tmp_flag = 0; + const struct btf_type *t; + u32 id = reg->btf_id; int err; - u32 id; + while (type_is_alloc(reg->type)) { + struct btf_struct_meta *meta; + struct btf_record *rec; + int i; + + meta = btf_find_struct_meta(btf, id); + if (!meta) + break; + rec = meta->record; + for (i = 0; i < rec->cnt; i++) { + struct btf_field *field = &rec->fields[i]; + u32 offset = field->offset; + if (off < offset + btf_field_type_size(field->type) && offset < off + size) { + bpf_log(log, + "direct access to %s is disallowed\n", + btf_field_type_name(field->type)); + return -EACCES; + } + } + break; + } + + t = btf_type_by_id(btf, id); do { err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); switch (err) { case WALK_PTR: + /* For local types, the destination register cannot + * become a pointer again. + */ + if (type_is_alloc(reg->type)) + return SCALAR_VALUE; /* If we found the pointer or scalar on t+off, * we're done. */ @@ -5842,8 +6361,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, * end up with two different module BTFs, but IDs point to the common type in * vmlinux BTF. */ -static bool btf_types_are_same(const struct btf *btf1, u32 id1, - const struct btf *btf2, u32 id2) +bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) { if (id1 != id2) return false; @@ -6125,122 +6644,19 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr return btf_check_func_type_match(log, btf1, t1, btf2, t2); } -static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { -#ifdef CONFIG_NET - [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], - [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], - [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], -#endif -}; - -/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ -static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int rec) -{ - const struct btf_type *member_type; - const struct btf_member *member; - u32 i; - - if (!btf_type_is_struct(t)) - return false; - - for_each_member(i, t, member) { - const struct btf_array *array; - - member_type = btf_type_skip_modifiers(btf, member->type, NULL); - if (btf_type_is_struct(member_type)) { - if (rec >= 3) { - bpf_log(log, "max struct nesting depth exceeded\n"); - return false; - } - if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1)) - return false; - continue; - } - if (btf_type_is_array(member_type)) { - array = btf_type_array(member_type); - if (!array->nelems) - return false; - member_type = btf_type_skip_modifiers(btf, array->type, NULL); - if (!btf_type_is_scalar(member_type)) - return false; - continue; - } - if (!btf_type_is_scalar(member_type)) - return false; - } - return true; -} - -static bool is_kfunc_arg_mem_size(const struct btf *btf, - const struct btf_param *arg, - const struct bpf_reg_state *reg) -{ - int len, sfx_len = sizeof("__sz") - 1; - const struct btf_type *t; - const char *param_name; - - t = btf_type_skip_modifiers(btf, arg->type, NULL); - if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) - return false; - - /* In the future, this can be ported to use BTF tagging */ - param_name = btf_name_by_offset(btf, arg->name_off); - if (str_is_empty(param_name)) - return false; - len = strlen(param_name); - if (len < sfx_len) - return false; - param_name += len - sfx_len; - if (strncmp(param_name, "__sz", sfx_len)) - return false; - - return true; -} - -static bool btf_is_kfunc_arg_mem_size(const struct btf *btf, - const struct btf_param *arg, - const struct bpf_reg_state *reg, - const char *name) -{ - int len, target_len = strlen(name); - const struct btf_type *t; - const char *param_name; - - t = btf_type_skip_modifiers(btf, arg->type, NULL); - if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) - return false; - - param_name = btf_name_by_offset(btf, arg->name_off); - if (str_is_empty(param_name)) - return false; - len = strlen(param_name); - if (len != target_len) - return false; - if (strcmp(param_name, name)) - return false; - - return true; -} - static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok, - struct bpf_kfunc_arg_meta *kfunc_meta, bool processing_call) { enum bpf_prog_type prog_type = resolve_prog_type(env->prog); - bool rel = false, kptr_get = false, trusted_args = false; - bool sleepable = false; struct bpf_verifier_log *log = &env->log; - u32 i, nargs, ref_id, ref_obj_id = 0; - bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - int ref_regno = 0, ret; + u32 i, nargs, ref_id; + int ret; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -6266,14 +6682,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - if (is_kfunc && kfunc_meta) { - /* Only kfunc can be release func */ - rel = kfunc_meta->flags & KF_RELEASE; - kptr_get = kfunc_meta->flags & KF_KPTR_GET; - trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS; - sleepable = kfunc_meta->flags & KF_SLEEPABLE; - } - /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -6281,42 +6689,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1; struct bpf_reg_state *reg = ®s[regno]; - bool obj_ptr = false; t = btf_type_skip_modifiers(btf, args[i].type, NULL); if (btf_type_is_scalar(t)) { - if (is_kfunc && kfunc_meta) { - bool is_buf_size = false; - - /* check for any const scalar parameter of name "rdonly_buf_size" - * or "rdwr_buf_size" - */ - if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, - "rdonly_buf_size")) { - kfunc_meta->r0_rdonly = true; - is_buf_size = true; - } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg, - "rdwr_buf_size")) - is_buf_size = true; - - if (is_buf_size) { - if (kfunc_meta->r0_size) { - bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); - return -EINVAL; - } - - if (!tnum_is_const(reg->var_off)) { - bpf_log(log, "R%d is not a const\n", regno); - return -EINVAL; - } - - kfunc_meta->r0_size = reg->var_off.value; - ret = mark_chain_precision(env, regno); - if (ret) - return ret; - } - } - if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", regno); @@ -6329,88 +6704,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } - /* These register types have special constraints wrt ref_obj_id - * and offset checks. The rest of trusted args don't. - */ - obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID || - reg2btf_ids[base_type(reg->type)]; - - /* Check if argument must be a referenced pointer, args + i has - * been verified to be a pointer (after skipping modifiers). - * PTR_TO_CTX is ok without having non-zero ref_obj_id. - */ - if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) { - bpf_log(log, "R%d must be referenced\n", regno); - return -EINVAL; - } - ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - /* Trusted args have the same offset checks as release arguments */ - if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id)) - arg_type |= OBJ_RELEASE; ret = check_func_arg_reg_off(env, reg, regno, arg_type); if (ret < 0) return ret; - if (is_kfunc && reg->ref_obj_id) { - /* Ensure only one argument is referenced PTR_TO_BTF_ID */ - if (ref_obj_id) { - bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", - regno, reg->ref_obj_id, ref_obj_id); - return -EFAULT; - } - ref_regno = regno; - ref_obj_id = reg->ref_obj_id; - } - - /* kptr_get is only true for kfunc */ - if (i == 0 && kptr_get) { - struct bpf_map_value_off_desc *off_desc; - - if (reg->type != PTR_TO_MAP_VALUE) { - bpf_log(log, "arg#0 expected pointer to map value\n"); - return -EINVAL; - } - - /* check_func_arg_reg_off allows var_off for - * PTR_TO_MAP_VALUE, but we need fixed offset to find - * off_desc. - */ - if (!tnum_is_const(reg->var_off)) { - bpf_log(log, "arg#0 must have constant offset\n"); - return -EINVAL; - } - - off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value); - if (!off_desc || off_desc->type != BPF_KPTR_REF) { - bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n", - reg->off + reg->var_off.value); - return -EINVAL; - } - - if (!btf_type_is_ptr(ref_t)) { - bpf_log(log, "arg#0 BTF type must be a double pointer\n"); - return -EINVAL; - } - - ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id); - ref_tname = btf_name_by_offset(btf, ref_t->name_off); - - if (!btf_type_is_struct(ref_t)) { - bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", - func_name, i, btf_type_str(ref_t), ref_tname); - return -EINVAL; - } - if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf, - off_desc->kptr.btf_id, true)) { - bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n", - func_name, i, btf_type_str(ref_t), ref_tname); - return -EINVAL; - } - /* rest of the arguments can be anything, like normal kfunc */ - } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -6420,109 +6721,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, i, btf_type_str(t)); return -EINVAL; } - } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || - (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { - const struct btf_type *reg_ref_t; - const struct btf *reg_btf; - const char *reg_ref_tname; - u32 reg_ref_id; - - if (!btf_type_is_struct(ref_t)) { - bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n", - func_name, i, btf_type_str(ref_t), - ref_tname); - return -EINVAL; - } - - if (reg->type == PTR_TO_BTF_ID) { - reg_btf = reg->btf; - reg_ref_id = reg->btf_id; - } else { - reg_btf = btf_vmlinux; - reg_ref_id = *reg2btf_ids[base_type(reg->type)]; - } - - reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, - ®_ref_id); - reg_ref_tname = btf_name_by_offset(reg_btf, - reg_ref_t->name_off); - if (!btf_struct_ids_match(log, reg_btf, reg_ref_id, - reg->off, btf, ref_id, - trusted_args || (rel && reg->ref_obj_id))) { - bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", - func_name, i, - btf_type_str(ref_t), ref_tname, - regno, btf_type_str(reg_ref_t), - reg_ref_tname); - return -EINVAL; - } } else if (ptr_to_mem_ok && processing_call) { const struct btf_type *resolve_ret; u32 type_size; - if (is_kfunc) { - bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); - bool arg_dynptr = btf_type_is_struct(ref_t) && - !strcmp(ref_tname, - stringify_struct(bpf_dynptr_kern)); - - /* Permit pointer to mem, but only when argument - * type is pointer to scalar, or struct composed - * (recursively) of scalars. - * When arg_mem_size is true, the pointer can be - * void *. - * Also permit initialized local dynamic pointers. - */ - if (!btf_type_is_scalar(ref_t) && - !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && - !arg_dynptr && - (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { - bpf_log(log, - "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", - i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); - return -EINVAL; - } - - if (arg_dynptr) { - if (reg->type != PTR_TO_STACK) { - bpf_log(log, "arg#%d pointer type %s %s not to stack\n", - i, btf_type_str(ref_t), - ref_tname); - return -EINVAL; - } - - if (!is_dynptr_reg_valid_init(env, reg)) { - bpf_log(log, - "arg#%d pointer type %s %s must be valid and initialized\n", - i, btf_type_str(ref_t), - ref_tname); - return -EINVAL; - } - - if (!is_dynptr_type_expected(env, reg, - ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) { - bpf_log(log, - "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n", - i, btf_type_str(ref_t), - ref_tname); - return -EINVAL; - } - - continue; - } - - /* Check for mem, len pair */ - if (arg_mem_size) { - if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { - bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", - i, i + 1); - return -EINVAL; - } - i++; - continue; - } - } - resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { bpf_log(log, @@ -6535,36 +6737,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (check_mem_reg(env, reg, regno, type_size)) return -EINVAL; } else { - bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i, - is_kfunc ? "kernel " : "", func_name, func_id); + bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i, + func_name, func_id); return -EINVAL; } } - /* Either both are set, or neither */ - WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); - /* We already made sure ref_obj_id is set only for one argument. We do - * allow (!rel && ref_obj_id), so that passing such referenced - * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when - * is_kfunc is true. - */ - if (rel && !ref_obj_id) { - bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", - func_name); - return -EINVAL; - } - - if (sleepable && !env->prog->aux->sleepable) { - bpf_log(log, "kernel function %s is sleepable but the program is not\n", - func_name); - return -EINVAL; - } - - if (kfunc_meta && ref_obj_id) - kfunc_meta->ref_obj_id = ref_obj_id; - - /* returns argument register number > 0 in case of reference release kfunc */ - return rel ? ref_regno : 0; + return 0; } /* Compare BTF of a function declaration with given bpf_reg_state. @@ -6594,7 +6773,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6637,7 +6816,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, return -EINVAL; is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; - err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true); + err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true); /* Compiler optimizations can remove arguments from static functions * or mismatched type can be passed into a global function. @@ -6648,14 +6827,6 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, return err; } -int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - struct bpf_kfunc_arg_meta *meta) -{ - return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true); -} - /* Convert BTF of a function into bpf_reg_state if possible * Returns: * EFAULT - there is a verifier bug. Abort verification. @@ -7038,23 +7209,6 @@ bool btf_is_module(const struct btf *btf) return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; } -static int btf_id_cmp_func(const void *a, const void *b) -{ - const int *pa = a, *pb = b; - - return *pa - *pb; -} - -bool btf_id_set_contains(const struct btf_id_set *set, u32 id) -{ - return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; -} - -static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) -{ - return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); -} - enum { BTF_MODULE_F_LIVE = (1 << 0), }; @@ -7415,6 +7569,8 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf, static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) { switch (prog_type) { + case BPF_PROG_TYPE_UNSPEC: + return BTF_KFUNC_HOOK_COMMON; case BPF_PROG_TYPE_XDP: return BTF_KFUNC_HOOK_XDP; case BPF_PROG_TYPE_SCHED_CLS: @@ -7443,16 +7599,24 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf, u32 kfunc_btf_id) { enum btf_kfunc_hook hook; + u32 *kfunc_flags; + + kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id); + if (kfunc_flags) + return kfunc_flags; hook = bpf_prog_type_to_kfunc_hook(prog_type); return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } -/* This function must be invoked only from initcalls/module init functions */ -int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, - const struct btf_kfunc_id_set *kset) +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id) +{ + return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); +} + +static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) { - enum btf_kfunc_hook hook; struct btf *btf; int ret; @@ -7471,13 +7635,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, if (IS_ERR(btf)) return PTR_ERR(btf); - hook = bpf_prog_type_to_kfunc_hook(prog_type); ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __register_btf_kfunc_id_set(hook, kset); +} EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset) +{ + return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset); +} +EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set); + s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 9fcf09f2ef00..06989d278846 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -157,23 +157,37 @@ static const struct seq_operations cgroup_iter_seq_ops = { .show = cgroup_iter_seq_show, }; -BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup) +BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup) static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux) { struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; struct cgroup *cgrp = aux->cgroup.start; + /* bpf_iter_attach_cgroup() has already acquired an extra reference + * for the start cgroup, but the reference may be released after + * cgroup_iter_seq_init(), so acquire another reference for the + * start cgroup. + */ p->start_css = &cgrp->self; + css_get(p->start_css); p->terminate = false; p->visited_all = false; p->order = aux->cgroup.order; return 0; } +static void cgroup_iter_seq_fini(void *priv) +{ + struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv; + + css_put(p->start_css); +} + static const struct bpf_iter_seq_info cgroup_iter_seq_info = { .seq_ops = &cgroup_iter_seq_ops, .init_seq_private = cgroup_iter_seq_init, + .fini_seq_private = cgroup_iter_seq_fini, .seq_priv_size = sizeof(struct cgroup_iter_priv), }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 25a54e04560e..ba3fff17e2f9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -34,6 +34,7 @@ #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> +#include <linux/bpf_mem_alloc.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -60,6 +61,9 @@ #define CTX regs[BPF_REG_CTX] #define IMM insn->imm +struct bpf_mem_alloc bpf_global_ma; +bool bpf_global_ma_set; + /* No hurry in this branch * * Exported for the bpf jit load helper. @@ -864,8 +868,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins list_add_tail(&pack->list, &pack_list); set_vm_flush_reset_perms(pack->ptr); - set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); - set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); + set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); return pack; } @@ -883,8 +886,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) if (ptr) { bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); - set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); - set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); } goto out; } @@ -1032,7 +1034,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = prandom_u32_max(hole) & ~(alignment - 1); + start = get_random_u32_below(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -1094,7 +1096,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); - start = prandom_u32_max(hole) & ~(alignment - 1); + start = get_random_u32_below(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; @@ -2088,6 +2090,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { + enum bpf_prog_type prog_type = resolve_prog_type(fp); bool ret; if (fp->kprobe_override) @@ -2098,12 +2101,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map, /* There's no owner yet where we could check for * compatibility. */ - map->owner.type = fp->type; + map->owner.type = prog_type; map->owner.jited = fp->jited; map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = map->owner.type == fp->type && + ret = map->owner.type == prog_type && map->owner.jited == fp->jited && map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } @@ -2251,8 +2254,14 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) { struct bpf_prog_array *progs; + /* If RCU Tasks Trace grace period implies RCU grace period, there is + * no need to call kfree_rcu(), just call kfree() directly. + */ progs = container_of(rcu, struct bpf_prog_array, rcu); - kfree_rcu(progs, rcu); + if (rcu_trace_implies_rcu_gp()) + kfree(progs); + else + kfree_rcu(progs, rcu); } void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) @@ -2740,6 +2749,18 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len) return -ENOTSUPP; } +#ifdef CONFIG_BPF_SYSCALL +static int __init bpf_global_ma_init(void) +{ + int ret; + + ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); + bpf_global_ma_set = !ret; + return ret; +} +late_initcall(bpf_global_ma_init); +#endif + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index b5ba34ddd4b6..e0b2d016f0bf 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -4,13 +4,16 @@ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ -/* The 'cpumap' is primarily used as a backend map for XDP BPF helper +/** + * DOC: cpu map + * The 'cpumap' is primarily used as a backend map for XDP BPF helper * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'. * - * Unlike devmap which redirects XDP frames out another NIC device, + * Unlike devmap which redirects XDP frames out to another NIC device, * this map type redirects raw XDP frames to another CPU. The remote * CPU will do SKB-allocation and call the normal network stack. - * + */ +/* * This is a scalability and isolation mechanism, that allow * separating the early driver network XDP layer, from the rest of the * netstack, and assigning dedicated CPUs for this stage. This @@ -85,7 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; - int err = -ENOMEM; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -97,29 +99,26 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); + /* Pre-limit array size based on NR_CPUS, not final CPU check */ + if (attr->max_entries > NR_CPUS) + return ERR_PTR(-E2BIG); + cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE); if (!cmap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&cmap->map, attr); - /* Pre-limit array size based on NR_CPUS, not final CPU check */ - if (cmap->map.max_entries > NR_CPUS) { - err = -E2BIG; - goto free_cmap; - } - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); - if (!cmap->cpu_map) - goto free_cmap; + if (!cmap->cpu_map) { + bpf_map_area_free(cmap); + return ERR_PTR(-ENOMEM); + } return &cmap->map; -free_cmap: - bpf_map_area_free(cmap); - return ERR_PTR(err); } static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -668,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f9a87dcc5535..d01e4c55b376 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -992,14 +992,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } -static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } -static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c index 04f0a045dcaa..fa3e9225aedc 100644 --- a/kernel/bpf/dispatcher.c +++ b/kernel/bpf/dispatcher.c @@ -4,7 +4,7 @@ #include <linux/hash.h> #include <linux/bpf.h> #include <linux/filter.h> -#include <linux/init.h> +#include <linux/static_call.h> /* The BPF dispatcher is a multiway branch code generator. The * dispatcher is a mechanism to avoid the performance penalty of an @@ -91,11 +91,6 @@ int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int n return -ENOTSUPP; } -int __weak __init bpf_arch_init_dispatcher_early(void *ip) -{ - return -ENOTSUPP; -} - static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf) { s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0]; @@ -110,17 +105,11 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *b static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) { - void *old, *new, *tmp; - u32 noff; - int err; - - if (!prev_num_progs) { - old = NULL; - noff = 0; - } else { - old = d->image + d->image_off; + void *new, *tmp; + u32 noff = 0; + + if (prev_num_progs) noff = d->image_off ^ (PAGE_SIZE / 2); - } new = d->num_progs ? d->image + noff : NULL; tmp = d->num_progs ? d->rw_image + noff : NULL; @@ -134,11 +123,15 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs) return; } - err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new); - if (err || !new) - return; + __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func); + + /* Make sure all the callers executing the previous/old half of the + * image leave it, so following update call can modify it safely. + */ + synchronize_rcu(); - d->image_off = noff; + if (new) + d->image_off = noff; } void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f39ee3e05589..5aa2b5525f79 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -222,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_timer(&htab->map)) + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -231,28 +231,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } -static void htab_free_prealloced_kptrs(struct bpf_htab *htab) +static void htab_free_prealloced_fields(struct bpf_htab *htab) { u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_kptrs(&htab->map)) + if (IS_ERR_OR_NULL(htab->map.record)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); - for (i = 0; i < num_entries; i++) { struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8)); + bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -764,10 +761,7 @@ static void check_and_free_fields(struct bpf_htab *htab, { void *map_value = elem->key + round_up(htab->map.key_size, 8); - if (map_value_has_timer(&htab->map)) - bpf_timer_cancel_and_free(map_value + htab->map.timer_off); - if (map_value_has_kptrs(&htab->map)) - bpf_map_free_kptrs(&htab->map, map_value); + bpf_obj_free_fields(htab->map.record, map_value); } /* It is called from the bpf_lru_list when the LRU needs to delete @@ -1091,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, head = &b->head; if (unlikely(map_flags & BPF_F_LOCK)) { - if (unlikely(!map_value_has_spin_lock(map))) + if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, @@ -1474,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We don't reset or free kptr on uref dropping to zero, - * hence just free timer. - */ - bpf_timer_cancel_and_free(l->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + /* We only free timer on uref dropping to zero */ + bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } @@ -1490,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(&htab->map)) + /* We only free timer on uref dropping to zero */ + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1517,11 +1507,10 @@ static void htab_map_free(struct bpf_map *map) if (!htab_is_prealloc(htab)) { delete_all_elements(htab); } else { - htab_free_prealloced_kptrs(htab); + htab_free_prealloced_fields(htab); prealloc_destroy(htab); } - bpf_map_free_kptr_off_tab(map); free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); bpf_mem_alloc_destroy(&htab->pcpu_ma); @@ -1675,7 +1664,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, elem_map_flags = attr->batch.elem_flags; if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; map_flags = attr->batch.flags; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a6b04faed282..af30c6cbd65d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4,6 +4,7 @@ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/bpf-cgroup.h> +#include <linux/cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> @@ -19,6 +20,7 @@ #include <linux/proc_ns.h> #include <linux/security.h> #include <linux/btf_ids.h> +#include <linux/bpf_mem_alloc.h> #include "../../lib/kstrtox.h" @@ -336,6 +338,7 @@ const struct bpf_func_proto bpf_spin_lock_proto = { .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, + .arg1_btf_id = BPF_PTR_POISON, }; static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock) @@ -358,6 +361,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = { .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_SPIN_LOCK, + .arg1_btf_id = BPF_PTR_POISON, }; void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, @@ -366,9 +370,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, struct bpf_spin_lock *lock; if (lock_src) - lock = src + map->spin_lock_off; + lock = src + map->record->spin_lock_off; else - lock = dst + map->spin_lock_off; + lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); @@ -657,6 +661,7 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size, const struct bpf_func_proto bpf_copy_from_user_proto = { .func = bpf_copy_from_user, .gpl_only = false, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, @@ -687,6 +692,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, const struct bpf_func_proto bpf_copy_from_user_task_proto = { .func = bpf_copy_from_user_task, .gpl_only = true, + .might_sleep = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, @@ -1169,7 +1175,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->timer_off; + t->value = (void *)timer - map->record->timer_off; t->map = map; t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); @@ -1398,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = { #define DYNPTR_SIZE_MASK 0xFFFFFF #define DYNPTR_RDONLY_BIT BIT(31) -static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr) +static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_RDONLY_BIT; } @@ -1408,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ ptr->size |= type << DYNPTR_TYPE_SHIFT; } -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr) +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr) { return ptr->size & DYNPTR_SIZE_MASK; } @@ -1432,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) memset(ptr, 0, sizeof(*ptr)); } -static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len) +static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len) { u32 size = bpf_dynptr_get_size(ptr); @@ -1477,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src, +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, u32, offset, u64, flags) { int err; @@ -1489,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src if (err) return err; - memcpy(dst, src->data + src->offset + offset, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst, src->data + src->offset + offset, len); return 0; } @@ -1500,12 +1510,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, - .arg3_type = ARG_PTR_TO_DYNPTR, + .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg4_type = ARG_ANYTHING, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src, +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, u32, len, u64, flags) { int err; @@ -1517,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, if (err) return err; - memcpy(dst->data + dst->offset + offset, src, len); + /* Source and destination may possibly overlap, hence use memmove to + * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr + * pointing to overlapping PTR_TO_MAP_VALUE regions. + */ + memmove(dst->data + dst->offset + offset, src, len); return 0; } @@ -1526,14 +1540,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg4_type = ARG_CONST_SIZE_OR_ZERO, .arg5_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) +BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len) { int err; @@ -1554,7 +1568,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = { .func = bpf_dynptr_data, .gpl_only = false, .ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL, - .arg1_type = ARG_PTR_TO_DYNPTR, + .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, }; @@ -1663,6 +1677,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_dynptr_write_proto; case BPF_FUNC_dynptr_data: return &bpf_dynptr_data_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_cgrp_storage_get: + return &bpf_cgrp_storage_get_proto; + case BPF_FUNC_cgrp_storage_delete: + return &bpf_cgrp_storage_delete_proto; +#endif default: break; } @@ -1700,20 +1720,401 @@ bpf_base_func_proto(enum bpf_func_id func_id) } } -BTF_SET8_START(tracing_btf_ids) +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock) +{ + struct list_head *head = list_head, *orig_head = list_head; + + BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head)); + BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head)); + + /* Do the actual list draining outside the lock to not hold the lock for + * too long, and also prevent deadlocks if tracing programs end up + * executing on entry/exit of functions called inside the critical + * section, and end up doing map ops that call bpf_list_head_free for + * the same map value again. + */ + __bpf_spin_lock_irqsave(spin_lock); + if (!head->next || list_empty(head)) + goto unlock; + head = head->next; +unlock: + INIT_LIST_HEAD(orig_head); + __bpf_spin_unlock_irqrestore(spin_lock); + + while (head != orig_head) { + void *obj = head; + + obj -= field->list_head.node_offset; + head = head->next; + /* The contained type can also have resources, including a + * bpf_list_head which needs to be freed. + */ + bpf_obj_free_fields(field->list_head.value_rec, obj); + /* bpf_mem_free requires migrate_disable(), since we can be + * called from map free path as well apart from BPF program (as + * part of map ops doing bpf_obj_free_fields). + */ + migrate_disable(); + bpf_mem_free(&bpf_global_ma, obj); + migrate_enable(); + } +} + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in vmlinux BTF"); + +void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + u64 size = local_type_id__k; + void *p; + + p = bpf_mem_alloc(&bpf_global_ma, size); + if (!p) + return NULL; + if (meta) + bpf_obj_init(meta->field_offs, p); + return p; +} + +void bpf_obj_drop_impl(void *p__alloc, void *meta__ign) +{ + struct btf_struct_meta *meta = meta__ign; + void *p = p__alloc; + + if (meta) + bpf_obj_free_fields(meta->record, p); + bpf_mem_free(&bpf_global_ma, p); +} + +static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail) +{ + struct list_head *n = (void *)node, *h = (void *)head; + + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + if (unlikely(!n->next)) + INIT_LIST_HEAD(n); + tail ? list_add_tail(n, h) : list_add(n, h); +} + +void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) +{ + return __bpf_list_add(node, head, false); +} + +void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) +{ + return __bpf_list_add(node, head, true); +} + +static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail) +{ + struct list_head *n, *h = (void *)head; + + if (unlikely(!h->next)) + INIT_LIST_HEAD(h); + if (list_empty(h)) + return NULL; + n = tail ? h->prev : h->next; + list_del_init(n); + return (struct bpf_list_node *)n; +} + +struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) +{ + return __bpf_list_del(head, false); +} + +struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) +{ + return __bpf_list_del(head, true); +} + +/** + * bpf_task_acquire - Acquire a reference to a task. A task acquired by this + * kfunc which is not stored in a map as a kptr, must be released by calling + * bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire(struct task_struct *p) +{ + return get_task_struct(p); +} + +/** + * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task + * acquired by this kfunc which is not stored in a map as a kptr, must be + * released by calling bpf_task_release(). + * @p: The task on which a reference is being acquired. + */ +struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p) +{ + /* For the time being this function returns NULL, as it's not currently + * possible to safely acquire a reference to a task with RCU protection + * using get_task_struct() and put_task_struct(). This is due to the + * slightly odd mechanics of p->rcu_users, and how task RCU protection + * works. + * + * A struct task_struct is refcounted by two different refcount_t + * fields: + * + * 1. p->usage: The "true" refcount field which tracks a task's + * lifetime. The task is freed as soon as this + * refcount drops to 0. + * + * 2. p->rcu_users: An "RCU users" refcount field which is statically + * initialized to 2, and is co-located in a union with + * a struct rcu_head field (p->rcu). p->rcu_users + * essentially encapsulates a single p->usage + * refcount, and when p->rcu_users goes to 0, an RCU + * callback is scheduled on the struct rcu_head which + * decrements the p->usage refcount. + * + * There are two important implications to this task refcounting logic + * described above. The first is that + * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as + * after the refcount goes to 0, the RCU callback being scheduled will + * cause the memory backing the refcount to again be nonzero due to the + * fields sharing a union. The other is that we can't rely on RCU to + * guarantee that a task is valid in a BPF program. This is because a + * task could have already transitioned to being in the TASK_DEAD + * state, had its rcu_users refcount go to 0, and its rcu callback + * invoked in which it drops its single p->usage reference. At this + * point the task will be freed as soon as the last p->usage reference + * goes to 0, without waiting for another RCU gp to elapse. The only + * way that a BPF program can guarantee that a task is valid is in this + * scenario is to hold a p->usage refcount itself. + * + * Until we're able to resolve this issue, either by pulling + * p->rcu_users and p->rcu out of the union, or by getting rid of + * p->usage and just using p->rcu_users for refcounting, we'll just + * return NULL here. + */ + return NULL; +} + +/** + * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task + * kptr acquired by this kfunc which is not subsequently stored in a map, must + * be released by calling bpf_task_release(). + * @pp: A pointer to a task kptr on which a reference is being acquired. + */ +struct task_struct *bpf_task_kptr_get(struct task_struct **pp) +{ + /* We must return NULL here until we have clarity on how to properly + * leverage RCU for ensuring a task's lifetime. See the comment above + * in bpf_task_acquire_not_zero() for more details. + */ + return NULL; +} + +/** + * bpf_task_release - Release the reference acquired on a task. + * @p: The task on which a reference is being released. + */ +void bpf_task_release(struct task_struct *p) +{ + if (!p) + return; + + put_task_struct(p); +} + +#ifdef CONFIG_CGROUPS +/** + * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by + * this kfunc which is not stored in a map as a kptr, must be released by + * calling bpf_cgroup_release(). + * @cgrp: The cgroup on which a reference is being acquired. + */ +struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + return cgrp; +} + +/** + * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup + * kptr acquired by this kfunc which is not subsequently stored in a map, must + * be released by calling bpf_cgroup_release(). + * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired. + */ +struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp) +{ + struct cgroup *cgrp; + + rcu_read_lock(); + /* Another context could remove the cgroup from the map and release it + * at any time, including after we've done the lookup above. This is + * safe because we're in an RCU read region, so the cgroup is + * guaranteed to remain valid until at least the rcu_read_unlock() + * below. + */ + cgrp = READ_ONCE(*cgrpp); + + if (cgrp && !cgroup_tryget(cgrp)) + /* If the cgroup had been removed from the map and freed as + * described above, cgroup_tryget() will return false. The + * cgroup will be freed at some point after the current RCU gp + * has ended, so just return NULL to the user. + */ + cgrp = NULL; + rcu_read_unlock(); + + return cgrp; +} + +/** + * bpf_cgroup_release - Release the reference acquired on a cgroup. + * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to + * not be freed until the current grace period has ended, even if its refcount + * drops to 0. + * @cgrp: The cgroup on which a reference is being released. + */ +void bpf_cgroup_release(struct cgroup *cgrp) +{ + if (!cgrp) + return; + + cgroup_put(cgrp); +} + +/** + * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor + * array. A cgroup returned by this kfunc which is not subsequently stored in a + * map, must be released by calling bpf_cgroup_release(). + * @cgrp: The cgroup for which we're performing a lookup. + * @level: The level of ancestor to look up. + */ +struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) +{ + struct cgroup *ancestor; + + if (level > cgrp->level || level < 0) + return NULL; + + ancestor = cgrp->ancestors[level]; + cgroup_get(ancestor); + return ancestor; +} +#endif /* CONFIG_CGROUPS */ + +/** + * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up + * in the root pid namespace idr. If a task is returned, it must either be + * stored in a map, or released with bpf_task_release(). + * @pid: The pid of the task being looked up. + */ +struct task_struct *bpf_task_from_pid(s32 pid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_pid_ns(pid, &init_pid_ns); + if (p) + bpf_task_acquire(p); + rcu_read_unlock(); + + return p; +} + +void *bpf_cast_to_kern_ctx(void *obj) +{ + return obj; +} + +void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k) +{ + return obj__ign; +} + +void bpf_rcu_read_lock(void) +{ + rcu_read_lock(); +} + +void bpf_rcu_read_unlock(void) +{ + rcu_read_unlock(); +} + +__diag_pop(); + +BTF_SET8_START(generic_btf_ids) #ifdef CONFIG_KEXEC_CORE BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE) #endif -BTF_SET8_END(tracing_btf_ids) +BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_list_push_front) +BTF_ID_FLAGS(func, bpf_list_push_back) +BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE) +#ifdef CONFIG_CGROUPS +BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) +#endif +BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) +BTF_SET8_END(generic_btf_ids) -static const struct btf_kfunc_id_set tracing_kfunc_set = { +static const struct btf_kfunc_id_set generic_kfunc_set = { .owner = THIS_MODULE, - .set = &tracing_btf_ids, + .set = &generic_btf_ids, +}; + + +BTF_ID_LIST(generic_dtor_ids) +BTF_ID(struct, task_struct) +BTF_ID(func, bpf_task_release) +#ifdef CONFIG_CGROUPS +BTF_ID(struct, cgroup) +BTF_ID(func, bpf_cgroup_release) +#endif + +BTF_SET8_START(common_btf_ids) +BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx) +BTF_ID_FLAGS(func, bpf_rdonly_cast) +BTF_ID_FLAGS(func, bpf_rcu_read_lock) +BTF_ID_FLAGS(func, bpf_rcu_read_unlock) +BTF_SET8_END(common_btf_ids) + +static const struct btf_kfunc_id_set common_kfunc_set = { + .owner = THIS_MODULE, + .set = &common_btf_ids, }; static int __init kfunc_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set); + int ret; + const struct btf_id_dtor_kfunc generic_dtors[] = { + { + .btf_id = generic_dtor_ids[0], + .kfunc_btf_id = generic_dtor_ids[1] + }, +#ifdef CONFIG_CGROUPS + { + .btf_id = generic_dtor_ids[2], + .kfunc_btf_id = generic_dtor_ids[3] + }, +#endif + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors, + ARRAY_SIZE(generic_dtors), + THIS_MODULE); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set); } late_initcall(kfunc_init); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 098cf336fae6..e90d9f63edc5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 135205d0d560..38136ec4e095 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; struct fd f; + int ret; f = fdget(inner_map_ufd); inner_map = __bpf_map_get(f); @@ -20,18 +21,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { - fdput(f); - return ERR_PTR(-EINVAL); + ret = -EINVAL; + goto put; } if (!inner_map->ops->map_meta_equal) { - fdput(f); - return ERR_PTR(-ENOTSUPP); - } - - if (map_value_has_spin_lock(inner_map)) { - fdput(f); - return ERR_PTR(-ENOTSUPP); + ret = -ENOTSUPP; + goto put; } inner_map_meta_size = sizeof(*inner_map_meta); @@ -41,8 +37,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { - fdput(f); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto put; } inner_map_meta->map_type = inner_map->map_type; @@ -50,9 +46,33 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; - inner_map_meta->spin_lock_off = inner_map->spin_lock_off; - inner_map_meta->timer_off = inner_map->timer_off; - inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map); + + inner_map_meta->record = btf_record_dup(inner_map->record); + if (IS_ERR(inner_map_meta->record)) { + /* btf_record_dup returns NULL or valid pointer in case of + * invalid/empty/valid, but ERR_PTR in case of errors. During + * equality NULL or IS_ERR is equivalent. + */ + ret = PTR_ERR(inner_map_meta->record); + goto free; + } + if (inner_map_meta->record) { + struct btf_field_offs *field_offs; + /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always + * valid. + */ + field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN); + if (!field_offs) { + ret = -ENOMEM; + goto free_rec; + } + inner_map_meta->field_offs = field_offs; + } + /* Note: We must use the same BTF, as we also used btf_record_dup above + * which relies on BTF being same for both maps, as some members like + * record->fields.list_head have pointers like value_rec pointing into + * inner_map->btf. + */ if (inner_map->btf) { btf_get(inner_map->btf); inner_map_meta->btf = inner_map->btf; @@ -68,11 +88,19 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) fdput(f); return inner_map_meta; +free_rec: + btf_record_free(inner_map_meta->record); +free: + kfree(inner_map_meta); +put: + fdput(f); + return ERR_PTR(ret); } void bpf_map_meta_free(struct bpf_map *map_meta) { - bpf_map_free_kptr_off_tab(map_meta); + kfree(map_meta->field_offs); + bpf_map_free_record(map_meta); btf_put(map_meta->btf); kfree(map_meta); } @@ -84,9 +112,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags && - bpf_map_equal_kptr_off_tab(meta0, meta1); + btf_record_equal(meta0->record, meta1->record); } void *bpf_map_fd_get_ptr(struct bpf_map *map, diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index 4901fa1048cd..ebcc3dd0fa19 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) memcg = get_memcg(c); old_memcg = set_active_memcg(memcg); for (i = 0; i < cnt; i++) { - obj = __alloc(c, node); - if (!obj) - break; + /* + * free_by_rcu is only manipulated by irq work refill_work(). + * IRQ works on the same CPU are called sequentially, so it is + * safe to use __llist_del_first() here. If alloc_bulk() is + * invoked by the initial prefill, there will be no running + * refill_work(), so __llist_del_first() is fine as well. + * + * In most cases, objects on free_by_rcu are from the same CPU. + * If some objects come from other CPUs, it doesn't incur any + * harm because NUMA_NO_NODE means the preference for current + * numa node and it is not a guarantee. + */ + obj = __llist_del_first(&c->free_by_rcu); + if (!obj) { + obj = __alloc(c, node); + if (!obj) + break; + } if (IS_ENABLED(CONFIG_PREEMPT_RT)) /* In RT irq_work runs in per-cpu kthread, so disable * interrupts to avoid preemption and interrupts and @@ -222,9 +237,13 @@ static void __free_rcu(struct rcu_head *head) static void __free_rcu_tasks_trace(struct rcu_head *head) { - struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); - - call_rcu(&c->rcu, __free_rcu); + /* If RCU Tasks Trace grace period implies RCU grace period, + * there is no need to invoke call_rcu(). + */ + if (rcu_trace_implies_rcu_gp()) + __free_rcu(head); + else + call_rcu(head, __free_rcu); } static void enque_to_free(struct bpf_mem_cache *c, void *obj) @@ -253,8 +272,9 @@ static void do_call_rcu(struct bpf_mem_cache *c) */ __llist_add(llnode, &c->waiting_for_gp); /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. - * Then use call_rcu() to wait for normal progs to finish - * and finally do free_one() on each element. + * If RCU Tasks Trace grace period implies RCU grace period, free + * these elements directly, else use call_rcu() to wait for normal + * progs to finish and finally do free_one() on each element. */ call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace); } @@ -444,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma) { /* waiting_for_gp lists was drained, but __free_rcu might * still execute. Wait for it now before we freeing percpu caches. + * + * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), + * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used + * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), + * so if call_rcu(head, __free_rcu) is skipped due to + * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by + * using rcu_trace_implies_rcu_gp() as well. */ rcu_barrier_tasks_trace(); - rcu_barrier(); + if (!rcu_trace_implies_rcu_gp()) + rcu_barrier(); free_mem_alloc_no_barrier(ma); } diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index b6e7f5c5b9ab..034cf87b54e9 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -100,22 +100,21 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems) { struct pcpu_freelist_head *head; - int i, cpu, pcpu_entries; + unsigned int cpu, cpu_idx, i, j, n, m; - pcpu_entries = nr_elems / num_possible_cpus() + 1; - i = 0; + n = nr_elems / num_possible_cpus(); + m = nr_elems % num_possible_cpus(); + cpu_idx = 0; for_each_possible_cpu(cpu) { -again: head = per_cpu_ptr(s->freelist, cpu); - /* No locking required as this is not visible yet. */ - pcpu_freelist_push_node(head, buf); - i++; - buf += elem_size; - if (i == nr_elems) - break; - if (i % pcpu_entries) - goto again; + j = n + (cpu_idx < m ? 1 : 0); + for (i = 0; i < j; i++) { + /* No locking required as this is not visible yet. */ + pcpu_freelist_push_node(head, buf); + buf += elem_size; + } + cpu_idx++; } } diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 9e832acf4692..80f4b4d88aaf 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -447,7 +447,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) const struct bpf_func_proto bpf_ringbuf_reserve_proto = { .func = bpf_ringbuf_reserve, - .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL, + .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, @@ -490,7 +490,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, + .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; @@ -503,7 +503,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, - .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE, + .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7b373a5e861f..64131f88c553 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -175,8 +175,8 @@ static void maybe_wait_bpf_programs(struct bpf_map *map) synchronize_rcu(); } -static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, - void *value, __u64 flags) +static int bpf_map_update_value(struct bpf_map *map, struct file *map_file, + void *key, void *value, __u64 flags) { int err; @@ -190,7 +190,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, map->map_type == BPF_MAP_TYPE_SOCKMAP) { return sock_map_update_elem_sys(map, key, value, flags); } else if (IS_FD_PROG_ARRAY(map)) { - return bpf_fd_array_map_update_elem(map, f.file, key, value, + return bpf_fd_array_map_update_elem(map, map_file, key, value, flags); } @@ -205,12 +205,12 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key, flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); - err = bpf_fd_array_map_update_elem(map, f.file, key, value, + err = bpf_fd_array_map_update_elem(map, map_file, key, value, flags); rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { rcu_read_lock(); - err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + err = bpf_fd_htab_map_update_elem(map, map_file, key, value, flags); rcu_read_unlock(); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { @@ -495,114 +495,181 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -static int bpf_map_kptr_off_cmp(const void *a, const void *b) +static int btf_field_cmp(const void *a, const void *b) { - const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b; + const struct btf_field *f1 = a, *f2 = b; - if (off_desc1->offset < off_desc2->offset) + if (f1->offset < f2->offset) return -1; - else if (off_desc1->offset > off_desc2->offset) + else if (f1->offset > f2->offset) return 1; return 0; } -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset) +struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset, + enum btf_field_type type) { - /* Since members are iterated in btf_find_field in increasing order, - * offsets appended to kptr_off_tab are in increasing order, so we can - * do bsearch to find exact match. - */ - struct bpf_map_value_off *tab; + struct btf_field *field; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type)) + return NULL; + field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp); + if (!field || !(field->type & type)) return NULL; - tab = map->kptr_off_tab; - return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp); + return field; } -void bpf_map_free_kptr_off_tab(struct bpf_map *map) +void btf_record_free(struct btf_record *rec) { - struct bpf_map_value_off *tab = map->kptr_off_tab; int i; - if (!map_value_has_kptrs(map)) + if (IS_ERR_OR_NULL(rec)) return; - for (i = 0; i < tab->nr_off; i++) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + for (i = 0; i < rec->cnt; i++) { + switch (rec->fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (rec->fields[i].kptr.module) + module_put(rec->fields[i].kptr.module); + btf_put(rec->fields[i].kptr.btf); + break; + case BPF_LIST_HEAD: + case BPF_LIST_NODE: + /* Nothing to release for bpf_list_head */ + break; + default: + WARN_ON_ONCE(1); + continue; + } } - kfree(tab); - map->kptr_off_tab = NULL; + kfree(rec); +} + +void bpf_map_free_record(struct bpf_map *map) +{ + btf_record_free(map->record); + map->record = NULL; } -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map) +struct btf_record *btf_record_dup(const struct btf_record *rec) { - struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab; - int size, i; + const struct btf_field *fields; + struct btf_record *new_rec; + int ret, size, i; - if (!map_value_has_kptrs(map)) - return ERR_PTR(-ENOENT); - size = offsetof(struct bpf_map_value_off, off[tab->nr_off]); - new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN); - if (!new_tab) + if (IS_ERR_OR_NULL(rec)) + return NULL; + size = offsetof(struct btf_record, fields[rec->cnt]); + new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN); + if (!new_rec) return ERR_PTR(-ENOMEM); - /* Do a deep copy of the kptr_off_tab */ - for (i = 0; i < tab->nr_off; i++) { - btf_get(tab->off[i].kptr.btf); - if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) { - while (i--) { - if (tab->off[i].kptr.module) - module_put(tab->off[i].kptr.module); - btf_put(tab->off[i].kptr.btf); + /* Do a deep copy of the btf_record */ + fields = rec->fields; + new_rec->cnt = 0; + for (i = 0; i < rec->cnt; i++) { + switch (fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + btf_get(fields[i].kptr.btf); + if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) { + ret = -ENXIO; + goto free; } - kfree(new_tab); - return ERR_PTR(-ENXIO); + break; + case BPF_LIST_HEAD: + case BPF_LIST_NODE: + /* Nothing to acquire for bpf_list_head */ + break; + default: + ret = -EFAULT; + WARN_ON_ONCE(1); + goto free; } + new_rec->cnt++; } - return new_tab; + return new_rec; +free: + btf_record_free(new_rec); + return ERR_PTR(ret); } -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b) +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b) { - struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab; - bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b); + bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b); int size; - if (!a_has_kptr && !b_has_kptr) + if (!a_has_fields && !b_has_fields) return true; - if (a_has_kptr != b_has_kptr) + if (a_has_fields != b_has_fields) return false; - if (tab_a->nr_off != tab_b->nr_off) + if (rec_a->cnt != rec_b->cnt) return false; - size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]); - return !memcmp(tab_a, tab_b, size); + size = offsetof(struct btf_record, fields[rec_a->cnt]); + /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused + * members are zeroed out. So memcmp is safe to do without worrying + * about padding/unused fields. + * + * While spin_lock, timer, and kptr have no relation to map BTF, + * list_head metadata is specific to map BTF, the btf and value_rec + * members in particular. btf is the map BTF, while value_rec points to + * btf_record in that map BTF. + * + * So while by default, we don't rely on the map BTF (which the records + * were parsed from) matching for both records, which is not backwards + * compatible, in case list_head is part of it, we implicitly rely on + * that by way of depending on memcmp succeeding for it. + */ + return !memcmp(rec_a, rec_b, size); } -/* Caller must ensure map_value_has_kptrs is true. Note that this function can - * be called on a map value while the map_value is visible to BPF programs, as - * it ensures the correct synchronization, and we already enforce the same using - * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs. - */ -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) +void bpf_obj_free_timer(const struct btf_record *rec, void *obj) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - unsigned long *btf_id_ptr; - int i; + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) + return; + bpf_timer_cancel_and_free(obj + rec->timer_off); +} - for (i = 0; i < tab->nr_off; i++) { - struct bpf_map_value_off_desc *off_desc = &tab->off[i]; - unsigned long old_ptr; +void bpf_obj_free_fields(const struct btf_record *rec, void *obj) +{ + const struct btf_field *fields; + int i; - btf_id_ptr = map_value + off_desc->offset; - if (off_desc->type == BPF_KPTR_UNREF) { - u64 *p = (u64 *)btf_id_ptr; + if (IS_ERR_OR_NULL(rec)) + return; + fields = rec->fields; + for (i = 0; i < rec->cnt; i++) { + const struct btf_field *field = &fields[i]; + void *field_ptr = obj + field->offset; - WRITE_ONCE(*p, 0); + switch (fields[i].type) { + case BPF_SPIN_LOCK: + break; + case BPF_TIMER: + bpf_timer_cancel_and_free(field_ptr); + break; + case BPF_KPTR_UNREF: + WRITE_ONCE(*(u64 *)field_ptr, 0); + break; + case BPF_KPTR_REF: + field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0)); + break; + case BPF_LIST_HEAD: + if (WARN_ON_ONCE(rec->spin_lock_off < 0)) + continue; + bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off); + break; + case BPF_LIST_NODE: + break; + default: + WARN_ON_ONCE(1); continue; } - old_ptr = xchg(btf_id_ptr, 0); - off_desc->kptr.dtor((void *)old_ptr); } } @@ -610,14 +677,24 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); + struct btf_field_offs *foffs = map->field_offs; + struct btf_record *rec = map->record; security_bpf_map_free(map); - kfree(map->off_arr); bpf_map_release_memcg(map); - /* implementation dependent freeing, map_free callback also does - * bpf_map_free_kptr_off_tab, if needed. - */ + /* implementation dependent freeing */ map->ops->map_free(map); + /* Delay freeing of field_offs and btf_record for maps, as map_free + * callback usually needs access to them. It is better to do it here + * than require each callback to do the free itself manually. + * + * Note that the btf_record stashed in map->inner_map_meta->record was + * already freed using the map_free callback for map in map case which + * eventually calls bpf_map_free_meta, since inner_map_meta is only a + * template bpf_map struct used during verification. + */ + kfree(foffs); + btf_record_free(rec); } static void bpf_map_put_uref(struct bpf_map *map) @@ -778,8 +855,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) struct bpf_map *map = filp->private_data; int err; - if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map) || map_value_has_kptrs(map)) + if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -906,84 +982,6 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv) -{ - const u32 a = *(const u32 *)_a; - const u32 b = *(const u32 *)_b; - - if (a < b) - return -1; - else if (a > b) - return 1; - return 0; -} - -static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv) -{ - struct bpf_map *map = (struct bpf_map *)priv; - u32 *off_base = map->off_arr->field_off; - u32 *a = _a, *b = _b; - u8 *sz_a, *sz_b; - - sz_a = map->off_arr->field_sz + (a - off_base); - sz_b = map->off_arr->field_sz + (b - off_base); - - swap(*a, *b); - swap(*sz_a, *sz_b); -} - -static int bpf_map_alloc_off_arr(struct bpf_map *map) -{ - bool has_spin_lock = map_value_has_spin_lock(map); - bool has_timer = map_value_has_timer(map); - bool has_kptrs = map_value_has_kptrs(map); - struct bpf_map_off_arr *off_arr; - u32 i; - - if (!has_spin_lock && !has_timer && !has_kptrs) { - map->off_arr = NULL; - return 0; - } - - off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN); - if (!off_arr) - return -ENOMEM; - map->off_arr = off_arr; - - off_arr->cnt = 0; - if (has_spin_lock) { - i = off_arr->cnt; - - off_arr->field_off[i] = map->spin_lock_off; - off_arr->field_sz[i] = sizeof(struct bpf_spin_lock); - off_arr->cnt++; - } - if (has_timer) { - i = off_arr->cnt; - - off_arr->field_off[i] = map->timer_off; - off_arr->field_sz[i] = sizeof(struct bpf_timer); - off_arr->cnt++; - } - if (has_kptrs) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - u32 *off = &off_arr->field_off[off_arr->cnt]; - u8 *sz = &off_arr->field_sz[off_arr->cnt]; - - for (i = 0; i < tab->nr_off; i++) { - *off++ = tab->off[i].offset; - *sz++ = sizeof(u64); - } - off_arr->cnt += tab->nr_off; - } - - if (off_arr->cnt == 1) - return 0; - sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]), - map_off_arr_cmp, map_off_arr_swap, map); - return 0; -} - static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { @@ -1006,39 +1004,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; - map->spin_lock_off = btf_find_spin_lock(btf, value_type); - - if (map_value_has_spin_lock(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE && - map->map_type != BPF_MAP_TYPE_TASK_STORAGE) - return -ENOTSUPP; - if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > - map->value_size) { - WARN_ONCE(1, - "verifier bug spin_lock_off %d value_size %d\n", - map->spin_lock_off, map->value_size); - return -EFAULT; - } - } + map->record = btf_parse_fields(btf, value_type, + BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD, + map->value_size); + if (!IS_ERR_OR_NULL(map->record)) { + int i; - map->timer_off = btf_find_timer(btf, value_type); - if (map_value_has_timer(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) - return -EOPNOTSUPP; - } - - map->kptr_off_tab = btf_parse_kptrs(btf, value_type); - if (map_value_has_kptrs(map)) { if (!bpf_capable()) { ret = -EPERM; goto free_map_tab; @@ -1047,15 +1018,60 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, ret = -EACCES; goto free_map_tab; } - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { - ret = -EOPNOTSUPP; - goto free_map_tab; + for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) { + switch (map->record->field_mask & (1 << i)) { + case 0: + continue; + case BPF_SPIN_LOCK: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_TIMER: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_LIST_HEAD: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + default: + /* Fail if map_type checks are missing for a field type */ + ret = -EOPNOTSUPP; + goto free_map_tab; + } } } + ret = btf_check_and_fixup_fields(btf, map->record); + if (ret < 0) + goto free_map_tab; + if (map->ops->map_check_btf) { ret = map->ops->map_check_btf(map, btf, key_type, value_type); if (ret < 0) @@ -1064,7 +1080,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, return ret; free_map_tab: - bpf_map_free_kptr_off_tab(map); + bpf_map_free_record(map); return ret; } @@ -1073,6 +1089,7 @@ free_map_tab: static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); + struct btf_field_offs *foffs; struct bpf_map *map; int f_flags; int err; @@ -1117,8 +1134,6 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); - map->spin_lock_off = -EINVAL; - map->timer_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1154,13 +1169,17 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = bpf_map_alloc_off_arr(map); - if (err) + + foffs = btf_parse_field_offs(map->record); + if (IS_ERR(foffs)) { + err = PTR_ERR(foffs); goto free_map; + } + map->field_offs = foffs; err = security_bpf_map_alloc(map); if (err) - goto free_map_off_arr; + goto free_map_field_offs; err = bpf_map_alloc_id(map); if (err) @@ -1184,8 +1203,8 @@ static int map_create(union bpf_attr *attr) free_map_sec: security_bpf_map_free(map); -free_map_off_arr: - kfree(map->off_arr); +free_map_field_offs: + kfree(map->field_offs); free_map: btf_put(map->btf); map->ops->map_free(map); @@ -1332,7 +1351,7 @@ static int map_lookup_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1405,7 +1424,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1423,7 +1442,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) goto free_key; } - err = bpf_map_update_value(map, f, key, value, attr->flags); + err = bpf_map_update_value(map, f.file, key, value, attr->flags); kvfree(value); free_key: @@ -1568,7 +1587,7 @@ int generic_map_delete_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1609,23 +1628,21 @@ int generic_map_delete_batch(struct bpf_map *map, return err; } -int generic_map_update_batch(struct bpf_map *map, +int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr) { void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 value_size, cp, max_count; - int ufd = attr->batch.map_fd; void *key, *value; - struct fd f; int err = 0; if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1645,7 +1662,6 @@ int generic_map_update_batch(struct bpf_map *map, return -ENOMEM; } - f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */ for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, @@ -1653,7 +1669,7 @@ int generic_map_update_batch(struct bpf_map *map, copy_from_user(value, values + cp * value_size, value_size)) break; - err = bpf_map_update_value(map, f, key, value, + err = bpf_map_update_value(map, map_file, key, value, attr->batch.elem_flags); if (err) @@ -1666,7 +1682,6 @@ int generic_map_update_batch(struct bpf_map *map, kvfree(value); kvfree(key); - fdput(f); return err; } @@ -1688,7 +1703,7 @@ int generic_map_lookup_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; value_size = bpf_map_value_size(map); @@ -1810,7 +1825,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1881,8 +1896,7 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map) || map_value_has_kptrs(map)) { + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { fdput(f); return -ENOTSUPP; } @@ -2117,11 +2131,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, st = per_cpu_ptr(prog->stats, cpu); do { - start = u64_stats_fetch_begin_irq(&st->syncp); + start = u64_stats_fetch_begin(&st->syncp); tnsecs = u64_stats_read(&st->nsecs); tcnt = u64_stats_read(&st->cnt); tmisses = u64_stats_read(&st->misses); - } while (u64_stats_fetch_retry_irq(&st->syncp, start)); + } while (u64_stats_fetch_retry(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; misses += tmisses; @@ -3504,9 +3518,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_LSM: if (ptype == BPF_PROG_TYPE_LSM && prog->expected_attach_type != BPF_LSM_CGROUP) - return -EINVAL; - - ret = cgroup_bpf_prog_attach(attr, ptype, prog); + ret = -EINVAL; + else + ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; default: ret = -EINVAL; @@ -4460,13 +4474,13 @@ put_file: #define BPF_MAP_BATCH_LAST_FIELD batch.flags -#define BPF_DO_BATCH(fn) \ +#define BPF_DO_BATCH(fn, ...) \ do { \ if (!fn) { \ err = -ENOTSUPP; \ goto err_put; \ } \ - err = fn(map, attr, uattr); \ + err = fn(__VA_ARGS__); \ } while (0) static int bpf_map_do_batch(const union bpf_attr *attr, @@ -4500,13 +4514,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, } if (cmd == BPF_MAP_LOOKUP_BATCH) - BPF_DO_BATCH(map->ops->map_lookup_batch); + BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr); else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) - BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch); + BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr); else if (cmd == BPF_MAP_UPDATE_BATCH) - BPF_DO_BATCH(map->ops->map_update_batch); + BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr); else - BPF_DO_BATCH(map->ops->map_delete_batch); + BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr); err_put: if (has_write) bpf_map_write_active_dec(map); @@ -5133,13 +5147,14 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) run_ctx.bpf_cookie = 0; run_ctx.saved_run_ctx = NULL; - if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) { + if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) { /* recursion detected */ bpf_prog_put(prog); return -EBUSY; } attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); - __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx); + __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */, + &run_ctx); bpf_prog_put(prog); return 0; #endif diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index bf0906e1e2b9..11f5ec0b8016 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -468,8 +468,7 @@ again: if (err < 0) goto out; - set_memory_ro((long)im->image, 1); - set_memory_x((long)im->image, 1); + set_memory_rox((long)im->image, 1); WARN_ON(tr->cur_image && tr->selector == 0); WARN_ON(!tr->cur_image && tr->selector); @@ -864,7 +863,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void) * [2..MAX_U64] - execute bpf prog and record execution time. * This is start time. */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) +static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); @@ -901,7 +900,8 @@ static void notrace update_prog_stats(struct bpf_prog *prog, } } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx) +static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); @@ -912,8 +912,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_ rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx) +static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { /* Runtime stats are exported via actual BPF_LSM_CGROUP @@ -927,8 +927,8 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, return NO_START_TIME; } -void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx) +static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); @@ -937,7 +937,8 @@ void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) +u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) { rcu_read_lock_trace(); migrate_disable(); @@ -953,8 +954,8 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx) +void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); @@ -964,8 +965,30 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, rcu_read_unlock_trace(); } -u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx) +static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) +{ + rcu_read_lock_trace(); + migrate_disable(); + might_fault(); + + run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx); + + return bpf_prog_start_time(); +} + +static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) +{ + bpf_reset_run_ctx(run_ctx->saved_run_ctx); + + update_prog_stats(prog, start); + migrate_enable(); + rcu_read_unlock_trace(); +} + +static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx) __acquires(RCU) { rcu_read_lock(); @@ -976,8 +999,8 @@ u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx) +static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx) __releases(RCU) { bpf_reset_run_ctx(run_ctx->saved_run_ctx); @@ -997,6 +1020,36 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) percpu_ref_put(&tr->pcref); } +bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) +{ + bool sleepable = prog->aux->sleepable; + + if (bpf_prog_check_recur(prog)) + return sleepable ? __bpf_prog_enter_sleepable_recur : + __bpf_prog_enter_recur; + + if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && + prog->expected_attach_type == BPF_LSM_CGROUP) + return __bpf_prog_enter_lsm_cgroup; + + return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter; +} + +bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) +{ + bool sleepable = prog->aux->sleepable; + + if (bpf_prog_check_recur(prog)) + return sleepable ? __bpf_prog_exit_sleepable_recur : + __bpf_prog_exit_recur; + + if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && + prog->expected_attach_type == BPF_LSM_CGROUP) + return __bpf_prog_exit_lsm_cgroup; + + return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit; +} + int __weak arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7f0a9f6cb889..a5255a0dcbb6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -262,7 +262,7 @@ struct bpf_call_arg_meta { struct btf *ret_btf; u32 ret_btf_id; u32 subprogno; - struct bpf_map_value_off_desc *kptr_off_desc; + struct btf_field *kptr_field; u8 uninit_dynptr_regno; }; @@ -451,17 +451,29 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } -static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +static bool type_is_ptr_alloc_obj(u32 type) { - return reg->type == PTR_TO_MAP_VALUE && - map_value_has_spin_lock(reg->map_ptr); + return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC; } -static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) +static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) { - type = base_type(type); - return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK || - type == PTR_TO_MEM || type == PTR_TO_BTF_ID; + struct btf_record *rec = NULL; + struct btf_struct_meta *meta; + + if (reg->type == PTR_TO_MAP_VALUE) { + rec = reg->map_ptr->record; + } else if (type_is_ptr_alloc_obj(reg->type)) { + meta = btf_find_struct_meta(reg->btf, reg->btf_id); + if (meta) + rec = meta->record; + } + return rec; +} + +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +{ + return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -511,6 +523,23 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_dynptr_data; } +static bool is_callback_calling_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_for_each_map_elem || + func_id == BPF_FUNC_timer_set_callback || + func_id == BPF_FUNC_find_vma || + func_id == BPF_FUNC_loop || + func_id == BPF_FUNC_user_ringbuf_drain; +} + +static bool is_storage_get_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_storage_get || + func_id == BPF_FUNC_inode_storage_get || + func_id == BPF_FUNC_task_storage_get || + func_id == BPF_FUNC_cgrp_storage_get; +} + static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id, const struct bpf_map *map) { @@ -541,7 +570,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) static const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { - char postfix[16] = {0}, prefix[32] = {0}; + char postfix[16] = {0}, prefix[64] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "scalar", @@ -563,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env, [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", - [PTR_TO_DYNPTR] = "dynptr_ptr", + [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { @@ -573,16 +602,15 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(postfix, "_or_null", 16); } - if (type & MEM_RDONLY) - strncpy(prefix, "rdonly_", 32); - if (type & MEM_ALLOC) - strncpy(prefix, "alloc_", 32); - if (type & MEM_USER) - strncpy(prefix, "user_", 32); - if (type & MEM_PERCPU) - strncpy(prefix, "percpu_", 32); - if (type & PTR_UNTRUSTED) - strncpy(prefix, "untrusted_", 32); + snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", + type & MEM_RDONLY ? "rdonly_" : "", + type & MEM_RINGBUF ? "ringbuf_" : "", + type & MEM_USER ? "user_" : "", + type & MEM_PERCPU ? "percpu_" : "", + type & MEM_RCU ? "rcu_" : "", + type & PTR_UNTRUSTED ? "untrusted_" : "", + type & PTR_TRUSTED ? "trusted_" : "" + ); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); @@ -697,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type) return type == BPF_DYNPTR_TYPE_RINGBUF; } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type, + bool first_slot); + +static void __mark_reg_not_init(const struct bpf_verifier_env *env, + struct bpf_reg_state *reg); + +static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1, + struct bpf_reg_state *sreg2, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(sreg1, type, true); + __mark_dynptr_reg(sreg2, type, false); +} + +static void mark_dynptr_cb_reg(struct bpf_reg_state *reg, + enum bpf_dynptr_type type) +{ + __mark_dynptr_reg(reg, type, true); +} + + static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, enum bpf_arg_type arg_type, int insn_idx) { @@ -718,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (type == BPF_DYNPTR_TYPE_INVALID) return -EINVAL; - state->stack[spi].spilled_ptr.dynptr.first_slot = true; - state->stack[spi].spilled_ptr.dynptr.type = type; - state->stack[spi - 1].spilled_ptr.dynptr.type = type; + mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr, + &state->stack[spi - 1].spilled_ptr, type); if (dynptr_type_refcounted(type)) { /* The id is used to track proper releasing */ @@ -728,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_ if (id < 0) return id; - state->stack[spi].spilled_ptr.id = id; - state->stack[spi - 1].spilled_ptr.id = id; + state->stack[spi].spilled_ptr.ref_obj_id = id; + state->stack[spi - 1].spilled_ptr.ref_obj_id = id; } return 0; @@ -751,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re } /* Invalidate any slices associated with this dynptr */ - if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) { - release_reference(env, state->stack[spi].spilled_ptr.id); - state->stack[spi].spilled_ptr.id = 0; - state->stack[spi - 1].spilled_ptr.id = 0; - } - - state->stack[spi].spilled_ptr.dynptr.first_slot = false; - state->stack[spi].spilled_ptr.dynptr.type = 0; - state->stack[spi - 1].spilled_ptr.dynptr.type = 0; + if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) + WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id)); + __mark_reg_not_init(env, &state->stack[spi].spilled_ptr); + __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr); return 0; } static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); - int i; + int spi, i; + if (reg->type == CONST_PTR_TO_DYNPTR) + return false; + + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS)) return true; @@ -782,13 +829,17 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_ return true; } -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg) +static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; int i; + /* This already represents first slot of initialized bpf_dynptr */ + if (reg->type == CONST_PTR_TO_DYNPTR) + return true; + + spi = get_spi(reg->off); if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || !state->stack[spi].spilled_ptr.dynptr.first_slot) return false; @@ -802,21 +853,24 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, return true; } -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type) +static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + enum bpf_arg_type arg_type) { struct bpf_func_state *state = func(env, reg); enum bpf_dynptr_type dynptr_type; - int spi = get_spi(reg->off); + int spi; /* ARG_PTR_TO_DYNPTR takes any type of dynptr */ if (arg_type == ARG_PTR_TO_DYNPTR) return true; dynptr_type = arg_to_dynptr_type(arg_type); - - return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + if (reg->type == CONST_PTR_TO_DYNPTR) { + return reg->dynptr.type == dynptr_type; + } else { + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type; + } } /* The reg state of a pointer or a bounded scalar was saved when @@ -875,7 +929,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (reg->id) verbose_a("id=%d", reg->id); - if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) + if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) verbose_a("off=%d", reg->off); @@ -1008,9 +1062,9 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - if (ksize(dst) < bytes) { + if (ksize(dst) < ksize(src)) { kfree(dst); - dst = kmalloc_track_caller(bytes, flags); + dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags); if (!dst) return NULL; } @@ -1027,12 +1081,19 @@ out: */ static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size) { + size_t alloc_size; + void *new_arr; + if (!new_n || old_n == new_n) goto out; - arr = krealloc_array(arr, new_n, size, GFP_KERNEL); - if (!arr) + alloc_size = kmalloc_size_roundup(size_mul(new_n, size)); + new_arr = krealloc(arr, alloc_size, GFP_KERNEL); + if (!new_arr) { + kfree(arr); return NULL; + } + arr = new_arr; if (new_n > old_n) memset(arr + old_n * size, 0, (new_n - old_n) * size); @@ -1199,8 +1260,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, dst_state->frame[i] = NULL; } dst_state->speculative = src->speculative; + dst_state->active_rcu_lock = src->active_rcu_lock; dst_state->curframe = src->curframe; - dst_state->active_spin_lock = src->active_spin_lock; + dst_state->active_lock.ptr = src->active_lock.ptr; + dst_state->active_lock.id = src->active_lock.id; dst_state->branches = src->branches; dst_state->parent = src->parent; dst_state->first_insn_idx = src->first_insn_idx; @@ -1319,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; -static void __mark_reg_not_init(const struct bpf_verifier_env *env, - struct bpf_reg_state *reg); - /* This helper doesn't clear reg->id */ static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm) { @@ -1384,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type, + bool first_slot) +{ + /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for + * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply + * set it unconditionally as it is ignored for STACK_DYNPTR anyway. + */ + __mark_reg_known_zero(reg); + reg->type = CONST_PTR_TO_DYNPTR; + reg->dynptr.type = type; + reg->dynptr.first_slot = first_slot; +} + static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { if (base_type(reg->type) == PTR_TO_MAP_VALUE) { @@ -1395,7 +1468,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (map_value_has_timer(map->inner_map_meta)) + if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; @@ -1684,7 +1757,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; + reg->precise = !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -2493,15 +2566,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return 0; } +static void mark_jmp_point(struct bpf_verifier_env *env, int idx) +{ + env->insn_aux_data[idx].jmp_point = true; +} + +static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].jmp_point; +} + /* for any branch, call, exit record the history of jmps in the given state */ static int push_jmp_history(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { u32 cnt = cur->jmp_history_cnt; struct bpf_idx_pair *p; + size_t alloc_size; + + if (!is_jmp_point(env, env->insn_idx)) + return 0; cnt++; - p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_USER); if (!p) return -ENOMEM; p[cnt - 1].idx = env->insn_idx; @@ -2653,6 +2741,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (opcode == BPF_CALL) { if (insn->src_reg == BPF_PSEUDO_CALL) return -ENOTSUPP; + /* BPF helpers that invoke callback subprogs are + * equivalent to BPF_PSEUDO_CALL above + */ + if (insn->src_reg == 0 && is_callback_calling_function(insn->imm)) + return -ENOTSUPP; /* regular helper call sets R0 */ *reg_mask &= ~1; if (*reg_mask & 0x3f) { @@ -2742,8 +2835,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, /* big hammer: mark all scalars precise in this path. * pop_stack may still get !precise scalars. + * We also skip current state and go straight to first parent state, + * because precision markings in current non-checkpointed state are + * not needed. See why in the comment in __mark_chain_precision below. */ - for (; st; st = st->parent) + for (st = st->parent; st; st = st->parent) { for (i = 0; i <= st->curframe; i++) { func = st->frame[i]; for (j = 0; j < BPF_REG_FP; j++) { @@ -2761,9 +2857,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env, reg->precise = true; } } + } } -static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, +static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st) +{ + struct bpf_func_state *func; + struct bpf_reg_state *reg; + int i, j; + + for (i = 0; i <= st->curframe; i++) { + func = st->frame[i]; + for (j = 0; j < BPF_REG_FP; j++) { + reg = &func->regs[j]; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { + if (!is_spilled_reg(&func->stack[j])) + continue; + reg = &func->stack[j].spilled_ptr; + if (reg->type != SCALAR_VALUE) + continue; + reg->precise = false; + } + } +} + +/* + * __mark_chain_precision() backtracks BPF program instruction sequence and + * chain of verifier states making sure that register *regno* (if regno >= 0) + * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked + * SCALARS, as well as any other registers and slots that contribute to + * a tracked state of given registers/stack slots, depending on specific BPF + * assembly instructions (see backtrack_insns() for exact instruction handling + * logic). This backtracking relies on recorded jmp_history and is able to + * traverse entire chain of parent states. This process ends only when all the + * necessary registers/slots and their transitive dependencies are marked as + * precise. + * + * One important and subtle aspect is that precise marks *do not matter* in + * the currently verified state (current state). It is important to understand + * why this is the case. + * + * First, note that current state is the state that is not yet "checkpointed", + * i.e., it is not yet put into env->explored_states, and it has no children + * states as well. It's ephemeral, and can end up either a) being discarded if + * compatible explored state is found at some point or BPF_EXIT instruction is + * reached or b) checkpointed and put into env->explored_states, branching out + * into one or more children states. + * + * In the former case, precise markings in current state are completely + * ignored by state comparison code (see regsafe() for details). Only + * checkpointed ("old") state precise markings are important, and if old + * state's register/slot is precise, regsafe() assumes current state's + * register/slot as precise and checks value ranges exactly and precisely. If + * states turn out to be compatible, current state's necessary precise + * markings and any required parent states' precise markings are enforced + * after the fact with propagate_precision() logic, after the fact. But it's + * important to realize that in this case, even after marking current state + * registers/slots as precise, we immediately discard current state. So what + * actually matters is any of the precise markings propagated into current + * state's parent states, which are always checkpointed (due to b) case above). + * As such, for scenario a) it doesn't matter if current state has precise + * markings set or not. + * + * Now, for the scenario b), checkpointing and forking into child(ren) + * state(s). Note that before current state gets to checkpointing step, any + * processed instruction always assumes precise SCALAR register/slot + * knowledge: if precise value or range is useful to prune jump branch, BPF + * verifier takes this opportunity enthusiastically. Similarly, when + * register's value is used to calculate offset or memory address, exact + * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to + * what we mentioned above about state comparison ignoring precise markings + * during state comparison, BPF verifier ignores and also assumes precise + * markings *at will* during instruction verification process. But as verifier + * assumes precision, it also propagates any precision dependencies across + * parent states, which are not yet finalized, so can be further restricted + * based on new knowledge gained from restrictions enforced by their children + * states. This is so that once those parent states are finalized, i.e., when + * they have no more active children state, state comparison logic in + * is_state_visited() would enforce strict and precise SCALAR ranges, if + * required for correctness. + * + * To build a bit more intuition, note also that once a state is checkpointed, + * the path we took to get to that state is not important. This is crucial + * property for state pruning. When state is checkpointed and finalized at + * some instruction index, it can be correctly and safely used to "short + * circuit" any *compatible* state that reaches exactly the same instruction + * index. I.e., if we jumped to that instruction from a completely different + * code path than original finalized state was derived from, it doesn't + * matter, current state can be discarded because from that instruction + * forward having a compatible state will ensure we will safely reach the + * exit. States describe preconditions for further exploration, but completely + * forget the history of how we got here. + * + * This also means that even if we needed precise SCALAR range to get to + * finalized state, but from that point forward *that same* SCALAR register is + * never used in a precise context (i.e., it's precise value is not needed for + * correctness), it's correct and safe to mark such register as "imprecise" + * (i.e., precise marking set to false). This is what we rely on when we do + * not set precise marking in current state. If no child state requires + * precision for any given SCALAR register, it's safe to dictate that it can + * be imprecise. If any child state does require this register to be precise, + * we'll mark it precise later retroactively during precise markings + * propagation from child state to parent states. + * + * Skipping precise marking setting in current state is a mild version of + * relying on the above observation. But we can utilize this property even + * more aggressively by proactively forgetting any precise marking in the + * current state (which we inherited from the parent state), right before we + * checkpoint it and branch off into new child state. This is done by + * mark_all_scalars_imprecise() to hopefully get more permissive and generic + * finalized states which help in short circuiting more future states. + */ +static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno, int spi) { struct bpf_verifier_state *st = env->cur_state; @@ -2780,18 +2989,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, if (!env->bpf_capable) return 0; - func = st->frame[st->curframe]; + /* Do sanity checks against current state of register and/or stack + * slot, but don't set precise flag in current state, as precision + * tracking in the current state is unnecessary. + */ + func = st->frame[frame]; if (regno >= 0) { reg = &func->regs[regno]; if (reg->type != SCALAR_VALUE) { WARN_ONCE(1, "backtracing misuse"); return -EFAULT; } - if (!reg->precise) - new_marks = true; - else - reg_mask = 0; - reg->precise = true; + new_marks = true; } while (spi >= 0) { @@ -2804,11 +3013,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, stack_mask = 0; break; } - if (!reg->precise) - new_marks = true; - else - stack_mask = 0; - reg->precise = true; + new_marks = true; break; } @@ -2816,12 +3021,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, return 0; if (!reg_mask && !stack_mask) return 0; + for (;;) { DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); + + if (last_idx < 0) { + /* we are at the entry into subprog, which + * is expected for global funcs, but only if + * requested precise registers are R1-R5 + * (which are global func's input arguments) + */ + if (st->curframe == 0 && + st->frame[0]->subprogno > 0 && + st->frame[0]->callsite == BPF_MAIN_FUNC && + stack_mask == 0 && (reg_mask & ~0x3e) == 0) { + bitmap_from_u64(mask, reg_mask); + for_each_set_bit(i, mask, 32) { + reg = &st->frame[0]->regs[i]; + if (reg->type != SCALAR_VALUE) { + reg_mask &= ~(1u << i); + continue; + } + reg->precise = true; + } + return 0; + } + + verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n", + st->frame[0]->subprogno, reg_mask, stack_mask); + WARN_ONCE(1, "verifier backtracking bug"); + return -EFAULT; + } + for (i = last_idx;;) { if (skip_first) { err = 0; @@ -2861,7 +3096,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, break; new_marks = false; - func = st->frame[st->curframe]; + func = st->frame[frame]; bitmap_from_u64(mask, reg_mask); for_each_set_bit(i, mask, 32) { reg = &func->regs[i]; @@ -2927,12 +3162,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, int mark_chain_precision(struct bpf_verifier_env *env, int regno) { - return __mark_chain_precision(env, regno, -1); + return __mark_chain_precision(env, env->cur_state->curframe, regno, -1); } -static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno) { - return __mark_chain_precision(env, -1, spi); + return __mark_chain_precision(env, frame, regno, -1); +} + +static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi) +{ + return __mark_chain_precision(env, frame, -1, spi); } static bool is_spillable_regtype(enum bpf_reg_type type) @@ -3181,14 +3421,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; mark_stack_slot_scratched(env, spi); - if (!env->allow_ptr_leaks - && *stype != NOT_INIT - && *stype != SCALAR_VALUE) { - /* Reject the write if there's are spilled pointers in - * range. If we didn't reject here, the ptr status - * would be erased below (even though not all slots are - * actually overwritten), possibly opening the door to - * leaks. + if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) { + /* Reject the write if range we may write to has not + * been initialized beforehand. If we didn't reject + * here, the ptr status would be erased below (even + * though not all slots are actually overwritten), + * possibly opening the door to leaks. + * + * We do however catch STACK_INVALID case below, and + * only allow reading possibly uninitialized memory + * later for CAP_PERFMON, as the write may not happen to + * that slot. */ verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", insn_idx, i); @@ -3678,15 +3921,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, } static int map_kptr_match_type(struct bpf_verifier_env *env, - struct bpf_map_value_off_desc *off_desc, + struct btf_field *kptr_field, struct bpf_reg_state *reg, u32 regno) { - const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL; + const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); + int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) perm_flags |= PTR_UNTRUSTED; if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags)) @@ -3733,15 +3976,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, * strict mode to true for type match. */ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, - off_desc->kptr.btf, off_desc->kptr.btf_id, - off_desc->type == BPF_KPTR_REF)) + kptr_field->kptr.btf, kptr_field->kptr.btf_id, + kptr_field->type == BPF_KPTR_REF)) goto bad_type; return 0; bad_type: verbose(env, "invalid kptr access, R%d type=%s%s ", regno, reg_type_str(env, reg->type), reg_name); verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name); - if (off_desc->type == BPF_KPTR_UNREF) + if (kptr_field->type == BPF_KPTR_UNREF) verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED), targ_name); else @@ -3751,7 +3994,7 @@ bad_type: static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, int value_regno, int insn_idx, - struct bpf_map_value_off_desc *off_desc) + struct btf_field *kptr_field) { struct bpf_insn *insn = &env->prog->insnsi[insn_idx]; int class = BPF_CLASS(insn->code); @@ -3761,7 +4004,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, * - Reject cases where variable offset may touch kptr * - size of access (must be BPF_DW) * - tnum_is_const(reg->var_off) - * - off_desc->offset == off + reg->var_off.value + * - kptr_field->offset == off + reg->var_off.value */ /* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */ if (BPF_MODE(insn->code) != BPF_MEM) { @@ -3772,7 +4015,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We only allow loading referenced kptr, since it will be marked as * untrusted, similar to unreferenced kptr. */ - if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) { + if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) { verbose(env, "store to referenced kptr disallowed\n"); return -EACCES; } @@ -3782,19 +4025,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, /* We can simply mark the value_regno receiving the pointer * value from map as PTR_TO_BTF_ID, with the correct type. */ - mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf, - off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); + mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); /* For mark_ptr_or_null_reg */ val_reg->id = ++env->id_gen; } else if (class == BPF_STX) { val_reg = reg_state(env, value_regno); if (!register_is_null(val_reg) && - map_kptr_match_type(env, off_desc, val_reg, value_regno)) + map_kptr_match_type(env, kptr_field, val_reg, value_regno)) return -EACCES; } else if (class == BPF_ST) { if (insn->imm) { verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n", - off_desc->offset); + kptr_field->offset); return -EACCES; } } else { @@ -3813,45 +4056,30 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; struct bpf_map *map = reg->map_ptr; - int err; + struct btf_record *rec; + int err, i; err = check_mem_region_access(env, regno, off, size, map->value_size, zero_size_allowed); if (err) return err; - if (map_value_has_spin_lock(map)) { - u32 lock = map->spin_lock_off; + if (IS_ERR_OR_NULL(map->record)) + return 0; + rec = map->record; + for (i = 0; i < rec->cnt; i++) { + struct btf_field *field = &rec->fields[i]; + u32 p = field->offset; - /* if any part of struct bpf_spin_lock can be touched by - * load/store reject this program. - * To check that [x1, x2) overlaps with [y1, y2) + /* If any part of a field can be touched by load/store, reject + * this program. To check that [x1, x2) overlaps with [y1, y2), * it is sufficient to check x1 < y2 && y1 < x2. */ - if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && - lock < reg->umax_value + off + size) { - verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_timer(map)) { - u32 t = map->timer_off; - - if (reg->smin_value + off < t + sizeof(struct bpf_timer) && - t < reg->umax_value + off + size) { - verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_kptrs(map)) { - struct bpf_map_value_off *tab = map->kptr_off_tab; - int i; - - for (i = 0; i < tab->nr_off; i++) { - u32 p = tab->off[i].offset; - - if (reg->smin_value + off < p + sizeof(u64) && - p < reg->umax_value + off + size) { + if (reg->smin_value + off < p + btf_field_type_size(field->type) && + p < reg->umax_value + off + size) { + switch (field->type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: if (src != ACCESS_DIRECT) { verbose(env, "kptr cannot be accessed indirectly by helper\n"); return -EACCES; @@ -3870,10 +4098,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, return -EACCES; } break; + default: + verbose(env, "%s cannot be accessed directly by load/store\n", + btf_field_type_name(field->type)); + return -EACCES; } } } - return err; + return 0; } #define MAX_PACKET_OFF 0xffff @@ -4090,6 +4322,30 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_FLOW_KEYS; } +static bool is_trusted_reg(const struct bpf_reg_state *reg) +{ + /* A referenced register is always trusted. */ + if (reg->ref_obj_id) + return true; + + /* If a register is not referenced, it is trusted if it has the + * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the + * other type modifiers may be safe, but we elect to take an opt-in + * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are + * not. + * + * Eventually, we should make PTR_TRUSTED the single source of truth + * for whether a register is trusted. + */ + return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS && + !bpf_type_has_unsafe_modifiers(reg->type); +} + +static bool is_rcu_reg(const struct bpf_reg_state *reg) +{ + return reg->type & MEM_RCU; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -4506,6 +4762,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, u32 btf_id; int ret; + if (!env->allow_ptr_leaks) { + verbose(env, + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + tname); + return -EPERM; + } + if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) { + verbose(env, + "Cannot access kernel 'struct %s' from non-GPL compatible program\n", + tname); + return -EINVAL; + } if (off < 0) { verbose(env, "R%d is ptr_%s invalid negative access: off=%d\n", @@ -4536,17 +4804,28 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } - if (env->ops->btf_struct_access) { - ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id, &flag); + if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) { + if (!btf_is_kernel(reg->btf)) { + verbose(env, "verifier internal error: reg->btf must be kernel btf\n"); + return -EFAULT; + } + ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); } else { - if (atype != BPF_READ) { + /* Writes are permitted with default btf_struct_access for + * program allocated objects (which always have ref_obj_id > 0), + * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC. + */ + if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { verbose(env, "only read is supported\n"); return -EACCES; } - ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id, &flag); + if (type_is_alloc(reg->type) && !reg->ref_obj_id) { + verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n"); + return -EFAULT; + } + + ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); } if (ret < 0) @@ -4558,6 +4837,30 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_flag(reg->type) & PTR_UNTRUSTED) flag |= PTR_UNTRUSTED; + /* By default any pointer obtained from walking a trusted pointer is + * no longer trusted except the rcu case below. + */ + flag &= ~PTR_TRUSTED; + + if (flag & MEM_RCU) { + /* Mark value register as MEM_RCU only if it is protected by + * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU + * itself can already indicate trustedness inside the rcu + * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since + * it could be null in some cases. + */ + if (!env->cur_state->active_rcu_lock || + !(is_trusted_reg(reg) || is_rcu_reg(reg))) + flag &= ~MEM_RCU; + else + flag |= PTR_MAYBE_NULL; + } else if (reg->type & MEM_RCU) { + /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged + * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. + */ + flag |= PTR_UNTRUSTED; + } + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -4572,6 +4875,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; @@ -4592,9 +4896,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id); tname = btf_name_by_offset(btf_vmlinux, t->name_off); - if (!env->allow_ptr_to_map_access) { + if (!env->allow_ptr_leaks) { verbose(env, - "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", + "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n", tname); return -EPERM; } @@ -4610,7 +4914,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); + /* Simulate access to a PTR_TO_BTF_ID */ + memset(&map_reg, 0, sizeof(map_reg)); + mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); + ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; @@ -4746,7 +5053,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_MAP_VALUE) { - struct bpf_map_value_off_desc *kptr_off_desc = NULL; + struct btf_field *kptr_field = NULL; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { @@ -4760,11 +5067,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; if (tnum_is_const(reg->var_off)) - kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr, - off + reg->var_off.value); - if (kptr_off_desc) { - err = check_map_kptr_access(env, regno, value_regno, insn_idx, - kptr_off_desc); + kptr_field = btf_record_find(reg->map_ptr->record, + off + reg->var_off.value, BPF_KPTR); + if (kptr_field) { + err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field); } else if (t == BPF_READ && value_regno >= 0) { struct bpf_map *map = reg->map_ptr; @@ -5155,10 +5461,6 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) - goto mark; - - if (is_spilled_reg(&state->stack[spi]) && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { if (clobber) { @@ -5188,6 +5490,11 @@ mark: mark_reg_read(env, &state->stack[spi].spilled_ptr, state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64); + /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not + * be sure that whether stack slot is written to or not. Hence, + * we must still conservatively propagate reads upwards even if + * helper may write to the entire memory range. + */ } return update_stack_depth(env, state, min_off); } @@ -5369,8 +5676,8 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return err; } -int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno) +static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno) { struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; bool may_be_null = type_may_be_null(mem_reg->type); @@ -5398,23 +5705,26 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state } /* Implementation details: - * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL. + * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL. * Two bpf_map_lookups (even with the same key) will have different reg->id. - * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after - * value_or_null->value transition, since the verifier only cares about - * the range of access to valid map value pointer and doesn't care about actual - * address of the map element. + * Two separate bpf_obj_new will also have different reg->id. + * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier + * clears reg->id after value_or_null->value transition, since the verifier only + * cares about the range of access to valid map value pointer and doesn't care + * about actual address of the map element. * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps * reg->id > 0 after value_or_null->value transition. By doing so * two bpf_map_lookups will be considered two different pointers that - * point to different bpf_spin_locks. + * point to different bpf_spin_locks. Likewise for pointers to allocated objects + * returned from bpf_obj_new. * The verifier allows taking only one bpf_spin_lock at a time to avoid * dead-locks. * Since only one bpf_spin_lock is allowed the checks are simpler than * reg_is_refcounted() logic. The verifier needs to remember only * one spin_lock instead of array of acquired_refs. - * cur_state->active_spin_lock remembers which map value element got locked - * and clears it after bpf_spin_unlock. + * cur_state->active_lock remembers which map value element or allocated + * object got locked and clears it after bpf_spin_unlock. */ static int process_spin_lock(struct bpf_verifier_env *env, int regno, bool is_lock) @@ -5422,8 +5732,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; struct bpf_verifier_state *cur = env->cur_state; bool is_const = tnum_is_const(reg->var_off); - struct bpf_map *map = reg->map_ptr; u64 val = reg->var_off.value; + struct bpf_map *map = NULL; + struct btf *btf = NULL; + struct btf_record *rec; if (!is_const) { verbose(env, @@ -5431,49 +5743,78 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, regno); return -EINVAL; } - if (!map->btf) { - verbose(env, - "map '%s' has to have BTF in order to use bpf_spin_lock\n", - map->name); - return -EINVAL; - } - if (!map_value_has_spin_lock(map)) { - if (map->spin_lock_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_spin_lock'\n", - map->name); - else if (map->spin_lock_off == -ENOENT) + if (reg->type == PTR_TO_MAP_VALUE) { + map = reg->map_ptr; + if (!map->btf) { verbose(env, - "map '%s' doesn't have 'struct bpf_spin_lock'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_spin_lock is mangled\n", + "map '%s' has to have BTF in order to use bpf_spin_lock\n", map->name); + return -EINVAL; + } + } else { + btf = reg->btf; + } + + rec = reg_btf_record(reg); + if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) { + verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local", + map ? map->name : "kptr"); return -EINVAL; } - if (map->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", - val + reg->off); + if (rec->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", + val + reg->off, rec->spin_lock_off); return -EINVAL; } if (is_lock) { - if (cur->active_spin_lock) { + if (cur->active_lock.ptr) { verbose(env, "Locking two bpf_spin_locks are not allowed\n"); return -EINVAL; } - cur->active_spin_lock = reg->id; + if (map) + cur->active_lock.ptr = map; + else + cur->active_lock.ptr = btf; + cur->active_lock.id = reg->id; } else { - if (!cur->active_spin_lock) { + struct bpf_func_state *fstate = cur_func(env); + void *ptr; + int i; + + if (map) + ptr = map; + else + ptr = btf; + + if (!cur->active_lock.ptr) { verbose(env, "bpf_spin_unlock without taking a lock\n"); return -EINVAL; } - if (cur->active_spin_lock != reg->id) { + if (cur->active_lock.ptr != ptr || + cur->active_lock.id != reg->id) { verbose(env, "bpf_spin_unlock of different lock\n"); return -EINVAL; } - cur->active_spin_lock = 0; + cur->active_lock.ptr = NULL; + cur->active_lock.id = 0; + + for (i = fstate->acquired_refs - 1; i >= 0; i--) { + int err; + + /* Complain on error because this reference state cannot + * be freed before this point, as bpf_spin_lock critical + * section does not allow functions that release the + * allocated object immediately. + */ + if (!fstate->refs[i].release_on_unlock) + continue; + err = release_reference(env, fstate->refs[i].id); + if (err) { + verbose(env, "failed to release release_on_unlock reference"); + return err; + } + } } return 0; } @@ -5497,24 +5838,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_timer(map)) { - if (map->timer_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_timer'\n", - map->name); - else if (map->timer_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_timer'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_timer is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_TIMER)) { + verbose(env, "map '%s' has no valid bpf_timer\n", map->name); return -EINVAL; } - if (map->timer_off != val + reg->off) { + if (map->record->timer_off != val + reg->off) { verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->timer_off); + val + reg->off, map->record->timer_off); return -EINVAL; } if (meta->map_ptr) { @@ -5530,10 +5860,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; - struct bpf_map_value_off_desc *off_desc; struct bpf_map *map_ptr = reg->map_ptr; + struct btf_field *kptr_field; u32 kptr_off; - int ret; if (!tnum_is_const(reg->var_off)) { verbose(env, @@ -5546,30 +5875,136 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno, map_ptr->name); return -EINVAL; } - if (!map_value_has_kptrs(map_ptr)) { - ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab); - if (ret == -E2BIG) - verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name, - BPF_MAP_VALUE_OFF_MAX); - else if (ret == -EEXIST) - verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name); - else - verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); + if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) { + verbose(env, "map '%s' has no valid kptr\n", map_ptr->name); return -EINVAL; } meta->map_ptr = map_ptr; kptr_off = reg->off + reg->var_off.value; - off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off); - if (!off_desc) { + kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR); + if (!kptr_field) { verbose(env, "off=%d doesn't point to kptr\n", kptr_off); return -EACCES; } - if (off_desc->type != BPF_KPTR_REF) { + if (kptr_field->type != BPF_KPTR_REF) { verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off); return -EACCES; } - meta->kptr_off_desc = off_desc; + meta->kptr_field = kptr_field; + return 0; +} + +/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK + * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR. + * + * In both cases we deal with the first 8 bytes, but need to mark the next 8 + * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of + * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object. + * + * Mutability of bpf_dynptr is at two levels, one is at the level of struct + * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct + * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can + * mutate the view of the dynptr and also possibly destroy it. In the latter + * case, it cannot mutate the bpf_dynptr itself but it can still mutate the + * memory that dynptr points to. + * + * The verifier will keep track both levels of mutation (bpf_dynptr's in + * reg->type and the memory's in reg->dynptr.type), but there is no support for + * readonly dynptr view yet, hence only the first case is tracked and checked. + * + * This is consistent with how C applies the const modifier to a struct object, + * where the pointer itself inside bpf_dynptr becomes const but not what it + * points to. + * + * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument + * type, and declare it as 'const struct bpf_dynptr *' in their prototype. + */ +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + + /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an + * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*): + */ + if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) { + verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n"); + return -EFAULT; + } + /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to + * check_func_arg_reg_off's logic. We only need to check offset + * alignment for PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) { + verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off); + return -EINVAL; + } + /* MEM_UNINIT - Points to memory that is an appropriate candidate for + * constructing a mutable bpf_dynptr object. + * + * Currently, this is only possible with PTR_TO_STACK + * pointing to a region of at least 16 bytes which doesn't + * contain an existing bpf_dynptr. + * + * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be + * mutated or destroyed. However, the memory it points to + * may be mutated. + * + * None - Points to a initialized dynptr that can be mutated and + * destroyed, including mutation of the memory it points + * to. + */ + if (arg_type & MEM_UNINIT) { + if (!is_dynptr_reg_valid_uninit(env, reg)) { + verbose(env, "Dynptr has to be an uninitialized dynptr\n"); + return -EINVAL; + } + + /* We only support one dynptr being uninitialized at the moment, + * which is sufficient for the helper functions we have right now. + */ + if (meta->uninit_dynptr_regno) { + verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); + return -EFAULT; + } + + meta->uninit_dynptr_regno = regno; + } else /* MEM_RDONLY and None case from above */ { + /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */ + if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) { + verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n"); + return -EINVAL; + } + + if (!is_dynptr_reg_valid_init(env, reg)) { + verbose(env, + "Expected an initialized dynptr as arg #%d\n", + regno); + return -EINVAL; + } + + /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */ + if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { + const char *err_extra = ""; + + switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { + case DYNPTR_TYPE_LOCAL: + err_extra = "local"; + break; + case DYNPTR_TYPE_RINGBUF: + err_extra = "ringbuf"; + break; + default: + err_extra = "<unknown>"; + break; + } + verbose(env, + "Expected a dynptr of type %s as arg #%d\n", + err_extra, regno); + return -EINVAL; + } + } return 0; } @@ -5634,16 +6069,6 @@ struct bpf_reg_types { u32 *btf_id; }; -static const struct bpf_reg_types map_key_value_types = { - .types = { - PTR_TO_STACK, - PTR_TO_PACKET, - PTR_TO_PACKET_META, - PTR_TO_MAP_KEY, - PTR_TO_MAP_VALUE, - }, -}; - static const struct bpf_reg_types sock_types = { .types = { PTR_TO_SOCK_COMMON, @@ -5661,6 +6086,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = { PTR_TO_TCP_SOCK, PTR_TO_XDP_SOCK, PTR_TO_BTF_ID, + PTR_TO_BTF_ID | PTR_TRUSTED, }, .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], }; @@ -5674,7 +6100,7 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_MEM | MEM_ALLOC, + PTR_TO_MEM | MEM_RINGBUF, PTR_TO_BUF, }, }; @@ -5689,14 +6115,31 @@ static const struct bpf_reg_types int_ptr_types = { }, }; +static const struct bpf_reg_types spin_lock_types = { + .types = { + PTR_TO_MAP_VALUE, + PTR_TO_BTF_ID | MEM_ALLOC, + } +}; + static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; +static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; -static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; -static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; +static const struct bpf_reg_types btf_ptr_types = { + .types = { + PTR_TO_BTF_ID, + PTR_TO_BTF_ID | PTR_TRUSTED, + PTR_TO_BTF_ID | MEM_RCU, + }, +}; +static const struct bpf_reg_types percpu_btf_ptr_types = { + .types = { + PTR_TO_BTF_ID | MEM_PERCPU, + PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED, + } +}; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5705,13 +6148,13 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } } static const struct bpf_reg_types dynptr_types = { .types = { PTR_TO_STACK, - PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL, + CONST_PTR_TO_DYNPTR, } }; static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { - [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, + [ARG_PTR_TO_MAP_KEY] = &mem_types, + [ARG_PTR_TO_MAP_VALUE] = &mem_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, @@ -5725,7 +6168,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, + [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, @@ -5784,7 +6227,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EACCES; found: - if (reg->type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This * allows bpf_sk_release to work for multiple socket types. @@ -5801,7 +6244,7 @@ found: } if (meta->func_id == BPF_FUNC_kptr_xchg) { - if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno)) + if (map_kptr_match_type(env, meta->kptr_field, reg, regno)) return -EACCES; } else { if (arg_btf_id == BPF_PTR_POISON) { @@ -5820,6 +6263,11 @@ found: return -EACCES; } } + } else if (type_is_alloc(reg->type)) { + if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) { + verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n"); + return -EFAULT; + } } return 0; @@ -5829,64 +6277,80 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type) { - enum bpf_reg_type type = reg->type; - bool fixed_off_ok = false; + u32 type = reg->type; - switch ((u32)type) { - /* Pointer types where reg offset is explicitly allowed: */ - case PTR_TO_STACK: - if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) { - verbose(env, "cannot pass in dynptr at an offset\n"); + /* When referenced register is passed to release function, its fixed + * offset must be 0. + * + * We will check arg_type_is_release reg has ref_obj_id when storing + * meta->release_regno. + */ + if (arg_type_is_release(arg_type)) { + /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it + * may not directly point to the object being released, but to + * dynptr pointing to such object, which might be at some offset + * on the stack. In that case, we simply to fallback to the + * default handling. + */ + if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK) + return 0; + /* Doing check_ptr_off_reg check for the offset will catch this + * because fixed_off_ok is false, but checking here allows us + * to give the user a better error message. + */ + if (reg->off) { + verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n", + regno); return -EINVAL; } - fallthrough; + return __check_ptr_off_reg(env, reg, regno, false); + } + + switch (type) { + /* Pointer types where both fixed and variable offset is explicitly allowed: */ + case PTR_TO_STACK: case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: case PTR_TO_MEM: case PTR_TO_MEM | MEM_RDONLY: - case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_MEM | MEM_RINGBUF: case PTR_TO_BUF: case PTR_TO_BUF | MEM_RDONLY: case SCALAR_VALUE: - /* Some of the argument types nevertheless require a - * zero register offset. - */ - if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM) - return 0; - break; + return 0; /* All the rest must be rejected, except PTR_TO_BTF_ID which allows * fixed offset. */ case PTR_TO_BTF_ID: + case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_RCU: + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: /* When referenced PTR_TO_BTF_ID is passed to release function, - * it's fixed offset must be 0. In the other cases, fixed offset - * can be non-zero. + * its fixed offset must be 0. In the other cases, fixed offset + * can be non-zero. This was already checked above. So pass + * fixed_off_ok as true to allow fixed offset for all other + * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we + * still need to do checks instead of returning. */ - if (arg_type_is_release(arg_type) && reg->off) { - verbose(env, "R%d must have zero offset when passed to release func\n", - regno); - return -EINVAL; - } - /* For arg is release pointer, fixed_off_ok must be false, but - * we already checked and rejected reg->off != 0 above, so set - * to true to allow fixed offset for all other cases. - */ - fixed_off_ok = true; - break; + return __check_ptr_off_reg(env, reg, regno, true); default: - break; + return __check_ptr_off_reg(env, reg, regno, false); } - return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } -static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; + + if (reg->type == CONST_PTR_TO_DYNPTR) + return reg->ref_obj_id; - return state->stack[spi].spilled_ptr.id; + spi = get_spi(reg->off); + return state->stack[spi].spilled_ptr.ref_obj_id; } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -5935,7 +6399,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, goto skip_type_check; /* arg_btf_id and arg_size are in a union. */ - if (base_type(arg_type) == ARG_PTR_TO_BTF_ID) + if (base_type(arg_type) == ARG_PTR_TO_BTF_ID || + base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); @@ -5950,11 +6415,22 @@ skip_type_check: if (arg_type_is_release(arg_type)) { if (arg_type_is_dynptr(arg_type)) { struct bpf_func_state *state = func(env, reg); - int spi = get_spi(reg->off); + int spi; - if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || - !state->stack[spi].spilled_ptr.id) { - verbose(env, "arg %d is an unacquired reference\n", regno); + /* Only dynptr created on stack can be released, thus + * the get_spi and stack state checks for spilled_ptr + * should only be done before process_dynptr_func for + * PTR_TO_STACK. + */ + if (reg->type == PTR_TO_STACK) { + spi = get_spi(reg->off); + if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) || + !state->stack[spi].spilled_ptr.ref_obj_id) { + verbose(env, "arg %d is an unacquired reference\n", regno); + return -EINVAL; + } + } else { + verbose(env, "cannot release unowned const bpf_dynptr\n"); return -EINVAL; } } else if (!reg->ref_obj_id && !register_is_null(reg)) { @@ -6051,19 +6527,22 @@ skip_type_check: break; case ARG_PTR_TO_SPIN_LOCK: if (meta->func_id == BPF_FUNC_spin_lock) { - if (process_spin_lock(env, regno, true)) - return -EACCES; + err = process_spin_lock(env, regno, true); + if (err) + return err; } else if (meta->func_id == BPF_FUNC_spin_unlock) { - if (process_spin_lock(env, regno, false)) - return -EACCES; + err = process_spin_lock(env, regno, false); + if (err) + return err; } else { verbose(env, "verifier internal error\n"); return -EFAULT; } break; case ARG_PTR_TO_TIMER: - if (process_timer_func(env, regno, meta)) - return -EACCES; + err = process_timer_func(env, regno, meta); + if (err) + return err; break; case ARG_PTR_TO_FUNC: meta->subprogno = reg->subprogno; @@ -6086,52 +6565,9 @@ skip_type_check: err = check_mem_size_reg(env, reg, regno, true, meta); break; case ARG_PTR_TO_DYNPTR: - /* We only need to check for initialized / uninitialized helper - * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the - * assumption is that if it is, that a helper function - * initialized the dynptr on behalf of the BPF program. - */ - if (base_type(reg->type) == PTR_TO_DYNPTR) - break; - if (arg_type & MEM_UNINIT) { - if (!is_dynptr_reg_valid_uninit(env, reg)) { - verbose(env, "Dynptr has to be an uninitialized dynptr\n"); - return -EINVAL; - } - - /* We only support one dynptr being uninitialized at the moment, - * which is sufficient for the helper functions we have right now. - */ - if (meta->uninit_dynptr_regno) { - verbose(env, "verifier internal error: multiple uninitialized dynptr args\n"); - return -EFAULT; - } - - meta->uninit_dynptr_regno = regno; - } else if (!is_dynptr_reg_valid_init(env, reg)) { - verbose(env, - "Expected an initialized dynptr as arg #%d\n", - arg + 1); - return -EINVAL; - } else if (!is_dynptr_type_expected(env, reg, arg_type)) { - const char *err_extra = ""; - - switch (arg_type & DYNPTR_TYPE_FLAG_MASK) { - case DYNPTR_TYPE_LOCAL: - err_extra = "local"; - break; - case DYNPTR_TYPE_RINGBUF: - err_extra = "ringbuf"; - break; - default: - err_extra = "<unknown>"; - break; - } - verbose(env, - "Expected a dynptr of type %s as arg #%d\n", - err_extra, arg + 1); - return -EINVAL; - } + err = process_dynptr_func(env, regno, arg_type, meta); + if (err) + return err; break; case ARG_CONST_ALLOC_SIZE_OR_ZERO: if (!tnum_is_const(reg->var_off)) { @@ -6198,8 +6634,9 @@ skip_type_check: break; } case ARG_PTR_TO_KPTR: - if (process_kptr_func(env, regno, meta)) - return -EACCES; + err = process_kptr_func(env, regno, meta); + if (err) + return err; break; } @@ -6360,6 +6797,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_task_storage_delete) goto error; break; + case BPF_MAP_TYPE_CGRP_STORAGE: + if (func_id != BPF_FUNC_cgrp_storage_get && + func_id != BPF_FUNC_cgrp_storage_delete) + goto error; + break; case BPF_MAP_TYPE_BLOOM_FILTER: if (func_id != BPF_FUNC_map_peek_elem && func_id != BPF_FUNC_map_push_elem) @@ -6472,6 +6914,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) goto error; break; + case BPF_FUNC_cgrp_storage_get: + case BPF_FUNC_cgrp_storage_delete: + if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) + goto error; + break; default: break; } @@ -6543,9 +6990,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn) int i; for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) { - if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i]) - return false; - + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID) + return !!fn->arg_btf_id[i]; + if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK) + return fn->arg_btf_id[i] == BPF_PTR_POISON; if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] && /* arg_btf_id and arg_size are in a union. */ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM || @@ -6618,8 +7066,12 @@ static int release_reference(struct bpf_verifier_env *env, return err; bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ - if (reg->ref_obj_id == ref_obj_id) - __mark_reg_unknown(env, reg); + if (reg->ref_obj_id == ref_obj_id) { + if (!env->allow_ptr_leaks) + __mark_reg_not_init(env, reg); + else + __mark_reg_unknown(env, reg); + } })); return 0; @@ -6642,6 +7094,10 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env, struct bpf_func_state *callee, int insn_idx); +static int set_callee_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, int insn_idx); + static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx, int subprog, set_callee_state_fn set_callee_state_cb) @@ -6692,6 +7148,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } } + /* set_callee_state is used for direct subprog calls, but we are + * interested in validating only BPF helpers that can call subprogs as + * callbacks + */ + if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) { + verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + if (insn->code == (BPF_JMP | BPF_CALL) && insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { @@ -6736,11 +7202,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn /* Transfer references to the callee */ err = copy_reference_state(callee, caller); if (err) - return err; + goto err_out; err = set_callee_state_cb(env, caller, callee, *insn_idx); if (err) - return err; + goto err_out; clear_caller_saved_regs(env, caller->regs); @@ -6757,6 +7223,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn print_verifier_state(env, callee, true); } return 0; + +err_out: + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return err; } int map_set_for_each_callback_args(struct bpf_verifier_env *env, @@ -6933,11 +7404,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env, { /* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void * callback_ctx, u64 flags); - * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx); + * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx); */ __mark_reg_not_init(env, &callee->regs[BPF_REG_0]); - callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL; - __mark_reg_known_zero(&callee->regs[BPF_REG_1]); + mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL); callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; /* unused */ @@ -6970,8 +7440,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return -EINVAL; } - state->curframe--; - caller = state->frame[state->curframe]; + caller = state->frame[state->curframe - 1]; if (callee->in_callback_fn) { /* enforce R0 return value range [0, 1]. */ struct tnum range = callee->callback_ret_range; @@ -7010,7 +7479,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) } /* clear everything in the callee */ free_func_state(callee); - state->frame[state->curframe + 1] = NULL; + state->frame[state->curframe--] = NULL; return 0; } @@ -7270,6 +7739,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EINVAL; } + if (!env->prog->aux->sleepable && fn->might_sleep) { + verbose(env, "helper call might sleep in a non-sleepable prog\n"); + return -EINVAL; + } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { @@ -7288,6 +7762,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } + if (env->cur_state->active_rcu_lock) { + if (fn->might_sleep) { + verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n", + func_id_name(func_id), func_id); + return -EINVAL; + } + + if (env->prog->aux->sleepable && is_storage_get_function(func_id)) + env->insn_aux_data[insn_idx].storage_get_func_atomic = true; + } + meta.func_id = func_id; /* check args */ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) { @@ -7316,7 +7801,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr + * is safe to do directly. + */ if (meta.uninit_dynptr_regno) { + if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n"); + return -EFAULT; + } /* we write BPF_DW bits (8 bytes) at a time */ for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) { err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno, @@ -7334,15 +7827,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (meta.release_regno) { err = -EINVAL; - if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) + /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot + * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr + * is safe to do directly. + */ + if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) { + if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) { + verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n"); + return -EFAULT; + } err = unmark_stack_slots_dynptr(env, ®s[meta.release_regno]); - else if (meta.ref_obj_id) + } else if (meta.ref_obj_id) { err = release_reference(env, meta.ref_obj_id); - /* meta.ref_obj_id can only be 0 if register that is meant to be - * released is NULL, which must be > R0. - */ - else if (register_is_null(®s[meta.release_regno])) + } else if (register_is_null(®s[meta.release_regno])) { + /* meta.ref_obj_id can only be 0 if register that is meant to be + * released is NULL, which must be > R0. + */ err = 0; + } if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", func_id_name(func_id), func_id); @@ -7416,11 +7918,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return -EFAULT; } - if (base_type(reg->type) != PTR_TO_DYNPTR) - /* Find the id of the dynptr we're - * tracking the reference of - */ - meta.ref_obj_id = stack_slot_get_id(env, reg); + meta.ref_obj_id = dynptr_ref_obj_id(env, reg); break; } } @@ -7475,7 +7973,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_type) && - map_value_has_spin_lock(meta.map_ptr)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -7491,7 +7989,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; break; - case RET_PTR_TO_ALLOC_MEM: + case RET_PTR_TO_MEM: mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; @@ -7539,8 +8037,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; if (func_id == BPF_FUNC_kptr_xchg) { - ret_btf = meta.kptr_off_desc->kptr.btf; - ret_btf_id = meta.kptr_off_desc->kptr.btf_id; + ret_btf = meta.kptr_field->kptr.btf; + ret_btf_id = meta.kptr_field->kptr.btf_id; } else { if (fn->ret_btf_id == BPF_PTR_POISON) { verbose(env, "verifier internal error:"); @@ -7654,19 +8152,926 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } +struct bpf_kfunc_call_arg_meta { + /* In parameters */ + struct btf *btf; + u32 func_id; + u32 kfunc_flags; + const struct btf_type *func_proto; + const char *func_name; + /* Out parameters */ + u32 ref_obj_id; + u8 release_regno; + bool r0_rdonly; + u32 ret_btf_id; + u64 r0_size; + struct { + u64 value; + bool found; + } arg_constant; + struct { + struct btf *btf; + u32 btf_id; + } arg_obj_drop; + struct { + struct btf_field *field; + } arg_list_head; +}; + +static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_ACQUIRE; +} + +static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RET_NULL; +} + +static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RELEASE; +} + +static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_TRUSTED_ARGS; +} + +static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_SLEEPABLE; +} + +static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_DESTRUCTIVE; +} + +static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->kfunc_flags & KF_RCU; +} + +static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) +{ + return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); +} + +static bool __kfunc_param_match_suffix(const struct btf *btf, + const struct btf_param *arg, + const char *suffix) +{ + int suffix_len = strlen(suffix), len; + const char *param_name; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len < suffix_len) + return false; + param_name += len - suffix_len; + return !strncmp(param_name, suffix, suffix_len); +} + +static bool is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + const struct btf_type *t; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + return __kfunc_param_match_suffix(btf, arg, "__sz"); +} + +static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__k"); +} + +static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__ign"); +} + +static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg) +{ + return __kfunc_param_match_suffix(btf, arg, "__alloc"); +} + +static bool is_kfunc_arg_scalar_with_name(const struct btf *btf, + const struct btf_param *arg, + const char *name) +{ + int len, target_len = strlen(name); + const char *param_name; + + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len != target_len) + return false; + if (strcmp(param_name, name)) + return false; + + return true; +} + +enum { + KF_ARG_DYNPTR_ID, + KF_ARG_LIST_HEAD_ID, + KF_ARG_LIST_NODE_ID, +}; + +BTF_ID_LIST(kf_arg_btf_ids) +BTF_ID(struct, bpf_dynptr_kern) +BTF_ID(struct, bpf_list_head) +BTF_ID(struct, bpf_list_node) + +static bool __is_kfunc_ptr_arg_type(const struct btf *btf, + const struct btf_param *arg, int type) +{ + const struct btf_type *t; + u32 res_id; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!t) + return false; + if (!btf_type_is_ptr(t)) + return false; + t = btf_type_skip_modifiers(btf, t->type, &res_id); + if (!t) + return false; + return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]); +} + +static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID); +} + +static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID); +} + +static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg) +{ + return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID); +} + +/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ +static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env, + const struct btf *btf, + const struct btf_type *t, int rec) +{ + const struct btf_type *member_type; + const struct btf_member *member; + u32 i; + + if (!btf_type_is_struct(t)) + return false; + + for_each_member(i, t, member) { + const struct btf_array *array; + + member_type = btf_type_skip_modifiers(btf, member->type, NULL); + if (btf_type_is_struct(member_type)) { + if (rec >= 3) { + verbose(env, "max struct nesting depth exceeded\n"); + return false; + } + if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1)) + return false; + continue; + } + if (btf_type_is_array(member_type)) { + array = btf_array(member_type); + if (!array->nelems) + return false; + member_type = btf_type_skip_modifiers(btf, array->type, NULL); + if (!btf_type_is_scalar(member_type)) + return false; + continue; + } + if (!btf_type_is_scalar(member_type)) + return false; + } + return true; +} + + +static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { +#ifdef CONFIG_NET + [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], + [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP], +#endif +}; + +enum kfunc_ptr_arg_type { + KF_ARG_PTR_TO_CTX, + KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */ + KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */ + KF_ARG_PTR_TO_DYNPTR, + KF_ARG_PTR_TO_LIST_HEAD, + KF_ARG_PTR_TO_LIST_NODE, + KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */ + KF_ARG_PTR_TO_MEM, + KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */ +}; + +enum special_kfunc_type { + KF_bpf_obj_new_impl, + KF_bpf_obj_drop_impl, + KF_bpf_list_push_front, + KF_bpf_list_push_back, + KF_bpf_list_pop_front, + KF_bpf_list_pop_back, + KF_bpf_cast_to_kern_ctx, + KF_bpf_rdonly_cast, + KF_bpf_rcu_read_lock, + KF_bpf_rcu_read_unlock, +}; + +BTF_SET_START(special_kfunc_set) +BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_list_push_front) +BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_pop_front) +BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_cast_to_kern_ctx) +BTF_ID(func, bpf_rdonly_cast) +BTF_SET_END(special_kfunc_set) + +BTF_ID_LIST(special_kfunc_list) +BTF_ID(func, bpf_obj_new_impl) +BTF_ID(func, bpf_obj_drop_impl) +BTF_ID(func, bpf_list_push_front) +BTF_ID(func, bpf_list_push_back) +BTF_ID(func, bpf_list_pop_front) +BTF_ID(func, bpf_list_pop_back) +BTF_ID(func, bpf_cast_to_kern_ctx) +BTF_ID(func, bpf_rdonly_cast) +BTF_ID(func, bpf_rcu_read_lock) +BTF_ID(func, bpf_rcu_read_unlock) + +static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock]; +} + +static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta) +{ + return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock]; +} + +static enum kfunc_ptr_arg_type +get_kfunc_ptr_arg_type(struct bpf_verifier_env *env, + struct bpf_kfunc_call_arg_meta *meta, + const struct btf_type *t, const struct btf_type *ref_t, + const char *ref_tname, const struct btf_param *args, + int argno, int nargs) +{ + u32 regno = argno + 1; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = ®s[regno]; + bool arg_mem_size = false; + + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) + return KF_ARG_PTR_TO_CTX; + + /* In this function, we verify the kfunc's BTF as per the argument type, + * leaving the rest of the verification with respect to the register + * type to our caller. When a set of conditions hold in the BTF type of + * arguments, we resolve it to a known kfunc_ptr_arg_type. + */ + if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno)) + return KF_ARG_PTR_TO_CTX; + + if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_ALLOC_BTF_ID; + + if (is_kfunc_arg_kptr_get(meta, argno)) { + if (!btf_type_is_ptr(ref_t)) { + verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n"); + return -EINVAL; + } + ref_t = btf_type_by_id(meta->btf, ref_t->type); + ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off); + if (!btf_type_is_struct(ref_t)) { + verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n", + meta->func_name, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + return KF_ARG_PTR_TO_KPTR; + } + + if (is_kfunc_arg_dynptr(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_DYNPTR; + + if (is_kfunc_arg_list_head(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_LIST_HEAD; + + if (is_kfunc_arg_list_node(meta->btf, &args[argno])) + return KF_ARG_PTR_TO_LIST_NODE; + + if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) { + if (!btf_type_is_struct(ref_t)) { + verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n", + meta->func_name, argno, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + return KF_ARG_PTR_TO_BTF_ID; + } + + if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], ®s[regno + 1])) + arg_mem_size = true; + + /* This is the catch all argument type of register types supported by + * check_helper_mem_access. However, we only allow when argument type is + * pointer to scalar, or struct composed (recursively) of scalars. When + * arg_mem_size is true, the pointer can be void *. + */ + if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) && + (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { + verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", + argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + return -EINVAL; + } + return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM; +} + +static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const struct btf_type *ref_t, + const char *ref_tname, u32 ref_id, + struct bpf_kfunc_call_arg_meta *meta, + int argno) +{ + const struct btf_type *reg_ref_t; + bool strict_type_match = false; + const struct btf *reg_btf; + const char *reg_ref_tname; + u32 reg_ref_id; + + if (base_type(reg->type) == PTR_TO_BTF_ID) { + reg_btf = reg->btf; + reg_ref_id = reg->btf_id; + } else { + reg_btf = btf_vmlinux; + reg_ref_id = *reg2btf_ids[base_type(reg->type)]; + } + + if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id)) + strict_type_match = true; + + reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, ®_ref_id); + reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off); + if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) { + verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n", + meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1, + btf_type_str(reg_ref_t), reg_ref_tname); + return -EINVAL; + } + return 0; +} + +static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, + const struct btf_type *ref_t, + const char *ref_tname, + struct bpf_kfunc_call_arg_meta *meta, + int argno) +{ + struct btf_field *kptr_field; + + /* check_func_arg_reg_off allows var_off for + * PTR_TO_MAP_VALUE, but we need fixed offset to find + * off_desc. + */ + if (!tnum_is_const(reg->var_off)) { + verbose(env, "arg#0 must have constant offset\n"); + return -EINVAL; + } + + kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR); + if (!kptr_field || kptr_field->type != BPF_KPTR_REF) { + verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n", + reg->off + reg->var_off.value); + return -EINVAL; + } + + if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf, + kptr_field->kptr.btf_id, true)) { + verbose(env, "kernel function %s args#%d expected pointer to %s %s\n", + meta->func_name, argno, btf_type_str(ref_t), ref_tname); + return -EINVAL; + } + return 0; +} + +static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id) +{ + struct bpf_func_state *state = cur_func(env); + struct bpf_reg_state *reg; + int i; + + /* bpf_spin_lock only allows calling list_push and list_pop, no BPF + * subprogs, no global functions. This means that the references would + * not be released inside the critical section but they may be added to + * the reference state, and the acquired_refs are never copied out for a + * different frame as BPF to BPF calls don't work in bpf_spin_lock + * critical sections. + */ + if (!ref_obj_id) { + verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n"); + return -EFAULT; + } + for (i = 0; i < state->acquired_refs; i++) { + if (state->refs[i].id == ref_obj_id) { + if (state->refs[i].release_on_unlock) { + verbose(env, "verifier internal error: expected false release_on_unlock"); + return -EFAULT; + } + state->refs[i].release_on_unlock = true; + /* Now mark everyone sharing same ref_obj_id as untrusted */ + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->ref_obj_id == ref_obj_id) + reg->type |= PTR_UNTRUSTED; + })); + return 0; + } + } + verbose(env, "verifier internal error: ref state missing for ref_obj_id\n"); + return -EFAULT; +} + +/* Implementation details: + * + * Each register points to some region of memory, which we define as an + * allocation. Each allocation may embed a bpf_spin_lock which protects any + * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same + * allocation. The lock and the data it protects are colocated in the same + * memory region. + * + * Hence, everytime a register holds a pointer value pointing to such + * allocation, the verifier preserves a unique reg->id for it. + * + * The verifier remembers the lock 'ptr' and the lock 'id' whenever + * bpf_spin_lock is called. + * + * To enable this, lock state in the verifier captures two values: + * active_lock.ptr = Register's type specific pointer + * active_lock.id = A unique ID for each register pointer value + * + * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two + * supported register types. + * + * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of + * allocated objects is the reg->btf pointer. + * + * The active_lock.id is non-unique for maps supporting direct_value_addr, as we + * can establish the provenance of the map value statically for each distinct + * lookup into such maps. They always contain a single map value hence unique + * IDs for each pseudo load pessimizes the algorithm and rejects valid programs. + * + * So, in case of global variables, they use array maps with max_entries = 1, + * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point + * into the same map value as max_entries is 1, as described above). + * + * In case of inner map lookups, the inner map pointer has same map_ptr as the + * outer map pointer (in verifier context), but each lookup into an inner map + * assigns a fresh reg->id to the lookup, so while lookups into distinct inner + * maps from the same outer map share the same map_ptr as active_lock.ptr, they + * will get different reg->id assigned to each lookup, hence different + * active_lock.id. + * + * In case of allocated objects, active_lock.ptr is the reg->btf, and the + * reg->id is a unique ID preserved after the NULL pointer check on the pointer + * returned from bpf_obj_new. Each allocation receives a new reg->id. + */ +static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg) +{ + void *ptr; + u32 id; + + switch ((int)reg->type) { + case PTR_TO_MAP_VALUE: + ptr = reg->map_ptr; + break; + case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: + ptr = reg->btf; + break; + default: + verbose(env, "verifier internal error: unknown reg type for lock check\n"); + return -EFAULT; + } + id = reg->id; + + if (!env->cur_state->active_lock.ptr) + return -EINVAL; + if (env->cur_state->active_lock.ptr != ptr || + env->cur_state->active_lock.id != id) { + verbose(env, "held lock and object are not in the same allocation\n"); + return -EINVAL; + } + return 0; +} + +static bool is_bpf_list_api_kfunc(u32 btf_id) +{ + return btf_id == special_kfunc_list[KF_bpf_list_push_front] || + btf_id == special_kfunc_list[KF_bpf_list_push_back] || + btf_id == special_kfunc_list[KF_bpf_list_pop_front] || + btf_id == special_kfunc_list[KF_bpf_list_pop_back]; +} + +static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + struct btf_field *field; + struct btf_record *rec; + u32 list_head_off; + + if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) { + verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n"); + return -EFAULT; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n", + regno); + return -EINVAL; + } + + rec = reg_btf_record(reg); + list_head_off = reg->off + reg->var_off.value; + field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD); + if (!field) { + verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off); + return -EINVAL; + } + + /* All functions require bpf_list_head to be protected using a bpf_spin_lock */ + if (check_reg_allocation_locked(env, reg)) { + verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n", + rec->spin_lock_off); + return -EINVAL; + } + + if (meta->arg_list_head.field) { + verbose(env, "verifier internal error: repeating bpf_list_head arg\n"); + return -EFAULT; + } + meta->arg_list_head.field = field; + return 0; +} + +static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + struct bpf_kfunc_call_arg_meta *meta) +{ + const struct btf_type *et, *t; + struct btf_field *field; + struct btf_record *rec; + u32 list_node_off; + + if (meta->btf != btf_vmlinux || + (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] && + meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) { + verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n"); + return -EFAULT; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, + "R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n", + regno); + return -EINVAL; + } + + rec = reg_btf_record(reg); + list_node_off = reg->off + reg->var_off.value; + field = btf_record_find(rec, list_node_off, BPF_LIST_NODE); + if (!field || field->offset != list_node_off) { + verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off); + return -EINVAL; + } + + field = meta->arg_list_head.field; + + et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id); + t = btf_type_by_id(reg->btf, reg->btf_id); + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf, + field->list_head.value_btf_id, true)) { + verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d " + "in struct %s, but arg is at offset=%d in struct %s\n", + field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off), + list_node_off, btf_name_by_offset(reg->btf, t->name_off)); + return -EINVAL; + } + + if (list_node_off != field->list_head.node_offset) { + verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n", + list_node_off, field->list_head.node_offset, + btf_name_by_offset(field->list_head.btf, et->name_off)); + return -EINVAL; + } + /* Set arg#1 for expiration after unlock */ + return ref_set_release_on_unlock(env, reg->ref_obj_id); +} + +static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta) +{ + const char *func_name = meta->func_name, *ref_tname; + const struct btf *btf = meta->btf; + const struct btf_param *args; + u32 i, nargs; + int ret; + + args = (const struct btf_param *)(meta->func_proto + 1); + nargs = btf_type_vlen(meta->func_proto); + if (nargs > MAX_BPF_FUNC_REG_ARGS) { + verbose(env, "Function %s has %d > %d args\n", func_name, nargs, + MAX_BPF_FUNC_REG_ARGS); + return -EINVAL; + } + + /* Check that BTF function arguments match actual types that the + * verifier sees. + */ + for (i = 0; i < nargs; i++) { + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[i + 1]; + const struct btf_type *t, *ref_t, *resolve_ret; + enum bpf_arg_type arg_type = ARG_DONTCARE; + u32 regno = i + 1, ref_id, type_size; + bool is_ret_buf_sz = false; + int kf_arg_type; + + t = btf_type_skip_modifiers(btf, args[i].type, NULL); + + if (is_kfunc_arg_ignore(btf, &args[i])) + continue; + + if (btf_type_is_scalar(t)) { + if (reg->type != SCALAR_VALUE) { + verbose(env, "R%d is not a scalar\n", regno); + return -EINVAL; + } + + if (is_kfunc_arg_constant(meta->btf, &args[i])) { + if (meta->arg_constant.found) { + verbose(env, "verifier internal error: only one constant argument permitted\n"); + return -EFAULT; + } + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d must be a known constant\n", regno); + return -EINVAL; + } + ret = mark_chain_precision(env, regno); + if (ret < 0) + return ret; + meta->arg_constant.found = true; + meta->arg_constant.value = reg->var_off.value; + } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) { + meta->r0_rdonly = true; + is_ret_buf_sz = true; + } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) { + is_ret_buf_sz = true; + } + + if (is_ret_buf_sz) { + if (meta->r0_size) { + verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc"); + return -EINVAL; + } + + if (!tnum_is_const(reg->var_off)) { + verbose(env, "R%d is not a const\n", regno); + return -EINVAL; + } + + meta->r0_size = reg->var_off.value; + ret = mark_chain_precision(env, regno); + if (ret) + return ret; + } + continue; + } + + if (!btf_type_is_ptr(t)) { + verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t)); + return -EINVAL; + } + + if (reg->ref_obj_id) { + if (is_kfunc_release(meta) && meta->ref_obj_id) { + verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, + meta->ref_obj_id); + return -EFAULT; + } + meta->ref_obj_id = reg->ref_obj_id; + if (is_kfunc_release(meta)) + meta->release_regno = regno; + } + + ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); + ref_tname = btf_name_by_offset(btf, ref_t->name_off); + + kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs); + if (kf_arg_type < 0) + return kf_arg_type; + + switch (kf_arg_type) { + case KF_ARG_PTR_TO_ALLOC_BTF_ID: + case KF_ARG_PTR_TO_BTF_ID: + if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) + break; + + if (!is_trusted_reg(reg)) { + if (!is_kfunc_rcu(meta)) { + verbose(env, "R%d must be referenced or trusted\n", regno); + return -EINVAL; + } + if (!is_rcu_reg(reg)) { + verbose(env, "R%d must be a rcu pointer\n", regno); + return -EINVAL; + } + } + + fallthrough; + case KF_ARG_PTR_TO_CTX: + /* Trusted arguments have the same offset checks as release arguments */ + arg_type |= OBJ_RELEASE; + break; + case KF_ARG_PTR_TO_KPTR: + case KF_ARG_PTR_TO_DYNPTR: + case KF_ARG_PTR_TO_LIST_HEAD: + case KF_ARG_PTR_TO_LIST_NODE: + case KF_ARG_PTR_TO_MEM: + case KF_ARG_PTR_TO_MEM_SIZE: + /* Trusted by default */ + break; + default: + WARN_ON_ONCE(1); + return -EFAULT; + } + + if (is_kfunc_release(meta) && reg->ref_obj_id) + arg_type |= OBJ_RELEASE; + ret = check_func_arg_reg_off(env, reg, regno, arg_type); + if (ret < 0) + return ret; + + switch (kf_arg_type) { + case KF_ARG_PTR_TO_CTX: + if (reg->type != PTR_TO_CTX) { + verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t)); + return -EINVAL; + } + + if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog)); + if (ret < 0) + return -EINVAL; + meta->ret_btf_id = ret; + } + break; + case KF_ARG_PTR_TO_ALLOC_BTF_ID: + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); + return -EINVAL; + } + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + if (meta->btf == btf_vmlinux && + meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + meta->arg_obj_drop.btf = reg->btf; + meta->arg_obj_drop.btf_id = reg->btf_id; + } + break; + case KF_ARG_PTR_TO_KPTR: + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "arg#0 expected pointer to map value\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_DYNPTR: + if (reg->type != PTR_TO_STACK && + reg->type != CONST_PTR_TO_DYNPTR) { + verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i); + return -EINVAL; + } + + ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_LIST_HEAD: + if (reg->type != PTR_TO_MAP_VALUE && + reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to map value or allocated object\n", i); + return -EINVAL; + } + if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_LIST_NODE: + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) { + verbose(env, "arg#%d expected pointer to allocated object\n", i); + return -EINVAL; + } + if (!reg->ref_obj_id) { + verbose(env, "allocated object must be referenced\n"); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_BTF_ID: + /* Only base_type is checked, further checks are done here */ + if ((base_type(reg->type) != PTR_TO_BTF_ID || + (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && + !reg2btf_ids[base_type(reg->type)]) { + verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); + verbose(env, "expected %s or socket\n", + reg_type_str(env, base_type(reg->type) | + (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); + return -EINVAL; + } + ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_MEM: + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); + if (IS_ERR(resolve_ret)) { + verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret)); + return -EINVAL; + } + ret = check_mem_reg(env, reg, regno, type_size); + if (ret < 0) + return ret; + break; + case KF_ARG_PTR_TO_MEM_SIZE: + ret = check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1); + if (ret < 0) { + verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1); + return ret; + } + /* Skip next '__sz' argument */ + i++; + break; + } + } + + if (is_kfunc_release(meta) && !meta->release_regno) { + verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; + } + + return 0; +} + static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); - struct bpf_kfunc_arg_meta meta = { 0 }; const char *func_name, *ptr_type_name; + bool sleepable, rcu_lock, rcu_unlock; + struct bpf_kfunc_call_arg_meta meta; u32 i, nargs, func_id, ptr_type_id; int err, insn_idx = *insn_idx_p; const struct btf_param *args; + const struct btf_type *ret_t; struct btf *desc_btf; u32 *kfunc_flags; - bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) @@ -7687,24 +9092,68 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, func_name); return -EACCES; } - if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) { - verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n"); + + /* Prepare kfunc call metadata */ + memset(&meta, 0, sizeof(meta)); + meta.btf = desc_btf; + meta.func_id = func_id; + meta.kfunc_flags = *kfunc_flags; + meta.func_proto = func_proto; + meta.func_name = func_name; + + if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) { + verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n"); + return -EACCES; + } + + sleepable = is_kfunc_sleepable(&meta); + if (sleepable && !env->prog->aux->sleepable) { + verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name); return -EACCES; } - acq = *kfunc_flags & KF_ACQUIRE; + rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta); + rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta); + if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) { + verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name); + return -EACCES; + } - meta.flags = *kfunc_flags; + if (env->cur_state->active_rcu_lock) { + struct bpf_func_state *state; + struct bpf_reg_state *reg; + + if (rcu_lock) { + verbose(env, "nested rcu read lock (kernel function %s)\n", func_name); + return -EINVAL; + } else if (rcu_unlock) { + bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({ + if (reg->type & MEM_RCU) { + reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL); + reg->type |= PTR_UNTRUSTED; + } + })); + env->cur_state->active_rcu_lock = false; + } else if (sleepable) { + verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name); + return -EACCES; + } + } else if (rcu_lock) { + env->cur_state->active_rcu_lock = true; + } else if (rcu_unlock) { + verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name); + return -EINVAL; + } /* Check the arguments */ - err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta); + err = check_kfunc_args(env, &meta); if (err < 0) return err; /* In case of release function, we get register number of refcounted - * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now + * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now. */ - if (err) { - err = release_reference(env, regs[err].ref_obj_id); + if (meta.release_regno) { + err = release_reference(env, regs[meta.release_regno].ref_obj_id); if (err) { verbose(env, "kfunc %s#%d reference has not been acquired before\n", func_name, func_id); @@ -7718,18 +9167,92 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); - if (acq && !btf_type_is_struct_ptr(desc_btf, t)) { - verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); - return -EINVAL; + if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) { + /* Only exception is bpf_obj_new_impl */ + if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) { + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); + return -EINVAL; + } } if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); mark_btf_func_reg_size(env, BPF_REG_0, t->size); } else if (btf_type_is_ptr(t)) { - ptr_type = btf_type_skip_modifiers(desc_btf, t->type, - &ptr_type_id); - if (!btf_type_is_struct(ptr_type)) { + ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id); + + if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) { + if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + struct btf *ret_btf; + u32 ret_btf_id; + + if (unlikely(!bpf_global_ma_set)) + return -ENOMEM; + + if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) { + verbose(env, "local type ID argument must be in range [0, U32_MAX]\n"); + return -EINVAL; + } + + ret_btf = env->prog->aux->btf; + ret_btf_id = meta.arg_constant.value; + + /* This may be NULL due to user not supplying a BTF */ + if (!ret_btf) { + verbose(env, "bpf_obj_new requires prog BTF\n"); + return -EINVAL; + } + + ret_t = btf_type_by_id(ret_btf, ret_btf_id); + if (!ret_t || !__btf_type_is_struct(ret_t)) { + verbose(env, "bpf_obj_new type ID argument must be of a struct\n"); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = ret_btf; + regs[BPF_REG_0].btf_id = ret_btf_id; + + env->insn_aux_data[insn_idx].obj_new_size = ret_t->size; + env->insn_aux_data[insn_idx].kptr_struct_meta = + btf_find_struct_meta(ret_btf, ret_btf_id); + } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + env->insn_aux_data[insn_idx].kptr_struct_meta = + btf_find_struct_meta(meta.arg_obj_drop.btf, + meta.arg_obj_drop.btf_id); + } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] || + meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) { + struct btf_field *field = meta.arg_list_head.field; + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC; + regs[BPF_REG_0].btf = field->list_head.btf; + regs[BPF_REG_0].btf_id = field->list_head.value_btf_id; + regs[BPF_REG_0].off = field->list_head.node_offset; + } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta.ret_btf_id; + } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value); + if (!ret_t || !btf_type_is_struct(ret_t)) { + verbose(env, + "kfunc bpf_rdonly_cast type ID argument must be of a struct\n"); + return -EINVAL; + } + + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED; + regs[BPF_REG_0].btf = desc_btf; + regs[BPF_REG_0].btf_id = meta.arg_constant.value; + } else { + verbose(env, "kernel function %s unhandled dynamic return type\n", + meta.func_name); + return -EFAULT; + } + } else if (!__btf_type_is_struct(ptr_type)) { if (!meta.r0_size) { ptr_type_name = btf_name_by_offset(desc_btf, ptr_type->name_off); @@ -7757,20 +9280,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; } - if (*kfunc_flags & KF_RET_NULL) { + + if (is_kfunc_ret_null(&meta)) { regs[BPF_REG_0].type |= PTR_MAYBE_NULL; /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ regs[BPF_REG_0].id = ++env->id_gen; } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); - if (acq) { + if (is_kfunc_acquire(&meta)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) return id; - regs[BPF_REG_0].id = id; + if (is_kfunc_ret_null(&meta)) + regs[BPF_REG_0].id = id; regs[BPF_REG_0].ref_obj_id = id; } + if (reg_may_point_to_spin_lock(®s[BPF_REG_0]) && !regs[BPF_REG_0].id) + regs[BPF_REG_0].id = ++env->id_gen; } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ nargs = btf_type_vlen(func_proto); @@ -9198,6 +10725,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, return err; return adjust_ptr_min_max_vals(env, insn, dst_reg, src_reg); + } else if (dst_reg->precise) { + /* if dst_reg is precise, src_reg should be precise as well */ + err = mark_chain_precision(env, insn->src_reg); + if (err) + return err; } } else { /* Pretend the src is a reg with a known value, since we only @@ -9937,17 +11469,20 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, bool is_null) { if (type_may_be_null(reg->type) && reg->id == id && - !WARN_ON_ONCE(!reg->id)) { - if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || - !tnum_equals_const(reg->var_off, 0) || - reg->off)) { - /* Old offset (both fixed and variable parts) should - * have been known-zero, because we don't allow pointer - * arithmetic on pointers that might be NULL. If we - * see this happening, don't convert the register. - */ + (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) { + /* Old offset (both fixed and variable parts) should have been + * known-zero, because we don't allow pointer arithmetic on + * pointers that might be NULL. If we see this happening, don't + * convert the register. + * + * But in some cases, some helpers that return local kptrs + * advance offset for the returned pointer. In those cases, it + * is fine to expect to see reg->off. + */ + if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0))) + return; + if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off)) return; - } if (is_null) { reg->type = SCALAR_VALUE; /* We don't need id and ref_obj_id from this point @@ -10121,6 +11656,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_verifier_state *other_branch; struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; + struct bpf_reg_state *eq_branch_regs; u8 opcode = BPF_OP(insn->code); bool is_jmp32; int pred = -1; @@ -10230,8 +11766,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. * this is only legit if both are scalars (or pointers to the same - * object, I suppose, but we don't support that right now), because - * otherwise the different base pointers mean the offsets aren't + * object, I suppose, see the PTR_MAYBE_NULL related if block below), + * because otherwise the different base pointers mean the offsets aren't * comparable. */ if (BPF_SRC(insn->code) == BPF_X) { @@ -10280,6 +11816,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]); } + /* if one pointer register is compared to another pointer + * register check if PTR_MAYBE_NULL could be lifted. + * E.g. register A - maybe null + * register B - not null + * for JNE A, B, ... - A is not null in the false branch; + * for JEQ A, B, ... - A is not null in the true branch. + */ + if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X && + __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) && + type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) { + eq_branch_regs = NULL; + switch (opcode) { + case BPF_JEQ: + eq_branch_regs = other_branch_regs; + break; + case BPF_JNE: + eq_branch_regs = regs; + break; + default: + /* do nothing */ + break; + } + if (eq_branch_regs) { + if (type_may_be_null(src_reg->type)) + mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]); + else + mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]); + } + } + /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). * NOTE: these optimizations below are related with pointer comparison * which will never be JMP32. @@ -10386,8 +11952,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - if (map_value_has_spin_lock(map)) - dst_reg->id = ++env->id_gen; + WARN_ON_ONCE(map->max_entries != 1); + /* We want reg->id to be same (0) as map_value is not distinct */ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { dst_reg->type = CONST_PTR_TO_MAP; @@ -10465,11 +12031,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - if (env->cur_state->active_spin_lock) { + if (env->cur_state->active_lock.ptr) { verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); return -EINVAL; } + if (env->cur_state->active_rcu_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n"); + return -EINVAL; + } + if (regs[ctx_reg].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -10671,7 +12242,7 @@ static int check_return_code(struct bpf_verifier_env *env) * 3 let S be a stack * 4 S.push(v) * 5 while S is not empty - * 6 t <- S.pop() + * 6 t <- S.peek() * 7 if t is what we're looking for: * 8 return t * 9 for all edges e in G.adjacentEdges(t) do @@ -10720,11 +12291,16 @@ static struct bpf_verifier_state_list **explored_state( return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; } -static void init_explored_state(struct bpf_verifier_env *env, int idx) +static void mark_prune_point(struct bpf_verifier_env *env, int idx) { env->insn_aux_data[idx].prune_point = true; } +static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) +{ + return env->insn_aux_data[insn_idx].prune_point; +} + enum { DONE_EXPLORING = 0, KEEP_EXPLORING = 1, @@ -10753,9 +12329,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return -EINVAL; } - if (e == BRANCH) + if (e == BRANCH) { /* mark branch target for state pruning */ - init_explored_state(env, w); + mark_prune_point(env, w); + mark_jmp_point(env, w); + } if (insn_state[w] == 0) { /* tree-edge */ @@ -10782,8 +12360,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, return DONE_EXPLORING; } -static int visit_func_call_insn(int t, int insn_cnt, - struct bpf_insn *insns, +static int visit_func_call_insn(int t, struct bpf_insn *insns, struct bpf_verifier_env *env, bool visit_callee) { @@ -10793,10 +12370,12 @@ static int visit_func_call_insn(int t, int insn_cnt, if (ret) return ret; - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + mark_prune_point(env, t + 1); + /* when we exit from subprog, we need to record non-linear history */ + mark_jmp_point(env, t + 1); + if (visit_callee) { - init_explored_state(env, t); + mark_prune_point(env, t); ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, /* It's ok to allow recursion from CFG point of * view. __check_func_call() will do the actual @@ -10812,13 +12391,13 @@ static int visit_func_call_insn(int t, int insn_cnt, * DONE_EXPLORING - the instruction was fully explored * KEEP_EXPLORING - there is still work to be done before it is fully explored */ -static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +static int visit_insn(int t, struct bpf_verifier_env *env) { struct bpf_insn *insns = env->prog->insnsi; int ret; if (bpf_pseudo_func(insns + t)) - return visit_func_call_insn(t, insn_cnt, insns, env, true); + return visit_func_call_insn(t, insns, env, true); /* All non-branch instructions have a single fall-through edge. */ if (BPF_CLASS(insns[t].code) != BPF_JMP && @@ -10831,13 +12410,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) case BPF_CALL: if (insns[t].imm == BPF_FUNC_timer_set_callback) - /* Mark this call insn to trigger is_state_visited() check - * before call itself is processed by __check_func_call(). - * Otherwise new async state will be pushed for further - * exploration. + /* Mark this call insn as a prune point to trigger + * is_state_visited() check before call itself is + * processed by __check_func_call(). Otherwise new + * async state will be pushed for further exploration. */ - init_explored_state(env, t); - return visit_func_call_insn(t, insn_cnt, insns, env, + mark_prune_point(env, t); + return visit_func_call_insn(t, insns, env, insns[t].src_reg == BPF_PSEUDO_CALL); case BPF_JA: @@ -10850,22 +12429,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) if (ret) return ret; - /* unconditional jmp is not a good pruning point, - * but it's marked, since backtracking needs - * to record jmp history in is_state_visited(). - */ - init_explored_state(env, t + insns[t].off + 1); - /* tell verifier to check for equivalent states - * after every call and jump - */ - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); + mark_prune_point(env, t + insns[t].off + 1); + mark_jmp_point(env, t + insns[t].off + 1); return ret; default: /* conditional jump with two edges */ - init_explored_state(env, t); + mark_prune_point(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); if (ret) return ret; @@ -10901,7 +12473,7 @@ static int check_cfg(struct bpf_verifier_env *env) while (env->cfg.cur_stack > 0) { int t = insn_stack[env->cfg.cur_stack - 1]; - ret = visit_insn(t, insn_cnt, env); + ret = visit_insn(t, env); switch (ret) { case DONE_EXPLORING: insn_state[t] = EXPLORED; @@ -11492,15 +13064,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; - if (rold->type == PTR_TO_STACK) - /* two stack pointers are equal only if they're pointing to - * the same stack frame, since fp-8 in foo != fp-8 in bar - */ - return equal && rold->frameno == rcur->frameno; - - if (equal) - return true; - if (rold->type == NOT_INIT) /* explored state can't have used this */ return true; @@ -11508,10 +13071,12 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return false; switch (base_type(rold->type)) { case SCALAR_VALUE: + if (equal) + return true; if (env->explore_alu_limits) return false; if (rcur->type == SCALAR_VALUE) { - if (!rold->precise && !rcur->precise) + if (!rold->precise) return true; /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && @@ -11554,7 +13119,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && - tnum_in(rold->var_off, rcur->var_off); + tnum_in(rold->var_off, rcur->var_off) && + check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -11578,20 +13144,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, /* new val must satisfy old val knowledge */ return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_CTX: - case CONST_PTR_TO_MAP: - case PTR_TO_PACKET_END: - case PTR_TO_FLOW_KEYS: - case PTR_TO_SOCKET: - case PTR_TO_SOCK_COMMON: - case PTR_TO_TCP_SOCK: - case PTR_TO_XDP_SOCK: - /* Only valid matches are exact, which memcmp() above - * would have accepted + case PTR_TO_STACK: + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar */ + return equal && rold->frameno == rcur->frameno; default: - /* Don't know what's going on, just say it's not safe */ - return false; + /* Only valid matches are exact, which memcmp() */ + return equal; } /* Shouldn't get here; if we do, say it's not safe */ @@ -11701,7 +13261,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat { int i; - memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); for (i = 0; i < MAX_BPF_REG; i++) if (!regsafe(env, &old->regs[i], &cur->regs[i], env->idmap_scratch)) @@ -11725,13 +13284,28 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->curframe != cur->curframe) return false; + memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch)); + /* Verification state from speculative execution simulation * must never prune a non-speculative execution one. */ if (old->speculative && !cur->speculative) return false; - if (old->active_spin_lock != cur->active_spin_lock) + if (old->active_lock.ptr != cur->active_lock.ptr) + return false; + + /* Old and cur active_lock's have to be either both present + * or both absent. + */ + if (!!old->active_lock.id != !!cur->active_lock.id) + return false; + + if (old->active_lock.id && + !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch)) + return false; + + if (old->active_rcu_lock != cur->active_rcu_lock) return false; /* for states to be equal callsites have to be the same @@ -11834,34 +13408,36 @@ static int propagate_precision(struct bpf_verifier_env *env, { struct bpf_reg_state *state_reg; struct bpf_func_state *state; - int i, err = 0; + int i, err = 0, fr; - state = old->frame[old->curframe]; - state_reg = state->regs; - for (i = 0; i < BPF_REG_FP; i++, state_reg++) { - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating r%d\n", i); - err = mark_chain_precision(env, i); - if (err < 0) - return err; - } + for (fr = old->curframe; fr >= 0; fr--) { + state = old->frame[fr]; + state_reg = state->regs; + for (i = 0; i < BPF_REG_FP; i++, state_reg++) { + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating r%d\n", i, fr); + err = mark_chain_precision_frame(env, fr, i); + if (err < 0) + return err; + } - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (!is_spilled_reg(&state->stack[i])) - continue; - state_reg = &state->stack[i].spilled_ptr; - if (state_reg->type != SCALAR_VALUE || - !state_reg->precise) - continue; - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "propagating fp%d\n", - (-i - 1) * BPF_REG_SIZE); - err = mark_chain_precision_stack(env, i); - if (err < 0) - return err; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (!is_spilled_reg(&state->stack[i])) + continue; + state_reg = &state->stack[i].spilled_ptr; + if (state_reg->type != SCALAR_VALUE || + !state_reg->precise) + continue; + if (env->log.level & BPF_LOG_LEVEL2) + verbose(env, "frame %d: propagating fp%d\n", + (-i - 1) * BPF_REG_SIZE, fr); + err = mark_chain_precision_stack_frame(env, fr, i); + if (err < 0) + return err; + } } return 0; } @@ -11893,13 +13469,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) int i, j, err, states_cnt = 0; bool add_new_state = env->test_state_freq ? true : false; - cur->last_insn_idx = env->prev_insn_idx; - if (!env->insn_aux_data[insn_idx].prune_point) - /* this 'insn_idx' instruction wasn't marked, so we will not - * be doing state search here - */ - return 0; - /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 * Do not add new state for future pruning if the verifier hasn't seen @@ -12034,10 +13603,10 @@ next: env->max_states_per_insn = states_cnt; if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) - return push_jmp_history(env, cur); + return 0; if (!add_new_state) - return push_jmp_history(env, cur); + return 0; /* There were no equivalent states, remember the current one. * Technically the current state is not proven to be safe yet, @@ -12056,6 +13625,10 @@ next: env->prev_jmps_processed = env->jmps_processed; env->prev_insn_processed = env->insn_processed; + /* forget precise markings we inherited, see __mark_chain_precision */ + if (env->bpf_capable) + mark_all_scalars_imprecise(env, cur); + /* add new state to the head of linked list */ new = &new_sl->state; err = copy_verifier_state(new, cur); @@ -12173,21 +13746,31 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, env->insn_idx); - if (err < 0) - return err; - if (err == 1) { - /* found equivalent state, can prune the search */ - if (env->log.level & BPF_LOG_LEVEL) { - if (do_print_state) - verbose(env, "\nfrom %d to %d%s: safe\n", - env->prev_insn_idx, env->insn_idx, - env->cur_state->speculative ? - " (speculative execution)" : ""); - else - verbose(env, "%d: safe\n", env->insn_idx); + state->last_insn_idx = env->prev_insn_idx; + + if (is_prune_point(env, env->insn_idx)) { + err = is_state_visited(env, env->insn_idx); + if (err < 0) + return err; + if (err == 1) { + /* found equivalent state, can prune the search */ + if (env->log.level & BPF_LOG_LEVEL) { + if (do_print_state) + verbose(env, "\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + else + verbose(env, "%d: safe\n", env->insn_idx); + } + goto process_bpf_exit; } - goto process_bpf_exit; + } + + if (is_jmp_point(env, env->insn_idx)) { + err = push_jmp_history(env, state); + if (err) + return err; } if (signal_pending(current)) @@ -12370,11 +13953,14 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_spin_lock && - (insn->src_reg == BPF_PSEUDO_CALL || - insn->imm != BPF_FUNC_spin_unlock)) { - verbose(env, "function calls are not allowed while holding a lock\n"); - return -EINVAL; + if (env->cur_state->active_lock.ptr) { + if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) || + (insn->src_reg == BPF_PSEUDO_CALL) || + (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && + (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) { + verbose(env, "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); @@ -12407,11 +13993,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - if (env->cur_state->active_spin_lock) { + if (env->cur_state->active_lock.ptr) { verbose(env, "bpf_spin_unlock is missing\n"); return -EINVAL; } + if (env->cur_state->active_rcu_lock) { + verbose(env, "bpf_rcu_read_unlock is missing\n"); + return -EINVAL; + } + /* We must do check_reference_leak here before * prepare_func_exit to handle the case when * state->curframe > 0, it may be a callback @@ -12664,7 +14255,14 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); - if (map_value_has_spin_lock(map)) { + if (btf_record_has_field(map->record, BPF_LIST_HEAD)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_list_head yet\n"); + return -EINVAL; + } + } + + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -12681,7 +14279,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (map_value_has_timer(map)) { + if (btf_record_has_field(map->record, BPF_TIMER)) { if (is_tracing_prog_type(prog_type)) { verbose(env, "tracing progs cannot use bpf_timer yet\n"); return -EINVAL; @@ -12714,10 +14312,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: break; default: verbose(env, - "Sleepable programs can only use array, hash, and ringbuf maps\n"); + "Sleepable programs can only use array, hash, ringbuf and local storage maps\n"); return -EINVAL; } @@ -13373,6 +14972,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn)) continue; + /* Zero-extension is done by the caller. */ + if (bpf_pseudo_kfunc_call(&insn)) + continue; + if (WARN_ON(load_reg == -1)) { verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n"); return -EFAULT; @@ -13500,6 +15103,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) break; case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_UNTRUSTED: + /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike + * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot + * be said once it is marked PTR_UNTRUSTED, hence we must handle + * any faults for loads into such types. BPF_WRITE is disallowed + * for this case. + */ + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); @@ -13865,8 +15475,8 @@ static int fixup_call_args(struct bpf_verifier_env *env) return err; } -static int fixup_kfunc_call(struct bpf_verifier_env *env, - struct bpf_insn *insn) +static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + struct bpf_insn *insn_buf, int insn_idx, int *cnt) { const struct bpf_kfunc_desc *desc; @@ -13876,7 +15486,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, } /* insn->imm has the btf func_id. Replace it with - * an address (relative to __bpf_base_call). + * an address (relative to __bpf_call_base). */ desc = find_kfunc_desc(env->prog, insn->imm, insn->off); if (!desc) { @@ -13885,8 +15495,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, return -EFAULT; } + *cnt = 0; insn->imm = desc->imm; - + if (insn->off) + return 0; + if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size; + + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size); + insn_buf[1] = addr[0]; + insn_buf[2] = addr[1]; + insn_buf[3] = *insn; + *cnt = 4; + } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) { + struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta; + struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) }; + + insn_buf[0] = addr[0]; + insn_buf[1] = addr[1]; + insn_buf[2] = *insn; + *cnt = 3; + } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] || + desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) { + insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1); + *cnt = 1; + } return 0; } @@ -14028,9 +15663,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) continue; if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { - ret = fixup_kfunc_call(env, insn); + ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt); if (ret) return ret; + if (cnt == 0) + continue; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } @@ -14148,13 +15793,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } - if (insn->imm == BPF_FUNC_task_storage_get || - insn->imm == BPF_FUNC_sk_storage_get || - insn->imm == BPF_FUNC_inode_storage_get) { - if (env->prog->aux->sleepable) - insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); - else + if (is_storage_get_function(insn->imm)) { + if (!env->prog->aux->sleepable || + env->insn_aux_data[i + delta].storage_get_func_atomic) insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); insn_buf[1] = *insn; cnt = 2; @@ -14224,7 +15868,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, (int (*)(struct bpf_map *map, bpf_callback_t callback_fn, @@ -14603,6 +16247,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) BPF_MAIN_FUNC /* callsite */, 0 /* frameno */, subprog); + state->first_insn_idx = env->subprog_info[subprog].start; + state->last_insn_idx = -1; regs = state->frame[state->curframe]->regs; if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) { @@ -15012,12 +16658,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: - /* fentry/fexit/fmod_ret progs can be sleepable only if they are + + /* fentry/fexit/fmod_ret progs can be sleepable if they are * attached to ALLOW_ERROR_INJECTION and are not in denylist. */ if (!check_non_sleepable_error_inject(btf_id) && within_error_injection_list(addr)) ret = 0; + /* fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id); + + if (flags && (*flags & KF_SLEEPABLE)) + ret = 0; + } break; case BPF_PROG_TYPE_LSM: /* LSM progs check that they are attached to bpf_lsm_*() funcs. @@ -15038,7 +16694,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "can't modify return codes of BPF programs\n"); return -EINVAL; } - ret = check_attach_modify_return(addr, tname); + ret = -EINVAL; + if (btf_kfunc_is_modify_return(btf, btf_id) || + !check_attach_modify_return(addr, tname)) + ret = 0; if (ret) { bpf_log(log, "%s() is not modifiable\n", tname); return ret; @@ -15227,10 +16886,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->allow_uninit_stack = bpf_allow_uninit_stack(); - env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); + env->rcu_tag_supported = btf_vmlinux && + btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0; if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/capability.c b/kernel/capability.c index 765194f5d678..860fd22117c1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -489,8 +489,8 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, struct user_namespace *mnt_userns, const struct inode *inode) { - return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) && - kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode)); + return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) && + vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode)); } /** diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index fd4020835ec6..367b0a42ada9 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -167,7 +167,6 @@ struct cgroup_mgctx { extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; -extern struct file_system_type cgroup_fs_type; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2319946715e0..c099cf3fa02d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -248,6 +248,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +#ifdef CONFIG_DEBUG_CGROUP_REF +#define CGROUP_REF_FN_ATTRS noinline +#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn); +#include <linux/cgroup_refcnt.h> +#endif + /** * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID * @ssid: subsys ID of interest @@ -2860,14 +2866,12 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, * take an rcu_read_lock. */ spin_lock_irq(&css_set_lock); - rcu_read_lock(); task = leader; do { cgroup_migrate_add_task(task, mgctx); if (!threadgroup) break; } while_each_thread(leader, task); - rcu_read_unlock(); spin_unlock_irq(&css_set_lock); return cgroup_migrate_execute(mgctx); @@ -5349,6 +5353,7 @@ static void css_free_rwork_fn(struct work_struct *work) atomic_dec(&cgrp->root->nr_cgrps); cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); + bpf_cgrp_storage_free(cgrp); if (cgroup_parent(cgrp)) { /* diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index b474289c15b8..a29c0b13706b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -550,11 +550,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_rwsem held. + * Call with callback_lock or cpuset_rwsem held. The check can be skipped + * if on default hierarchy. */ -static void cpuset_update_task_spread_flag(struct cpuset *cs, +static void cpuset_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) { + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + return; + if (is_spread_page(cs)) task_set_spread_page(tsk); else @@ -2153,7 +2157,7 @@ static void update_tasks_flags(struct cpuset *cs) css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) - cpuset_update_task_spread_flag(cs, task); + cpuset_update_task_spread_flags(cs, task); css_task_iter_end(&it); } @@ -2509,12 +2513,28 @@ static void cpuset_attach(struct cgroup_taskset *tset) struct cgroup_subsys_state *css; struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + bool cpus_updated, mems_updated; cgroup_taskset_first(tset, &css); cs = css_cs(css); lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ percpu_down_write(&cpuset_rwsem); + cpus_updated = !cpumask_equal(cs->effective_cpus, + oldcs->effective_cpus); + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); + + /* + * In the default hierarchy, enabling cpuset in the child cgroups + * will trigger a number of cpuset_attach() calls with no change + * in effective cpus and mems. In that case, we can optimize out + * by skipping the task iteration and update. + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !cpus_updated && !mems_updated) { + cpuset_attach_nodemask_to = cs->effective_mems; + goto out; + } guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2530,14 +2550,19 @@ static void cpuset_attach(struct cgroup_taskset *tset) WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); - cpuset_update_task_spread_flag(cs, task); + cpuset_update_task_spread_flags(cs, task); } /* * Change mm for all threadgroup leaders. This is expensive and may - * sleep and should be moved outside migration path proper. + * sleep and should be moved outside migration path proper. Skip it + * if there is no change in effective_mems and CS_MEMORY_MIGRATE is + * not set. */ cpuset_attach_nodemask_to = cs->effective_mems; + if (!is_memory_migrate(cs) && !mems_updated) + goto out; + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); @@ -2560,6 +2585,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) } } +out: cs->old_mems_allowed = cpuset_attach_nodemask_to; cs->attach_in_progress--; @@ -3046,11 +3072,15 @@ static struct cftype dfl_files[] = { }; -/* - * cpuset_css_alloc - allocate a cpuset css - * cgrp: control group that the new cpuset will be part of +/** + * cpuset_css_alloc - Allocate a cpuset css + * @parent_css: Parent css of the control group that the new cpuset will be + * part of + * Return: cpuset css on success, -ENOMEM on failure. + * + * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return + * top cpuset css otherwise. */ - static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -3630,11 +3660,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block cpuset_track_online_nodes_nb = { - .notifier_call = cpuset_track_online_nodes, - .priority = 10, /* ??! */ -}; - /** * cpuset_init_smp - initialize cpus_allowed * @@ -3652,7 +3677,7 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); top_cpuset.effective_mems = node_states[N_MEMORY]; - register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI); cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); BUG_ON(!cpuset_migrate_mm_wq); diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 8a44b93da0f3..c2f9c912df1c 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -7,5 +7,6 @@ CONFIG_KERNEL_XZ=y # CONFIG_KERNEL_LZO is not set # CONFIG_KERNEL_LZ4 is not set # CONFIG_SLAB is not set -# CONFIG_SLUB is not set -CONFIG_SLOB=y +# CONFIG_SLOB_DEPRECATED is not set +CONFIG_SLUB=y +CONFIG_SLUB_TINY=y diff --git a/kernel/cpu.c b/kernel/cpu.c index bbad5e375d3b..6c0a92ca6bb5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -663,21 +663,51 @@ static bool cpuhp_next_state(bool bringup, return true; } -static int cpuhp_invoke_callback_range(bool bringup, - unsigned int cpu, - struct cpuhp_cpu_state *st, - enum cpuhp_state target) +static int __cpuhp_invoke_callback_range(bool bringup, + unsigned int cpu, + struct cpuhp_cpu_state *st, + enum cpuhp_state target, + bool nofail) { enum cpuhp_state state; - int err = 0; + int ret = 0; while (cpuhp_next_state(bringup, &state, st, target)) { + int err; + err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL); - if (err) + if (!err) + continue; + + if (nofail) { + pr_warn("CPU %u %s state %s (%d) failed (%d)\n", + cpu, bringup ? "UP" : "DOWN", + cpuhp_get_step(st->state)->name, + st->state, err); + ret = -1; + } else { + ret = err; break; + } } - return err; + return ret; +} + +static inline int cpuhp_invoke_callback_range(bool bringup, + unsigned int cpu, + struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false); +} + +static inline void cpuhp_invoke_callback_range_nofail(bool bringup, + unsigned int cpu, + struct cpuhp_cpu_state *st, + enum cpuhp_state target) +{ + __cpuhp_invoke_callback_range(bringup, cpu, st, target, true); } static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) @@ -999,7 +1029,6 @@ static int take_cpu_down(void *_param) struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); int err, cpu = smp_processor_id(); - int ret; /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); @@ -1012,13 +1041,10 @@ static int take_cpu_down(void *_param) */ WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1)); - /* Invoke the former CPU_DYING callbacks */ - ret = cpuhp_invoke_callback_range(false, cpu, st, target); - /* - * DYING must not fail! + * Invoke the former CPU_DYING callbacks. DYING must not fail! */ - WARN_ON_ONCE(ret); + cpuhp_invoke_callback_range_nofail(false, cpu, st, target); /* Give up timekeeping duties */ tick_handover_do_timer(); @@ -1296,16 +1322,14 @@ void notify_cpu_starting(unsigned int cpu) { struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - int ret; rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */ cpumask_set_cpu(cpu, &cpus_booted_once_mask); - ret = cpuhp_invoke_callback_range(true, cpu, st, target); /* * STARTING must not fail! */ - WARN_ON_ONCE(ret); + cpuhp_invoke_callback_range_nofail(true, cpu, st, target); } /* @@ -2326,8 +2350,10 @@ static ssize_t target_store(struct device *dev, struct device_attribute *attr, if (st->state < target) ret = cpu_up(dev->id, target); - else + else if (st->state > target) ret = cpu_down(dev->id, target); + else if (WARN_ON(st->target != target)) + st->target = target; out: unlock_device_hotplug(); return ret ? ret : count; @@ -2688,6 +2714,7 @@ void __init boot_cpu_hotplug_init(void) cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask); #endif this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); + this_cpu_write(cpuhp_state.target, CPUHP_ONLINE); } /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c index a0eb4d5cf557..87ef6096823f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -383,6 +383,9 @@ void vmcoreinfo_append_str(const char *fmt, ...) memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); vmcoreinfo_size += r; + + WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES, + "vmcoreinfo data exceeds allocated size, truncating"); } /* diff --git a/kernel/cred.c b/kernel/cred.c index e10c15f51c1f..811ad654abd1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -701,9 +701,9 @@ void __init cred_init(void) * override a task's own credentials so that work can be done on behalf of that * task that requires a different subjective context. * - * @daemon is used to provide a base for the security record, but can be NULL. - * If @daemon is supplied, then the security data will be derived from that; - * otherwise they'll be set to 0 and no groups, full capabilities and no keys. + * @daemon is used to provide a base cred, with the security data derived from + * that; if this is "&init_task", they'll be set to 0, no groups, full + * capabilities, and no keys. * * The caller may change these controls afterwards if desired. * @@ -714,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) const struct cred *old; struct cred *new; + if (WARN_ON_ONCE(!daemon)) + return NULL; + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); if (!new) return NULL; kdebug("prepare_kernel_cred() alloc %p", new); - if (daemon) - old = get_task_cred(daemon); - else - old = get_cred(&init_cred); - + old = get_task_cred(daemon); validate_creds(old); *new = *old; diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 67d3c48a1522..5c7e9ba7cd6b 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -545,6 +545,7 @@ static void kdb_msg_write(const char *msg, int msg_len) { struct console *c; const char *cp; + int cookie; int len; if (msg_len == 0) @@ -558,8 +559,20 @@ static void kdb_msg_write(const char *msg, int msg_len) cp++; } - for_each_console(c) { - if (!(c->flags & CON_ENABLED)) + /* + * The console_srcu_read_lock() only provides safe console list + * traversal. The use of the ->write() callback relies on all other + * CPUs being stopped at the moment and console drivers being able to + * handle reentrance when @oops_in_progress is set. + * + * There is no guarantee that every console driver can handle + * reentrance in this way; the developer deploying the debugger + * is responsible for ensuring that the console drivers they + * have selected handle reentrance appropriately. + */ + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + if (!(console_srcu_read_flags(c) & CON_ENABLED)) continue; if (c == dbg_io_ops->cons) continue; @@ -577,6 +590,7 @@ static void kdb_msg_write(const char *msg, int msg_len) --oops_in_progress; touch_nmi_watchdog(); } + console_srcu_read_unlock(cookie); } int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 33437d620644..68106e3791f6 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -498,6 +498,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, WARN_ON_ONCE(!dev->coherent_dma_mask); + /* + * DMA allocations can never be turned back into a page pointer, so + * requesting compound pages doesn't make sense (and can't even be + * supported at all by various backends). + */ + if (WARN_ON_ONCE(flag & __GFP_COMP)) + return NULL; + if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; @@ -552,6 +560,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, return NULL; if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM))) return NULL; + if (WARN_ON_ONCE(gfp & __GFP_COMP)) + return NULL; size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) @@ -637,6 +647,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES)) return NULL; + if (WARN_ON_ONCE(gfp & __GFP_COMP)) + return NULL; if (ops && ops->alloc_noncontiguous) sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 339a990554e7..a34c38bbe28f 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -300,6 +300,37 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, return; } +static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags, + int (*remap)(void *tlb, unsigned long nslabs)) +{ + size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); + void *tlb; + + /* + * By default allocate the bounce buffer memory from low memory, but + * allow to pick a location everywhere for hypervisors with guest + * memory encryption. + */ + if (flags & SWIOTLB_ANY) + tlb = memblock_alloc(bytes, PAGE_SIZE); + else + tlb = memblock_alloc_low(bytes, PAGE_SIZE); + + if (!tlb) { + pr_warn("%s: Failed to allocate %zu bytes tlb structure\n", + __func__, bytes); + return NULL; + } + + if (remap && remap(tlb, nslabs) < 0) { + memblock_free(tlb, PAGE_ALIGN(bytes)); + pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes); + return NULL; + } + + return tlb; +} + /* * Statically reserve bounce buffer space and initialize bounce buffer data * structures for the software IO TLB used to implement the DMA API. @@ -310,7 +341,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, struct io_tlb_mem *mem = &io_tlb_default_mem; unsigned long nslabs; size_t alloc_size; - size_t bytes; void *tlb; if (!addressing_limit && !swiotlb_force_bounce) @@ -326,31 +356,16 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags, swiotlb_adjust_nareas(num_possible_cpus()); nslabs = default_nslabs; - /* - * By default allocate the bounce buffer memory from low memory, but - * allow to pick a location everywhere for hypervisors with guest - * memory encryption. - */ -retry: - bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT); - if (flags & SWIOTLB_ANY) - tlb = memblock_alloc(bytes, PAGE_SIZE); - else - tlb = memblock_alloc_low(bytes, PAGE_SIZE); - if (!tlb) { - pr_warn("%s: failed to allocate tlb structure\n", __func__); - return; - } - - if (remap && remap(tlb, nslabs) < 0) { - memblock_free(tlb, PAGE_ALIGN(bytes)); - + while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) { + if (nslabs <= IO_TLB_MIN_SLABS) + return; nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); - if (nslabs >= IO_TLB_MIN_SLABS) - goto retry; + } - pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes); - return; + if (default_nslabs != nslabs) { + pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs", + default_nslabs, nslabs); + default_nslabs = nslabs; } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); diff --git a/kernel/events/core.c b/kernel/events/core.c index 4ec3717003d5..eacc3702654d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -155,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - static void perf_ctx_lock(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -184,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event) return READ_ONCE(event->owner) == TASK_TOMBSTONE; } +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +struct perf_event_context *perf_cpu_task_ctx(void) +{ + lockdep_assert_irqs_disabled(); + return this_cpu_ptr(&perf_cpu_context)->task_ctx; +} + /* * On task ctx scheduling... * @@ -217,7 +219,7 @@ static int event_function(void *info) struct event_function_struct *efs = info; struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; int ret = 0; @@ -314,7 +316,7 @@ again: static void event_function_local(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct task_struct *task = READ_ONCE(ctx->task); struct perf_event_context *task_ctx = NULL; @@ -388,7 +390,6 @@ static DEFINE_MUTEX(perf_sched_mutex); static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(int, perf_sched_cb_usages); static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; @@ -448,7 +449,7 @@ static void update_perf_cpu_limits(void) WRITE_ONCE(perf_sample_allowed_ns, tmp); } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx); +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc); int perf_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -571,12 +572,6 @@ void perf_sample_event_took(u64 sample_len_ns) static atomic64_t perf_event_id; -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -691,13 +686,31 @@ do { \ ___p; \ }) +static void perf_ctx_disable(struct perf_event_context *ctx) +{ + struct perf_event_pmu_context *pmu_ctx; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + perf_pmu_disable(pmu_ctx->pmu); +} + +static void perf_ctx_enable(struct perf_event_context *ctx) +{ + struct perf_event_pmu_context *pmu_ctx; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + perf_pmu_enable(pmu_ctx->pmu); +} + +static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type); +static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type); + #ifdef CONFIG_CGROUP_PERF static inline bool perf_cgroup_match(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); /* @event doesn't care about cgroup */ if (!event->cgrp) @@ -823,54 +836,39 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) } } -static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); - /* * reschedule events based on the cgroup constraint of task. */ static void perf_cgroup_switch(struct task_struct *task) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_cgroup *cgrp; - struct perf_cpu_context *cpuctx, *tmp; - struct list_head *list; - unsigned long flags; - - /* - * Disable interrupts and preemption to avoid this CPU's - * cgrp_cpuctx_entry to change under us. - */ - local_irq_save(flags); cgrp = perf_cgroup_from_task(task, NULL); - list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { - WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); - if (READ_ONCE(cpuctx->cgrp) == cgrp) - continue; - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + return; - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to update_cgrp_time_from_cpuctx() in - * ctx_sched_out() - */ - cpuctx->cgrp = cgrp; - /* - * set cgrp before ctxsw in to allow - * perf_cgroup_set_timestamp() in ctx_sched_in() - * to not have to pass task around - */ - cpu_ctx_sched_in(cpuctx, EVENT_ALL); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_ctx_disable(&cpuctx->ctx); - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + ctx_sched_out(&cpuctx->ctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() + */ + cpuctx->cgrp = cgrp; + /* + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around + */ + ctx_sched_in(&cpuctx->ctx, EVENT_ALL); - local_irq_restore(flags); + perf_ctx_enable(&cpuctx->ctx); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } static int perf_cgroup_ensure_storage(struct perf_event *event, @@ -888,7 +886,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event, heap_size++; for_each_possible_cpu(cpu) { - cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); if (heap_size <= cpuctx->heap_size) continue; @@ -972,8 +970,6 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct return; cpuctx->cgrp = perf_cgroup_from_task(current, ctx); - list_add(&cpuctx->cgrp_cpuctx_entry, - per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } static inline void @@ -994,7 +990,6 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c return; cpuctx->cgrp = NULL; - list_del(&cpuctx->cgrp_cpuctx_entry); } #else /* !CONFIG_CGROUP_PERF */ @@ -1069,34 +1064,30 @@ static void perf_cgroup_switch(struct task_struct *task) */ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_pmu_context *cpc; bool rotations; lockdep_assert_irqs_disabled(); - cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); + cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer); + rotations = perf_rotate_context(cpc); - raw_spin_lock(&cpuctx->hrtimer_lock); + raw_spin_lock(&cpc->hrtimer_lock); if (rotations) - hrtimer_forward_now(hr, cpuctx->hrtimer_interval); + hrtimer_forward_now(hr, cpc->hrtimer_interval); else - cpuctx->hrtimer_active = 0; - raw_spin_unlock(&cpuctx->hrtimer_lock); + cpc->hrtimer_active = 0; + raw_spin_unlock(&cpc->hrtimer_lock); return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; + struct pmu *pmu = cpc->epc.pmu; u64 interval; - /* no multiplexing needed for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return; - /* * check default is sane, if not set then force to * default interval (1/tick) @@ -1105,34 +1096,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) if (interval < 1) interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - raw_spin_lock_init(&cpuctx->hrtimer_lock); + raw_spin_lock_init(&cpc->hrtimer_lock); hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); timer->function = perf_mux_hrtimer_handler; } -static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc) { - struct hrtimer *timer = &cpuctx->hrtimer; - struct pmu *pmu = cpuctx->ctx.pmu; + struct hrtimer *timer = &cpc->hrtimer; unsigned long flags; - /* not for SW PMU */ - if (pmu->task_ctx_nr == perf_sw_context) - return 0; - - raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); - if (!cpuctx->hrtimer_active) { - cpuctx->hrtimer_active = 1; - hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags); + if (!cpc->hrtimer_active) { + cpc->hrtimer_active = 1; + hrtimer_forward_now(timer, cpc->hrtimer_interval); hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); } - raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); + raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags); return 0; } +static int perf_mux_hrtimer_restart_ipi(void *arg) +{ + return perf_mux_hrtimer_restart(arg); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -1147,32 +1138,9 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static DEFINE_PER_CPU(struct list_head, active_ctx_list); - -/* - * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and - * perf_event_task_tick() are fully serialized because they're strictly cpu - * affine and perf_event_ctx{activate,deactivate} are called with IRQs - * disabled, while perf_event_task_tick is called from IRQ context. - */ -static void perf_event_ctx_activate(struct perf_event_context *ctx) -{ - struct list_head *head = this_cpu_ptr(&active_ctx_list); - - lockdep_assert_irqs_disabled(); - - WARN_ON(!list_empty(&ctx->active_ctx_list)); - - list_add(&ctx->active_ctx_list, head); -} - -static void perf_event_ctx_deactivate(struct perf_event_context *ctx) +static void perf_assert_pmu_disabled(struct pmu *pmu) { - lockdep_assert_irqs_disabled(); - - WARN_ON(list_empty(&ctx->active_ctx_list)); - - list_del_init(&ctx->active_ctx_list); + WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0); } static void get_ctx(struct perf_event_context *ctx) @@ -1199,7 +1167,6 @@ static void free_ctx(struct rcu_head *head) struct perf_event_context *ctx; ctx = container_of(head, struct perf_event_context, rcu_head); - free_task_ctx_data(ctx->pmu, ctx->task_ctx_data); kfree(ctx); } @@ -1384,7 +1351,7 @@ static u64 primary_event_id(struct perf_event *event) * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_event_context *ctx; @@ -1400,7 +1367,7 @@ retry: */ local_irq_save(*flags); rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might @@ -1413,7 +1380,7 @@ retry: * can't get swapped on us any more. */ raw_spin_lock(&ctx->lock); - if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + if (ctx != rcu_dereference(task->perf_event_ctxp)) { raw_spin_unlock(&ctx->lock); rcu_read_unlock(); local_irq_restore(*flags); @@ -1440,12 +1407,12 @@ retry: * reference count so that the context can't get freed. */ static struct perf_event_context * -perf_pin_task_context(struct task_struct *task, int ctxn) +perf_pin_task_context(struct task_struct *task) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -1593,14 +1560,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event) * which provides ordering when rotating groups for the same CPU. */ static __always_inline int -perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, - const u64 left_group_index, const struct perf_event *right) +perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu, + const struct cgroup *left_cgroup, const u64 left_group_index, + const struct perf_event *right) { if (left_cpu < right->cpu) return -1; if (left_cpu > right->cpu) return 1; + if (left_pmu) { + if (left_pmu < right->pmu_ctx->pmu) + return -1; + if (left_pmu > right->pmu_ctx->pmu) + return 1; + } + #ifdef CONFIG_CGROUP_PERF { const struct cgroup *right_cgroup = event_cgroup(right); @@ -1643,12 +1618,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, static inline bool __group_less(struct rb_node *a, const struct rb_node *b) { struct perf_event *e = __node_2_pe(a); - return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index, - __node_2_pe(b)) < 0; + return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e), + e->group_index, __node_2_pe(b)) < 0; } struct __group_key { int cpu; + struct pmu *pmu; struct cgroup *cgroup; }; @@ -1657,14 +1633,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node) const struct __group_key *a = key; const struct perf_event *b = __node_2_pe(node); - /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */ - return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b); + /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b); +} + +static inline int +__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */ + return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b), + b->group_index, b); } /* - * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for - * key (see perf_event_groups_less). This places it last inside the CPU - * subtree. + * Insert @event into @groups' tree; using + * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index} + * as key. This places it last inside the {cpu,pmu,cgroup} subtree. */ static void perf_event_groups_insert(struct perf_event_groups *groups, @@ -1714,14 +1701,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx) } /* - * Get the leftmost event in the cpu/cgroup subtree. + * Get the leftmost event in the {cpu,pmu,cgroup} subtree. */ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, - struct cgroup *cgrp) + struct pmu *pmu, struct cgroup *cgrp) { struct __group_key key = { .cpu = cpu, + .pmu = pmu, .cgroup = cgrp, }; struct rb_node *node; @@ -1733,14 +1721,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu, return NULL; } -/* - * Like rb_entry_next_safe() for the @cpu subtree. - */ static struct perf_event * -perf_event_groups_next(struct perf_event *event) +perf_event_groups_next(struct perf_event *event, struct pmu *pmu) { struct __group_key key = { .cpu = event->cpu, + .pmu = pmu, .cgroup = event_cgroup(event), }; struct rb_node *next; @@ -1752,6 +1738,10 @@ perf_event_groups_next(struct perf_event *event) return NULL; } +#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \ + for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \ + event; event = perf_event_groups_next(event, pmu)) + /* * Iterate through the whole groups tree. */ @@ -1796,6 +1786,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) perf_cgroup_event_enable(event, ctx); ctx->generation++; + event->pmu_ctx->nr_events++; } /* @@ -1941,7 +1932,8 @@ static void perf_group_attach(struct perf_event *event) lockdep_assert_held(&event->ctx->lock); /* - * We can have double attach due to group movement in perf_event_open. + * We can have double attach due to group movement (move_group) in + * perf_event_open(). */ if (event->attach_state & PERF_ATTACH_GROUP) return; @@ -2006,6 +1998,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) } ctx->generation++; + event->pmu_ctx->nr_events--; } static int @@ -2022,13 +2015,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event) static void put_event(struct perf_event *event); static void event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, struct perf_event_context *ctx); static void perf_put_aux_event(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *iter; /* @@ -2057,7 +2048,7 @@ static void perf_put_aux_event(struct perf_event *event) * state so that we don't try to schedule it again. Note * that perf_event_enable() will clear the ERROR status. */ - event_sched_out(iter, cpuctx, ctx); + event_sched_out(iter, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } } @@ -2108,8 +2099,8 @@ static int perf_get_aux_event(struct perf_event *event, static inline struct list_head *get_event_list(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active; + return event->attr.pinned ? &event->pmu_ctx->pinned_active : + &event->pmu_ctx->flexible_active; } /* @@ -2120,10 +2111,7 @@ static inline struct list_head *get_event_list(struct perf_event *event) */ static inline void perf_remove_sibling_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, event->ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } @@ -2212,53 +2200,22 @@ static bool is_orphaned_event(struct perf_event *event) return event->state == PERF_EVENT_STATE_DEAD; } -static inline int __pmu_filter_match(struct perf_event *event) -{ - struct pmu *pmu = event->pmu; - return pmu->filter_match ? pmu->filter_match(event) : 1; -} - -/* - * Check whether we should attempt to schedule an event group based on - * PMU-specific filtering. An event group can consist of HW and SW events, - * potentially with a SW leader, so we must check all the filters, to - * determine whether a group is schedulable: - */ -static inline int pmu_filter_match(struct perf_event *event) -{ - struct perf_event *sibling; - unsigned long flags; - int ret = 1; - - if (!__pmu_filter_match(event)) - return 0; - - local_irq_save(flags); - for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) { - ret = 0; - break; - } - } - local_irq_restore(flags); - - return ret; -} - static inline int event_filter_match(struct perf_event *event) { return (event->cpu == -1 || event->cpu == smp_processor_id()) && - perf_cgroup_match(event) && pmu_filter_match(event); + perf_cgroup_match(event); } static void -event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_out(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); enum perf_event_state state = PERF_EVENT_STATE_INACTIVE; + // XXX cpc serialization, probably per-cpu IRQ disabled + WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -2291,6 +2248,7 @@ event_sched_out(struct perf_event *event, !event->pending_work) { event->pending_work = 1; dec = false; + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); task_work_add(current, &event->pending_task, TWA_RESUME); } if (dec) @@ -2300,42 +2258,37 @@ event_sched_out(struct perf_event *event, perf_event_set_state(event, state); if (!is_software_event(event)) - cpuctx->active_oncpu--; - if (!--ctx->nr_active) - perf_event_ctx_deactivate(ctx); + cpc->active_oncpu--; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq--; - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; + if (event->attr.exclusive || !cpc->active_oncpu) + cpc->exclusive = 0; perf_pmu_enable(event->pmu); } static void -group_sched_out(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event; if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; - perf_pmu_disable(ctx->pmu); + perf_assert_pmu_disabled(group_event->pmu_ctx->pmu); - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); /* * Schedule out siblings (if any): */ for_each_sibling_event(event, group_event) - event_sched_out(event, cpuctx, ctx); - - perf_pmu_enable(ctx->pmu); + event_sched_out(event, ctx); } #define DETACH_GROUP 0x01UL #define DETACH_CHILD 0x02UL +#define DETACH_DEAD 0x04UL /* * Cross CPU call to remove a performance event @@ -2349,6 +2302,7 @@ __perf_remove_from_context(struct perf_event *event, struct perf_event_context *ctx, void *info) { + struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx; unsigned long flags = (unsigned long)info; if (ctx->is_active & EVENT_TIME) { @@ -2356,19 +2310,38 @@ __perf_remove_from_context(struct perf_event *event, update_cgrp_time_from_cpuctx(cpuctx, false); } - event_sched_out(event, cpuctx, ctx); + /* + * Ensure event_sched_out() switches to OFF, at the very least + * this avoids raising perf_pending_task() at this time. + */ + if (flags & DETACH_DEAD) + event->pending_disable = 1; + event_sched_out(event, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); if (flags & DETACH_CHILD) perf_child_detach(event); list_del_event(event, ctx); + if (flags & DETACH_DEAD) + event->state = PERF_EVENT_STATE_DEAD; + + if (!pmu_ctx->nr_events) { + pmu_ctx->rotate_necessary = 0; + + if (ctx->task && ctx->is_active) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + } if (!ctx->nr_events && ctx->is_active) { if (ctx == &cpuctx->ctx) update_cgrp_time_from_cpuctx(cpuctx, true); ctx->is_active = 0; - ctx->rotate_necessary = 0; if (ctx->task) { WARN_ON_ONCE(cpuctx->task_ctx != ctx); cpuctx->task_ctx = NULL; @@ -2398,12 +2371,8 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - /* - * Cgroup events are per-cpu events, and must IPI because of - * cgrp_cpuctx_list. - */ - if (!ctx->is_active && !is_cgroup_event(event)) { - __perf_remove_from_context(event, __get_cpu_context(ctx), + if (!ctx->is_active) { + __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); return; @@ -2429,13 +2398,17 @@ static void __perf_event_disable(struct perf_event *event, update_cgrp_time_from_event(event); } + perf_pmu_disable(event->pmu_ctx->pmu); + if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); + group_sched_out(event, ctx); else - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_OFF); perf_cgroup_event_disable(event, ctx); + + perf_pmu_enable(event->pmu_ctx->pmu); } /* @@ -2497,10 +2470,10 @@ static void perf_log_throttle(struct perf_event *event, int enable); static void perf_log_itrace_start(struct perf_event *event); static int -event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +event_sched_in(struct perf_event *event, struct perf_event_context *ctx) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); int ret = 0; WARN_ON_ONCE(event->ctx != ctx); @@ -2541,14 +2514,12 @@ event_sched_in(struct perf_event *event, } if (!is_software_event(event)) - cpuctx->active_oncpu++; - if (!ctx->nr_active++) - perf_event_ctx_activate(ctx); + cpc->active_oncpu++; if (event->attr.freq && event->attr.sample_freq) ctx->nr_freq++; if (event->attr.exclusive) - cpuctx->exclusive = 1; + cpc->exclusive = 1; out: perf_pmu_enable(event->pmu); @@ -2557,26 +2528,24 @@ out: } static int -group_sched_in(struct perf_event *group_event, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) +group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - struct pmu *pmu = ctx->pmu; + struct pmu *pmu = group_event->pmu_ctx->pmu; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) + if (event_sched_in(group_event, ctx)) goto error; /* * Schedule in siblings as one group (if any): */ for_each_sibling_event(event, group_event) { - if (event_sched_in(event, cpuctx, ctx)) { + if (event_sched_in(event, ctx)) { partial_group = event; goto group_error; } @@ -2595,9 +2564,9 @@ group_error: if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + event_sched_out(event, ctx); } - event_sched_out(group_event, cpuctx, ctx); + event_sched_out(group_event, ctx); error: pmu->cancel_txn(pmu); @@ -2607,10 +2576,11 @@ error: /* * Work out whether we can put this event group on the CPU now. */ -static int group_can_go_on(struct perf_event *event, - struct perf_cpu_context *cpuctx, - int can_add_hw) +static int group_can_go_on(struct perf_event *event, int can_add_hw) { + struct perf_event_pmu_context *epc = event->pmu_ctx; + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context); + /* * Groups consisting entirely of software events can always go on. */ @@ -2620,7 +2590,7 @@ static int group_can_go_on(struct perf_event *event, * If an exclusive group is already on, no other hardware * events can go on. */ - if (cpuctx->exclusive) + if (cpc->exclusive) return 0; /* * If this group is exclusive and there are already @@ -2642,36 +2612,29 @@ static void add_event_to_ctx(struct perf_event *event, perf_group_attach(event); } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); -static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type); - -static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - enum event_type_t event_type) +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + if (!cpuctx->task_ctx) return; if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - ctx_sched_out(ctx, cpuctx, event_type); + ctx_sched_out(ctx, event_type); } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, EVENT_PINNED); + ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); + ctx_sched_in(ctx, EVENT_FLEXIBLE); } /* @@ -2689,11 +2652,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, * event_type is a bit mask of the types of events involved. For CPU events, * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE. */ +/* + * XXX: ctx_resched() reschedule entire perf_event_context while adding new + * event to the context or enabling existing event in the context. We can + * probably optimize it by rescheduling only affected pmu_ctx. + */ static void ctx_resched(struct perf_cpu_context *cpuctx, struct perf_event_context *task_ctx, enum event_type_t event_type) { - enum event_type_t ctx_event_type; bool cpu_event = !!(event_type & EVENT_CPU); /* @@ -2703,11 +2670,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, if (event_type & EVENT_PINNED) event_type |= EVENT_FLEXIBLE; - ctx_event_type = event_type & EVENT_ALL; + event_type &= EVENT_ALL; - perf_pmu_disable(cpuctx->ctx.pmu); - if (task_ctx) - task_ctx_sched_out(cpuctx, task_ctx, event_type); + perf_ctx_disable(&cpuctx->ctx); + if (task_ctx) { + perf_ctx_disable(task_ctx); + task_ctx_sched_out(task_ctx, event_type); + } /* * Decide which cpu ctx groups to schedule out based on the types @@ -2717,17 +2686,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, * - otherwise, do nothing more. */ if (cpu_event) - cpu_ctx_sched_out(cpuctx, ctx_event_type); - else if (ctx_event_type & EVENT_PINNED) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + ctx_sched_out(&cpuctx->ctx, event_type); + else if (event_type & EVENT_PINNED) + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, task_ctx); - perf_pmu_enable(cpuctx->ctx.pmu); + + perf_ctx_enable(&cpuctx->ctx); + if (task_ctx) + perf_ctx_enable(task_ctx); } void perf_pmu_resched(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; perf_ctx_lock(cpuctx, task_ctx); @@ -2745,7 +2717,7 @@ static int __perf_install_in_context(void *info) { struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *task_ctx = cpuctx->task_ctx; bool reprogram = true; int ret = 0; @@ -2787,7 +2759,7 @@ static int __perf_install_in_context(void *info) #endif if (reprogram) { - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); add_event_to_ctx(event, ctx); ctx_resched(cpuctx, task_ctx, get_event_type(event)); } else { @@ -2820,7 +2792,7 @@ perf_install_in_context(struct perf_event_context *ctx, WARN_ON_ONCE(!exclusive_event_installable(event, ctx)); if (event->cpu != -1) - event->cpu = cpu; + WARN_ON_ONCE(event->cpu != cpu); /* * Ensures that if we can observe event->ctx, both the event and ctx @@ -2832,8 +2804,6 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. - * Similarly, cgroup events for the context also needs the IPI to - * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. @@ -2935,7 +2905,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (ctx->is_active) - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE); perf_cgroup_event_enable(event, ctx); @@ -2944,7 +2914,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); return; } @@ -2953,7 +2923,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); return; } @@ -3222,11 +3192,52 @@ out: return err; } -static void ctx_sched_out(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx, + enum event_type_t event_type) { + struct perf_event_context *ctx = pmu_ctx->ctx; struct perf_event *event, *tmp; + struct pmu *pmu = pmu_ctx->pmu; + + if (ctx->task && !ctx->is_active) { + struct perf_cpu_pmu_context *cpc; + + cpc = this_cpu_ptr(pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = NULL; + } + + if (!event_type) + return; + + perf_pmu_disable(pmu); + if (event_type & EVENT_PINNED) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->pinned_active, + active_list) + group_sched_out(event, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry_safe(event, tmp, + &pmu_ctx->flexible_active, + active_list) + group_sched_out(event, ctx); + /* + * Since we cleared EVENT_FLEXIBLE, also clear + * rotate_necessary, is will be reset by + * ctx_flexible_sched_in() when needed. + */ + pmu_ctx->rotate_necessary = 0; + } + perf_pmu_enable(pmu); +} + +static void +ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *pmu_ctx; int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); @@ -3274,27 +3285,8 @@ static void ctx_sched_out(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (!ctx->nr_active || !(is_active & EVENT_ALL)) - return; - - perf_pmu_disable(ctx->pmu); - if (is_active & EVENT_PINNED) { - list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) - group_sched_out(event, cpuctx, ctx); - } - - if (is_active & EVENT_FLEXIBLE) { - list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list) - group_sched_out(event, cpuctx, ctx); - - /* - * Since we cleared EVENT_FLEXIBLE, also clear - * rotate_necessary, is will be reset by - * ctx_flexible_sched_in() when needed. - */ - ctx->rotate_necessary = 0; - } - perf_pmu_enable(ctx->pmu); + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) + __pmu_ctx_sched_out(pmu_ctx, is_active); } /* @@ -3399,26 +3391,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -static void perf_event_context_sched_out(struct task_struct *task, int ctxn, - struct task_struct *next) +#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \ + for (pos1 = list_first_entry(head1, typeof(*pos1), member), \ + pos2 = list_first_entry(head2, typeof(*pos2), member); \ + !list_entry_is_head(pos1, head1, member) && \ + !list_entry_is_head(pos2, head2, member); \ + pos1 = list_next_entry(pos1, member), \ + pos2 = list_next_entry(pos2, member)) + +static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx, + struct perf_event_context *next_ctx) { - struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_pmu_context *prev_epc, *next_epc; + + if (!prev_ctx->nr_task_data) + return; + + double_list_for_each_entry(prev_epc, next_epc, + &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list, + pmu_ctx_entry) { + + if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu)) + continue; + + /* + * PMU specific parts of task perf context can require + * additional synchronization. As an example of such + * synchronization see implementation details of Intel + * LBR call stack data profiling; + */ + if (prev_epc->pmu->swap_task_ctx) + prev_epc->pmu->swap_task_ctx(prev_epc, next_epc); + else + swap(prev_epc->task_ctx_data, next_epc->task_ctx_data); + } +} + +static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in) +{ + struct perf_event_pmu_context *pmu_ctx; + struct perf_cpu_pmu_context *cpc; + + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + + if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task) + pmu_ctx->pmu->sched_task(pmu_ctx, sched_in); + } +} + +static void +perf_event_context_sched_out(struct task_struct *task, struct task_struct *next) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent, *next_parent; - struct perf_cpu_context *cpuctx; int do_switch = 1; - struct pmu *pmu; if (likely(!ctx)) return; - pmu = ctx->pmu; - cpuctx = __get_cpu_context(ctx); - if (!cpuctx->task_ctx) - return; - rcu_read_lock(); - next_ctx = next->perf_event_ctxp[ctxn]; + next_ctx = rcu_dereference(next->perf_event_ctxp); if (!next_ctx) goto unlock; @@ -3443,7 +3477,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); /* PMIs are disabled; ctx->nr_pending is stable. */ if (local_read(&ctx->nr_pending) || @@ -3460,21 +3494,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, WRITE_ONCE(ctx->task, next); WRITE_ONCE(next_ctx->task, task); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); + perf_ctx_sched_task_cb(ctx, false); + perf_event_swap_task_ctx_data(ctx, next_ctx); - /* - * PMU specific parts of task perf context can require - * additional synchronization. As an example of such - * synchronization see implementation details of Intel - * LBR call stack data profiling; - */ - if (pmu->swap_task_ctx) - pmu->swap_task_ctx(ctx, next_ctx); - else - swap(ctx->task_ctx_data, next_ctx->task_ctx_data); - - perf_pmu_enable(pmu); + perf_ctx_enable(ctx); /* * RCU_INIT_POINTER here is safe because we've not @@ -3483,8 +3506,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, * since those values are always verified under * ctx->lock which we're now holding. */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); - RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp, ctx); do_switch = 0; @@ -3498,38 +3521,40 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); inside_switch: - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(ctx, false); - task_ctx_sched_out(cpuctx, ctx, EVENT_ALL); + perf_ctx_sched_task_cb(ctx, false); + task_ctx_sched_out(ctx, EVENT_ALL); - perf_pmu_enable(pmu); + perf_ctx_enable(ctx); raw_spin_unlock(&ctx->lock); } } static DEFINE_PER_CPU(struct list_head, sched_cb_list); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); void perf_sched_cb_dec(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); this_cpu_dec(perf_sched_cb_usages); + barrier(); - if (!--cpuctx->sched_cb_usage) - list_del(&cpuctx->sched_cb_entry); + if (!--cpc->sched_cb_usage) + list_del(&cpc->sched_cb_entry); } void perf_sched_cb_inc(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context); - if (!cpuctx->sched_cb_usage++) - list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + if (!cpc->sched_cb_usage++) + list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + barrier(); this_cpu_inc(perf_sched_cb_usages); } @@ -3541,19 +3566,21 @@ void perf_sched_cb_inc(struct pmu *pmu) * PEBS requires this to provide PID/TID information. This requires we flush * all queued PEBS records before we context switch to a new task. */ -static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in) +static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu; - pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */ + pmu = cpc->epc.pmu; + /* software PMUs will not have sched_task */ if (WARN_ON_ONCE(!pmu->sched_task)) return; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(pmu); - pmu->sched_task(cpuctx->task_ctx, sched_in); + pmu->sched_task(cpc->task_epc, sched_in); perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -3563,26 +3590,20 @@ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, bool sched_in) { - struct perf_cpu_context *cpuctx; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_cpu_pmu_context *cpc; - if (prev == next) + /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */ + if (prev == next || cpuctx->task_ctx) return; - list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { - /* will be handled in perf_event_context_sched_in/out */ - if (cpuctx->task_ctx) - continue; - - __perf_pmu_sched_task(cpuctx, sched_in); - } + list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry) + __perf_pmu_sched_task(cpc, sched_in); } static void perf_event_switch(struct task_struct *task, struct task_struct *next_prev, bool sched_in); -#define for_each_task_context_nr(ctxn) \ - for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) - /* * Called from scheduler to remove the events of the current task, * with interrupts disabled. @@ -3597,16 +3618,13 @@ static void perf_event_switch(struct task_struct *task, void __perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - int ctxn; - if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); if (atomic_read(&nr_switch_events)) perf_event_switch(task, next, false); - for_each_task_context_nr(ctxn) - perf_event_context_sched_out(task, ctxn, next); + perf_event_context_sched_out(task, next); /* * if cgroup events exist on this CPU, then we need @@ -3617,15 +3635,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_switch(next); } -/* - * Called with IRQs disabled - */ -static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) -{ - ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); -} - static bool perf_less_group_idx(const void *l, const void *r) { const struct perf_event *le = *(const struct perf_event **)l; @@ -3657,21 +3666,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event) } } -static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, +static void __link_epc(struct perf_event_pmu_context *pmu_ctx) +{ + struct perf_cpu_pmu_context *cpc; + + if (!pmu_ctx->ctx->task) + return; + + cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context); + WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx); + cpc->task_epc = pmu_ctx; +} + +static noinline int visit_groups_merge(struct perf_event_context *ctx, struct perf_event_groups *groups, int cpu, + struct pmu *pmu, int (*func)(struct perf_event *, void *), void *data) { #ifdef CONFIG_CGROUP_PERF struct cgroup_subsys_state *css = NULL; #endif + struct perf_cpu_context *cpuctx = NULL; /* Space for per CPU and/or any CPU event iterators. */ struct perf_event *itrs[2]; struct min_heap event_heap; struct perf_event **evt; int ret; - if (cpuctx) { + if (pmu->filter && pmu->filter(pmu, cpu)) + return 0; + + if (!ctx->task) { + cpuctx = this_cpu_ptr(&perf_cpu_context); event_heap = (struct min_heap){ .data = cpuctx->heap, .nr = 0, @@ -3691,17 +3718,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, .size = ARRAY_SIZE(itrs), }; /* Events not within a CPU context may be on any CPU. */ - __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL)); } evt = event_heap.data; - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL)); #ifdef CONFIG_CGROUP_PERF for (; css; css = css->parent) - __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup)); + __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup)); #endif + if (event_heap.nr) { + __link_epc((*evt)->pmu_ctx); + perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu); + } + min_heapify_all(&event_heap, &perf_min_heap); while (event_heap.nr) { @@ -3709,7 +3741,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, if (ret) return ret; - *evt = perf_event_groups_next(*evt); + *evt = perf_event_groups_next(*evt, pmu); if (*evt) min_heapify(&event_heap, 0, &perf_min_heap); else @@ -3751,7 +3783,6 @@ static inline void group_update_userpage(struct perf_event *group_event) static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int *can_add_hw = data; if (event->state <= PERF_EVENT_STATE_OFF) @@ -3760,8 +3791,8 @@ static int merge_sched_in(struct perf_event *event, void *data) if (!event_filter_match(event)) return 0; - if (group_can_go_on(event, cpuctx, *can_add_hw)) { - if (!group_sched_in(event, cpuctx, ctx)) + if (group_can_go_on(event, *can_add_hw)) { + if (!group_sched_in(event, ctx)) list_add_tail(&event->active_list, get_event_list(event)); } @@ -3771,8 +3802,11 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); } else { - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); + struct perf_cpu_pmu_context *cpc; + + event->pmu_ctx->rotate_necessary = 1; + cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context); + perf_mux_hrtimer_restart(cpc); group_update_userpage(event); } } @@ -3780,39 +3814,53 @@ static int merge_sched_in(struct perf_event *event, void *data) return 0; } -static void -ctx_pinned_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { + struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - if (ctx != &cpuctx->ctx) - cpuctx = NULL; - - visit_groups_merge(cpuctx, &ctx->pinned_groups, - smp_processor_id(), - merge_sched_in, &can_add_hw); + if (pmu) { + visit_groups_merge(ctx, &ctx->pinned_groups, + smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); + } else { + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + can_add_hw = 1; + visit_groups_merge(ctx, &ctx->pinned_groups, + smp_processor_id(), pmu_ctx->pmu, + merge_sched_in, &can_add_hw); + } + } } -static void -ctx_flexible_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx) +static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu) { + struct perf_event_pmu_context *pmu_ctx; int can_add_hw = 1; - if (ctx != &cpuctx->ctx) - cpuctx = NULL; + if (pmu) { + visit_groups_merge(ctx, &ctx->flexible_groups, + smp_processor_id(), pmu, + merge_sched_in, &can_add_hw); + } else { + list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) { + can_add_hw = 1; + visit_groups_merge(ctx, &ctx->flexible_groups, + smp_processor_id(), pmu_ctx->pmu, + merge_sched_in, &can_add_hw); + } + } +} - visit_groups_merge(cpuctx, &ctx->flexible_groups, - smp_processor_id(), - merge_sched_in, &can_add_hw); +static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu) +{ + ctx_flexible_sched_in(ctx, pmu); } static void -ctx_sched_in(struct perf_event_context *ctx, - struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); int is_active = ctx->is_active; lockdep_assert_held(&ctx->lock); @@ -3846,39 +3894,32 @@ ctx_sched_in(struct perf_event_context *ctx, * in order to give them the best chance of going on. */ if (is_active & EVENT_PINNED) - ctx_pinned_sched_in(ctx, cpuctx); + ctx_pinned_sched_in(ctx, NULL); /* Then walk through the lower prio flexible groups */ if (is_active & EVENT_FLEXIBLE) - ctx_flexible_sched_in(ctx, cpuctx); + ctx_flexible_sched_in(ctx, NULL); } -static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type) +static void perf_event_context_sched_in(struct task_struct *task) { - struct perf_event_context *ctx = &cpuctx->ctx; - - ctx_sched_in(ctx, cpuctx, event_type); -} + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; -static void perf_event_context_sched_in(struct perf_event_context *ctx, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; + rcu_read_lock(); + ctx = rcu_dereference(task->perf_event_ctxp); + if (!ctx) + goto rcu_unlock; - cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) { + perf_ctx_lock(cpuctx, ctx); + perf_ctx_disable(ctx); - /* - * HACK: for HETEROGENEOUS the task context might have switched to a - * different PMU, force (re)set the context, - */ - pmu = ctx->pmu = cpuctx->ctx.pmu; + perf_ctx_sched_task_cb(ctx, true); - if (cpuctx->task_ctx == ctx) { - if (cpuctx->sched_cb_usage) - __perf_pmu_sched_task(cpuctx, true); - return; + perf_ctx_enable(ctx); + perf_ctx_unlock(cpuctx, ctx); + goto rcu_unlock; } perf_ctx_lock(cpuctx, ctx); @@ -3889,7 +3930,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, if (!ctx->nr_events) goto unlock; - perf_pmu_disable(pmu); + perf_ctx_disable(ctx); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -3898,17 +3939,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * However, if task's ctx is not carrying any pinned * events, no need to flip the cpuctx's events around. */ - if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { + perf_ctx_disable(&cpuctx->ctx); + ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE); + } + perf_event_sched_in(cpuctx, ctx); - if (cpuctx->sched_cb_usage && pmu->sched_task) - pmu->sched_task(cpuctx->task_ctx, true); + perf_ctx_sched_task_cb(cpuctx->task_ctx, true); - perf_pmu_enable(pmu); + if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) + perf_ctx_enable(&cpuctx->ctx); + + perf_ctx_enable(ctx); unlock: perf_ctx_unlock(cpuctx, ctx); +rcu_unlock: + rcu_read_unlock(); } /* @@ -3925,16 +3973,7 @@ unlock: void __perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - struct perf_event_context *ctx; - int ctxn; - - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (likely(!ctx)) - continue; - - perf_event_context_sched_in(ctx, task); - } + perf_event_context_sched_in(task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -4053,8 +4092,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo * events. At the same time, make sure, having freq events does not change * the rate of unthrottling as that would introduce bias. */ -static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, - int needs_unthr) +static void +perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) { struct perf_event *event; struct hw_perf_event *hwc; @@ -4066,16 +4105,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, * - context have events in frequency mode (needs freq adjust) * - there are events to unthrottle on this cpu */ - if (!(ctx->nr_freq || needs_unthr)) + if (!(ctx->nr_freq || unthrottle)) return; raw_spin_lock(&ctx->lock); - perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; + // XXX use visit thingy to avoid the -1,cpu match if (!event_filter_match(event)) continue; @@ -4116,7 +4155,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, perf_pmu_enable(event->pmu); } - perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4138,72 +4176,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event) /* pick an event from the flexible_groups to rotate */ static inline struct perf_event * -ctx_event_to_rotate(struct perf_event_context *ctx) +ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx) { struct perf_event *event; + struct rb_node *node; + struct rb_root *tree; + struct __group_key key = { + .pmu = pmu_ctx->pmu, + }; /* pick the first active flexible event */ - event = list_first_entry_or_null(&ctx->flexible_active, + event = list_first_entry_or_null(&pmu_ctx->flexible_active, struct perf_event, active_list); + if (event) + goto out; /* if no active flexible event, pick the first event */ - if (!event) { - event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree), - typeof(*event), group_node); + tree = &pmu_ctx->ctx->flexible_groups.tree; + + if (!pmu_ctx->ctx->task) { + key.cpu = smp_processor_id(); + + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + goto out; } + key.cpu = -1; + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) { + event = __node_2_pe(node); + goto out; + } + + key.cpu = smp_processor_id(); + node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup); + if (node) + event = __node_2_pe(node); + +out: /* * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in() * finds there are unschedulable events, it will set it again. */ - ctx->rotate_necessary = 0; + pmu_ctx->rotate_necessary = 0; return event; } -static bool perf_rotate_context(struct perf_cpu_context *cpuctx) +static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_pmu_context *cpu_epc, *task_epc = NULL; struct perf_event *cpu_event = NULL, *task_event = NULL; - struct perf_event_context *task_ctx = NULL; int cpu_rotate, task_rotate; + struct pmu *pmu; /* * Since we run this from IRQ context, nobody can install new * events, thus the event count values are stable. */ - cpu_rotate = cpuctx->ctx.rotate_necessary; - task_ctx = cpuctx->task_ctx; - task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; + cpu_epc = &cpc->epc; + pmu = cpu_epc->pmu; + task_epc = cpc->task_epc; + + cpu_rotate = cpu_epc->rotate_necessary; + task_rotate = task_epc ? task_epc->rotate_necessary : 0; if (!(cpu_rotate || task_rotate)) return false; perf_ctx_lock(cpuctx, cpuctx->task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); + perf_pmu_disable(pmu); if (task_rotate) - task_event = ctx_event_to_rotate(task_ctx); + task_event = ctx_event_to_rotate(task_epc); if (cpu_rotate) - cpu_event = ctx_event_to_rotate(&cpuctx->ctx); + cpu_event = ctx_event_to_rotate(cpu_epc); /* * As per the order given at ctx_resched() first 'pop' task flexible * and then, if needed CPU flexible. */ - if (task_event || (task_ctx && cpu_event)) - ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); - if (cpu_event) - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (task_event || (task_epc && cpu_event)) { + update_context_time(task_epc->ctx); + __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE); + } - if (task_event) - rotate_ctx(task_ctx, task_event); - if (cpu_event) + if (cpu_event) { + update_context_time(&cpuctx->ctx); + __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE); rotate_ctx(&cpuctx->ctx, cpu_event); + __pmu_ctx_sched_in(&cpuctx->ctx, pmu); + } - perf_event_sched_in(cpuctx, task_ctx); + if (task_event) + rotate_ctx(task_epc->ctx, task_event); + + if (task_event || (task_epc && cpu_event)) + __pmu_ctx_sched_in(task_epc->ctx, pmu); - perf_pmu_enable(cpuctx->ctx.pmu); + perf_pmu_enable(pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); return true; @@ -4211,8 +4286,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) void perf_event_task_tick(void) { - struct list_head *head = this_cpu_ptr(&active_ctx_list); - struct perf_event_context *ctx, *tmp; + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); + struct perf_event_context *ctx; int throttled; lockdep_assert_irqs_disabled(); @@ -4221,8 +4296,13 @@ void perf_event_task_tick(void) throttled = __this_cpu_xchg(perf_throttled_count, 0); tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); - list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) - perf_adjust_freq_unthr_context(ctx, throttled); + perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled); + + rcu_read_lock(); + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_adjust_freq_unthr_context(ctx, !!throttled); + rcu_read_unlock(); } static int event_enable_on_exec(struct perf_event *event, @@ -4244,9 +4324,9 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(int ctxn) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; enum event_type_t event_type = 0; struct perf_cpu_context *cpuctx; struct perf_event *event; @@ -4254,13 +4334,16 @@ static void perf_event_enable_on_exec(int ctxn) int enabled = 0; local_irq_save(flags); - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx || !ctx->nr_events) + if (WARN_ON_ONCE(current->perf_event_ctxp != ctx)) goto out; - cpuctx = __get_cpu_context(ctx); + if (!ctx->nr_events) + goto out; + + cpuctx = this_cpu_ptr(&perf_cpu_context); perf_ctx_lock(cpuctx, ctx); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); + list_for_each_entry(event, &ctx->event_list, event_entry) { enabled |= event_enable_on_exec(event, ctx); event_type |= get_event_type(event); @@ -4273,7 +4356,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME); + ctx_sched_in(ctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -4292,17 +4375,13 @@ static void perf_event_exit_event(struct perf_event *event, * Removes all events from the current task that have been marked * remove-on-exec, and feeds their values back to parent events. */ -static void perf_event_remove_on_exec(int ctxn) +static void perf_event_remove_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_event_context *clone_ctx = NULL; struct perf_event *event, *next; unsigned long flags; bool modified = false; - ctx = perf_pin_task_context(current, ctxn); - if (!ctx) - return; - mutex_lock(&ctx->mutex); if (WARN_ON_ONCE(ctx->task != current)) @@ -4323,13 +4402,11 @@ static void perf_event_remove_on_exec(int ctxn) raw_spin_lock_irqsave(&ctx->lock, flags); if (modified) clone_ctx = unclone_ctx(ctx); - --ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); unlock: mutex_unlock(&ctx->mutex); - put_ctx(ctx); if (clone_ctx) put_ctx(clone_ctx); } @@ -4365,7 +4442,7 @@ static void __perf_event_read(void *info) struct perf_read_data *data = info; struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct pmu *pmu = event->pmu; /* @@ -4591,17 +4668,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->active_ctx_list); + INIT_LIST_HEAD(&ctx->pmu_ctx_list); perf_event_groups_init(&ctx->pinned_groups); perf_event_groups_init(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); - INIT_LIST_HEAD(&ctx->pinned_active); - INIT_LIST_HEAD(&ctx->flexible_active); refcount_set(&ctx->refcount, 1); } +static void +__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu) +{ + epc->pmu = pmu; + INIT_LIST_HEAD(&epc->pmu_ctx_entry); + INIT_LIST_HEAD(&epc->pinned_active); + INIT_LIST_HEAD(&epc->flexible_active); + atomic_set(&epc->refcount, 1); +} + static struct perf_event_context * -alloc_perf_context(struct pmu *pmu, struct task_struct *task) +alloc_perf_context(struct task_struct *task) { struct perf_event_context *ctx; @@ -4612,7 +4697,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) __perf_event_init_context(ctx); if (task) ctx->task = get_task_struct(task); - ctx->pmu = pmu; return ctx; } @@ -4641,15 +4725,12 @@ find_lively_task_by_vpid(pid_t vpid) * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, - struct perf_event *event) +find_get_context(struct task_struct *task, struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; - void *task_ctx_data = NULL; unsigned long flags; - int ctxn, err; - int cpu = event->cpu; + int err; if (!task) { /* Must be root to operate on a CPU event: */ @@ -4657,7 +4738,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, if (err) return ERR_PTR(err); - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); ctx = &cpuctx->ctx; get_ctx(ctx); raw_spin_lock_irqsave(&ctx->lock, flags); @@ -4668,43 +4749,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task, } err = -EINVAL; - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto errout; - - if (event->attach_state & PERF_ATTACH_TASK_DATA) { - task_ctx_data = alloc_task_ctx_data(pmu); - if (!task_ctx_data) { - err = -ENOMEM; - goto errout; - } - } - retry: - ctx = perf_lock_task_context(task, ctxn, &flags); + ctx = perf_lock_task_context(task, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; - if (task_ctx_data && !ctx->task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) put_ctx(clone_ctx); } else { - ctx = alloc_perf_context(pmu, task); + ctx = alloc_perf_context(task); err = -ENOMEM; if (!ctx) goto errout; - if (task_ctx_data) { - ctx->task_ctx_data = task_ctx_data; - task_ctx_data = NULL; - } - err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -4713,12 +4773,12 @@ retry: */ if (task->flags & PF_EXITING) err = -ESRCH; - else if (task->perf_event_ctxp[ctxn]) + else if (task->perf_event_ctxp) err = -EAGAIN; else { get_ctx(ctx); ++ctx->pin_count; - rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + rcu_assign_pointer(task->perf_event_ctxp, ctx); } mutex_unlock(&task->perf_event_mutex); @@ -4731,21 +4791,146 @@ retry: } } - free_task_ctx_data(pmu, task_ctx_data); return ctx; errout: - free_task_ctx_data(pmu, task_ctx_data); return ERR_PTR(err); } +static struct perf_event_pmu_context * +find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx, + struct perf_event *event) +{ + struct perf_event_pmu_context *new = NULL, *epc; + void *task_ctx_data = NULL; + + if (!ctx->task) { + struct perf_cpu_pmu_context *cpc; + + cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu); + epc = &cpc->epc; + + if (!epc->ctx) { + atomic_set(&epc->refcount, 1); + epc->embedded = 1; + raw_spin_lock_irq(&ctx->lock); + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + raw_spin_unlock_irq(&ctx->lock); + } else { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + } + + return epc; + } + + new = kzalloc(sizeof(*epc), GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = alloc_task_ctx_data(pmu); + if (!task_ctx_data) { + kfree(new); + return ERR_PTR(-ENOMEM); + } + } + + __perf_init_event_pmu_context(new, pmu); + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because perf_event_init_task() doesn't actually hold the + * child_ctx->mutex. + */ + + raw_spin_lock_irq(&ctx->lock); + list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) { + if (epc->pmu == pmu) { + WARN_ON_ONCE(epc->ctx != ctx); + atomic_inc(&epc->refcount); + goto found_epc; + } + } + + epc = new; + new = NULL; + + list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list); + epc->ctx = ctx; + +found_epc: + if (task_ctx_data && !epc->task_ctx_data) { + epc->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + ctx->nr_task_data++; + } + raw_spin_unlock_irq(&ctx->lock); + + free_task_ctx_data(pmu, task_ctx_data); + kfree(new); + + return epc; +} + +static void get_pmu_ctx(struct perf_event_pmu_context *epc) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount)); +} + +static void free_epc_rcu(struct rcu_head *head) +{ + struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head); + + kfree(epc->task_ctx_data); + kfree(epc); +} + +static void put_pmu_ctx(struct perf_event_pmu_context *epc) +{ + unsigned long flags; + + if (!atomic_dec_and_test(&epc->refcount)) + return; + + if (epc->ctx) { + struct perf_event_context *ctx = epc->ctx; + + /* + * XXX + * + * lockdep_assert_held(&ctx->mutex); + * + * can't because of the call-site in _free_event()/put_event() + * which isn't always called under ctx->mutex. + */ + + WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry)); + raw_spin_lock_irqsave(&ctx->lock, flags); + list_del_init(&epc->pmu_ctx_entry); + epc->ctx = NULL; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + WARN_ON_ONCE(!list_empty(&epc->pinned_active)); + WARN_ON_ONCE(!list_empty(&epc->flexible_active)); + + if (epc->embedded) + return; + + call_rcu(&epc->rcu_head, free_epc_rcu); +} + static void perf_event_free_filter(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { - struct perf_event *event; + struct perf_event *event = container_of(head, typeof(*event), rcu_head); - event = container_of(head, struct perf_event, rcu_head); if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); @@ -4883,7 +5068,7 @@ static void perf_sched_delayed(struct work_struct *work) * * 1) cpu-wide events in the presence of per-task events, * 2) per-task events in the presence of cpu-wide events, - * 3) two matching events on the same context. + * 3) two matching events on the same perf_event_context. * * The former two cases are handled in the allocation path (perf_event_alloc(), * _free_event()), the latter -- before the first perf_install_in_context(). @@ -5007,6 +5192,9 @@ static void _free_event(struct perf_event *event) if (event->hw.target) put_task_struct(event->hw.target); + if (event->pmu_ctx) + put_pmu_ctx(event->pmu_ctx); + /* * perf_event_free_task() relies on put_ctx() being 'last', in particular * all task references must be cleaned up. @@ -5107,8 +5295,8 @@ int perf_event_release_kernel(struct perf_event *event) LIST_HEAD(free_list); /* - * If we got here through err_file: fput(event_file); we will not have - * attached to a context yet. + * If we got here through err_alloc: free_event(event); we will not + * have attached to a context yet. */ if (!ctx) { WARN_ON_ONCE(event->attach_state & @@ -5121,9 +5309,7 @@ int perf_event_release_kernel(struct perf_event *event) ctx = perf_event_ctx_lock(event); WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, DETACH_GROUP); - raw_spin_lock_irq(&ctx->lock); /* * Mark this event as STATE_DEAD, there is no external reference to it * anymore. @@ -5135,8 +5321,7 @@ int perf_event_release_kernel(struct perf_event *event) * Thus this guarantees that we will in fact observe and kill _ALL_ * child events. */ - event->state = PERF_EVENT_STATE_DEAD; - raw_spin_unlock_irq(&ctx->lock); + perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD); perf_event_ctx_unlock(event, ctx); @@ -5543,7 +5728,7 @@ static void __perf_event_period(struct perf_event *event, active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { - perf_pmu_disable(ctx->pmu); + perf_pmu_disable(event->pmu); /* * We could be throttled; unthrottle now to avoid the tick * trying to unthrottle while we already re-started the event. @@ -5559,7 +5744,7 @@ static void __perf_event_period(struct perf_event *event, if (active) { event->pmu->start(event, PERF_EF_RELOAD); - perf_pmu_enable(ctx->pmu); + perf_pmu_enable(event->pmu); } } @@ -6577,6 +6762,8 @@ static void perf_pending_task(struct callback_head *head) if (rctx >= 0) perf_swevent_put_recursion_context(rctx); preempt_enable_notrace(); + + put_event(event); } #ifdef CONFIG_GUEST_PERF_EVENTS @@ -7306,7 +7493,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) return pud_leaf_size(pud); pmdp = pmd_offset_lockless(pudp, pud, addr); - pmd = READ_ONCE(*pmdp); + pmd = pmdp_get_lockless(pmdp); if (!pmd_present(pmd)) return 0; @@ -7720,7 +7907,6 @@ perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { struct perf_event_context *ctx; - int ctxn; rcu_read_lock(); preempt_disable(); @@ -7737,11 +7923,9 @@ perf_iterate_sb(perf_iterate_f output, void *data, perf_iterate_sb_cpu(output, data); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) - perf_iterate_ctx(ctx, output, data, false); - } + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_iterate_ctx(ctx, output, data, false); done: preempt_enable(); rcu_read_unlock(); @@ -7783,20 +7967,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) void perf_event_exec(void) { struct perf_event_context *ctx; - int ctxn; - for_each_task_context_nr(ctxn) { - perf_event_enable_on_exec(ctxn); - perf_event_remove_on_exec(ctxn); + ctx = perf_pin_task_context(current); + if (!ctx) + return; - rcu_read_lock(); - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (ctx) { - perf_iterate_ctx(ctx, perf_event_addr_filters_exec, - NULL, true); - } - rcu_read_unlock(); - } + perf_event_enable_on_exec(ctx); + perf_event_remove_on_exec(ctx); + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); + + perf_unpin_context(ctx); + put_ctx(ctx); } struct remote_output { @@ -7836,8 +8017,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; - struct pmu *pmu = event->ctx->pmu; - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct remote_output ro = { .rb = event->rb, }; @@ -8626,7 +8806,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) static void perf_addr_filters_adjust(struct vm_area_struct *vma) { struct perf_event_context *ctx; - int ctxn; /* * Data tracing isn't supported yet and as such there is no need @@ -8636,13 +8815,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) return; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); - if (!ctx) - continue; - + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); - } rcu_read_unlock(); } @@ -9030,7 +9205,7 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, PERF_RECORD_KSYMBOL_TYPE_BPF, (u64)(unsigned long)subprog->bpf_func, subprog->jited_len, unregister, - prog->aux->ksym.name); + subprog->aux->ksym.name); } } } @@ -9273,6 +9448,19 @@ int perf_event_account_interrupt(struct perf_event *event) return __perf_event_account_interrupt(event, 1); } +static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs) +{ + /* + * Due to interrupt latency (AKA "skid"), we may enter the + * kernel before taking an overflow, even if the PMU is only + * counting user events. + */ + if (event->attr.exclude_kernel && !user_mode(regs)) + return false; + + return true; +} + /* * Generic event overflow handling, sampling. */ @@ -9307,15 +9495,38 @@ static int __perf_event_overflow(struct perf_event *event, if (event->attr.sigtrap) { /* - * Should not be able to return to user space without processing - * pending_sigtrap (kernel events can overflow multiple times). + * The desired behaviour of sigtrap vs invalid samples is a bit + * tricky; on the one hand, one should not loose the SIGTRAP if + * it is the first event, on the other hand, we should also not + * trigger the WARN or override the data address. */ - WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel); + bool valid_sample = sample_is_allowed(event, regs); + unsigned int pending_id = 1; + + if (regs) + pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; if (!event->pending_sigtrap) { - event->pending_sigtrap = 1; + event->pending_sigtrap = pending_id; local_inc(&event->ctx->nr_pending); + } else if (event->attr.exclude_kernel && valid_sample) { + /* + * Should not be able to return to user space without + * consuming pending_sigtrap; with exceptions: + * + * 1. Where !exclude_kernel, events can overflow again + * in the kernel without returning to user space. + * + * 2. Events that can overflow again before the IRQ- + * work without user space progress (e.g. hrtimer). + * To approximate progress (with false negatives), + * check 32-bit hash of the current IP. + */ + WARN_ON_ONCE(event->pending_sigtrap != pending_id); } - event->pending_addr = data->addr; + + event->pending_addr = 0; + if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR)) + event->pending_addr = data->addr; irq_work_queue(&event->pending_irq); } @@ -9781,6 +9992,44 @@ static struct pmu perf_swevent = { #ifdef CONFIG_EVENT_TRACING +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { @@ -9830,6 +10079,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); +static void __perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_event *event) +{ + struct trace_entry *entry = record; + + if (event->attr.config != entry->type) + return; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + return; + if (perf_tp_event_match(event, data, regs)) + perf_swevent_event(event, count, data, regs); +} + +static void perf_tp_event_target_task(u64 count, void *record, + struct pt_regs *regs, + struct perf_sample_data *data, + struct perf_event_context *ctx) +{ + unsigned int cpu = smp_processor_id(); + struct pmu *pmu = &perf_tracepoint; + struct perf_event *event, *sibling; + + perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, sibling); + } + + perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) { + __perf_tp_event_target_task(count, record, regs, data, event); + for_each_sibling_event(sibling, event) + __perf_tp_event_target_task(count, record, regs, data, sibling); + } +} + void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, struct task_struct *task) @@ -9861,26 +10148,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, */ if (task && task != current) { struct perf_event_context *ctx; - struct trace_entry *entry = record; rcu_read_lock(); - ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); + ctx = rcu_dereference(task->perf_event_ctxp); if (!ctx) goto unlock; - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { - if (event->cpu != smp_processor_id()) - continue; - if (event->attr.type != PERF_TYPE_TRACEPOINT) - continue; - if (event->attr.config != entry->type) - continue; - /* Cannot deliver synchronous signal to other task. */ - if (event->attr.sigtrap) - continue; - if (perf_tp_event_match(event, &data, regs)) - perf_swevent_event(event, count, &data, regs); - } + raw_spin_lock(&ctx->lock); + perf_tp_event_target_task(count, record, regs, &data, ctx); + raw_spin_unlock(&ctx->lock); unlock: rcu_read_unlock(); } @@ -9889,44 +10165,6 @@ unlock: } EXPORT_SYMBOL_GPL(perf_tp_event); -static void tp_perf_event_destroy(struct perf_event *event) -{ - perf_trace_destroy(event); -} - -static int perf_tp_event_init(struct perf_event *event) -{ - int err; - - if (event->attr.type != PERF_TYPE_TRACEPOINT) - return -ENOENT; - - /* - * no branch sampling for tracepoint events - */ - if (has_branch_stack(event)) - return -EOPNOTSUPP; - - err = perf_trace_init(event); - if (err) - return err; - - event->destroy = tp_perf_event_destroy; - - return 0; -} - -static struct pmu perf_tracepoint = { - .task_ctx_nr = perf_sw_context, - - .event_init = perf_tp_event_init, - .add = perf_trace_add, - .del = perf_trace_del, - .start = perf_swevent_start, - .stop = perf_swevent_stop, - .read = perf_swevent_read, -}; - #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) /* * Flags in config, used by dynamic PMU kprobe and uprobe @@ -11013,36 +11251,9 @@ static int perf_event_idx_default(struct perf_event *event) return 0; } -/* - * Ensures all contexts with the same task_ctx_nr have the same - * pmu_cpu_context too. - */ -static struct perf_cpu_context __percpu *find_pmu_context(int ctxn) -{ - struct pmu *pmu; - - if (ctxn < 0) - return NULL; - - list_for_each_entry(pmu, &pmus, entry) { - if (pmu->task_ctx_nr == ctxn) - return pmu->pmu_cpu_context; - } - - return NULL; -} - static void free_pmu_context(struct pmu *pmu) { - /* - * Static contexts such as perf_sw_context have a global lifetime - * and may be shared between different PMUs. Avoid freeing them - * when a single PMU is going away. - */ - if (pmu->task_ctx_nr > perf_invalid_context) - return; - - free_percpu(pmu->pmu_cpu_context); + free_percpu(pmu->cpu_pmu_context); } /* @@ -11106,12 +11317,11 @@ perf_event_mux_interval_ms_store(struct device *dev, /* update all cpuctx for this PMU */ cpus_read_lock(); for_each_online_cpu(cpu) { - struct perf_cpu_context *cpuctx; - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + struct perf_cpu_pmu_context *cpc; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - cpu_function_call(cpu, - (remote_function_f)perf_mux_hrtimer_restart, cpuctx); + cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc); } cpus_read_unlock(); mutex_unlock(&mux_interval_mutex); @@ -11148,13 +11358,15 @@ static int pmu_dev_alloc(struct pmu *pmu) pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); - ret = dev_set_name(pmu->dev, "%s", pmu->name); - if (ret) - goto free_dev; dev_set_drvdata(pmu->dev, pmu); pmu->dev->bus = &pmu_bus; pmu->dev->release = pmu_dev_release; + + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + ret = device_add(pmu->dev); if (ret) goto free_dev; @@ -11222,47 +11434,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) } skip_type: - if (pmu->task_ctx_nr == perf_hw_context) { - static int hw_context_taken = 0; - - /* - * Other than systems with heterogeneous CPUs, it never makes - * sense for two PMUs to share perf_hw_context. PMUs which are - * uncore must use perf_invalid_context. - */ - if (WARN_ON_ONCE(hw_context_taken && - !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS))) - pmu->task_ctx_nr = perf_invalid_context; - - hw_context_taken = 1; - } - - pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); - if (pmu->pmu_cpu_context) - goto got_cpu_context; - ret = -ENOMEM; - pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); - if (!pmu->pmu_cpu_context) + pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context); + if (!pmu->cpu_pmu_context) goto free_dev; for_each_possible_cpu(cpu) { - struct perf_cpu_context *cpuctx; - - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx); - lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); - lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.pmu = pmu; - cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); - - __perf_mux_hrtimer_init(cpuctx, cpu); + struct perf_cpu_pmu_context *cpc; - cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); - cpuctx->heap = cpuctx->heap_default; + cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu); + __perf_init_event_pmu_context(&cpc->epc, pmu); + __perf_mux_hrtimer_init(cpc, cpu); } -got_cpu_context: if (!pmu->start_txn) { if (pmu->pmu_enable) { /* @@ -11741,10 +11925,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } /* - * Disallow uncore-cgroup events, they don't make sense as the cgroup will - * be different on other CPUs in the uncore mask. + * Disallow uncore-task events. Similarly, disallow uncore-cgroup + * events (they don't make sense as the cgroup will be different + * on other CPUs in the uncore mask). */ - if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) { + if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) { err = -EINVAL; goto err_pmu; } @@ -12091,37 +12276,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) return 0; } -/* - * Variation on perf_event_ctx_lock_nested(), except we take two context - * mutexes. - */ -static struct perf_event_context * -__perf_event_ctx_lock_double(struct perf_event *group_leader, - struct perf_event_context *ctx) -{ - struct perf_event_context *gctx; - -again: - rcu_read_lock(); - gctx = READ_ONCE(group_leader->ctx); - if (!refcount_inc_not_zero(&gctx->refcount)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); - - mutex_lock_double(&gctx->mutex, &ctx->mutex); - - if (group_leader->ctx != gctx) { - mutex_unlock(&ctx->mutex); - mutex_unlock(&gctx->mutex); - put_ctx(gctx); - goto again; - } - - return gctx; -} - static bool perf_check_permission(struct perf_event_attr *attr, struct task_struct *task) { @@ -12167,9 +12321,10 @@ SYSCALL_DEFINE5(perf_event_open, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *event, *sibling; struct perf_event_attr attr; - struct perf_event_context *ctx, *gctx; + struct perf_event_context *ctx; struct file *event_file = NULL; struct fd group = {NULL, 0}; struct task_struct *task = NULL; @@ -12299,42 +12454,53 @@ SYSCALL_DEFINE5(perf_event_open, if (pmu->task_ctx_nr == perf_sw_context) event->event_caps |= PERF_EV_CAP_SOFTWARE; - if (group_leader) { - if (is_software_event(event) && - !in_software_context(group_leader)) { - /* - * If the event is a sw event, but the group_leader - * is on hw context. - * - * Allow the addition of software events to hw - * groups, this is safe because software events - * never fail to schedule. - */ - pmu = group_leader->ctx->pmu; - } else if (!is_software_event(event) && - is_software_event(group_leader) && - (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { - /* - * In case the group is a pure software group, and we - * try to add a hardware event, move the whole group to - * the hardware context. - */ - move_group = 1; - } + if (task) { + err = down_read_interruptible(&task->signal->exec_update_lock); + if (err) + goto err_alloc; + + /* + * We must hold exec_update_lock across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perf_check_permission(&attr, task)) + goto err_cred; } /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_alloc; + goto err_cred; + } + + mutex_lock(&ctx->mutex); + + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + + if (!task) { + /* + * Check if the @cpu we're creating an event for is online. + * + * We use the perf_cpu_context::ctx::mutex to serialize against + * the hotplug notifiers. See perf_event_{init,exit}_cpu(). + */ + struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu); + + if (!cpuctx->online) { + err = -ENODEV; + goto err_locked; + } } - /* - * Look up the group leader (we will attach this event to it): - */ if (group_leader) { err = -EINVAL; @@ -12343,11 +12509,11 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_context; + goto err_locked; /* All events in a group should have the same clock */ if (group_leader->clock != event->clock) - goto err_context; + goto err_locked; /* * Make sure we're both events for the same CPU; @@ -12355,145 +12521,76 @@ SYSCALL_DEFINE5(perf_event_open, * you can never concurrently schedule them anyhow. */ if (group_leader->cpu != event->cpu) - goto err_context; - - /* - * Make sure we're both on the same task, or both - * per-CPU events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + goto err_locked; /* - * Do not allow to attach to a group in a different task - * or CPU context. If we're moving SW events, we'll fix - * this up later, so allow that. - * - * Racy, not holding group_leader->ctx->mutex, see comment with - * perf_event_ctx_lock(). + * Make sure we're both on the same context; either task or cpu. */ - if (!move_group && group_leader->ctx != ctx) - goto err_context; + if (group_leader->ctx != ctx) + goto err_locked; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_context; - } - - if (output_event) { - err = perf_event_set_output(event, output_event); - if (err) - goto err_context; - } - - event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, - f_flags); - if (IS_ERR(event_file)) { - err = PTR_ERR(event_file); - event_file = NULL; - goto err_context; - } - - if (task) { - err = down_read_interruptible(&task->signal->exec_update_lock); - if (err) - goto err_file; - - /* - * We must hold exec_update_lock across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!perf_check_permission(&attr, task)) - goto err_cred; - } - - if (move_group) { - gctx = __perf_event_ctx_lock_double(group_leader, ctx); - - if (gctx->task == TASK_TOMBSTONE) { - err = -ESRCH; goto err_locked; - } - /* - * Check if we raced against another sys_perf_event_open() call - * moving the software group underneath us. - */ - if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + if (is_software_event(event) && + !in_software_context(group_leader)) { /* - * If someone moved the group out from under us, check - * if this new event wound up on the same ctx, if so - * its the regular !move_group case, otherwise fail. + * If the event is a sw event, but the group_leader + * is on hw context. + * + * Allow the addition of software events to hw + * groups, this is safe because software events + * never fail to schedule. + * + * Note the comment that goes with struct + * perf_event_pmu_context. */ - if (gctx != ctx) { - err = -EINVAL; - goto err_locked; - } else { - perf_event_ctx_unlock(group_leader, gctx); - move_group = 0; - goto not_move_group; + pmu = group_leader->pmu_ctx->pmu; + } else if (!is_software_event(event)) { + if (is_software_event(group_leader) && + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; } - } - - /* - * Failure to create exclusive events returns -EBUSY. - */ - err = -EBUSY; - if (!exclusive_event_installable(group_leader, ctx)) - goto err_locked; - for_each_sibling_event(sibling, group_leader) { - if (!exclusive_event_installable(sibling, ctx)) + /* Don't allow group of multiple hw events from different pmus */ + if (!in_software_context(group_leader) && + group_leader->pmu_ctx->pmu != pmu) goto err_locked; } - } else { - mutex_lock(&ctx->mutex); - - /* - * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, - * see the group_leader && !move_group test earlier. - */ - if (group_leader && group_leader->ctx != ctx) { - err = -EINVAL; - goto err_locked; - } } -not_move_group: - if (ctx->task == TASK_TOMBSTONE) { - err = -ESRCH; + /* + * Now that we're certain of the pmu; find the pmu_ctx. + */ + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); goto err_locked; } + event->pmu_ctx = pmu_ctx; - if (!perf_event_validate_size(event)) { - err = -E2BIG; - goto err_locked; + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; } - if (!task) { - /* - * Check if the @cpu we're creating an event for is online. - * - * We use the perf_cpu_context::ctx::mutex to serialize against - * the hotplug notifiers. See perf_event_{init,exit}_cpu(). - */ - struct perf_cpu_context *cpuctx = - container_of(ctx, struct perf_cpu_context, ctx); - - if (!cpuctx->online) { - err = -ENODEV; - goto err_locked; - } + if (!perf_event_validate_size(event)) { + err = -E2BIG; + goto err_context; } if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) { err = -EINVAL; - goto err_locked; + goto err_context; } /* @@ -12502,36 +12599,33 @@ not_move_group: */ if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_locked; + goto err_context; } WARN_ON_ONCE(ctx->parent_ctx); + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + event_file = NULL; + goto err_context; + } + /* * This is the point on no return; we cannot fail hereafter. This is * where we start modifying current state. */ if (move_group) { - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ perf_remove_from_context(group_leader, 0); - put_ctx(gctx); + put_pmu_ctx(group_leader->pmu_ctx); for_each_sibling_event(sibling, group_leader) { perf_remove_from_context(sibling, 0); - put_ctx(gctx); + put_pmu_ctx(sibling->pmu_ctx); } /* - * Wait for everybody to stop referencing the events through - * the old lists, before installing it on new lists. - */ - synchronize_rcu(); - - /* * Install the group siblings before the group leader. * * Because a group leader will try and install the entire group @@ -12542,9 +12636,10 @@ not_move_group: * reachable through the group lists. */ for_each_sibling_event(sibling, group_leader) { + sibling->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(sibling); perf_install_in_context(ctx, sibling, sibling->cpu); - get_ctx(ctx); } /* @@ -12552,9 +12647,10 @@ not_move_group: * event. What we want here is event in the initial * startup state, ready to be add into new context. */ + group_leader->pmu_ctx = pmu_ctx; + get_pmu_ctx(pmu_ctx); perf_event__state_init(group_leader); perf_install_in_context(ctx, group_leader, group_leader->cpu); - get_ctx(ctx); } /* @@ -12571,8 +12667,6 @@ not_move_group: perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); if (task) { @@ -12594,25 +12688,17 @@ not_move_group: fd_install(event_fd, event_file); return event_fd; +err_context: + /* event->pmu_ctx freed by free_event() */ err_locked: - if (move_group) - perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_cred: if (task) up_read(&task->signal->exec_update_lock); -err_file: - fput(event_file); -err_context: - perf_unpin_context(ctx); - put_ctx(ctx); err_alloc: - /* - * If event_file is set, the fput() above will have called ->release() - * and that will take care of freeing the event. - */ - if (!event_file) - free_event(event); + free_event(event); err_task: if (task) put_task_struct(task); @@ -12638,8 +12724,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, perf_overflow_handler_t overflow_handler, void *context) { + struct perf_event_pmu_context *pmu_ctx; struct perf_event_context *ctx; struct perf_event *event; + struct pmu *pmu; int err; /* @@ -12658,14 +12746,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, /* Mark owner so we could distinguish it from user events. */ event->owner = TASK_TOMBSTONE; + pmu = event->pmu; + + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; /* * Get the target context (task or percpu): */ - ctx = find_get_context(event->pmu, task, event); + ctx = find_get_context(task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_free; + goto err_alloc; } WARN_ON_ONCE(ctx->parent_ctx); @@ -12675,6 +12767,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err_unlock; } + pmu_ctx = find_get_pmu_context(pmu, ctx, event); + if (IS_ERR(pmu_ctx)) { + err = PTR_ERR(pmu_ctx); + goto err_unlock; + } + event->pmu_ctx = pmu_ctx; + if (!task) { /* * Check if the @cpu we're creating an event for is online. @@ -12686,13 +12785,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, container_of(ctx, struct perf_cpu_context, ctx); if (!cpuctx->online) { err = -ENODEV; - goto err_unlock; + goto err_pmu_ctx; } } if (!exclusive_event_installable(event, ctx)) { err = -EBUSY; - goto err_unlock; + goto err_pmu_ctx; } perf_install_in_context(ctx, event, event->cpu); @@ -12701,44 +12800,61 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_pmu_ctx: + put_pmu_ctx(pmu_ctx); err_unlock: mutex_unlock(&ctx->mutex); perf_unpin_context(ctx); put_ctx(ctx); -err_free: +err_alloc: free_event(event); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +static void __perf_pmu_remove(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, + struct perf_event_groups *groups, + struct list_head *events) { - struct perf_event_context *src_ctx; - struct perf_event_context *dst_ctx; - struct perf_event *event, *tmp; - LIST_HEAD(events); - - src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; - dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; + struct perf_event *event, *sibling; - /* - * See perf_event_ctx_lock() for comments on the details - * of swizzling perf_event::ctx. - */ - mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &src_ctx->event_list, - event_entry) { + perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) { perf_remove_from_context(event, 0); - unaccount_event_cpu(event, src_cpu); - put_ctx(src_ctx); - list_add(&event->migrate_entry, &events); + unaccount_event_cpu(event, cpu); + put_pmu_ctx(event->pmu_ctx); + list_add(&event->migrate_entry, events); + + for_each_sibling_event(sibling, event) { + perf_remove_from_context(sibling, 0); + unaccount_event_cpu(sibling, cpu); + put_pmu_ctx(sibling->pmu_ctx); + list_add(&sibling->migrate_entry, events); + } } +} - /* - * Wait for the events to quiesce before re-instating them. - */ - synchronize_rcu(); +static void __perf_pmu_install_event(struct pmu *pmu, + struct perf_event_context *ctx, + int cpu, struct perf_event *event) +{ + struct perf_event_pmu_context *epc; + + event->cpu = cpu; + epc = find_get_pmu_context(pmu, ctx, event); + event->pmu_ctx = epc; + + if (event->state >= PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, cpu); + perf_install_in_context(ctx, event, cpu); +} + +static void __perf_pmu_install(struct perf_event_context *ctx, + int cpu, struct pmu *pmu, struct list_head *events) +{ + struct perf_event *event, *tmp; /* * Re-instate events in 2 passes. @@ -12748,30 +12864,48 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) * leader will enable its siblings, even if those are still on the old * context. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { if (event->group_leader == event) continue; list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } /* * Once all the siblings are setup properly, install the group leaders * to make it go. */ - list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_for_each_entry_safe(event, tmp, events, migrate_entry) { list_del(&event->migrate_entry); - if (event->state >= PERF_EVENT_STATE_OFF) - event->state = PERF_EVENT_STATE_INACTIVE; - account_event_cpu(event, dst_cpu); - perf_install_in_context(dst_ctx, event, dst_cpu); - get_ctx(dst_ctx); + __perf_pmu_install_event(pmu, ctx, cpu, event); } +} + +void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) +{ + struct perf_event_context *src_ctx, *dst_ctx; + LIST_HEAD(events); + + src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx; + dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx; + + /* + * See perf_event_ctx_lock() for comments on the details + * of swizzling perf_event::ctx. + */ + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); + + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events); + __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events); + + /* + * Wait for the events to quiesce before re-instating them. + */ + synchronize_rcu(); + + __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events); + mutex_unlock(&dst_ctx->mutex); mutex_unlock(&src_ctx->mutex); } @@ -12851,14 +12985,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) perf_event_wakeup(event); } -static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +static void perf_event_exit_task_context(struct task_struct *child) { struct perf_event_context *child_ctx, *clone_ctx = NULL; struct perf_event *child_event, *next; WARN_ON_ONCE(child != current); - child_ctx = perf_pin_task_context(child, ctxn); + child_ctx = perf_pin_task_context(child); if (!child_ctx) return; @@ -12880,13 +13014,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * in. */ raw_spin_lock_irq(&child_ctx->lock); - task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL); + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Now that the context is inactive, destroy the task <-> ctx relation * and mark the context dead. */ - RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + RCU_INIT_POINTER(child->perf_event_ctxp, NULL); put_ctx(child_ctx); /* cannot be last */ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); put_task_struct(current); /* cannot be last */ @@ -12921,7 +13055,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) void perf_event_exit_task(struct task_struct *child) { struct perf_event *event, *tmp; - int ctxn; mutex_lock(&child->perf_event_mutex); list_for_each_entry_safe(event, tmp, &child->perf_event_list, @@ -12937,8 +13070,7 @@ void perf_event_exit_task(struct task_struct *child) } mutex_unlock(&child->perf_event_mutex); - for_each_task_context_nr(ctxn) - perf_event_exit_task_context(child, ctxn); + perf_event_exit_task_context(child); /* * The perf_event_exit_task_context calls perf_event_task @@ -12981,56 +13113,51 @@ void perf_event_free_task(struct task_struct *task) { struct perf_event_context *ctx; struct perf_event *event, *tmp; - int ctxn; - for_each_task_context_nr(ctxn) { - ctx = task->perf_event_ctxp[ctxn]; - if (!ctx) - continue; + ctx = rcu_access_pointer(task->perf_event_ctxp); + if (!ctx) + return; - mutex_lock(&ctx->mutex); - raw_spin_lock_irq(&ctx->lock); - /* - * Destroy the task <-> ctx relation and mark the context dead. - * - * This is important because even though the task hasn't been - * exposed yet the context has been (through child_list). - */ - RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL); - WRITE_ONCE(ctx->task, TASK_TOMBSTONE); - put_task_struct(task); /* cannot be last */ - raw_spin_unlock_irq(&ctx->lock); + mutex_lock(&ctx->mutex); + raw_spin_lock_irq(&ctx->lock); + /* + * Destroy the task <-> ctx relation and mark the context dead. + * + * This is important because even though the task hasn't been + * exposed yet the context has been (through child_list). + */ + RCU_INIT_POINTER(task->perf_event_ctxp, NULL); + WRITE_ONCE(ctx->task, TASK_TOMBSTONE); + put_task_struct(task); /* cannot be last */ + raw_spin_unlock_irq(&ctx->lock); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) - perf_free_event(event, ctx); - mutex_unlock(&ctx->mutex); + list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) + perf_free_event(event, ctx); - /* - * perf_event_release_kernel() could've stolen some of our - * child events and still have them on its free_list. In that - * case we must wait for these events to have been freed (in - * particular all their references to this task must've been - * dropped). - * - * Without this copy_process() will unconditionally free this - * task (irrespective of its reference count) and - * _free_event()'s put_task_struct(event->hw.target) will be a - * use-after-free. - * - * Wait for all events to drop their context reference. - */ - wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); - put_ctx(ctx); /* must be last */ - } + mutex_unlock(&ctx->mutex); + + /* + * perf_event_release_kernel() could've stolen some of our + * child events and still have them on its free_list. In that + * case we must wait for these events to have been freed (in + * particular all their references to this task must've been + * dropped). + * + * Without this copy_process() will unconditionally free this + * task (irrespective of its reference count) and + * _free_event()'s put_task_struct(event->hw.target) will be a + * use-after-free. + * + * Wait for all events to drop their context reference. + */ + wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1); + put_ctx(ctx); /* must be last */ } void perf_event_delayed_put(struct task_struct *task) { - int ctxn; - - for_each_task_context_nr(ctxn) - WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); + WARN_ON_ONCE(task->perf_event_ctxp); } struct file *perf_event_get(unsigned int fd) @@ -13080,6 +13207,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event_context *child_ctx) { enum perf_event_state parent_state = parent_event->state; + struct perf_event_pmu_context *pmu_ctx; struct perf_event *child_event; unsigned long flags; @@ -13100,17 +13228,12 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; - - if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && - !child_ctx->task_ctx_data) { - struct pmu *pmu = child_event->pmu; - - child_ctx->task_ctx_data = alloc_task_ctx_data(pmu); - if (!child_ctx->task_ctx_data) { - free_event(child_event); - return ERR_PTR(-ENOMEM); - } + pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event); + if (IS_ERR(pmu_ctx)) { + free_event(child_event); + return NULL; } + child_event->pmu_ctx = pmu_ctx; /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) @@ -13233,11 +13356,11 @@ static int inherit_group(struct perf_event *parent_event, static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, int ctxn, + struct task_struct *child, u64 clone_flags, int *inherited_all) { - int ret; struct perf_event_context *child_ctx; + int ret; if (!event->attr.inherit || (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) || @@ -13247,7 +13370,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return 0; } - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -13255,16 +13378,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * First allocate and initialize a context for the * child. */ - child_ctx = alloc_perf_context(parent_ctx->pmu, child); + child_ctx = alloc_perf_context(child); if (!child_ctx) return -ENOMEM; - child->perf_event_ctxp[ctxn] = child_ctx; + child->perf_event_ctxp = child_ctx; } - ret = inherit_group(event, parent, parent_ctx, - child, child_ctx); - + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) *inherited_all = 0; @@ -13274,8 +13395,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -static int perf_event_init_context(struct task_struct *child, int ctxn, - u64 clone_flags) +static int perf_event_init_context(struct task_struct *child, u64 clone_flags) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -13285,14 +13405,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, unsigned long flags; int ret = 0; - if (likely(!parent->perf_event_ctxp[ctxn])) + if (likely(!parent->perf_event_ctxp)) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent, ctxn); + parent_ctx = perf_pin_task_context(parent); if (!parent_ctx) return 0; @@ -13315,8 +13435,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, */ perf_event_groups_for_each(event, &parent_ctx->pinned_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, clone_flags, - &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -13332,8 +13451,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, perf_event_groups_for_each(event, &parent_ctx->flexible_groups) { ret = inherit_task_group(event, parent, parent_ctx, - child, ctxn, clone_flags, - &inherited_all); + child, clone_flags, &inherited_all); if (ret) goto out_unlock; } @@ -13341,7 +13459,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn, raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = child->perf_event_ctxp; if (child_ctx && inherited_all) { /* @@ -13377,18 +13495,16 @@ out_unlock: */ int perf_event_init_task(struct task_struct *child, u64 clone_flags) { - int ctxn, ret; + int ret; - memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + child->perf_event_ctxp = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - for_each_task_context_nr(ctxn) { - ret = perf_event_init_context(child, ctxn, clone_flags); - if (ret) { - perf_event_free_task(child); - return ret; - } + ret = perf_event_init_context(child, clone_flags); + if (ret) { + perf_event_free_task(child); + return ret; } return 0; @@ -13397,6 +13513,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags) static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; + struct perf_cpu_context *cpuctx; int cpu; zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); @@ -13404,15 +13521,19 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); -#ifdef CONFIG_CGROUP_PERF - INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu)); -#endif INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); + + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); + cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask); + cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default); + cpuctx->heap = cpuctx->heap_default; } } @@ -13434,12 +13555,12 @@ static void perf_swevent_init_cpu(unsigned int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); struct perf_event_context *ctx = __info; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event *event; raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_TIME); + ctx_sched_out(ctx, EVENT_TIME); list_for_each_entry(event, &ctx->event_list, event_entry) __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); raw_spin_unlock(&ctx->lock); @@ -13449,18 +13570,16 @@ static void perf_event_exit_cpu_context(int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; + // XXX simplify cpuctx->online mutex_lock(&pmus_lock); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); - cpuctx->online = 0; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + cpuctx->online = 0; + mutex_unlock(&ctx->mutex); cpumask_clear_cpu(cpu, perf_online_mask); mutex_unlock(&pmus_lock); } @@ -13474,20 +13593,17 @@ int perf_event_init_cpu(unsigned int cpu) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; perf_swevent_init_cpu(cpu); mutex_lock(&pmus_lock); cpumask_set_cpu(cpu, perf_online_mask); - list_for_each_entry(pmu, &pmus, entry) { - cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - ctx = &cpuctx->ctx; + cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); + ctx = &cpuctx->ctx; - mutex_lock(&ctx->mutex); - cpuctx->online = 1; - mutex_unlock(&ctx->mutex); - } + mutex_lock(&ctx->mutex); + cpuctx->online = 1; + mutex_unlock(&ctx->mutex); mutex_unlock(&pmus_lock); return 0; @@ -13624,9 +13740,12 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; - rcu_read_lock(); - perf_cgroup_switch(task); - rcu_read_unlock(); + + preempt_disable(); + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_switch(task); + preempt_enable(); + return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index 35e0a31a0315..15dc2ec80c46 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -67,11 +67,58 @@ #include <linux/io_uring.h> #include <linux/kprobes.h> #include <linux/rethook.h> +#include <linux/sysfs.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/mmu_context.h> +/* + * The default value should be high enough to not crash a system that randomly + * crashes its kernel from time to time, but low enough to at least not permit + * overflowing 32-bit refcounts or the ldsem writer count. + */ +static unsigned int oops_limit = 10000; + +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_exit_table[] = { + { + .procname = "oops_limit", + .data = &oops_limit, + .maxlen = sizeof(oops_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { } +}; + +static __init int kernel_exit_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_exit_table); + return 0; +} +late_initcall(kernel_exit_sysctls_init); +#endif + +static atomic_t oops_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&oops_count)); +} + +static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count); + +static __init int kernel_exit_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_exit_sysfs_init); +#endif + static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -884,6 +931,7 @@ void __noreturn make_task_dead(int signr) * Then do everything else. */ struct task_struct *tsk = current; + unsigned int limit; if (unlikely(in_interrupt())) panic("Aiee, killing interrupt handler!"); @@ -898,6 +946,20 @@ void __noreturn make_task_dead(int signr) } /* + * Every time the system oopses, if the oops happens while a reference + * to an object was held, the reference leaks. + * If the oops doesn't also leak memory, repeated oopsing can cause + * reference counters to wrap around (if they're not using refcount_t). + * This means that repeated oopsing can make unexploitable-looking bugs + * exploitable through repeated oopsing. + * To make sure this can't happen, place an upper bound on how often the + * kernel may oops without panic(). + */ + limit = READ_ONCE(oops_limit); + if (atomic_inc_return(&oops_count) >= limit && limit) + panic("Oopsed too often (kernel.oops_limit is %d)", limit); + + /* * We're taking recursive faults here in make_task_dead. Safest is to just * leave this task alone and wait for reboot. */ diff --git a/kernel/fork.c b/kernel/fork.c index 08969f5aa38d..9f7fe3541897 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,7 +75,6 @@ #include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> -#include <linux/random.h> #include <linux/tty.h> #include <linux/fs_struct.h> #include <linux/magic.h> @@ -97,6 +96,7 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> +#include <linux/stackprotector.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -535,6 +535,9 @@ void put_task_stack(struct task_struct *tsk) void free_task(struct task_struct *tsk) { +#ifdef CONFIG_SECCOMP + WARN_ON_ONCE(tsk->seccomp.filter); +#endif release_user_cpus_ptr(tsk); scs_release(tsk); @@ -753,8 +756,13 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); + long x = percpu_counter_sum(&mm->rss_stat[i]); + + if (likely(!x)) + continue; + /* Making sure this is not due to race with CPU offlining. */ + x = percpu_counter_sum_all(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); @@ -779,6 +787,8 @@ static void check_mm(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { + int i; + BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); WARN_ON_ONCE(mm == current->active_mm); @@ -788,6 +798,9 @@ void __mmdrop(struct mm_struct *mm) check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); + + for (i = 0; i < NR_MM_COUNTERS; i++) + percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1107,6 +1120,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { + int i; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1148,10 +1163,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (init_new_context(p, mm)) goto fail_nocontext; + for (i = 0; i < NR_MM_COUNTERS; i++) + if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) + goto fail_pcpu; + mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; +fail_pcpu: + while (i > 0) + percpu_counter_destroy(&mm->rss_stat[--i]); fail_nocontext: mm_free_pgd(mm); fail_nopgd: @@ -2043,15 +2065,6 @@ static __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } - /* - * If the new process will be in a different time namespace - * do not allow it to share VM or a thread group with the forking task. - */ - if (clone_flags & (CLONE_THREAD | CLONE_VM)) { - if (nsp->time_ns != nsp->time_ns_for_children) - return ERR_PTR(-EINVAL); - } - if (clone_flags & CLONE_PIDFD) { /* * - CLONE_DETACHED is blocked so that we can potentially @@ -2406,12 +2419,6 @@ static __latent_entropy struct task_struct *copy_process( spin_lock(¤t->sighand->siglock); - /* - * Copy seccomp details explicitly here, in case they were changed - * before holding sighand lock. - */ - copy_seccomp(p); - rv_task_fork(p); rseq_fork(p, clone_flags); @@ -2428,6 +2435,14 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } + /* No more failure paths after this point. */ + + /* + * Copy seccomp details explicitly here, in case they were changed + * before holding sighand lock. + */ + copy_seccomp(p); + init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2592,11 +2607,6 @@ struct task_struct * __init fork_idle(int cpu) return task; } -struct mm_struct *copy_init_mm(void) -{ - return dup_mm(NULL, &init_mm); -} - /* * This is like kernel_clone(), but shaved down and tailored to just * creating io_uring workers. It returns a created task, or an error pointer. @@ -3015,10 +3025,27 @@ static void sighand_ctor(void *data) init_waitqueue_head(&sighand->signalfd_wqh); } -void __init proc_caches_init(void) +void __init mm_cache_init(void) { unsigned int mm_size; + /* + * The mm_cpumask is located at the end of mm_struct, and is + * dynamically sized based on the maximum CPU number this system + * can have, taking hotplug into account (nr_cpu_ids). + */ + mm_size = sizeof(struct mm_struct) + cpumask_size(); + + mm_cachep = kmem_cache_create_usercopy("mm_struct", + mm_size, ARCH_MIN_MMSTRUCT_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct mm_struct, saved_auxv), + sizeof_field(struct mm_struct, saved_auxv), + NULL); +} + +void __init proc_caches_init(void) +{ sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3036,19 +3063,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - /* - * The mm_cpumask is located at the end of mm_struct, and is - * dynamically sized based on the maximum CPU number this system - * can have, taking hotplug into account (nr_cpu_ids). - */ - mm_size = sizeof(struct mm_struct) + cpumask_size(); - - mm_cachep = kmem_cache_create_usercopy("mm_struct", - mm_size, ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, - offsetof(struct mm_struct, saved_auxv), - sizeof_field(struct mm_struct, saved_auxv), - NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index b22ef1efe751..514e4582b863 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -638,6 +638,7 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, bool pi, bool pending_op) { u32 uval, nval, mval; + pid_t owner; int err; /* Futex address must be 32bit aligned */ @@ -659,6 +660,10 @@ retry: * 2. A woken up waiter is killed before it can acquire the * futex in user space. * + * In the second case, the wake up notification could be generated + * by the unlock path in user space after setting the futex value + * to zero or by the kernel after setting the OWNER_DIED bit below. + * * In both cases the TID validation below prevents a wakeup of * potential waiters which can cause these waiters to block * forever. @@ -667,24 +672,27 @@ retry: * * 1) task->robust_list->list_op_pending != NULL * @pending_op == true - * 2) User space futex value == 0 + * 2) The owner part of user space futex value == 0 * 3) Regular futex: @pi == false * * If these conditions are met, it is safe to attempt waking up a * potential waiter without touching the user space futex value and - * trying to set the OWNER_DIED bit. The user space futex value is - * uncontended and the rest of the user space mutex state is - * consistent, so a woken waiter will just take over the - * uncontended futex. Setting the OWNER_DIED bit would create - * inconsistent state and malfunction of the user space owner died - * handling. + * trying to set the OWNER_DIED bit. If the futex value is zero, + * the rest of the user space mutex state is consistent, so a woken + * waiter will just take over the uncontended futex. Setting the + * OWNER_DIED bit would create inconsistent state and malfunction + * of the user space owner died handling. Otherwise, the OWNER_DIED + * bit is already set, and the woken waiter is expected to deal with + * this. */ - if (pending_op && !pi && !uval) { + owner = uval & FUTEX_TID_MASK; + + if (pending_op && !pi && !owner) { futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); return 0; } - if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + if (owner != task_pid_vnr(curr)) return 0; /* diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c index cbb0bed958ab..7670a811a565 100644 --- a/kernel/gcov/clang.c +++ b/kernel/gcov/clang.c @@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) for (i = 0; i < sfn_ptr->num_counters; i++) dfn_ptr->counters[i] += sfn_ptr->counters[i]; + + sfn_ptr = list_next_entry(sfn_ptr, head); } } diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 7971e989e425..74a4ef1da9ad 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -82,6 +82,7 @@ struct gcov_fn_info { * @version: gcov version magic indicating the gcc version used for compilation * @next: list head for a singly-linked list * @stamp: uniquifying time stamp + * @checksum: unique object checksum * @filename: name of the associated gcov data file * @merge: merge functions (null for unused counter type) * @n_functions: number of instrumented functions @@ -94,6 +95,10 @@ struct gcov_info { unsigned int version; struct gcov_info *next; unsigned int stamp; + /* Since GCC 12.1 a checksum field is added. */ +#if (__GNUC__ >= 12) + unsigned int checksum; +#endif const char *filename; void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int); unsigned int n_functions; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index db3d174c53d4..b64c44ae4c25 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -86,15 +86,10 @@ config GENERIC_IRQ_IPI depends on SMP select IRQ_DOMAIN_HIERARCHY -# Generic MSI interrupt support -config GENERIC_MSI_IRQ - bool - # Generic MSI hierarchical interrupt domain support -config GENERIC_MSI_IRQ_DOMAIN +config GENERIC_MSI_IRQ bool select IRQ_DOMAIN_HIERARCHY - select GENERIC_MSI_IRQ config IRQ_MSI_IOMMU bool diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8ac37e8e738a..49e7bc871fec 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1561,10 +1561,10 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } -static struct device *irq_get_parent_device(struct irq_data *data) +static struct device *irq_get_pm_device(struct irq_data *data) { if (data->domain) - return data->domain->dev; + return data->domain->pm_dev; return NULL; } @@ -1578,7 +1578,7 @@ static struct device *irq_get_parent_device(struct irq_data *data) */ int irq_chip_pm_get(struct irq_data *data) { - struct device *dev = irq_get_parent_device(data); + struct device *dev = irq_get_pm_device(data); int retval = 0; if (IS_ENABLED(CONFIG_PM) && dev) @@ -1597,7 +1597,7 @@ int irq_chip_pm_get(struct irq_data *data) */ int irq_chip_pm_put(struct irq_data *data) { - struct device *dev = irq_get_parent_device(data); + struct device *dev = irq_get_pm_device(data); int retval = 0; if (IS_ENABLED(CONFIG_PM) && dev) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f09c60393e55..5fdc0b557579 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -52,6 +52,7 @@ enum { * IRQS_PENDING - irq is pending and replayed later * IRQS_SUSPENDED - irq is suspended * IRQS_NMI - irq line is used to deliver NMIs + * IRQS_SYSFS - descriptor has been added to sysfs */ enum { IRQS_AUTODETECT = 0x00000001, @@ -64,6 +65,7 @@ enum { IRQS_SUSPENDED = 0x00000800, IRQS_TIMINGS = 0x00001000, IRQS_NMI = 0x00002000, + IRQS_SYSFS = 0x00004000, }; #include "debug.h" diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a91f9001103c..fd0996274401 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -288,22 +288,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc) if (irq_kobj_base) { /* * Continue even in case of failure as this is nothing - * crucial. + * crucial and failures in the late irq_sysfs_init() + * cannot be rolled back. */ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) pr_warn("Failed to add kobject for irq %d\n", irq); + else + desc->istate |= IRQS_SYSFS; } } static void irq_sysfs_del(struct irq_desc *desc) { /* - * If irq_sysfs_init() has not yet been invoked (early boot), then - * irq_kobj_base is NULL and the descriptor was never added. - * kobject_del() complains about a object with no parent, so make - * it conditional. + * Only invoke kobject_del() when kobject_add() was successfully + * invoked for the descriptor. This covers both early boot, where + * sysfs is not initialized yet, and the case of a failed + * kobject_add() invocation. */ - if (irq_kobj_base) + if (desc->istate & IRQS_SYSFS) kobject_del(&desc->kobj); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 40fe7806cc8c..5b7cf28df290 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -321,7 +321,7 @@ static int irq_try_set_affinity(struct irq_data *data, } static bool irq_set_affinity_deactivated(struct irq_data *data, - const struct cpumask *mask, bool force) + const struct cpumask *mask) { struct irq_desc *desc = irq_data_to_desc(data); @@ -354,7 +354,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (!chip || !chip->irq_set_affinity) return -EINVAL; - if (irq_set_affinity_deactivated(data, mask, force)) + if (irq_set_affinity_deactivated(data, mask)) return 0; if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index a9ee535293eb..955267bbc2be 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -19,8 +19,31 @@ #include "internals.h" +/** + * struct msi_ctrl - MSI internal management control structure + * @domid: ID of the domain on which management operations should be done + * @first: First (hardware) slot index to operate on + * @last: Last (hardware) slot index to operate on + * @nirqs: The number of Linux interrupts to allocate. Can be larger + * than the range due to PCI/multi-MSI. + */ +struct msi_ctrl { + unsigned int domid; + unsigned int first; + unsigned int last; + unsigned int nirqs; +}; + +/* Invalid Xarray index which is outside of any searchable range */ +#define MSI_XA_MAX_INDEX (ULONG_MAX - 1) +/* The maximum domain size */ +#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1) + +static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl); +static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid); static inline int msi_sysfs_create_group(struct device *dev); + /** * msi_alloc_desc - Allocate an initialized msi_desc * @dev: Pointer to the device for which this is allocated @@ -33,7 +56,7 @@ static inline int msi_sysfs_create_group(struct device *dev); * Return: pointer to allocated &msi_desc on success or %NULL on failure */ static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, - const struct irq_affinity_desc *affinity) + const struct irq_affinity_desc *affinity) { struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); @@ -58,25 +81,56 @@ static void msi_free_desc(struct msi_desc *desc) kfree(desc); } -static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index) +static int msi_insert_desc(struct device *dev, struct msi_desc *desc, + unsigned int domid, unsigned int index) { + struct msi_device_data *md = dev->msi.data; + struct xarray *xa = &md->__domains[domid].store; + unsigned int hwsize; int ret; - desc->msi_index = index; - ret = xa_insert(&md->__store, index, desc, GFP_KERNEL); - if (ret) - msi_free_desc(desc); + hwsize = msi_domain_get_hwsize(dev, domid); + + if (index == MSI_ANY_INDEX) { + struct xa_limit limit = { .min = 0, .max = hwsize - 1 }; + unsigned int index; + + /* Let the xarray allocate a free index within the limit */ + ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL); + if (ret) + goto fail; + + desc->msi_index = index; + return 0; + } else { + if (index >= hwsize) { + ret = -ERANGE; + goto fail; + } + + desc->msi_index = index; + ret = xa_insert(xa, index, desc, GFP_KERNEL); + if (ret) + goto fail; + return 0; + } +fail: + msi_free_desc(desc); return ret; } /** - * msi_add_msi_desc - Allocate and initialize a MSI descriptor + * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and + * insert it at @init_desc->msi_index + * * @dev: Pointer to the device for which the descriptor is allocated + * @domid: The id of the interrupt domain to which the desriptor is added * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor * * Return: 0 on success or an appropriate failure code. */ -int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) +int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid, + struct msi_desc *init_desc) { struct msi_desc *desc; @@ -88,40 +142,8 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) /* Copy type specific data to the new descriptor. */ desc->pci = init_desc->pci; - return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index); -} -/** - * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors - * @dev: Pointer to the device for which the descriptors are allocated - * @index: Index for the first MSI descriptor - * @ndesc: Number of descriptors to allocate - * - * Return: 0 on success or an appropriate failure code. - */ -static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc) -{ - unsigned int idx, last = index + ndesc - 1; - struct msi_desc *desc; - int ret; - - lockdep_assert_held(&dev->msi.data->mutex); - - for (idx = index; idx <= last; idx++) { - desc = msi_alloc_desc(dev, 1, NULL); - if (!desc) - goto fail_mem; - ret = msi_insert_desc(dev->msi.data, desc, idx); - if (ret) - goto fail; - } - return 0; - -fail_mem: - ret = -ENOMEM; -fail: - msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last); - return ret; + return msi_insert_desc(dev, desc, domid, init_desc->msi_index); } static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) @@ -138,28 +160,97 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) return false; } +static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl) +{ + unsigned int hwsize; + + if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS || + (dev->msi.domain && + !dev->msi.data->__domains[ctrl->domid].domain))) + return false; + + hwsize = msi_domain_get_hwsize(dev, ctrl->domid); + if (WARN_ON_ONCE(ctrl->first > ctrl->last || + ctrl->first >= hwsize || + ctrl->last >= hwsize)) + return false; + return true; +} + +static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl) +{ + struct msi_desc *desc; + struct xarray *xa; + unsigned long idx; + + lockdep_assert_held(&dev->msi.data->mutex); + + if (!msi_ctrl_valid(dev, ctrl)) + return; + + xa = &dev->msi.data->__domains[ctrl->domid].store; + xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { + xa_erase(xa, idx); + + /* Leak the descriptor when it is still referenced */ + if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED))) + continue; + msi_free_desc(desc); + } +} + /** - * msi_free_msi_descs_range - Free MSI descriptors of a device - * @dev: Device to free the descriptors - * @filter: Descriptor state filter - * @first_index: Index to start freeing from - * @last_index: Last index to be freed + * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain + * @dev: Device for which to free the descriptors + * @domid: Id of the domain to operate on + * @first: Index to start freeing from (inclusive) + * @last: Last index to be freed (inclusive) */ -void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, - unsigned int first_index, unsigned int last_index) +void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + struct msi_ctrl ctrl = { + .domid = domid, + .first = first, + .last = last, + }; + + msi_domain_free_descs(dev, &ctrl); +} + +/** + * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors + * @dev: Pointer to the device for which the descriptors are allocated + * @ctrl: Allocation control struct + * + * Return: 0 on success or an appropriate failure code. + */ +static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl) { - struct xarray *xa = &dev->msi.data->__store; struct msi_desc *desc; - unsigned long idx; + unsigned int idx; + int ret; lockdep_assert_held(&dev->msi.data->mutex); - xa_for_each_range(xa, idx, desc, first_index, last_index) { - if (msi_desc_match(desc, filter)) { - xa_erase(xa, idx); - msi_free_desc(desc); - } + if (!msi_ctrl_valid(dev, ctrl)) + return -EINVAL; + + for (idx = ctrl->first; idx <= ctrl->last; idx++) { + desc = msi_alloc_desc(dev, 1, NULL); + if (!desc) + goto fail_mem; + ret = msi_insert_desc(dev, desc, ctrl->domid, idx); + if (ret) + goto fail; } + return 0; + +fail_mem: + ret = -ENOMEM; +fail: + msi_domain_free_descs(dev, ctrl); + return ret; } void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) @@ -178,9 +269,13 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg); static void msi_device_data_release(struct device *dev, void *res) { struct msi_device_data *md = res; + int i; - WARN_ON_ONCE(!xa_empty(&md->__store)); - xa_destroy(&md->__store); + for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) { + msi_remove_device_irq_domain(dev, i); + WARN_ON_ONCE(!xa_empty(&md->__domains[i].store)); + xa_destroy(&md->__domains[i].store); + } dev->msi.data = NULL; } @@ -197,7 +292,7 @@ static void msi_device_data_release(struct device *dev, void *res) int msi_setup_device_data(struct device *dev) { struct msi_device_data *md; - int ret; + int ret, i; if (dev->msi.data) return 0; @@ -212,7 +307,18 @@ int msi_setup_device_data(struct device *dev) return ret; } - xa_init(&md->__store); + for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) + xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC); + + /* + * If @dev::msi::domain is set and is a global MSI domain, copy the + * pointer into the domain array so all code can operate on domain + * ids. The NULL pointer check is required to keep the legacy + * architecture specific PCI/MSI support working. + */ + if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain)) + md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain; + mutex_init(&md->mutex); dev->msi.data = md; devres_add(dev, md); @@ -235,27 +341,30 @@ EXPORT_SYMBOL_GPL(msi_lock_descs); */ void msi_unlock_descs(struct device *dev) { - /* Invalidate the index wich was cached by the iterator */ - dev->msi.data->__iter_idx = MSI_MAX_INDEX; + /* Invalidate the index which was cached by the iterator */ + dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX; mutex_unlock(&dev->msi.data->mutex); } EXPORT_SYMBOL_GPL(msi_unlock_descs); -static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter) +static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid, + enum msi_desc_filter filter) { + struct xarray *xa = &md->__domains[domid].store; struct msi_desc *desc; - xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) { + xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) { if (msi_desc_match(desc, filter)) return desc; } - md->__iter_idx = MSI_MAX_INDEX; + md->__iter_idx = MSI_XA_MAX_INDEX; return NULL; } /** - * msi_first_desc - Get the first MSI descriptor of a device + * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device * @dev: Device to operate on + * @domid: The id of the interrupt domain which should be walked. * @filter: Descriptor state filter * * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs() @@ -264,23 +373,26 @@ static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_ * Return: Pointer to the first MSI descriptor matching the search * criteria, NULL if none found. */ -struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter) +struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid, + enum msi_desc_filter filter) { struct msi_device_data *md = dev->msi.data; - if (WARN_ON_ONCE(!md)) + if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS)) return NULL; lockdep_assert_held(&md->mutex); md->__iter_idx = 0; - return msi_find_desc(md, filter); + return msi_find_desc(md, domid, filter); } -EXPORT_SYMBOL_GPL(msi_first_desc); +EXPORT_SYMBOL_GPL(msi_domain_first_desc); /** * msi_next_desc - Get the next MSI descriptor of a device * @dev: Device to operate on + * @domid: The id of the interrupt domain which should be walked. + * @filter: Descriptor state filter * * The first invocation of msi_next_desc() has to be preceeded by a * successful invocation of __msi_first_desc(). Consecutive invocations are @@ -290,11 +402,12 @@ EXPORT_SYMBOL_GPL(msi_first_desc); * Return: Pointer to the next MSI descriptor matching the search * criteria, NULL if none found. */ -struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) +struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid, + enum msi_desc_filter filter) { struct msi_device_data *md = dev->msi.data; - if (WARN_ON_ONCE(!md)) + if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS)) return NULL; lockdep_assert_held(&md->mutex); @@ -303,30 +416,38 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) return NULL; md->__iter_idx++; - return msi_find_desc(md, filter); + return msi_find_desc(md, domid, filter); } EXPORT_SYMBOL_GPL(msi_next_desc); /** - * msi_get_virq - Return Linux interrupt number of a MSI interrupt + * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain * @dev: Device to operate on + * @domid: Domain ID of the interrupt domain associated to the device * @index: MSI interrupt index to look for (0-based) * * Return: The Linux interrupt number on success (> 0), 0 if not found */ -unsigned int msi_get_virq(struct device *dev, unsigned int index) +unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index) { struct msi_desc *desc; unsigned int ret = 0; - bool pcimsi; + bool pcimsi = false; + struct xarray *xa; if (!dev->msi.data) return 0; - pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false; + if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS)) + return 0; + + /* This check is only valid for the PCI default MSI domain */ + if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN) + pcimsi = to_pci_dev(dev)->msi_enabled; msi_lock_descs(dev); - desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index); + xa = &dev->msi.data->__domains[domid].store; + desc = xa_load(xa, pcimsi ? 0 : index); if (desc && desc->irq) { /* * PCI-MSI has only one descriptor for multiple interrupts. @@ -340,10 +461,11 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index) ret = desc->irq; } } + msi_unlock_descs(dev); return ret; } -EXPORT_SYMBOL_GPL(msi_get_virq); +EXPORT_SYMBOL_GPL(msi_domain_get_virq); #ifdef CONFIG_SYSFS static struct attribute *msi_dev_attrs[] = { @@ -459,7 +581,39 @@ static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *d static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { } #endif /* !CONFIG_SYSFS */ -#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid) +{ + struct irq_domain *domain; + + lockdep_assert_held(&dev->msi.data->mutex); + + if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS)) + return NULL; + + domain = dev->msi.data->__domains[domid].domain; + if (!domain) + return NULL; + + if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain))) + return NULL; + + return domain; +} + +static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid) +{ + struct msi_domain_info *info; + struct irq_domain *domain; + + domain = msi_get_device_domain(dev, domid); + if (domain) { + info = domain->host_data; + return info->hwsize; + } + /* No domain, default to MSI_XA_DOMAIN_SIZE */ + return MSI_XA_DOMAIN_SIZE; +} + static inline void irq_chip_write_msi_msg(struct irq_data *data, struct msi_msg *msg) { @@ -613,21 +767,11 @@ static int msi_domain_ops_init(struct irq_domain *domain, return 0; } -static int msi_domain_ops_check(struct irq_domain *domain, - struct msi_domain_info *info, - struct device *dev) -{ - return 0; -} - static struct msi_domain_ops msi_domain_ops_default = { .get_hwirq = msi_domain_ops_get_hwirq, .msi_init = msi_domain_ops_init, - .msi_check = msi_domain_ops_check, .msi_prepare = msi_domain_ops_prepare, .set_desc = msi_domain_ops_set_desc, - .domain_alloc_irqs = __msi_domain_alloc_irqs, - .domain_free_irqs = __msi_domain_free_irqs, }; static void msi_domain_update_dom_ops(struct msi_domain_info *info) @@ -639,11 +783,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) return; } - if (ops->domain_alloc_irqs == NULL) - ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs; - if (ops->domain_free_irqs == NULL) - ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs; - if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS)) return; @@ -651,8 +790,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info) ops->get_hwirq = msi_domain_ops_default.get_hwirq; if (ops->msi_init == NULL) ops->msi_init = msi_domain_ops_default.msi_init; - if (ops->msi_check == NULL) - ops->msi_check = msi_domain_ops_default.msi_check; if (ops->msi_prepare == NULL) ops->msi_prepare = msi_domain_ops_default.msi_prepare; if (ops->set_desc == NULL) @@ -668,6 +805,40 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info) chip->irq_set_affinity = msi_domain_set_affinity; } +static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode, + struct msi_domain_info *info, + unsigned int flags, + struct irq_domain *parent) +{ + struct irq_domain *domain; + + if (info->hwsize > MSI_XA_DOMAIN_SIZE) + return NULL; + + /* + * Hardware size 0 is valid for backwards compatibility and for + * domains which are not backed by a hardware table. Grant the + * maximum index space. + */ + if (!info->hwsize) + info->hwsize = MSI_XA_DOMAIN_SIZE; + + msi_domain_update_dom_ops(info); + if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) + msi_domain_update_chip_ops(info); + + domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0, + fwnode, &msi_domain_ops, info); + + if (domain) { + if (!domain->name && info->chip) + domain->name = info->chip->name; + irq_domain_update_bus_token(domain, info->bus_token); + } + + return domain; +} + /** * msi_create_irq_domain - Create an MSI interrupt domain * @fwnode: Optional fwnode of the interrupt controller @@ -680,19 +851,210 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, struct msi_domain_info *info, struct irq_domain *parent) { + return __msi_create_irq_domain(fwnode, info, 0, parent); +} + +/** + * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down + * in the domain hierarchy + * @dev: The device for which the domain should be created + * @domain: The domain in the hierarchy this op is being called on + * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to + * be created + * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE + * domain to be created + * + * Return: true on success, false otherwise + * + * This is the most complex problem of per device MSI domains and the + * underlying interrupt domain hierarchy: + * + * The device domain to be initialized requests the broadest feature set + * possible and the underlying domain hierarchy puts restrictions on it. + * + * That's trivial for a simple parent->child relationship, but it gets + * interesting with an intermediate domain: root->parent->child. The + * intermediate 'parent' can expand the capabilities which the 'root' + * domain is providing. So that creates a classic hen and egg problem: + * Which entity is doing the restrictions/expansions? + * + * One solution is to let the root domain handle the initialization that's + * why there is the @domain and the @msi_parent_domain pointer. + */ +bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *msi_parent_domain, + struct msi_domain_info *msi_child_info) +{ + struct irq_domain *parent = domain->parent; + + if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops || + !parent->msi_parent_ops->init_dev_msi_info)) + return false; + + return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain, + msi_child_info); +} + +/** + * msi_create_device_irq_domain - Create a device MSI interrupt domain + * @dev: Pointer to the device + * @domid: Domain id + * @template: MSI domain info bundle used as template + * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited) + * @domain_data: Optional pointer to domain specific data which is set in + * msi_domain_info::data + * @chip_data: Optional pointer to chip specific data which is set in + * msi_domain_info::chip_data + * + * Return: True on success, false otherwise + * + * There is no firmware node required for this interface because the per + * device domains are software constructs which are actually closer to the + * hardware reality than any firmware can describe them. + * + * The domain name and the irq chip name for a MSI device domain are + * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)" + * + * $PREFIX: Optional prefix provided by the underlying MSI parent domain + * via msi_parent_ops::prefix. If that pointer is NULL the prefix + * is empty. + * $CHIPNAME: The name of the irq_chip in @template + * $DEVNAME: The name of the device + * + * This results in understandable chip names and hardware interrupt numbers + * in e.g. /proc/interrupts + * + * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix + * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-' + * + * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect + * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device + * IR-PCI-MSIX-0000:3d:00.0 2-edge + * + * On IMS domains the hardware interrupt number is either a table entry + * index or a purely software managed index but it is guaranteed to be + * unique. + * + * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All + * subsequent operations on the domain depend on the domain id. + * + * The domain is automatically freed when the device is removed via devres + * in the context of @dev::msi::data freeing, but it can also be + * independently removed via @msi_remove_device_irq_domain(). + */ +bool msi_create_device_irq_domain(struct device *dev, unsigned int domid, + const struct msi_domain_template *template, + unsigned int hwsize, void *domain_data, + void *chip_data) +{ + struct irq_domain *domain, *parent = dev->msi.domain; + const struct msi_parent_ops *pops; + struct msi_domain_template *bundle; + struct fwnode_handle *fwnode; + + if (!irq_domain_is_msi_parent(parent)) + return false; + + if (domid >= MSI_MAX_DEVICE_IRQDOMAINS) + return false; + + bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL); + if (!bundle) + return false; + + bundle->info.hwsize = hwsize; + bundle->info.chip = &bundle->chip; + bundle->info.ops = &bundle->ops; + bundle->info.data = domain_data; + bundle->info.chip_data = chip_data; + + pops = parent->msi_parent_ops; + snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s", + pops->prefix ? : "", bundle->chip.name, dev_name(dev)); + bundle->chip.name = bundle->name; + + fwnode = irq_domain_alloc_named_fwnode(bundle->name); + if (!fwnode) + goto free_bundle; + + if (msi_setup_device_data(dev)) + goto free_fwnode; + + msi_lock_descs(dev); + + if (WARN_ON_ONCE(msi_get_device_domain(dev, domid))) + goto fail; + + if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info)) + goto fail; + + domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent); + if (!domain) + goto fail; + + domain->dev = dev; + dev->msi.data->__domains[domid].domain = domain; + msi_unlock_descs(dev); + return true; + +fail: + msi_unlock_descs(dev); +free_fwnode: + kfree(fwnode); +free_bundle: + kfree(bundle); + return false; +} + +/** + * msi_remove_device_irq_domain - Free a device MSI interrupt domain + * @dev: Pointer to the device + * @domid: Domain id + */ +void msi_remove_device_irq_domain(struct device *dev, unsigned int domid) +{ + struct msi_domain_info *info; struct irq_domain *domain; - msi_domain_update_dom_ops(info); - if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) - msi_domain_update_chip_ops(info); + msi_lock_descs(dev); - domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0, - fwnode, &msi_domain_ops, info); + domain = msi_get_device_domain(dev, domid); - if (domain && !domain->name && info->chip) - domain->name = info->chip->name; + if (!domain || !irq_domain_is_msi_device(domain)) + goto unlock; - return domain; + dev->msi.data->__domains[domid].domain = NULL; + info = domain->host_data; + irq_domain_remove(domain); + kfree(container_of(info, struct msi_domain_template, info)); + +unlock: + msi_unlock_descs(dev); +} + +/** + * msi_match_device_irq_domain - Match a device irq domain against a bus token + * @dev: Pointer to the device + * @domid: Domain id + * @bus_token: Bus token to match against the domain bus token + * + * Return: True if device domain exists and bus tokens match. + */ +bool msi_match_device_irq_domain(struct device *dev, unsigned int domid, + enum irq_domain_bus_token bus_token) +{ + struct msi_domain_info *info; + struct irq_domain *domain; + bool ret = false; + + msi_lock_descs(dev); + domain = msi_get_device_domain(dev, domid); + if (domain && irq_domain_is_msi_device(domain)) { + info = domain->host_data; + ret = info->bus_token == bus_token; + } + msi_unlock_descs(dev); + return ret; } int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, @@ -700,13 +1062,8 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - int ret; - - ret = ops->msi_check(domain, info, dev); - if (ret == 0) - ret = ops->msi_prepare(domain, dev, nvec, arg); - return ret; + return ops->msi_prepare(domain, dev, nvec, arg); } int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, @@ -714,16 +1071,27 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; + struct msi_ctrl ctrl = { + .domid = MSI_DEFAULT_DOMAIN, + .first = virq_base, + .last = virq_base + nvec - 1, + }; struct msi_desc *desc; + struct xarray *xa; int ret, virq; + if (!msi_ctrl_valid(dev, &ctrl)) + return -EINVAL; + msi_lock_descs(dev); - ret = msi_add_simple_msi_descs(dev, virq_base, nvec); + ret = msi_domain_add_simple_msi_descs(dev, &ctrl); if (ret) goto unlock; + xa = &dev->msi.data->__domains[ctrl.domid].store; + for (virq = virq_base; virq < virq_base + nvec; virq++) { - desc = xa_load(&dev->msi.data->__store, virq); + desc = xa_load(xa, virq); desc->irq = virq; ops->set_desc(arg, desc); @@ -739,7 +1107,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, fail: for (--virq; virq >= virq_base; virq--) irq_domain_free_irqs_common(domain, virq, 1); - msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1); + msi_domain_free_descs(dev, &ctrl); unlock: msi_unlock_descs(dev); return ret; @@ -764,6 +1132,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, switch(domain->bus_token) { case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: case DOMAIN_BUS_VMD_MSI: break; default: @@ -789,6 +1159,8 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, { switch(domain->bus_token) { case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSI: + case DOMAIN_BUS_PCI_DEVICE_MSIX: case DOMAIN_BUS_VMD_MSI: if (IS_ENABLED(CONFIG_PCI_MSI)) break; @@ -850,18 +1222,19 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag return 0; } -int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain, + struct msi_ctrl *ctrl) { + struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store; struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; + unsigned int vflags = 0, allocated = 0; msi_alloc_info_t arg = { }; - unsigned int vflags = 0; struct msi_desc *desc; - int allocated = 0; + unsigned long idx; int i, ret, virq; - ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); + ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg); if (ret) return ret; @@ -883,11 +1256,21 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, * MSI affinity setting requires a special quirk (X86) when * reservation mode is active. */ - if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + if (info->flags & MSI_FLAG_NOMASK_QUIRK) vflags |= VIRQ_NOMASK_QUIRK; } - msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) { + xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { + if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED)) + continue; + + /* This should return -ECONFUSED... */ + if (WARN_ON_ONCE(allocated >= ctrl->nirqs)) + return -EINVAL; + + if (ops->prepare_desc) + ops->prepare_desc(domain, &arg, desc); + ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, @@ -913,76 +1296,213 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return 0; } -static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info, - struct device *dev, - unsigned int num_descs) +static int msi_domain_alloc_simple_msi_descs(struct device *dev, + struct msi_domain_info *info, + struct msi_ctrl *ctrl) { if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS)) return 0; - return msi_add_simple_msi_descs(dev, 0, num_descs); + return msi_domain_add_simple_msi_descs(dev, ctrl); +} + +static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) +{ + struct msi_domain_info *info; + struct msi_domain_ops *ops; + struct irq_domain *domain; + int ret; + + if (!msi_ctrl_valid(dev, ctrl)) + return -EINVAL; + + domain = msi_get_device_domain(dev, ctrl->domid); + if (!domain) + return -ENODEV; + + info = domain->host_data; + + ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl); + if (ret) + return ret; + + ops = info->ops; + if (ops->domain_alloc_irqs) + return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs); + + return __msi_domain_alloc_irqs(dev, domain, ctrl); +} + +static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl) +{ + int ret = __msi_domain_alloc_locked(dev, ctrl); + + if (ret) + msi_domain_free_locked(dev, ctrl); + return ret; } /** - * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain - * @domain: The domain to allocate from + * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain * @dev: Pointer to device struct of the device for which the interrupts * are allocated - * @nvec: The number of interrupts to allocate + * @domid: Id of the interrupt domain to operate on + * @first: First index to allocate (inclusive) + * @last: Last index to allocate (inclusive) * * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() - * pair. Use this for MSI irqdomains which implement their own vector + * pair. Use this for MSI irqdomains which implement their own descriptor * allocation/free. * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, - int nvec) +int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) { - struct msi_domain_info *info = domain->host_data; - struct msi_domain_ops *ops = info->ops; - int ret; - - lockdep_assert_held(&dev->msi.data->mutex); + struct msi_ctrl ctrl = { + .domid = domid, + .first = first, + .last = last, + .nirqs = last + 1 - first, + }; + + return msi_domain_alloc_locked(dev, &ctrl); +} - ret = msi_domain_add_simple_msi_descs(info, dev, nvec); - if (ret) - return ret; +/** + * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @domid: Id of the interrupt domain to operate on + * @first: First index to allocate (inclusive) + * @last: Last index to allocate (inclusive) + * + * Return: %0 on success or an error code. + */ +int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + int ret; - ret = ops->domain_alloc_irqs(domain, dev, nvec); - if (ret) - msi_domain_free_irqs_descs_locked(domain, dev); + msi_lock_descs(dev); + ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); return ret; } /** - * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain - * @domain: The domain to allocate from + * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain + * * @dev: Pointer to device struct of the device for which the interrupts * are allocated - * @nvec: The number of interrupts to allocate + * @domid: Id of the interrupt domain to operate on + * @nirqs: The number of interrupts to allocate + * + * This function scans all MSI descriptors of the MSI domain and allocates interrupts + * for all unassigned ones. That function is to be used for MSI domain usage where + * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X]. * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) +int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs) { + struct msi_ctrl ctrl = { + .domid = domid, + .first = 0, + .last = msi_domain_get_hwsize(dev, domid) - 1, + .nirqs = nirqs, + }; + + return msi_domain_alloc_locked(dev, &ctrl); +} + +/** + * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at + * a given index - or at the next free index + * + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @domid: Id of the interrupt domain to operate on + * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation + * uses the next free index. + * @affdesc: Optional pointer to an interrupt affinity descriptor structure + * @icookie: Optional pointer to a domain specific per instance cookie. If + * non-NULL the content of the cookie is stored in msi_desc::data. + * Must be NULL for MSI-X allocations + * + * This requires a MSI interrupt domain which lets the core code manage the + * MSI descriptors. + * + * Return: struct msi_map + * + * On success msi_map::index contains the allocated index number and + * msi_map::virq the corresponding Linux interrupt number + * + * On failure msi_map::index contains the error code and msi_map::virq + * is %0. + */ +struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index, + const struct irq_affinity_desc *affdesc, + union msi_instance_cookie *icookie) +{ + struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, }; + struct irq_domain *domain; + struct msi_map map = { }; + struct msi_desc *desc; int ret; msi_lock_descs(dev); - ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec); + domain = msi_get_device_domain(dev, domid); + if (!domain) { + map.index = -ENODEV; + goto unlock; + } + + desc = msi_alloc_desc(dev, 1, affdesc); + if (!desc) { + map.index = -ENOMEM; + goto unlock; + } + + if (icookie) + desc->data.icookie = *icookie; + + ret = msi_insert_desc(dev, desc, domid, index); + if (ret) { + map.index = ret; + goto unlock; + } + + ctrl.first = ctrl.last = desc->msi_index; + + ret = __msi_domain_alloc_irqs(dev, domain, &ctrl); + if (ret) { + map.index = ret; + msi_domain_free_locked(dev, &ctrl); + } else { + map.index = desc->msi_index; + map.virq = desc->irq; + } +unlock: msi_unlock_descs(dev); - return ret; + return map; } -void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain, + struct msi_ctrl *ctrl) { + struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store; struct msi_domain_info *info = domain->host_data; struct irq_data *irqd; struct msi_desc *desc; + unsigned long idx; int i; - /* Only handle MSI entries which have an interrupt associated */ - msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) { + /* Only handle MSI entries which have an interrupt associated */ + if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED)) + continue; + /* Make sure all interrupts are deactivated */ for (i = 0; i < desc->nvec_used; i++) { irqd = irq_domain_get_irq_data(domain, desc->irq + i); @@ -997,44 +1517,99 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) } } -static void msi_domain_free_msi_descs(struct msi_domain_info *info, - struct device *dev) +static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl) { + struct msi_domain_info *info; + struct msi_domain_ops *ops; + struct irq_domain *domain; + + if (!msi_ctrl_valid(dev, ctrl)) + return; + + domain = msi_get_device_domain(dev, ctrl->domid); + if (!domain) + return; + + info = domain->host_data; + ops = info->ops; + + if (ops->domain_free_irqs) + ops->domain_free_irqs(domain, dev); + else + __msi_domain_free_irqs(dev, domain, ctrl); + + if (ops->msi_post_free) + ops->msi_post_free(domain, dev); + if (info->flags & MSI_FLAG_FREE_MSI_DESCS) - msi_free_msi_descs(dev); + msi_domain_free_descs(dev, ctrl); } /** - * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev - * @domain: The domain to managing the interrupts + * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain + * associated to @dev with msi_lock held * @dev: Pointer to device struct of the device for which the interrupts - * are free + * are freed + * @domid: Id of the interrupt domain to operate on + * @first: First index to free (inclusive) + * @last: Last index to free (inclusive) + */ +void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + struct msi_ctrl ctrl = { + .domid = domid, + .first = first, + .last = last, + }; + msi_domain_free_locked(dev, &ctrl); +} + +/** + * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain + * associated to @dev + * @dev: Pointer to device struct of the device for which the interrupts + * are freed + * @domid: Id of the interrupt domain to operate on + * @first: First index to free (inclusive) + * @last: Last index to free (inclusive) + */ +void msi_domain_free_irqs_range(struct device *dev, unsigned int domid, + unsigned int first, unsigned int last) +{ + msi_lock_descs(dev); + msi_domain_free_irqs_range_locked(dev, domid, first, last); + msi_unlock_descs(dev); +} + +/** + * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain + * associated to a device + * @dev: Pointer to device struct of the device for which the interrupts + * are freed + * @domid: The id of the domain to operate on * * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() * pair. Use this for MSI irqdomains which implement their own vector * allocation. */ -void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev) +void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid) { - struct msi_domain_info *info = domain->host_data; - struct msi_domain_ops *ops = info->ops; - - lockdep_assert_held(&dev->msi.data->mutex); - - ops->domain_free_irqs(domain, dev); - msi_domain_free_msi_descs(info, dev); + msi_domain_free_irqs_range_locked(dev, domid, 0, + msi_domain_get_hwsize(dev, domid) - 1); } /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev - * @domain: The domain to managing the interrupts + * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain + * associated to a device * @dev: Pointer to device struct of the device for which the interrupts - * are free + * are freed + * @domid: The id of the domain to operate on */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +void msi_domain_free_irqs_all(struct device *dev, unsigned int domid) { msi_lock_descs(dev); - msi_domain_free_irqs_descs_locked(domain, dev); + msi_domain_free_irqs_all_locked(dev, domid); msi_unlock_descs(dev); } @@ -1048,5 +1623,3 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) { return (struct msi_domain_info *)domain->host_data; } - -#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 714ac4c3b556..d9c822bbffb8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -113,11 +113,40 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -void static_key_slow_inc_cpuslocked(struct static_key *key) +/* + * static_key_fast_inc_not_disabled - adds a user for a static key + * @key: static key that must be already enabled + * + * The caller must make sure that the static key can't get disabled while + * in this function. It doesn't patch jump labels, only adds a user to + * an already enabled static key. + * + * Returns true if the increment was done. Unlike refcount_t the ref counter + * is not saturated, but will fail to increment on overflow. + */ +bool static_key_fast_inc_not_disabled(struct static_key *key) { - int v, v1; + int v; STATIC_KEY_CHECK_USE(key); + /* + * Negative key->enabled has a special meaning: it sends + * static_key_slow_inc() down the slow path, and it is non-zero + * so it counts as "enabled" in jump_label_update(). Note that + * atomic_inc_unless_negative() checks >= 0, so roll our own. + */ + v = atomic_read(&key->enabled); + do { + if (v <= 0 || (v + 1) < 0) + return false; + } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); + + return true; +} +EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled); + +bool static_key_slow_inc_cpuslocked(struct static_key *key) +{ lockdep_assert_cpus_held(); /* @@ -126,17 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key) * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see * static_key_enabled(&key) for jumps to be updated properly. - * - * So give a special meaning to negative key->enabled: it sends - * static_key_slow_inc() down the slow path, and it is non-zero - * so it counts as "enabled" in jump_label_update(). Note that - * atomic_inc_unless_negative() checks >= 0, so roll our own. */ - for (v = atomic_read(&key->enabled); v > 0; v = v1) { - v1 = atomic_cmpxchg(&key->enabled, v, v + 1); - if (likely(v1 == v)) - return; - } + if (static_key_fast_inc_not_disabled(key)) + return true; jump_label_lock(); if (atomic_read(&key->enabled) == 0) { @@ -148,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key) */ atomic_set_release(&key->enabled, 1); } else { - atomic_inc(&key->enabled); + if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) { + jump_label_unlock(); + return false; + } } jump_label_unlock(); + return true; } -void static_key_slow_inc(struct static_key *key) +bool static_key_slow_inc(struct static_key *key) { + bool ret; + cpus_read_lock(); - static_key_slow_inc_cpuslocked(key); + ret = static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(static_key_slow_inc); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 60c20f301a6b..83f499182c9a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -146,7 +146,7 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } -static unsigned long kallsyms_sym_address(int idx) +unsigned long kallsyms_sym_address(int idx) { if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) return kallsyms_addresses[idx]; @@ -187,26 +187,100 @@ static bool cleanup_symbol_name(char *s) return false; } +static int compare_symbol_name(const char *name, char *namebuf) +{ + int ret; + + ret = strcmp(name, namebuf); + if (!ret) + return ret; + + if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf)) + return 0; + + return ret; +} + +static unsigned int get_symbol_seq(int index) +{ + unsigned int i, seq = 0; + + for (i = 0; i < 3; i++) + seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i]; + + return seq; +} + +static int kallsyms_lookup_names(const char *name, + unsigned int *start, + unsigned int *end) +{ + int ret; + int low, mid, high; + unsigned int seq, off; + char namebuf[KSYM_NAME_LEN]; + + low = 0; + high = kallsyms_num_syms - 1; + + while (low <= high) { + mid = low + (high - low) / 2; + seq = get_symbol_seq(mid); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + ret = compare_symbol_name(name, namebuf); + if (ret > 0) + low = mid + 1; + else if (ret < 0) + high = mid - 1; + else + break; + } + + if (low > high) + return -ESRCH; + + low = mid; + while (low) { + seq = get_symbol_seq(low - 1); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + if (compare_symbol_name(name, namebuf)) + break; + low--; + } + *start = low; + + if (end) { + high = mid; + while (high < kallsyms_num_syms - 1) { + seq = get_symbol_seq(high + 1); + off = get_symbol_offset(seq); + kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); + if (compare_symbol_name(name, namebuf)) + break; + high++; + } + *end = high; + } + + return 0; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { - char namebuf[KSYM_NAME_LEN]; - unsigned long i; - unsigned int off; + int ret; + unsigned int i; /* Skip the search for empty string. */ if (!*name) return 0; - for (i = 0, off = 0; i < kallsyms_num_syms; i++) { - off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - - if (strcmp(namebuf, name) == 0) - return kallsyms_sym_address(i); + ret = kallsyms_lookup_names(name, &i, NULL); + if (!ret) + return kallsyms_sym_address(get_symbol_seq(i)); - if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0) - return kallsyms_sym_address(i); - } return module_kallsyms_lookup_name(name); } @@ -233,6 +307,24 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, return 0; } +int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long), + const char *name, void *data) +{ + int ret; + unsigned int i, start, end; + + ret = kallsyms_lookup_names(name, &start, &end); + if (ret) + return 0; + + for (i = start; !ret && i <= end; i++) { + ret = fn(data, kallsyms_sym_address(get_symbol_seq(i))); + cond_resched(); + } + + return ret; +} + static unsigned long get_symbol_pos(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h index 2d0c6f2f0243..27fabdcc40f5 100644 --- a/kernel/kallsyms_internal.h +++ b/kernel/kallsyms_internal.h @@ -26,5 +26,6 @@ extern const char kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; extern const unsigned int kallsyms_markers[] __weak; +extern const u8 kallsyms_seqs_of_names[] __weak; #endif // LINUX_KALLSYMS_INTERNAL_H_ diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c new file mode 100644 index 000000000000..f35d9cc1aab1 --- /dev/null +++ b/kernel/kallsyms_selftest.c @@ -0,0 +1,485 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Test the function and performance of kallsyms + * + * Copyright (C) Huawei Technologies Co., Ltd., 2022 + * + * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei + */ + +#define pr_fmt(fmt) "kallsyms_selftest: " fmt + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/random.h> +#include <linux/sched/clock.h> +#include <linux/kthread.h> +#include <linux/vmalloc.h> + +#include "kallsyms_internal.h" +#include "kallsyms_selftest.h" + + +#define MAX_NUM_OF_RECORDS 64 + +struct test_stat { + int min; + int max; + int save_cnt; + int real_cnt; + int perf; + u64 sum; + char *name; + unsigned long addr; + unsigned long addrs[MAX_NUM_OF_RECORDS]; +}; + +struct test_item { + char *name; + unsigned long addr; +}; + +#define ITEM_FUNC(s) \ + { \ + .name = #s, \ + .addr = (unsigned long)s, \ + } + +#define ITEM_DATA(s) \ + { \ + .name = #s, \ + .addr = (unsigned long)&s, \ + } + + +static int kallsyms_test_var_bss_static; +static int kallsyms_test_var_data_static = 1; +int kallsyms_test_var_bss; +int kallsyms_test_var_data = 1; + +static int kallsyms_test_func_static(void) +{ + kallsyms_test_var_bss_static++; + kallsyms_test_var_data_static++; + + return 0; +} + +int kallsyms_test_func(void) +{ + return kallsyms_test_func_static(); +} + +__weak int kallsyms_test_func_weak(void) +{ + kallsyms_test_var_bss++; + kallsyms_test_var_data++; + return 0; +} + +static struct test_item test_items[] = { + ITEM_FUNC(kallsyms_test_func_static), + ITEM_FUNC(kallsyms_test_func), + ITEM_FUNC(kallsyms_test_func_weak), + ITEM_FUNC(vmalloc), + ITEM_FUNC(vfree), +#ifdef CONFIG_KALLSYMS_ALL + ITEM_DATA(kallsyms_test_var_bss_static), + ITEM_DATA(kallsyms_test_var_data_static), + ITEM_DATA(kallsyms_test_var_bss), + ITEM_DATA(kallsyms_test_var_data), + ITEM_DATA(vmap_area_list), +#endif +}; + +static char stub_name[KSYM_NAME_LEN]; + +static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr) +{ + *(u32 *)data += strlen(name); + + return 0; +} + +static void test_kallsyms_compression_ratio(void) +{ + u32 pos, off, len, num; + u32 ratio, total_size, total_len = 0; + + kallsyms_on_each_symbol(stat_symbol_len, &total_len); + + /* + * A symbol name cannot start with a number. This stub name helps us + * traverse the entire symbol table without finding a match. It's used + * for subsequent performance tests, and its length is the average + * length of all symbol names. + */ + memset(stub_name, '4', sizeof(stub_name)); + pos = total_len / kallsyms_num_syms; + stub_name[pos] = 0; + + pos = 0; + num = 0; + off = 0; + while (pos < kallsyms_num_syms) { + len = kallsyms_names[off]; + num++; + off++; + pos++; + if ((len & 0x80) != 0) { + len = (len & 0x7f) | (kallsyms_names[off] << 7); + num++; + off++; + } + off += len; + } + + /* + * 1. The length fields is not counted + * 2. The memory occupied by array kallsyms_token_table[] and + * kallsyms_token_index[] needs to be counted. + */ + total_size = off - num; + pos = kallsyms_token_index[0xff]; + total_size += pos + strlen(&kallsyms_token_table[pos]) + 1; + total_size += 0x100 * sizeof(u16); + + pr_info(" ---------------------------------------------------------\n"); + pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n"); + pr_info("|---------------------------------------------------------|\n"); + ratio = (u32)div_u64(10000ULL * total_size, total_len); + pr_info("| %10d | %10d | %10d | %2d.%-2d |\n", + kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100); + pr_info(" ---------------------------------------------------------\n"); +} + +static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr) +{ + u64 t0, t1, t; + unsigned long flags; + struct test_stat *stat = (struct test_stat *)data; + + local_irq_save(flags); + t0 = sched_clock(); + (void)kallsyms_lookup_name(name); + t1 = sched_clock(); + local_irq_restore(flags); + + t = t1 - t0; + if (t < stat->min) + stat->min = t; + + if (t > stat->max) + stat->max = t; + + stat->real_cnt++; + stat->sum += t; + + return 0; +} + +static void test_perf_kallsyms_lookup_name(void) +{ + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.min = INT_MAX; + kallsyms_on_each_symbol(lookup_name, &stat); + pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt); + pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n", + stat.min, stat.max, div_u64(stat.sum, stat.real_cnt)); +} + +static bool match_cleanup_name(const char *s, const char *name) +{ + char *p; + int len; + + if (!IS_ENABLED(CONFIG_LTO_CLANG)) + return false; + + p = strchr(s, '.'); + if (!p) + return false; + + len = strlen(name); + if (p - s != len) + return false; + + return !strncmp(s, name, len); +} + +static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr) +{ + struct test_stat *stat = (struct test_stat *)data; + + if (strcmp(name, stat->name) == 0 || + (!stat->perf && match_cleanup_name(name, stat->name))) { + stat->real_cnt++; + stat->addr = addr; + + if (stat->save_cnt < MAX_NUM_OF_RECORDS) { + stat->addrs[stat->save_cnt] = addr; + stat->save_cnt++; + } + + if (stat->real_cnt == stat->max) + return 1; + } + + return 0; +} + +static void test_perf_kallsyms_on_each_symbol(void) +{ + u64 t0, t1; + unsigned long flags; + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.max = INT_MAX; + stat.name = stub_name; + stat.perf = 1; + local_irq_save(flags); + t0 = sched_clock(); + kallsyms_on_each_symbol(find_symbol, &stat); + t1 = sched_clock(); + local_irq_restore(flags); + pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0); +} + +static int match_symbol(void *data, unsigned long addr) +{ + struct test_stat *stat = (struct test_stat *)data; + + stat->real_cnt++; + stat->addr = addr; + + if (stat->save_cnt < MAX_NUM_OF_RECORDS) { + stat->addrs[stat->save_cnt] = addr; + stat->save_cnt++; + } + + if (stat->real_cnt == stat->max) + return 1; + + return 0; +} + +static void test_perf_kallsyms_on_each_match_symbol(void) +{ + u64 t0, t1; + unsigned long flags; + struct test_stat stat; + + memset(&stat, 0, sizeof(stat)); + stat.max = INT_MAX; + stat.name = stub_name; + local_irq_save(flags); + t0 = sched_clock(); + kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat); + t1 = sched_clock(); + local_irq_restore(flags); + pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0); +} + +static int test_kallsyms_basic_function(void) +{ + int i, j, ret; + int next = 0, nr_failed = 0; + char *prefix; + unsigned short rand; + unsigned long addr, lookup_addr; + char namebuf[KSYM_NAME_LEN]; + struct test_stat *stat, *stat2; + + stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL); + if (!stat) + return -ENOMEM; + stat2 = stat + 1; + + prefix = "kallsyms_lookup_name() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + addr = kallsyms_lookup_name(test_items[i].name); + if (addr != test_items[i].addr) { + nr_failed++; + pr_info("%s %s failed: addr=%lx, expect %lx\n", + prefix, test_items[i].name, addr, test_items[i].addr); + } + } + + prefix = "kallsyms_on_each_symbol() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + stat->name = test_items[i].name; + kallsyms_on_each_symbol(find_symbol, stat); + if (stat->addr != test_items[i].addr || stat->real_cnt != 1) { + nr_failed++; + pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n", + prefix, test_items[i].name, + stat->real_cnt, stat->addr, test_items[i].addr); + } + } + + prefix = "kallsyms_on_each_match_symbol() for"; + for (i = 0; i < ARRAY_SIZE(test_items); i++) { + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + stat->name = test_items[i].name; + kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat); + if (stat->addr != test_items[i].addr || stat->real_cnt != 1) { + nr_failed++; + pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n", + prefix, test_items[i].name, + stat->real_cnt, stat->addr, test_items[i].addr); + } + } + + if (nr_failed) { + kfree(stat); + return -ESRCH; + } + + for (i = 0; i < kallsyms_num_syms; i++) { + addr = kallsyms_sym_address(i); + if (!is_ksym_addr(addr)) + continue; + + ret = lookup_symbol_name(addr, namebuf); + if (unlikely(ret)) { + namebuf[0] = 0; + goto failed; + } + + /* + * The first '.' may be the initial letter, in which case the + * entire symbol name will be truncated to an empty string in + * cleanup_symbol_name(). Do not test these symbols. + * + * For example: + * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head + * .E_read_words + * .E_leading_bytes + * .E_trailing_bytes + * .E_write_words + * .E_copy + * .str.292.llvm.12122243386960820698 + * .str.24.llvm.12122243386960820698 + * .str.29.llvm.12122243386960820698 + * .str.75.llvm.12122243386960820698 + * .str.99.llvm.12122243386960820698 + */ + if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0]) + continue; + + lookup_addr = kallsyms_lookup_name(namebuf); + + memset(stat, 0, sizeof(*stat)); + stat->max = INT_MAX; + kallsyms_on_each_match_symbol(match_symbol, namebuf, stat); + + /* + * kallsyms_on_each_symbol() is too slow, randomly select some + * symbols for test. + */ + if (i >= next) { + memset(stat2, 0, sizeof(*stat2)); + stat2->max = INT_MAX; + stat2->name = namebuf; + kallsyms_on_each_symbol(find_symbol, stat2); + + /* + * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol() + * need to get the same traversal result. + */ + if (stat->addr != stat2->addr || + stat->real_cnt != stat2->real_cnt || + memcmp(stat->addrs, stat2->addrs, + stat->save_cnt * sizeof(stat->addrs[0]))) + goto failed; + + /* + * The average of random increments is 128, that is, one of + * them is tested every 128 symbols. + */ + get_random_bytes(&rand, sizeof(rand)); + next = i + (rand & 0xff) + 1; + } + + /* Need to be found at least once */ + if (!stat->real_cnt) + goto failed; + + /* + * kallsyms_lookup_name() returns the address of the first + * symbol found and cannot be NULL. + */ + if (!lookup_addr || lookup_addr != stat->addrs[0]) + goto failed; + + /* + * If the addresses of all matching symbols are recorded, the + * target address needs to be exist. + */ + if (stat->real_cnt <= MAX_NUM_OF_RECORDS) { + for (j = 0; j < stat->save_cnt; j++) { + if (stat->addrs[j] == addr) + break; + } + + if (j == stat->save_cnt) + goto failed; + } + } + + kfree(stat); + + return 0; + +failed: + pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr); + kfree(stat); + return -ESRCH; +} + +static int test_entry(void *p) +{ + int ret; + + do { + schedule_timeout(5 * HZ); + } while (system_state != SYSTEM_RUNNING); + + pr_info("start\n"); + ret = test_kallsyms_basic_function(); + if (ret) { + pr_info("abort\n"); + return 0; + } + + test_kallsyms_compression_ratio(); + test_perf_kallsyms_lookup_name(); + test_perf_kallsyms_on_each_symbol(); + test_perf_kallsyms_on_each_match_symbol(); + pr_info("finish\n"); + + return 0; +} + +static int __init kallsyms_test_init(void) +{ + struct task_struct *t; + + t = kthread_create(test_entry, NULL, "kallsyms_test"); + if (IS_ERR(t)) { + pr_info("Create kallsyms selftest task failed\n"); + return PTR_ERR(t); + } + kthread_bind(t, 0); + wake_up_process(t); + + return 0; +} +late_initcall(kallsyms_test_init); diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h new file mode 100644 index 000000000000..c0ca548e2a22 --- /dev/null +++ b/kernel/kallsyms_selftest.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef LINUX_KALLSYMS_SELFTEST_H_ +#define LINUX_KALLSYMS_SELFTEST_H_ + +#include <linux/types.h> + +extern int kallsyms_test_var_bss; +extern int kallsyms_test_var_data; + +extern int kallsyms_test_func(void); +extern int kallsyms_test_func_weak(void); + +#endif // LINUX_KALLSYMS_SELFTEST_H_ diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index 4f35d1bced6a..8cf70f068d92 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer +CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN) obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index fe12dfe254ec..54d077e1a2dc 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -14,10 +14,12 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> +#include <linux/minmax.h> #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/preempt.h> #include <linux/sched.h> +#include <linux/string.h> #include <linux/uaccess.h> #include "encoding.h" @@ -1308,3 +1310,51 @@ noinline void __tsan_atomic_signal_fence(int memorder) } } EXPORT_SYMBOL(__tsan_atomic_signal_fence); + +#ifdef __HAVE_ARCH_MEMSET +void *__tsan_memset(void *s, int c, size_t count); +noinline void *__tsan_memset(void *s, int c, size_t count) +{ + /* + * Instead of not setting up watchpoints where accessed size is greater + * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE. + */ + size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE); + + check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_); + return memset(s, c, count); +} +#else +void *__tsan_memset(void *s, int c, size_t count) __alias(memset); +#endif +EXPORT_SYMBOL(__tsan_memset); + +#ifdef __HAVE_ARCH_MEMMOVE +void *__tsan_memmove(void *dst, const void *src, size_t len); +noinline void *__tsan_memmove(void *dst, const void *src, size_t len) +{ + size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE); + + check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_); + check_access(src, check_len, 0, _RET_IP_); + return memmove(dst, src, len); +} +#else +void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove); +#endif +EXPORT_SYMBOL(__tsan_memmove); + +#ifdef __HAVE_ARCH_MEMCPY +void *__tsan_memcpy(void *dst, const void *src, size_t len); +noinline void *__tsan_memcpy(void *dst, const void *src, size_t len) +{ + size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE); + + check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_); + check_access(src, check_len, 0, _RET_IP_); + return memcpy(dst, src, len); +} +#else +void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy); +#endif +EXPORT_SYMBOL(__tsan_memcpy); diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c index 67794404042a..e95ce7d7a76e 100644 --- a/kernel/kcsan/report.c +++ b/kernel/kcsan/report.c @@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change, dump_stack_print_info(KERN_DEFAULT); pr_err("==================================================================\n"); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("KCSAN"); } static void release_report(unsigned long *flags, struct other_info *other_info) diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 00cdf8fa5693..8679322450f2 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -22,13 +22,6 @@ #define ITERS_PER_TEST 2000 -/* Test requirements. */ -static bool __init test_requires(void) -{ - /* random should be initialized for the below tests */ - return get_random_u32() + get_random_u32() != 0; -} - /* * Test watchpoint encode and decode: check that encoding some access's info, * and then subsequent decode preserves the access's info. @@ -38,8 +31,8 @@ static bool __init test_encode_decode(void) int i; for (i = 0; i < ITERS_PER_TEST; ++i) { - size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1; - bool is_write = !!prandom_u32_max(2); + size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE); + bool is_write = !!get_random_u32_below(2); unsigned long verif_masked_addr; long encoded_watchpoint; bool verif_is_write; @@ -259,7 +252,6 @@ static int __init kcsan_selftest(void) pr_err("selftest: " #do_test " failed"); \ } while (0) - RUN_TEST(test_requires); RUN_TEST(test_encode_decode); RUN_TEST(test_matching_access); RUN_TEST(test_barrier); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index ca2743f9c634..969e8f52f7da 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -561,23 +561,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) static int kimage_set_destination(struct kimage *image, unsigned long destination) { - int result; - destination &= PAGE_MASK; - result = kimage_add_entry(image, destination | IND_DESTINATION); - return result; + return kimage_add_entry(image, destination | IND_DESTINATION); } static int kimage_add_page(struct kimage *image, unsigned long page) { - int result; - page &= PAGE_MASK; - result = kimage_add_entry(image, page | IND_SOURCE); - return result; + return kimage_add_entry(image, page | IND_SOURCE); } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 45637511e0de..dd5983010b7b 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -1141,7 +1141,7 @@ int crash_exclude_mem_range(struct crash_mem *mem, { int i, j; unsigned long long start, end, p_start, p_end; - struct crash_mem_range temp_range = {0, 0}; + struct range temp_range = {0, 0}; for (i = 0; i < mem->nr_ranges; i++) { start = mem->ranges[i].start; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index cd9f5a66a690..1c18ecf9f98b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1766,7 +1766,13 @@ static int __unregister_kprobe_top(struct kprobe *p) if ((list_p != p) && (list_p->post_handler)) goto noclean; } - ap->post_handler = NULL; + /* + * For the kprobe-on-ftrace case, we keep the + * post_handler setting to identify this aggrprobe + * armed with kprobe_ipmodify_ops. + */ + if (!kprobe_ftrace(ap)) + ap->post_handler = NULL; } noclean: /* @@ -2207,13 +2213,9 @@ int register_kretprobe(struct kretprobe *rp) rp->kp.post_handler = NULL; /* Pre-allocate memory for max kretprobe instances */ - if (rp->maxactive <= 0) { -#ifdef CONFIG_PREEMPTION + if (rp->maxactive <= 0) rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); -#else - rp->maxactive = num_possible_cpus(); -#endif - } + #ifdef CONFIG_KRETPROBE_ON_RETHOOK rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler); if (!rp->rh) @@ -2358,6 +2360,14 @@ static void kill_kprobe(struct kprobe *p) lockdep_assert_held(&kprobe_mutex); + /* + * The module is going away. We should disarm the kprobe which + * is using ftrace, because ftrace framework is still available at + * 'MODULE_STATE_GOING' notification. + */ + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) + disarm_kprobe_ftrace(p); + p->flags |= KPROBE_FLAG_GONE; if (kprobe_aggrprobe(p)) { /* @@ -2374,14 +2384,6 @@ static void kill_kprobe(struct kprobe *p) * the original probed function (which will be freed soon) any more. */ arch_remove_kprobe(p); - - /* - * The module is going away. We should disarm the kprobe which - * is using ftrace, because ftrace framework is still available at - * 'MODULE_STATE_GOING' notification. - */ - if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) - disarm_kprobe_ftrace(p); } /* Disable one kprobe */ diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 65dba9076f31..2df00b789b90 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -6,6 +6,7 @@ * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> */ +#include <asm/byteorder.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/sysfs.h> @@ -20,6 +21,14 @@ #include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ +#if defined(__LITTLE_ENDIAN) +#define CPU_BYTEORDER_STRING "little" +#elif defined(__BIG_ENDIAN) +#define CPU_BYTEORDER_STRING "big" +#else +#error Unknown byteorder +#endif + #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -34,6 +43,14 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); +/* cpu byteorder */ +static ssize_t cpu_byteorder_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING); +} +KERNEL_ATTR_RO(cpu_byteorder); + #ifdef CONFIG_UEVENT_HELPER /* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, @@ -215,6 +232,7 @@ EXPORT_SYMBOL_GPL(kernel_kobj); static struct attribute * kernel_attrs[] = { &fscaps_attr.attr, &uevent_seqnum_attr.attr, + &cpu_byteorder_attr.attr, #ifdef CONFIG_UEVENT_HELPER &uevent_helper_attr.attr, #endif diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 9ada0bc5247b..201f0c0482fb 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -125,20 +125,10 @@ struct klp_find_arg { unsigned long pos; }; -static int klp_find_callback(void *data, const char *name, - struct module *mod, unsigned long addr) +static int klp_match_callback(void *data, unsigned long addr) { struct klp_find_arg *args = data; - if ((mod && !args->objname) || (!mod && args->objname)) - return 0; - - if (strcmp(args->name, name)) - return 0; - - if (args->objname && strcmp(args->objname, mod->name)) - return 0; - args->addr = addr; args->count++; @@ -153,6 +143,23 @@ static int klp_find_callback(void *data, const char *name, return 0; } +static int klp_find_callback(void *data, const char *name, + struct module *mod, unsigned long addr) +{ + struct klp_find_arg *args = data; + + if ((mod && !args->objname) || (!mod && args->objname)) + return 0; + + if (strcmp(args->name, name)) + return 0; + + if (args->objname && strcmp(args->objname, mod->name)) + return 0; + + return klp_match_callback(data, addr); +} + static int klp_find_object_symbol(const char *objname, const char *name, unsigned long sympos, unsigned long *addr) { @@ -167,7 +174,7 @@ static int klp_find_object_symbol(const char *objname, const char *name, if (objname) module_kallsyms_on_each_symbol(klp_find_callback, &args); else - kallsyms_on_each_symbol(klp_find_callback, &args); + kallsyms_on_each_match_symbol(klp_match_callback, name, &args); /* * Ensure an address was found. If sympos is 0, ensure symbol is unique; diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 4c4f5a776d80..4152c71507e2 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func); + ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func); unlock: ftrace_test_recursion_unlock(bit); diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 30187b1d8275..f1b25ec581e0 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -196,36 +196,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries, struct klp_ops *ops; int i; - for (i = 0; i < nr_entries; i++) { - address = entries[i]; + if (klp_target_state == KLP_UNPATCHED) { + /* + * Check for the to-be-unpatched function + * (the func itself). + */ + func_addr = (unsigned long)func->new_func; + func_size = func->new_size; + } else { + /* + * Check for the to-be-patched function + * (the previous func). + */ + ops = klp_find_ops(func->old_func); - if (klp_target_state == KLP_UNPATCHED) { - /* - * Check for the to-be-unpatched function - * (the func itself). - */ - func_addr = (unsigned long)func->new_func; - func_size = func->new_size; + if (list_is_singular(&ops->func_stack)) { + /* original function */ + func_addr = (unsigned long)func->old_func; + func_size = func->old_size; } else { - /* - * Check for the to-be-patched function - * (the previous func). - */ - ops = klp_find_ops(func->old_func); - - if (list_is_singular(&ops->func_stack)) { - /* original function */ - func_addr = (unsigned long)func->old_func; - func_size = func->old_size; - } else { - /* previously patched function */ - struct klp_func *prev; - - prev = list_next_entry(func, stack_node); - func_addr = (unsigned long)prev->new_func; - func_size = prev->new_size; - } + /* previously patched function */ + struct klp_func *prev; + + prev = list_next_entry(func, stack_node); + func_addr = (unsigned long)prev->new_func; + func_size = prev->new_size; } + } + + for (i = 0; i < nr_entries; i++) { + address = entries[i]; if (address >= func_addr && address < func_addr + func_size) return -EAGAIN; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index ea925731fa40..0db4093d17b8 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -7,7 +7,6 @@ obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o # Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n -KMSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 43efb2a04160..29dc253d03af 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -399,7 +399,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = prandom_u32_max(n + 1); + r = get_random_u32_below(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work) { struct stress *stress = container_of(work, typeof(*stress), work); const int nlocks = stress->nlocks; - struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks); + struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks); int err; do { diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 26ea5d04f56c..424b3bc58f3f 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -221,9 +221,10 @@ endchoice config MODULE_DECOMPRESS bool "Support in-kernel module decompression" - depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ + depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD select ZLIB_INFLATE if MODULE_COMPRESS_GZIP select XZ_DEC if MODULE_COMPRESS_XZ + select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD help Support for decompressing kernel modules by the kernel itself diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index c033572d83f0..bb79ac1a6d8f 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info) return page; } -#ifdef CONFIG_MODULE_COMPRESS_GZIP +#if defined(CONFIG_MODULE_COMPRESS_GZIP) #include <linux/zlib.h> #define MODULE_COMPRESSION gzip #define MODULE_DECOMPRESS_FN module_gzip_decompress @@ -114,8 +114,8 @@ static ssize_t module_gzip_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); - if (!page) { - retval = -ENOMEM; + if (IS_ERR(page)) { + retval = PTR_ERR(page); goto out_inflate_end; } @@ -141,7 +141,7 @@ out: kfree(s.workspace); return retval; } -#elif CONFIG_MODULE_COMPRESS_XZ +#elif defined(CONFIG_MODULE_COMPRESS_XZ) #include <linux/xz.h> #define MODULE_COMPRESSION xz #define MODULE_DECOMPRESS_FN module_xz_decompress @@ -173,8 +173,8 @@ static ssize_t module_xz_decompress(struct load_info *info, do { struct page *page = module_get_next_page(info); - if (!page) { - retval = -ENOMEM; + if (IS_ERR(page)) { + retval = PTR_ERR(page); goto out; } @@ -199,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info, xz_dec_end(xz_dec); return retval; } +#elif defined(CONFIG_MODULE_COMPRESS_ZSTD) +#include <linux/zstd.h> +#define MODULE_COMPRESSION zstd +#define MODULE_DECOMPRESS_FN module_zstd_decompress + +static ssize_t module_zstd_decompress(struct load_info *info, + const void *buf, size_t size) +{ + static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd }; + ZSTD_outBuffer zstd_dec; + ZSTD_inBuffer zstd_buf; + zstd_frame_header header; + size_t wksp_size; + void *wksp = NULL; + ZSTD_DStream *dstream; + size_t ret; + size_t new_size = 0; + int retval; + + if (size < sizeof(signature) || + memcmp(buf, signature, sizeof(signature))) { + pr_err("not a zstd compressed module\n"); + return -EINVAL; + } + + zstd_buf.src = buf; + zstd_buf.pos = 0; + zstd_buf.size = size; + + ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size); + if (ret != 0) { + pr_err("ZSTD-compressed data has an incomplete frame header\n"); + retval = -EINVAL; + goto out; + } + if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) { + pr_err("ZSTD-compressed data has too large a window size\n"); + retval = -EINVAL; + goto out; + } + + wksp_size = zstd_dstream_workspace_bound(header.windowSize); + wksp = kmalloc(wksp_size, GFP_KERNEL); + if (!wksp) { + retval = -ENOMEM; + goto out; + } + + dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size); + if (!dstream) { + pr_err("Can't initialize ZSTD stream\n"); + retval = -ENOMEM; + goto out; + } + + do { + struct page *page = module_get_next_page(info); + + if (!IS_ERR(page)) { + retval = PTR_ERR(page); + goto out; + } + + zstd_dec.dst = kmap_local_page(page); + zstd_dec.pos = 0; + zstd_dec.size = PAGE_SIZE; + + ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf); + kunmap(page); + retval = zstd_get_error_code(ret); + if (retval) + break; + + new_size += zstd_dec.pos; + } while (zstd_dec.pos == PAGE_SIZE && ret != 0); + + if (retval) { + pr_err("ZSTD-decompression failed with status %d\n", retval); + retval = -EINVAL; + goto out; + } + + retval = new_size; + + out: + kfree(wksp); + return retval; +} #else #error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS" #endif diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c index f5c5c9175333..4523f99b0358 100644 --- a/kernel/module/kallsyms.c +++ b/kernel/module/kallsyms.c @@ -494,7 +494,6 @@ unsigned long module_kallsyms_lookup_name(const char *name) return ret; } -#ifdef CONFIG_LIVEPATCH int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data) @@ -531,4 +530,3 @@ out: mutex_unlock(&module_mutex); return ret; } -#endif /* CONFIG_LIVEPATCH */ diff --git a/kernel/module/main.c b/kernel/module/main.c index d02d39c7174e..48568a0f5651 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -85,9 +85,6 @@ struct mod_tree_root mod_data_tree __cacheline_aligned = { }; #endif -#define module_addr_min mod_tree.addr_min -#define module_addr_max mod_tree.addr_max - struct symsearch { const struct kernel_symbol *start, *stop; const s32 *crcs; @@ -1674,6 +1671,11 @@ static int elf_validity_check(struct load_info *info) info->hdr->e_machine); goto no_exec; } + if (!module_elf_check_arch(info->hdr)) { + pr_err("Invalid module architecture in ELF header: %u\n", + info->hdr->e_machine); + goto no_exec; + } if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) { pr_err("Invalid ELF section header size\n"); goto no_exec; @@ -2247,6 +2249,11 @@ static void flush_module_icache(const struct module *mod) (unsigned long)mod->core_layout.base + mod->core_layout.size); } +bool __weak module_elf_check_arch(Elf_Ehdr *hdr) +{ + return true; +} + int __weak module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c index ce68f821dcd1..c921bf044050 100644 --- a/kernel/module/sysfs.c +++ b/kernel/module/sysfs.c @@ -340,7 +340,7 @@ static int mod_sysfs_init(struct module *mod) int err; struct kobject *kobj; - if (!module_sysfs_initialized) { + if (!module_kset) { pr_err("%s: module sysfs not initialized\n", mod->name); err = -EINVAL; goto out; diff --git a/kernel/notifier.c b/kernel/notifier.c index 0d5bd62c480e..ab75637fd904 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -62,7 +62,7 @@ static int notifier_chain_unregister(struct notifier_block **nl, * value of this parameter is -1. * @nr_calls: Records the number of notifications sent. Don't care * value of this field is NULL. - * @returns: notifier_call_chain returns the value returned by the + * Return: notifier_call_chain returns the value returned by the * last notifier function called. */ static int notifier_call_chain(struct notifier_block **nl, @@ -105,13 +105,13 @@ NOKPROBE_SYMBOL(notifier_call_chain); * @val_up: Value passed unmodified to the notifier function * @val_down: Value passed unmodified to the notifier function when recovering * from an error on @val_up - * @v Pointer passed unmodified to the notifier function + * @v: Pointer passed unmodified to the notifier function * * NOTE: It is important the @nl chain doesn't change between the two * invocations of notifier_call_chain() such that we visit the * exact same notifier callbacks; this rules out any RCU usage. * - * Returns: the return value of the @val_up call. + * Return: the return value of the @val_up call. */ static int notifier_call_chain_robust(struct notifier_block **nl, unsigned long val_up, unsigned long val_down, diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index eec72ca962e2..a487ff24129b 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -157,7 +157,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWCGROUP | CLONE_NEWTIME)))) { - if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) { + if ((flags & CLONE_VM) || + likely(old_ns->time_ns_for_children == old_ns->time_ns)) { get_nsproxy(old_ns); return 0; } @@ -179,7 +180,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); - timens_on_fork(new_ns, tsk); + if ((flags & CLONE_VM) == 0) + timens_on_fork(new_ns, tsk); tsk->nsproxy = new_ns; return 0; @@ -254,6 +256,23 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +int exec_task_namespaces(void) +{ + struct task_struct *tsk = current; + struct nsproxy *new; + + if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns) + return 0; + + new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); + if (IS_ERR(new)) + return PTR_ERR(new); + + timens_on_fork(new, tsk); + switch_task_namespaces(tsk, new); + return 0; +} + static int check_setns_flags(unsigned long flags) { if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | diff --git a/kernel/padata.c b/kernel/padata.c index e5819bb8bd1d..e007b8a4b738 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -83,8 +83,16 @@ static struct padata_work *padata_work_alloc(void) return pw; } -static void padata_work_init(struct padata_work *pw, work_func_t work_fn, - void *data, int flags) +/* + * This function is marked __ref because this function may be optimized in such + * a way that it directly refers to work_fn's address, which causes modpost to + * complain when work_fn is marked __init. This scenario was observed with clang + * LTO, where padata_work_init() was optimized to refer directly to + * padata_mt_helper() because the calls to padata_work_init() with other work_fn + * values were eliminated or inlined. + */ +static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn, + void *data, int flags) { if (flags & PADATA_WORK_ONSTACK) INIT_WORK_ONSTACK(&pw->pw_work, work_fn); @@ -207,14 +215,16 @@ int padata_do_parallel(struct padata_shell *ps, pw = padata_work_alloc(); spin_unlock(&padata_works_lock); + if (!pw) { + /* Maximum works limit exceeded, run in the current task. */ + padata->parallel(padata); + } + rcu_read_unlock_bh(); if (pw) { padata_work_init(pw, padata_parallel_worker, padata, 0); queue_work(pinst->parallel_wq, &pw->pw_work); - } else { - /* Maximum works limit exceeded, run in the current task. */ - padata->parallel(padata); } return 0; @@ -388,13 +398,16 @@ void padata_do_serial(struct padata_priv *padata) int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr); struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu); struct padata_priv *cur; + struct list_head *pos; spin_lock(&reorder->lock); /* Sort in ascending order of sequence number. */ - list_for_each_entry_reverse(cur, &reorder->list, list) + list_for_each_prev(pos, &reorder->list) { + cur = list_entry(pos, struct padata_priv, list); if (cur->seq_nr < padata->seq_nr) break; - list_add(&padata->list, &cur->list); + } + list_add(&padata->list, pos); spin_unlock(&reorder->lock); /* diff --git a/kernel/panic.c b/kernel/panic.c index da323209f583..463c9295bc28 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -25,6 +25,7 @@ #include <linux/kexec.h> #include <linux/panic_notifier.h> #include <linux/sched.h> +#include <linux/string_helpers.h> #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> @@ -32,6 +33,7 @@ #include <linux/bug.h> #include <linux/ratelimit.h> #include <linux/debugfs.h> +#include <linux/sysfs.h> #include <trace/events/error_report.h> #include <asm/sections.h> @@ -58,6 +60,7 @@ bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; unsigned long panic_on_taint; bool panic_on_taint_nousertaint = false; +static unsigned int warn_limit __read_mostly; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -75,8 +78,9 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL) +#ifdef CONFIG_SYSCTL static struct ctl_table kern_panic_table[] = { +#ifdef CONFIG_SMP { .procname = "oops_all_cpu_backtrace", .data = &sysctl_oops_all_cpu_backtrace, @@ -86,6 +90,14 @@ static struct ctl_table kern_panic_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, +#endif + { + .procname = "warn_limit", + .data = &warn_limit, + .maxlen = sizeof(warn_limit), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { } }; @@ -97,6 +109,25 @@ static __init int kernel_panic_sysctls_init(void) late_initcall(kernel_panic_sysctls_init); #endif +static atomic_t warn_count = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%d\n", atomic_read(&warn_count)); +} + +static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count); + +static __init int kernel_panic_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL); + return 0; +} +late_initcall(kernel_panic_sysfs_init); +#endif + static long no_blink(int state) { return 0; @@ -199,6 +230,19 @@ static void panic_print_sys_info(bool console_flush) ftrace_dump(DUMP_ALL); } +void check_panic_on_warn(const char *origin) +{ + unsigned int limit; + + if (panic_on_warn) + panic("%s: panic_on_warn set ...\n", origin); + + limit = READ_ONCE(warn_limit); + if (atomic_inc_return(&warn_count) >= limit && limit) + panic("%s: system warned too often (kernel.warn_limit is %d)", + origin, limit); +} + /** * panic - halt the system * @fmt: The text string to print @@ -617,8 +661,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, if (regs) show_regs(regs); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("kernel"); if (!regs) dump_stack(); @@ -744,8 +787,8 @@ static int __init panic_on_taint_setup(char *s) if (s && !strcmp(s, "nousertaint")) panic_on_taint_nousertaint = true; - pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", - panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n", + panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint)); return 0; } diff --git a/kernel/params.c b/kernel/params.c index 5b92310425c5..14d66070757b 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -926,7 +926,7 @@ static const struct sysfs_ops module_sysfs_ops = { .store = module_attr_store, }; -static int uevent_filter(struct kobject *kobj) +static int uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); @@ -940,7 +940,6 @@ static const struct kset_uevent_ops module_uevent_ops = { }; struct kset *module_kset; -int module_sysfs_initialized; static void module_kobj_release(struct kobject *kobj) { @@ -954,7 +953,11 @@ struct kobj_type module_ktype = { }; /* - * param_sysfs_init - wrapper for built-in params support + * param_sysfs_init - create "module" kset + * + * This must be done before the initramfs is unpacked and + * request_module() thus becomes possible, because otherwise the + * module load would fail in mod_sysfs_init. */ static int __init param_sysfs_init(void) { @@ -964,13 +967,25 @@ static int __init param_sysfs_init(void) __FILE__, __LINE__); return -ENOMEM; } - module_sysfs_initialized = 1; + + return 0; +} +subsys_initcall(param_sysfs_init); + +/* + * param_sysfs_builtin_init - add sysfs version and parameter + * attributes for built-in modules + */ +static int __init param_sysfs_builtin_init(void) +{ + if (!module_kset) + return -ENOMEM; version_sysfs_builtin(); param_sysfs_builtin(); return 0; } -subsys_initcall(param_sysfs_init); +late_initcall(param_sysfs_builtin_init); #endif /* CONFIG_SYSFS */ diff --git a/kernel/power/process.c b/kernel/power/process.c index ddd9988327fe..6c1c7e566d35 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -27,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; static int try_to_freeze_tasks(bool user_only) { + const char *what = user_only ? "user space processes" : + "remaining freezable tasks"; struct task_struct *g, *p; unsigned long end_time; unsigned int todo; @@ -36,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only) bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; + pr_info("Freezing %s\n", what); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -82,9 +86,8 @@ static int try_to_freeze_tasks(bool user_only) elapsed_msecs = ktime_to_ms(elapsed); if (todo) { - pr_cont("\n"); - pr_err("Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", + pr_err("Freezing %s %s after %d.%03d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", what, wakeup ? "aborted" : "failed", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); @@ -101,8 +104,8 @@ static int try_to_freeze_tasks(bool user_only) read_unlock(&tasklist_lock); } } else { - pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, - elapsed_msecs % 1000); + pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n", + what, elapsed_msecs / 1000, elapsed_msecs % 1000); } return todo ? -EBUSY : 0; @@ -130,14 +133,11 @@ int freeze_processes(void) static_branch_inc(&freezer_active); pm_wakeup_clear(0); - pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); - if (!error) { + if (!error) __usermodehelper_set_disable_depth(UMH_DISABLED); - pr_cont("done."); - } - pr_cont("\n"); + BUG_ON(in_atomic()); /* @@ -166,14 +166,9 @@ int freeze_kernel_threads(void) { int error; - pr_info("Freezing remaining freezable tasks ... "); - pm_nosig_freezing = true; error = try_to_freeze_tasks(false); - if (!error) - pr_cont("done."); - pr_cont("\n"); BUG_ON(in_atomic()); if (error) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 2a406753af90..cd8b7b35f1e8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1723,8 +1723,8 @@ static unsigned long minimum_image_size(unsigned long saveable) * /sys/power/reserved_size, respectively). To make this happen, we compute the * total number of available page frames and allocate at least * - * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 - * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2 + * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) * * of them, which corresponds to the maximum size of a hibernation image. * @@ -2259,10 +2259,14 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) + if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) { memory_bm_set_bit(bm, buf[j]); - else + } else { + if (!pfn_valid(buf[j])) + pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n", + (unsigned long long)PFN_PHYS(buf[j])); return -EFAULT; + } } return 0; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e4f1e7478b52..7decf1e9c486 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -79,13 +79,20 @@ int oops_in_progress; EXPORT_SYMBOL(oops_in_progress); /* - * console_sem protects the console_drivers list, and also - * provides serialisation for access to the entire console - * driver system. + * console_mutex protects console_list updates and console->flags updates. + * The flags are synchronized only for consoles that are registered, i.e. + * accessible via the console list. + */ +static DEFINE_MUTEX(console_mutex); + +/* + * console_sem protects updates to console->seq and console_suspended, + * and also provides serialization for console printing. */ static DEFINE_SEMAPHORE(console_sem); -struct console *console_drivers; -EXPORT_SYMBOL_GPL(console_drivers); +HLIST_HEAD(console_list); +EXPORT_SYMBOL_GPL(console_list); +DEFINE_STATIC_SRCU(console_srcu); /* * System may need to suppress printk message under certain @@ -103,6 +110,19 @@ static int __read_mostly suppress_panic_printk; static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; + +void lockdep_assert_console_list_lock_held(void) +{ + lockdep_assert_held(&console_mutex); +} +EXPORT_SYMBOL(lockdep_assert_console_list_lock_held); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +bool console_srcu_read_lock_is_held(void) +{ + return srcu_read_lock_held(&console_srcu); +} #endif enum devkmsg_log_bits { @@ -220,6 +240,69 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ +/** + * console_list_lock - Lock the console list + * + * For console list or console->flags updates + */ +void console_list_lock(void) +{ + /* + * In unregister_console() and console_force_preferred_locked(), + * synchronize_srcu() is called with the console_list_lock held. + * Therefore it is not allowed that the console_list_lock is taken + * with the srcu_lock held. + * + * Detecting if this context is really in the read-side critical + * section is only possible if the appropriate debug options are + * enabled. + */ + WARN_ON_ONCE(debug_lockdep_rcu_enabled() && + srcu_read_lock_held(&console_srcu)); + + mutex_lock(&console_mutex); +} +EXPORT_SYMBOL(console_list_lock); + +/** + * console_list_unlock - Unlock the console list + * + * Counterpart to console_list_lock() + */ +void console_list_unlock(void) +{ + mutex_unlock(&console_mutex); +} +EXPORT_SYMBOL(console_list_unlock); + +/** + * console_srcu_read_lock - Register a new reader for the + * SRCU-protected console list + * + * Use for_each_console_srcu() to iterate the console list + * + * Context: Any context. + * Return: A cookie to pass to console_srcu_read_unlock(). + */ +int console_srcu_read_lock(void) +{ + return srcu_read_lock_nmisafe(&console_srcu); +} +EXPORT_SYMBOL(console_srcu_read_lock); + +/** + * console_srcu_read_unlock - Unregister an old reader from + * the SRCU-protected console list + * @cookie: cookie returned from console_srcu_read_lock() + * + * Counterpart to console_srcu_read_lock() + */ +void console_srcu_read_unlock(int cookie) +{ + srcu_read_unlock_nmisafe(&console_srcu, cookie); +} +EXPORT_SYMBOL(console_srcu_read_unlock); + /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -1814,13 +1897,13 @@ static void console_lock_spinning_enable(void) * safe to start busy waiting for the lock. Second, it checks if * there is a busy waiter and passes the lock rights to her. * - * Important: Callers lose the lock if there was a busy waiter. - * They must not touch items synchronized by console_lock - * in this case. + * Important: Callers lose both the console_lock and the SRCU read lock if + * there was a busy waiter. They must not touch items synchronized by + * console_lock or SRCU read lock in this case. * * Return: 1 if the lock rights were passed, 0 otherwise. */ -static int console_lock_spinning_disable_and_check(void) +static int console_lock_spinning_disable_and_check(int cookie) { int waiter; @@ -1840,6 +1923,12 @@ static int console_lock_spinning_disable_and_check(void) spin_release(&console_owner_dep_map, _THIS_IP_); /* + * Preserve lockdep lock ordering. Release the SRCU read lock before + * releasing the console_lock. + */ + console_srcu_read_unlock(cookie); + + /* * Hand off console_lock to waiter. The waiter will perform * the up(). After this, the waiter is the console_lock owner. */ @@ -2322,7 +2411,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, char *text, size_t text_len, struct dev_printk_info *dev_info) { return 0; } static void console_lock_spinning_enable(void) { } -static int console_lock_spinning_disable_and_check(void) { return 0; } +static int console_lock_spinning_disable_and_check(int cookie) { return 0; } static void call_console_driver(struct console *con, const char *text, size_t len, char *dropped_text) { @@ -2391,7 +2480,7 @@ static int __add_preferred_console(char *name, int idx, char *options, return -E2BIG; if (!brl_options) preferred_console = i; - strlcpy(c->name, name, sizeof(c->name)); + strscpy(c->name, name, sizeof(c->name)); c->options = options; set_user_specified(c, user_specified); braille_set_options(c, brl_options); @@ -2553,10 +2642,10 @@ static int console_cpu_notify(unsigned int cpu) } /** - * console_lock - lock the console system for exclusive use. + * console_lock - block the console subsystem from printing * - * Acquires a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Acquires a lock which guarantees that no consoles will + * be in or enter their write() callback. * * Can sleep, returns nothing. */ @@ -2573,10 +2662,10 @@ void console_lock(void) EXPORT_SYMBOL(console_lock); /** - * console_trylock - try to lock the console system for exclusive use. + * console_trylock - try to block the console subsystem from printing * - * Try to acquire a lock which guarantees that the caller has exclusive - * access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that no consoles will + * be in or enter their write() callback. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2623,11 +2712,13 @@ static bool abandon_console_lock_in_panic(void) * Check if the given console is currently capable and allowed to print * records. * - * Requires the console_lock. + * Requires the console_srcu_read_lock. */ static inline bool console_is_usable(struct console *con) { - if (!(con->flags & CON_ENABLED)) + short flags = console_srcu_read_flags(con); + + if (!(flags & CON_ENABLED)) return false; if (!con->write) @@ -2638,8 +2729,7 @@ static inline bool console_is_usable(struct console *con) * allocated. So unless they're explicitly marked as being able to * cope (CON_ANYTIME) don't call them until this CPU is officially up. */ - if (!cpu_online(raw_smp_processor_id()) && - !(con->flags & CON_ANYTIME)) + if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME)) return false; return true; @@ -2664,16 +2754,18 @@ static void __console_unlock(void) * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL. * * @handover will be set to true if a printk waiter has taken over the - * console_lock, in which case the caller is no longer holding the - * console_lock. Otherwise it is set to false. + * console_lock, in which case the caller is no longer holding both the + * console_lock and the SRCU read lock. Otherwise it is set to false. + * + * @cookie is the cookie from the SRCU read lock. * * Returns false if the given console has no next record to print, otherwise * true. * - * Requires the console_lock. + * Requires the console_lock and the SRCU read lock. */ static bool console_emit_next_record(struct console *con, char *text, char *ext_text, - char *dropped_text, bool *handover) + char *dropped_text, bool *handover, int cookie) { static int panic_console_dropped; struct printk_info info; @@ -2733,7 +2825,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_ con->seq++; - *handover = console_lock_spinning_disable_and_check(); + *handover = console_lock_spinning_disable_and_check(cookie); printk_safe_exit_irqrestore(flags); skip: return true; @@ -2770,6 +2862,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove bool any_usable = false; struct console *con; bool any_progress; + int cookie; *next_seq = 0; *handover = false; @@ -2777,23 +2870,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove do { any_progress = false; - for_each_console(con) { + cookie = console_srcu_read_lock(); + for_each_console_srcu(con) { bool progress; if (!console_is_usable(con)) continue; any_usable = true; - if (con->flags & CON_EXTENDED) { + if (console_srcu_read_flags(con) & CON_EXTENDED) { /* Extended consoles do not print "dropped messages". */ progress = console_emit_next_record(con, &text[0], &ext_text[0], NULL, - handover); + handover, cookie); } else { progress = console_emit_next_record(con, &text[0], NULL, &dropped_text[0], - handover); + handover, cookie); } + + /* + * If a handover has occurred, the SRCU read lock + * is already released. + */ if (*handover) return false; @@ -2807,21 +2906,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove /* Allow panic_cpu to take over the consoles safely. */ if (abandon_console_lock_in_panic()) - return false; + goto abandon; if (do_cond_resched) cond_resched(); } + console_srcu_read_unlock(cookie); } while (any_progress); return any_usable; + +abandon: + console_srcu_read_unlock(cookie); + return false; } /** - * console_unlock - unlock the console system + * console_unlock - unblock the console subsystem from printing * - * Releases the console_lock which the caller holds on the console system - * and the console driver list. + * Releases the console_lock which the caller holds to block printing of + * the console subsystem. * * While the console_lock was held, console output may have been buffered * by printk(). If this is the case, console_unlock(); emits @@ -2899,10 +3003,14 @@ EXPORT_SYMBOL(console_conditional_schedule); void console_unblank(void) { struct console *c; + int cookie; /* - * console_unblank can no longer be called in interrupt context unless - * oops_in_progress is set to 1.. + * Stop console printing because the unblank() callback may + * assume the console is not within its write() callback. + * + * If @oops_in_progress is set, this may be an atomic context. + * In that case, attempt a trylock as best-effort. */ if (oops_in_progress) { if (down_trylock_console_sem() != 0) @@ -2912,9 +3020,14 @@ void console_unblank(void) console_locked = 1; console_may_schedule = 0; - for_each_console(c) - if ((c->flags & CON_ENABLED) && c->unblank) + + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank) c->unblank(); + } + console_srcu_read_unlock(cookie); + console_unlock(); if (!oops_in_progress) @@ -2941,11 +3054,21 @@ void console_flush_on_panic(enum con_flush_mode mode) if (mode == CONSOLE_REPLAY_ALL) { struct console *c; + int cookie; u64 seq; seq = prb_first_valid_seq(prb); - for_each_console(c) + + cookie = console_srcu_read_lock(); + for_each_console_srcu(c) { + /* + * If the above console_trylock() failed, this is an + * unsynchronized assignment. But in that case, the + * kernel is in "hope and pray" mode anyway. + */ c->seq = seq; + } + console_srcu_read_unlock(cookie); } console_unlock(); } @@ -2957,15 +3080,25 @@ struct tty_driver *console_device(int *index) { struct console *c; struct tty_driver *driver = NULL; + int cookie; + /* + * Take console_lock to serialize device() callback with + * other console operations. For example, fg_console is + * modified under console_lock when switching vt. + */ console_lock(); - for_each_console(c) { + + cookie = console_srcu_read_lo |