aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/auditsc.c75
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c29
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c246
-rw-r--r--kernel/bpf/bpf_inode_storage.c42
-rw-r--r--kernel/bpf/bpf_local_storage.c208
-rw-r--r--kernel/bpf/bpf_lsm.c22
-rw-r--r--kernel/bpf/bpf_struct_ops.c3
-rw-r--r--kernel/bpf/bpf_task_storage.c161
-rw-r--r--kernel/bpf/btf.c1308
-rw-r--r--kernel/bpf/cgroup_iter.c16
-rw-r--r--kernel/bpf/core.c39
-rw-r--r--kernel/bpf/cpumap.c33
-rw-r--r--kernel/bpf/devmap.c4
-rw-r--r--kernel/bpf/dispatcher.c33
-rw-r--r--kernel/bpf/hashtab.c37
-rw-r--r--kernel/bpf/helpers.c439
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/map_in_map.c61
-rw-r--r--kernel/bpf/memalloc.c46
-rw-r--r--kernel/bpf/percpu_freelist.c23
-rw-r--r--kernel/bpf/ringbuf.c6
-rw-r--r--kernel/bpf/syscall.c475
-rw-r--r--kernel/bpf/trampoline.c83
-rw-r--r--kernel/bpf/verifier.c2688
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup.c9
-rw-r--r--kernel/cgroup/cpuset.c55
-rw-r--r--kernel/configs/tiny.config5
-rw-r--r--kernel/cpu.c61
-rw-r--r--kernel/crash_core.c3
-rw-r--r--kernel/cred.c15
-rw-r--r--kernel/debug/kdb/kdb_io.c18
-rw-r--r--kernel/dma/mapping.c12
-rw-r--r--kernel/dma/swiotlb.c63
-rw-r--r--kernel/events/core.c2167
-rw-r--r--kernel/exit.c62
-rw-r--r--kernel/fork.c86
-rw-r--r--kernel/futex/core.c26
-rw-r--r--kernel/gcov/clang.c2
-rw-r--r--kernel/gcov/gcc_4_7.c5
-rw-r--r--kernel/irq/Kconfig7
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/msi.c915
-rw-r--r--kernel/jump_label.c58
-rw-r--r--kernel/kallsyms.c116
-rw-r--r--kernel/kallsyms_internal.h1
-rw-r--r--kernel/kallsyms_selftest.c485
-rw-r--r--kernel/kallsyms_selftest.h13
-rw-r--r--kernel/kcsan/Makefile1
-rw-r--r--kernel/kcsan/core.c50
-rw-r--r--kernel/kcsan/report.c3
-rw-r--r--kernel/kcsan/selftest.c12
-rw-r--r--kernel/kexec_core.c10
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/kprobes.c32
-rw-r--r--kernel/ksysfs.c18
-rw-r--r--kernel/livepatch/core.c31
-rw-r--r--kernel/livepatch/patch.c2
-rw-r--r--kernel/livepatch/transition.c54
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/test-ww_mutex.c4
-rw-r--r--kernel/module/Kconfig3
-rw-r--r--kernel/module/decompress.c100
-rw-r--r--kernel/module/kallsyms.c2
-rw-r--r--kernel/module/main.c13
-rw-r--r--kernel/module/sysfs.c2
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/nsproxy.c23
-rw-r--r--kernel/padata.c27
-rw-r--r--kernel/panic.c53
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/power/process.c25
-rw-r--r--kernel/power/snapshot.c12
-rw-r--r--kernel/printk/printk.c498
-rw-r--r--kernel/printk/printk_ringbuffer.c2
-rw-r--r--kernel/rcu/Kconfig22
-rw-r--r--kernel/rcu/Kconfig.debug3
-rw-r--r--kernel/rcu/rcu.h16
-rw-r--r--kernel/rcu/rcuscale.c69
-rw-r--r--kernel/rcu/rcutorture.c72
-rw-r--r--kernel/rcu/srcutiny.c10
-rw-r--r--kernel/rcu/srcutree.c100
-rw-r--r--kernel/rcu/sync.c2
-rw-r--r--kernel/rcu/tasks.h4
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c162
-rw-r--r--kernel/rcu/tree.h12
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_nocb.h259
-rw-r--r--kernel/rcu/tree_plugin.h5
-rw-r--r--kernel/rcu/update.c18
-rw-r--r--kernel/relay.c8
-rw-r--r--kernel/resource.c30
-rw-r--r--kernel/rseq.c19
-rw-r--r--kernel/sched/core.c327
-rw-r--r--kernel/sched/cpufreq_schedutil.c30
-rw-r--r--kernel/sched/deadline.c7
-rw-r--r--kernel/sched/fair.c323
-rw-r--r--kernel/sched/psi.c100
-rw-r--r--kernel/sched/sched.h92
-rw-r--r--kernel/sched/stats.h22
-rw-r--r--kernel/sched/wait.c18
-rw-r--r--kernel/scs.c14
-rw-r--r--kernel/signal.c3
-rw-r--r--kernel/static_call_inline.c23
-rw-r--r--kernel/sysctl.c50
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/namespace.c18
-rw-r--r--kernel/time/timer.c427
-rw-r--r--kernel/trace/Kconfig15
-rw-r--r--kernel/trace/blktrace.c7
-rw-r--r--kernel/trace/bpf_trace.c113
-rw-r--r--kernel/trace/ftrace.c52
-rw-r--r--kernel/trace/kprobe_event_gen_test.c48
-rw-r--r--kernel/trace/rethook.c4
-rw-r--r--kernel/trace/ring_buffer.c115
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.h2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.h2
-rw-r--r--kernel/trace/synth_event_gen_test.c16
-rw-r--r--kernel/trace/trace.c129
-rw-r--r--kernel/trace/trace.h32
-rw-r--r--kernel/trace/trace_dynevent.c2
-rw-r--r--kernel/trace/trace_eprobe.c8
-rw-r--r--kernel/trace/trace_event_perf.c16
-rw-r--r--kernel/trace/trace_events.c79
-rw-r--r--kernel/trace/trace_events_hist.c197
-rw-r--r--kernel/trace/trace_events_synth.c9
-rw-r--r--kernel/trace/trace_events_trigger.c19
-rw-r--r--kernel/trace/trace_events_user.c7
-rw-r--r--kernel/trace/trace_kprobe.c2
-rw-r--r--kernel/trace/trace_osnoise.c250
-rw-r--r--kernel/trace/trace_output.c71
-rw-r--r--kernel/trace/trace_probe.c67
-rw-r--r--kernel/trace/trace_probe.h19
-rw-r--r--kernel/trace/trace_probe_tmpl.h47
-rw-r--r--kernel/trace/trace_selftest.c9
-rw-r--r--kernel/trace/trace_syscalls.c2
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/workqueue.c2
149 files changed, 10758 insertions, 4182 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d754e0be1176..10ef068f598d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -41,9 +41,6 @@ UBSAN_SANITIZE_kcov.o := n
KMSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
-# Don't instrument error handlers
-CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
-
obj-y += sched/
obj-y += locking/
obj-y += power/
@@ -69,6 +66,7 @@ endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
+obj-$(CONFIG_KALLSYMS_SELFTEST) += kallsyms_selftest.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_CRASH_CORE) += crash_core.o
obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 62200d799b9b..010667ce6080 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -320,7 +320,7 @@ void acct_exit_ns(struct pid_namespace *ns)
}
/*
- * encode an unsigned long into a comp_t
+ * encode an u64 into a comp_t
*
* This routine has been adopted from the encode_comp_t() function in
* the kern_acct.c file of the FreeBSD operating system. The encoding
@@ -331,7 +331,7 @@ void acct_exit_ns(struct pid_namespace *ns)
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
-static comp_t encode_comp_t(unsigned long value)
+static comp_t encode_comp_t(u64 value)
{
int exp, rnd;
@@ -350,6 +350,8 @@ static comp_t encode_comp_t(unsigned long value)
exp++;
}
+ if (exp > (((comp_t) ~0U) >> MANTSIZE))
+ return (comp_t) ~0U;
/*
* Clean it up and polish it off.
*/
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9f8c05228d6d..547c88be8a28 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -806,30 +806,53 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
}
/**
- * audit_filter_uring - apply filters to an io_uring operation
+ * __audit_filter_op - common filter helper for operations (syscall/uring/etc)
* @tsk: associated task
* @ctx: audit context
+ * @list: audit filter list
+ * @name: audit_name (can be NULL)
+ * @op: current syscall/uring_op
+ *
+ * Run the udit filters specified in @list against @tsk using @ctx,
+ * @name, and @op, as necessary; the caller is responsible for ensuring
+ * that the call is made while the RCU read lock is held. The @name
+ * parameter can be NULL, but all others must be specified.
+ * Returns 1/true if the filter finds a match, 0/false if none are found.
*/
-static void audit_filter_uring(struct task_struct *tsk,
- struct audit_context *ctx)
+static int __audit_filter_op(struct task_struct *tsk,
+ struct audit_context *ctx,
+ struct list_head *list,
+ struct audit_names *name,
+ unsigned long op)
{
struct audit_entry *e;
enum audit_state state;
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, name,
+ &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
+ */
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
- list) {
- if (audit_in_mask(&e->rule, ctx->uring_op) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
- false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ NULL, ctx->uring_op);
rcu_read_unlock();
}
@@ -841,24 +864,13 @@ static void audit_filter_uring(struct task_struct *tsk,
static void audit_filter_syscall(struct task_struct *tsk,
struct audit_context *ctx)
{
- struct audit_entry *e;
- enum audit_state state;
-
if (auditd_test_task(tsk))
return;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ __audit_filter_op(tsk, ctx, &audit_filter_list[AUDIT_FILTER_EXIT],
+ NULL, ctx->major);
rcu_read_unlock();
- return;
}
/*
@@ -870,17 +882,8 @@ static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_context *ctx) {
int h = audit_hash_ino((u32)n->ino);
struct list_head *list = &audit_inode_hash[h];
- struct audit_entry *e;
- enum audit_state state;
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
- ctx->current_state = state;
- return 1;
- }
- }
- return 0;
+ return __audit_filter_op(tsk, ctx, list, n, ctx->major);
}
/* At syscall exit time, this filter is called if any audit_names have been
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 341c94f208f4..3a12e6b400a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -25,7 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif
ifeq ($(CONFIG_CGROUPS),y)
-obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += cgroup_iter.o bpf_cgrp_storage.o
endif
obj-$(CONFIG_CGROUP_BPF) += cgroup.o
ifeq ($(CONFIG_INET),y)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 832b2659e96e..484706959556 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -306,14 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
-static void check_and_free_fields(struct bpf_array *arr, void *val)
-{
- if (map_value_has_timer(&arr->map))
- bpf_timer_cancel_and_free(val + arr->map.timer_off);
- if (map_value_has_kptrs(&arr->map))
- bpf_map_free_kptrs(&arr->map, val);
-}
-
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@@ -335,13 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
return -EEXIST;
if (unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
val = this_cpu_ptr(array->pptrs[index & array->index_mask]);
copy_map_value(map, val, value);
- check_and_free_fields(array, val);
+ bpf_obj_free_fields(array->map.record, val);
} else {
val = array->value +
(u64)array->elem_size * (index & array->index_mask);
@@ -349,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
- check_and_free_fields(array, val);
+ bpf_obj_free_fields(array->map.record, val);
}
return 0;
}
@@ -386,7 +378,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off);
- check_and_free_fields(array, per_cpu_ptr(pptr, cpu));
+ bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu));
off += size;
}
rcu_read_unlock();
@@ -409,12 +401,12 @@ static void array_map_free_timers(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- /* We don't reset or free kptr on uref dropping to zero. */
- if (!map_value_has_timer(map))
+ /* We don't reset or free fields other than timer on uref dropping to zero. */
+ if (!btf_record_has_field(map->record, BPF_TIMER))
return;
for (i = 0; i < array->map.max_entries; i++)
- bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off);
+ bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i));
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -423,22 +415,21 @@ static void array_map_free(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- if (map_value_has_kptrs(map)) {
+ if (!IS_ERR_OR_NULL(map->record)) {
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
for (i = 0; i < array->map.max_entries; i++) {
void __percpu *pptr = array->pptrs[i & array->index_mask];
int cpu;
for_each_possible_cpu(cpu) {
- bpf_map_free_kptrs(map, per_cpu_ptr(pptr, cpu));
+ bpf_obj_free_fields(map->record, per_cpu_ptr(pptr, cpu));
cond_resched();
}
}
} else {
for (i = 0; i < array->map.max_entries; i++)
- bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
+ bpf_obj_free_fields(map->record, array_map_elem_ptr(array, i));
}
- bpf_map_free_kptr_off_tab(map);
}
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
new file mode 100644
index 000000000000..6cdf6d9ed91d
--- /dev/null
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+
+DEFINE_BPF_STORAGE_CACHE(cgroup_cache);
+
+static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
+
+static void bpf_cgrp_storage_lock(void)
+{
+ migrate_disable();
+ this_cpu_inc(bpf_cgrp_storage_busy);
+}
+
+static void bpf_cgrp_storage_unlock(void)
+{
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_cgrp_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
+ this_cpu_dec(bpf_cgrp_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner)
+{
+ struct cgroup *cg = owner;
+
+ return &cg->bpf_cgrp_storage;
+}
+
+void bpf_cgrp_storage_free(struct cgroup *cgroup)
+{
+ struct bpf_local_storage *local_storage;
+ bool free_cgroup_storage = false;
+ unsigned long flags;
+
+ rcu_read_lock();
+ local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ bpf_cgrp_storage_lock();
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ free_cgroup_storage = bpf_local_storage_unlink_nolock(local_storage);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_cgrp_storage_unlock();
+ rcu_read_unlock();
+
+ if (free_cgroup_storage)
+ kfree_rcu(local_storage, rcu);
+}
+
+static struct bpf_local_storage_data *
+cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *cgroup_storage;
+ struct bpf_local_storage_map *smap;
+
+ cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage,
+ bpf_rcu_lock_held());
+ if (!cgroup_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit);
+}
+
+static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return ERR_CAST(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return sdata ? sdata->data : NULL;
+}
+
+static int bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct cgroup *cgroup;
+ int fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, map_flags, GFP_ATOMIC);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = cgroup_storage_lookup(cgroup, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), true);
+ return 0;
+}
+
+static int bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct cgroup *cgroup;
+ int err, fd;
+
+ fd = *(int *)key;
+ cgroup = cgroup_get_from_fd(fd);
+ if (IS_ERR(cgroup))
+ return PTR_ERR(cgroup);
+
+ bpf_cgrp_storage_lock();
+ err = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ cgroup_put(cgroup);
+ return err;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache);
+}
+
+static void cgroup_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!cgroup)
+ return (unsigned long)NULL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return (unsigned long)NULL;
+
+ sdata = cgroup_storage_lookup(cgroup, map, true);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the cgroup is refcounted */
+ if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
+ value, BPF_NOEXIST, gfp_flags);
+
+unlock:
+ bpf_cgrp_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!cgroup)
+ return -EINVAL;
+
+ if (!bpf_cgrp_storage_trylock())
+ return -EBUSY;
+
+ ret = cgroup_storage_delete(cgroup, map);
+ bpf_cgrp_storage_unlock();
+ return ret;
+}
+
+const struct bpf_map_ops cgrp_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = cgroup_storage_map_alloc,
+ .map_free = cgroup_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_cgrp_storage_lookup_elem,
+ .map_update_elem = bpf_cgrp_storage_update_elem,
+ .map_delete_elem = bpf_cgrp_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_owner_storage_ptr = cgroup_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_get_proto = {
+ .func = bpf_cgrp_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_cgrp_storage_delete_proto = {
+ .func = bpf_cgrp_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_cgroup_btf_id[0],
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 5f7683b19199..05f4c66c9089 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
void bpf_inode_storage_free(struct inode *inode)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_inode_storage = false;
struct bpf_storage_blob *bsb;
- struct hlist_node *n;
bsb = bpf_inode(inode);
if (!bsb)
@@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
raw_spin_lock_bh(&local_storage->lock);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_inode_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
- }
+ free_inode_storage = bpf_local_storage_unlink_nolock(local_storage);
raw_spin_unlock_bh(&local_storage->lock);
rcu_read_unlock();
- /* free_inoode_storage should always be true as long as
- * local_storage->list was non-empty.
- */
if (free_inode_storage)
kfree_rcu(local_storage, rcu);
}
@@ -226,27 +205,14 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &inode_cache);
}
static void inode_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, NULL);
+ bpf_local_storage_map_free(map, &inode_cache, NULL);
}
-BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct,
- bpf_local_storage_map)
const struct bpf_map_ops inode_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -257,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = {
.map_update_elem = bpf_fd_inode_storage_update_elem,
.map_delete_elem = bpf_fd_inode_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_id = &inode_storage_map_btf_ids[0],
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = inode_storage_ptr,
};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 802fc15b0d73..b39a46e8fb08 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -74,7 +74,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
gfp_flags | __GFP_NOWARN);
if (selem) {
if (value)
- memcpy(SDATA(selem)->data, value, smap->map.value_size);
+ copy_map_value(&smap->map, SDATA(selem)->data, value);
return selem;
}
@@ -88,8 +88,14 @@ void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
+ /* If RCU Tasks Trace grace period implies RCU grace period, do
+ * kfree(), else do kfree_rcu().
+ */
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_rcu(local_storage, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(local_storage);
+ else
+ kfree_rcu(local_storage, rcu);
}
static void bpf_selem_free_rcu(struct rcu_head *rcu)
@@ -97,16 +103,19 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- kfree_rcu(selem, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(selem);
+ else
+ kfree_rcu(selem, rcu);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
*/
-bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
- struct bpf_local_storage_elem *selem,
- bool uncharge_mem, bool use_trace_rcu)
+static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_mem, bool use_trace_rcu)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -233,6 +242,7 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu)
__bpf_selem_unlink_storage(selem, use_trace_rcu);
}
+/* If cacheit_lockit is false, this lookup function is lockless */
struct bpf_local_storage_data *
bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
@@ -372,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
/* BPF_F_LOCK can only be used in a value with spin_lock */
unlikely((map_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(&smap->map)))
+ !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
return ERR_PTR(-EINVAL);
if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
@@ -491,7 +501,7 @@ unlock_err:
return ERR_PTR(err);
}
-u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
{
u64 min_usage = U64_MAX;
u16 i, res = 0;
@@ -515,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
return res;
}
-void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
- u16 idx)
+static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
{
spin_lock(&cache->idx_lock);
cache->idx_usage_counts[idx]--;
spin_unlock(&cache->idx_lock);
}
-void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
- int __percpu *busy_counter)
-{
- struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map_bucket *b;
- unsigned int i;
-
- /* Note that this map might be concurrently cloned from
- * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
- * RCU read section to finish before proceeding. New RCU
- * read sections should be prevented via bpf_map_inc_not_zero.
- */
- synchronize_rcu();
-
- /* bpf prog and the userspace can no longer access this map
- * now. No new selem (of this map) can be added
- * to the owner->storage or to the map bucket's list.
- *
- * The elem of this map can be cleaned up here
- * or when the storage is freed e.g.
- * by bpf_sk_storage_free() during __sk_destruct().
- */
- for (i = 0; i < (1U << smap->bucket_log); i++) {
- b = &smap->buckets[i];
-
- rcu_read_lock();
- /* No one is adding to b->list now */
- while ((selem = hlist_entry_safe(
- rcu_dereference_raw(hlist_first_rcu(&b->list)),
- struct bpf_local_storage_elem, map_node))) {
- if (busy_counter) {
- migrate_disable();
- this_cpu_inc(*busy_counter);
- }
- bpf_selem_unlink(selem, false);
- if (busy_counter) {
- this_cpu_dec(*busy_counter);
- migrate_enable();
- }
- cond_resched_rcu();
- }
- rcu_read_unlock();
- }
-
- /* While freeing the storage we may still need to access the map.
- *
- * e.g. when bpf_sk_storage_free() has unlinked selem from the map
- * which then made the above while((selem = ...)) loop
- * exit immediately.
- *
- * However, while freeing the storage one still needs to access the
- * smap->elem_size to do the uncharging in
- * bpf_selem_unlink_storage_nolock().
- *
- * Hence, wait another rcu grace period for the storage to be freed.
- */
- synchronize_rcu();
-
- kvfree(smap->buckets);
- bpf_map_area_free(smap);
-}
-
int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
{
if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
@@ -604,7 +552,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
return 0;
}
-struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -654,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
return 0;
}
+
+bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage)
+{
+ struct bpf_local_storage_elem *selem;
+ bool free_storage = false;
+ struct hlist_node *n;
+
+ /* Neither the bpf_prog nor the bpf_map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added to or deleted from the
+ * local_storage->list by the bpf_prog or by the bpf_map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ /* If local_storage list has only one element, the
+ * bpf_selem_unlink_storage_nolock() will return true.
+ * Otherwise, it will return false. The current loop iteration
+ * intends to remove all local storage. So the last iteration
+ * of the loop will set the free_cgroup_storage to true.
+ */
+ free_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, false, false);
+ }
+
+ return free_storage;
+}
+
+struct bpf_map *
+bpf_local_storage_map_alloc(union bpf_attr *attr,
+ struct bpf_local_storage_cache *cache)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = __bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
+ return &smap->map;
+}
+
+void bpf_local_storage_map_free(struct bpf_map *map,
+ struct bpf_local_storage_cache *cache,
+ int __percpu *busy_counter)
+{
+ struct bpf_local_storage_map_bucket *b;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(cache, smap->cache_idx);
+
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
+ synchronize_rcu();
+
+ /* bpf prog and the userspace can no longer access this map
+ * now. No new selem (of this map) can be added
+ * to the owner->storage or to the map bucket's list.
+ *
+ * The elem of this map can be cleaned up here
+ * or when the storage is freed e.g.
+ * by bpf_sk_storage_free() during __sk_destruct().
+ */
+ for (i = 0; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+
+ rcu_read_lock();
+ /* No one is adding to b->list now */
+ while ((selem = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(&b->list)),
+ struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter) {
+ migrate_disable();
+ this_cpu_inc(*busy_counter);
+ }
+ bpf_selem_unlink(selem, false);
+ if (busy_counter) {
+ this_cpu_dec(*busy_counter);
+ migrate_enable();
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+ }
+
+ /* While freeing the storage we may still need to access the map.
+ *
+ * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+ * which then made the above while((selem = ...)) loop
+ * exit immediately.
+ *
+ * However, while freeing the storage one still needs to access the
+ * smap->elem_size to do the uncharging in
+ * bpf_selem_unlink_storage_nolock().
+ *
+ * Hence, wait another rcu grace period for the storage to be freed.
+ */
+ synchronize_rcu();
+
+ kvfree(smap->buckets);
+ bpf_map_area_free(smap);
+}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index d6c9b3705f24..9ea42a45da47 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -151,6 +151,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.func = bpf_ima_inode_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
@@ -169,6 +170,7 @@ BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
static const struct bpf_func_proto bpf_ima_file_hash_proto = {
.func = bpf_ima_file_hash,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
@@ -221,9 +223,9 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_bprm_opts_set:
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
- return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ return &bpf_ima_inode_hash_proto;
case BPF_FUNC_ima_file_hash:
- return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
+ return &bpf_ima_file_hash_proto;
case BPF_FUNC_get_attach_cookie:
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
#ifdef CONFIG_NET
@@ -343,11 +345,27 @@ BTF_ID(func, bpf_lsm_task_to_inode)
BTF_ID(func, bpf_lsm_userns_create)
BTF_SET_END(sleepable_lsm_hooks)
+BTF_SET_START(untrusted_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_prog_free_security)
+BTF_ID(func, bpf_lsm_file_alloc_security)
+BTF_ID(func, bpf_lsm_file_free_security)
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+BTF_ID(func, bpf_lsm_task_free)
+BTF_SET_END(untrusted_lsm_hooks)
+
bool bpf_lsm_is_sleepable_hook(u32 btf_id)
{
return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
}
+bool bpf_lsm_is_trusted(const struct bpf_prog *prog)
+{
+ return !btf_id_set_contains(&untrusted_lsm_hooks, prog->aux->attach_btf_id);
+}
+
const struct bpf_prog_ops lsm_prog_ops = {
};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 84b2d9dba79a..ece9870cab68 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -494,8 +494,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
refcount_set(&kvalue->refcnt, 1);
bpf_map_inc(map);
- set_memory_ro((long)st_map->image, 1);
- set_memory_x((long)st_map->image, 1);
+ set_memory_rox((long)st_map->image, 1);
err = st_ops->reg(kdata);
if (likely(!err)) {
/* Pair with smp_load_acquire() during lookup_elem().
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 6f290623347e..1e486055a523 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
void bpf_task_storage_free(struct task_struct *task)
{
- struct bpf_local_storage_elem *selem;
struct bpf_local_storage *local_storage;
bool free_task_storage = false;
- struct hlist_node *n;
unsigned long flags;
rcu_read_lock();
@@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task)
return;
}
- /* Neither the bpf_prog nor the bpf-map's syscall
- * could be modifying the local_storage->list now.
- * Thus, no elem can be added-to or deleted-from the
- * local_storage->list by the bpf_prog or by the bpf-map's syscall.
- *
- * It is racing with bpf_local_storage_map_free() alone
- * when unlinking elem from the local_storage->list and
- * the map's bucket->list.
- */
bpf_task_storage_lock();
raw_spin_lock_irqsave(&local_storage->lock, flags);
- hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
- /* Always unlink from map before unlinking from
- * local_storage.
- */
- bpf_selem_unlink_map(selem);
- free_task_storage = bpf_selem_unlink_storage_nolock(
- local_storage, selem, false, false);
- }
+ free_task_storage = bpf_local_storage_unlink_nolock(local_storage);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_task_storage_unlock();
rcu_read_unlock();
- /* free_task_storage should always be true as long as
- * local_storage->list was non-empty.
- */
if (free_task_storage)
kfree_rcu(local_storage, rcu);
}
@@ -184,7 +163,8 @@ out:
return err;
}
-static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map,
+ bool nobusy)
{
struct bpf_local_storage_data *sdata;
@@ -192,6 +172,9 @@ static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
if (!sdata)
return -ENOENT;
+ if (!nobusy)
+ return -EBUSY;
+
bpf_selem_unlink(SELEM(sdata), true);
return 0;
@@ -220,63 +203,108 @@ static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
}
bpf_task_storage_lock();
- err = task_storage_delete(task, map);
+ err = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
out:
put_pid(pid);
return err;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
+/* Called by bpf_task_storage_get*() helpers */
+static void *__bpf_task_storage_get(struct bpf_map *map,
+ struct task_struct *task, void *value,
+ u64 flags, gfp_t gfp_flags, bool nobusy)
{
struct bpf_local_storage_data *sdata;
- WARN_ON_ONCE(!bpf_rcu_lock_held());
- if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
- return (unsigned long)NULL;
-
- if (!task)
- return (unsigned long)NULL;
-
- if (!bpf_task_storage_trylock())
- return (unsigned long)NULL;
-
- sdata = task_storage_lookup(task, map, true);
+ sdata = task_storage_lookup(task, map, nobusy);
if (sdata)
- goto unlock;
+ return sdata->data;
/* only allocate new storage, when the task is refcounted */
if (refcount_read(&task->usage) &&
- (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST, gfp_flags);
+ return IS_ERR(sdata) ? NULL : sdata->data;
+ }
+
+ return NULL;
+}
-unlock:
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get_recur, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ bool nobusy;
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ nobusy = bpf_task_storage_trylock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return (unsigned long)data;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ void *data;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~BPF_LOCAL_STORAGE_GET_F_CREATE || !task)
+ return (unsigned long)NULL;
+
+ bpf_task_storage_lock();
+ data = __bpf_task_storage_get(map, task, value, flags,
+ gfp_flags, true);
bpf_task_storage_unlock();
- return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
- (unsigned long)sdata->data;
+ return (unsigned long)data;
}
-BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+BPF_CALL_2(bpf_task_storage_delete_recur, struct bpf_map *, map, struct task_struct *,
task)
{
+ bool nobusy;
int ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
- if (!bpf_task_storage_trylock())
- return -EBUSY;
+ nobusy = bpf_task_storage_trylock();
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map, nobusy);
+ if (nobusy)
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+ bpf_task_storage_lock();
/* This helper must only be called from places where the lifetime of the task
* is guaranteed. Either by being refcounted or by being protected
* by an RCU read-side critical section.
*/
- ret = task_storage_delete(task, map);
+ ret = task_storage_delete(task, map, true);
bpf_task_storage_unlock();
return ret;
}
@@ -288,26 +316,15 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- struct bpf_local_storage_map *smap;
-
- smap = bpf_local_storage_map_alloc(attr);
- if (IS_ERR(smap))
- return ERR_CAST(smap);
-
- smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
- return &smap->map;
+ return bpf_local_storage_map_alloc(attr, &task_cache);
}
static void task_storage_map_free(struct bpf_map *map)
{
- struct bpf_local_storage_map *smap;
-
- smap = (struct bpf_local_storage_map *)map;
- bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
- bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
+ bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy);
}
-BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map)
const struct bpf_map_ops task_storage_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = bpf_local_storage_map_alloc_check,
@@ -318,10 +335,21 @@ const struct bpf_map_ops task_storage_map_ops = {
.map_update_elem = bpf_pid_task_storage_update_elem,
.map_delete_elem = bpf_pid_task_storage_delete_elem,
.map_check_btf = bpf_local_storage_map_check_btf,
- .map_btf_id = &task_storage_map_btf_ids[0],
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
.map_owner_storage_ptr = task_storage_ptr,
};
+const struct bpf_func_proto bpf_task_storage_get_recur_proto = {
+ .func = bpf_task_storage_get_recur,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
const struct bpf_func_proto bpf_task_storage_get_proto = {
.func = bpf_task_storage_get,
.gpl_only = false,
@@ -333,6 +361,15 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.arg4_type = ARG_ANYTHING,
};
+const struct bpf_func_proto bpf_task_storage_delete_recur_proto = {
+ .func = bpf_task_storage_delete_recur,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
+
const struct bpf_func_proto bpf_task_storage_delete_proto = {
.func = bpf_task_storage_delete,
.gpl_only = false,
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 35c07afac924..f7dd8af06413 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -19,6 +19,7 @@
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_lsm.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
#include <linux/bsearch.h>
@@ -199,11 +200,13 @@ DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_COMMON,
BTF_KFUNC_HOOK_XDP,
BTF_KFUNC_HOOK_TC,
BTF_KFUNC_HOOK_STRUCT_OPS,
BTF_KFUNC_HOOK_TRACING,
BTF_KFUNC_HOOK_SYSCALL,
+ BTF_KFUNC_HOOK_FMODRET,
BTF_KFUNC_HOOK_MAX,
};
@@ -237,6 +240,7 @@ struct btf {
struct rcu_head rcu;
struct btf_kfunc_set_tab *kfunc_set_tab;
struct btf_id_dtor_kfunc_tab *dtor_kfunc_tab;
+ struct btf_struct_metas *struct_meta_tab;
/* split BTF support */
struct btf *base_btf;
@@ -477,16 +481,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
-static bool __btf_type_is_struct(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
-}
-
-static bool btf_type_is_array(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
-}
-
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -1642,8 +1636,30 @@ static void btf_free_dtor_kfunc_tab(struct btf *btf)
btf->dtor_kfunc_tab = NULL;
}
+static void btf_struct_metas_free(struct btf_struct_metas *tab)
+{
+ int i;
+
+ if (!tab)
+ return;
+ for (i = 0; i < tab->cnt; i++) {
+ btf_record_free(tab->types[i].record);
+ kfree(tab->types[i].field_offs);
+ }
+ kfree(tab);
+}
+
+static void btf_free_struct_meta_tab(struct btf *btf)
+{
+ struct btf_struct_metas *tab = btf->struct_meta_tab;
+
+ btf_struct_metas_free(tab);
+ btf->struct_meta_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_struct_meta_tab(btf);
btf_free_dtor_kfunc_tab(btf);
btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
@@ -3191,7 +3207,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-enum btf_field_type {
+enum btf_field_info_type {
BTF_FIELD_SPIN_LOCK,
BTF_FIELD_TIMER,
BTF_FIELD_KPTR,
@@ -3203,18 +3219,28 @@ enum {
};
struct btf_field_info {
- u32 type_id;
+ enum btf_field_type type;
u32 off;
- enum bpf_kptr_type type;
+ union {
+ struct {
+ u32 type_id;
+ } kptr;
+ struct {
+ const char *node_name;
+ u32 value_btf_id;
+ } list_head;
+ };
};
static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
- u32 off, int sz, struct btf_field_info *info)
+ u32 off, int sz, enum btf_field_type field_type,
+ struct btf_field_info *info)
{
if (!__btf_type_is_struct(t))
return BTF_FIELD_IGNORE;
if (t->size != sz)
return BTF_FIELD_IGNORE;
+ info->type = field_type;
info->off = off;
return BTF_FIELD_FOUND;
}
@@ -3222,9 +3248,12 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
u32 off, int sz, struct btf_field_info *info)
{
- enum bpf_kptr_type type;
+ enum btf_field_type type;
u32 res_id;
+ /* Permit modifiers on the pointer itself */
+ if (btf_type_is_volatile(t))
+ t = btf_type_by_id(btf, t->type);
/* For PTR, sz is always == 8 */
if (!btf_type_is_ptr(t))
return BTF_FIELD_IGNORE;
@@ -3248,28 +3277,135 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
if (!__btf_type_is_struct(t))
return -EINVAL;
- info->type_id = res_id;
- info->off = off;
info->type = type;
+ info->off = off;
+ info->kptr.type_id = res_id;
return BTF_FIELD_FOUND;
}
-static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align,
- enum btf_field_type field_type,
+static const char *btf_find_decl_tag_value(const struct btf *btf,
+ const struct btf_type *pt,
+ int comp_idx, const char *tag_key)
+{
+ int i;
+
+ for (i = 1; i < btf_nr_types(btf); i++) {
+ const struct btf_type *t = btf_type_by_id(btf, i);
+ int len = strlen(tag_key);
+
+ if (!btf_type_is_decl_tag(t))
+ continue;
+ if (pt != btf_type_by_id(btf, t->type) ||
+ btf_type_decl_tag(t)->component_idx != comp_idx)
+ continue;
+ if (strncmp(__btf_name_by_offset(btf, t->name_off), tag_key, len))
+ continue;
+ return __btf_name_by_offset(btf, t->name_off) + len;
+ }
+ return NULL;
+}
+
+static int btf_find_list_head(const struct btf *btf, const struct btf_type *pt,
+ const struct btf_type *t, int comp_idx,
+ u32 off, int sz, struct btf_field_info *info)
+{
+ const char *value_type;
+ const char *list_node;
+ s32 id;
+
+ if (!__btf_type_is_struct(t))
+ return BTF_FIELD_IGNORE;
+ if (t->size != sz)
+ return BTF_FIELD_IGNORE;
+ value_type = btf_find_decl_tag_value(btf, pt, comp_idx, "contains:");
+ if (!value_type)
+ return -EINVAL;
+ list_node = strstr(value_type, ":");
+ if (!list_node)
+ return -EINVAL;
+ value_type = kstrndup(value_type, list_node - value_type, GFP_KERNEL | __GFP_NOWARN);
+ if (!value_type)
+ return -ENOMEM;
+ id = btf_find_by_name_kind(btf, value_type, BTF_KIND_STRUCT);
+ kfree(value_type);
+ if (id < 0)
+ return id;
+ list_node++;
+ if (str_is_empty(list_node))
+ return -EINVAL;
+ info->type = BPF_LIST_HEAD;
+ info->off = off;
+ info->list_head.value_btf_id = id;
+ info->list_head.node_name = list_node;
+ return BTF_FIELD_FOUND;
+}
+
+static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask,
+ int *align, int *sz)
+{
+ int type = 0;
+
+ if (field_mask & BPF_SPIN_LOCK) {
+ if (!strcmp(name, "bpf_spin_lock")) {
+ if (*seen_mask & BPF_SPIN_LOCK)
+ return -E2BIG;
+ *seen_mask |= BPF_SPIN_LOCK;
+ type = BPF_SPIN_LOCK;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_TIMER) {
+ if (!strcmp(name, "bpf_timer")) {
+ if (*seen_mask & BPF_TIMER)
+ return -E2BIG;
+ *seen_mask |= BPF_TIMER;
+ type = BPF_TIMER;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_LIST_HEAD) {
+ if (!strcmp(name, "bpf_list_head")) {
+ type = BPF_LIST_HEAD;
+ goto end;
+ }
+ }
+ if (field_mask & BPF_LIST_NODE) {
+ if (!strcmp(name, "bpf_list_node")) {
+ type = BPF_LIST_NODE;
+ goto end;
+ }
+ }
+ /* Only return BPF_KPTR when all other types with matchable names fail */
+ if (field_mask & BPF_KPTR) {
+ type = BPF_KPTR_REF;
+ goto end;
+ }
+ return 0;
+end:
+ *sz = btf_field_type_size(type);
+ *align = btf_field_type_align(type);
+ return type;
+}
+
+static int btf_find_struct_field(const struct btf *btf,
+ const struct btf_type *t, u32 field_mask,
struct btf_field_info *info, int info_cnt)
{
+ int ret, idx = 0, align, sz, field_type;
const struct btf_member *member;
struct btf_field_info tmp;
- int ret, idx = 0;
- u32 i, off;
+ u32 i, off, seen_mask = 0;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
- if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
+ if (field_type < 0)
+ return field_type;
off = __btf_member_bit_offset(t, member);
if (off % 8)
@@ -3277,22 +3413,30 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
return -EINVAL;
off /= 8;
if (off % align)
- return -EINVAL;
+ continue;
switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- case BTF_FIELD_TIMER:
- ret = btf_find_struct(btf, member_type, off, sz,
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ ret = btf_find_struct(btf, member_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
- case BTF_FIELD_KPTR:
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
ret = btf_find_kptr(btf, member_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
+ case BPF_LIST_HEAD:
+ ret = btf_find_list_head(btf, t, member_type, i, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
default:
return -EFAULT;
}
@@ -3307,42 +3451,53 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
}
static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
- const char *name, int sz, int align,
- enum btf_field_type field_type,
- struct btf_field_info *info, int info_cnt)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
{
+ int ret, idx = 0, align, sz, field_type;
const struct btf_var_secinfo *vsi;
struct btf_field_info tmp;
- int ret, idx = 0;
- u32 i, off;
+ u32 i, off, seen_mask = 0;
for_each_vsi(i, t, vsi) {
const struct btf_type *var = btf_type_by_id(btf, vsi->type);
const struct btf_type *var_type = btf_type_by_id(btf, var->type);
- off = vsi->offset;
-
- if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+ field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off),
+ field_mask, &seen_mask, &align, &sz);
+ if (field_type == 0)
continue;
+ if (field_type < 0)
+ return field_type;
+
+ off = vsi->offset;
if (vsi->size != sz)
continue;
if (off % align)
- return -EINVAL;
+ continue;
switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- case BTF_FIELD_TIMER:
- ret = btf_find_struct(btf, var_type, off, sz,
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ case BPF_LIST_NODE:
+ ret = btf_find_struct(btf, var_type, off, sz, field_type,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
- case BTF_FIELD_KPTR:
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
ret = btf_find_kptr(btf, var_type, off, sz,
idx < info_cnt ? &info[idx] : &tmp);
if (ret < 0)
return ret;
break;
+ case BPF_LIST_HEAD:
+ ret = btf_find_list_head(btf, var, var_type, -1, off, sz,
+ idx < info_cnt ? &info[idx] : &tmp);
+ if (ret < 0)
+ return ret;
+ break;
default:
return -EFAULT;
}
@@ -3357,169 +3512,327 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
}
static int btf_find_field(const struct btf *btf, const struct btf_type *t,
- enum btf_field_type field_type,
- struct btf_field_info *info, int info_cnt)
+ u32 field_mask, struct btf_field_info *info,
+ int info_cnt)
{
- const char *name;
- int sz, align;
-
- switch (field_type) {
- case BTF_FIELD_SPIN_LOCK:
- name = "bpf_spin_lock";
- sz = sizeof(struct bpf_spin_lock);
- align = __alignof__(struct bpf_spin_lock);
- break;
- case BTF_FIELD_TIMER:
- name = "bpf_timer";
- sz = sizeof(struct bpf_timer);
- align = __alignof__(struct bpf_timer);
- break;
- case BTF_FIELD_KPTR:
- name = NULL;
- sz = sizeof(u64);
- align = 8;
- break;
- default:
- return -EFAULT;
- }
-
if (__btf_type_is_struct(t))
- return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt);
+ return btf_find_struct_field(btf, t, field_mask, info, info_cnt);
else if (btf_type_is_datasec(t))
- return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt);
+ return btf_find_datasec_var(btf, t, field_mask, info, info_cnt);
return -EINVAL;
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_parse_kptr(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
{
- struct btf_field_info info;
+ struct module *mod = NULL;
+ const struct btf_type *t;
+ struct btf *kernel_btf;
int ret;
+ s32 id;
- ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1);
- if (ret < 0)
- return ret;
- if (!ret)
- return -ENOENT;
- return info.off;
+ /* Find type in map BTF, and use it to look up the matching type
+ * in vmlinux or module BTFs, by name and kind.
+ */
+ t = btf_type_by_id(btf, info->kptr.type_id);
+ id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
+ &kernel_btf);
+ if (id < 0)
+ return id;
+
+ /* Find and stash the function pointer for the destruction function that
+ * needs to be eventually invoked from the map free path.
+ */
+ if (info->type == BPF_KPTR_REF) {
+ const struct btf_type *dtor_func;
+ const char *dtor_func_name;
+ unsigned long addr;
+ s32 dtor_btf_id;
+
+ /* This call also serves as a whitelist of allowed objects that
+ * can be used as a referenced pointer and be stored in a map at
+ * the same time.
+ */
+ dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
+ if (dtor_btf_id < 0) {
+ ret = dtor_btf_id;
+ goto end_btf;
+ }
+
+ dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
+ if (!dtor_func) {
+ ret = -ENOENT;
+ goto end_btf;
+ }
+
+ if (btf_is_module(kernel_btf)) {
+ mod = btf_try_get_module(kernel_btf);
+ if (!mod) {
+ ret = -ENXIO;
+ goto end_btf;
+ }
+ }
+
+ /* We already verified dtor_func to be btf_type_is_func
+ * in register_btf_id_dtor_kfuncs.
+ */
+ dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
+ addr = kallsyms_lookup_name(dtor_func_name);
+ if (!addr) {
+ ret = -EINVAL;
+ goto end_mod;
+ }
+ field->kptr.dtor = (void *)addr;
+ }
+
+ field->kptr.btf_id = id;
+ field->kptr.btf = kernel_btf;
+ field->kptr.module = mod;
+ return 0;
+end_mod:
+ module_put(mod);
+end_btf:
+ btf_put(kernel_btf);
+ return ret;
}
-int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+static int btf_parse_list_head(const struct btf *btf, struct btf_field *field,
+ struct btf_field_info *info)
{
- struct btf_field_info info;
- int ret;
+ const struct btf_type *t, *n = NULL;
+ const struct btf_member *member;
+ u32 offset;
+ int i;
- ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1);
- if (ret < 0)
- return ret;
- if (!ret)
+ t = btf_type_by_id(btf, info->list_head.value_btf_id);
+ /* We've already checked that value_btf_id is a struct type. We
+ * just need to figure out the offset of the list_node, and
+ * verify its type.
+ */
+ for_each_member(i, t, member) {
+ if (strcmp(info->list_head.node_name, __btf_name_by_offset(btf, member->name_off)))
+ continue;
+ /* Invalid BTF, two members with same name */
+ if (n)
+ return -EINVAL;
+ n = btf_type_by_id(btf, member->type);
+ if (!__btf_type_is_struct(n))
+ return -EINVAL;
+ if (strcmp("bpf_list_node", __btf_name_by_offset(btf, n->name_off)))
+ return -EINVAL;
+ offset = __btf_member_bit_offset(n, member);
+ if (offset % 8)
+ return -EINVAL;
+ offset /= 8;
+ if (offset % __alignof__(struct bpf_list_node))
+ return -EINVAL;
+
+ field->list_head.btf = (struct btf *)btf;
+ field->list_head.value_btf_id = info->list_head.value_btf_id;
+ field->list_head.node_offset = offset;
+ }
+ if (!n)
return -ENOENT;
- return info.off;
+ return 0;
}
-struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf,
- const struct btf_type *t)
+struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t,
+ u32 field_mask, u32 value_size)
{
- struct btf_field_info info_arr[BPF_MAP_VALUE_OFF_MAX];
- struct bpf_map_value_off *tab;
- struct btf *kernel_btf = NULL;
- struct module *mod = NULL;
- int ret, i, nr_off;
+ struct btf_field_info info_arr[BTF_FIELDS_MAX];
+ struct btf_record *rec;
+ u32 next_off = 0;
+ int ret, i, cnt;
- ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr));
+ ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr));
if (ret < 0)
return ERR_PTR(ret);
if (!ret)
return NULL;
- nr_off = ret;
- tab = kzalloc(offsetof(struct bpf_map_value_off, off[nr_off]), GFP_KERNEL | __GFP_NOWARN);
- if (!tab)
+ cnt = ret;
+ /* This needs to be kzalloc to zero out padding and unused fields, see
+ * comment in btf_record_equal.
+ */
+ rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN);
+ if (!rec)
return ERR_PTR(-ENOMEM);
- for (i = 0; i < nr_off; i++) {
- const struct btf_type *t;
- s32 id;
+ rec->spin_lock_off = -EINVAL;
+ rec->timer_off = -EINVAL;
+ for (i = 0; i < cnt; i++) {
+ if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) {
+ WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size);
+ ret = -EFAULT;
+ goto end;
+ }
+ if (info_arr[i].off < next_off) {
+ ret = -EEXIST;
+ goto end;
+ }
+ next_off = info_arr[i].off + btf_field_type_size(info_arr[i].type);
- /* Find type in map BTF, and use it to look up the matching type
- * in vmlinux or module BTFs, by name and kind.
- */
- t = btf_type_by_id(btf, info_arr[i].type_id);
- id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info),
- &kernel_btf);
- if (id < 0) {
- ret = id;
+ rec->field_mask |= info_arr[i].type;
+ rec->fields[i].offset = info_arr[i].off;
+ rec->fields[i].type = info_arr[i].type;
+
+ switch (info_arr[i].type) {
+ case BPF_SPIN_LOCK:
+ WARN_ON_ONCE(rec->spin_lock_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->spin_lock_off = rec->fields[i].offset;
+ break;
+ case BPF_TIMER:
+ WARN_ON_ONCE(rec->timer_off >= 0);
+ /* Cache offset for faster lookup at runtime */
+ rec->timer_off = rec->fields[i].offset;
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_HEAD:
+ ret = btf_parse_list_head(btf, &rec->fields[i], &info_arr[i]);
+ if (ret < 0)
+ goto end;
+ break;
+ case BPF_LIST_NODE:
+ break;
+ default:
+ ret = -EFAULT;
goto end;
}
+ rec->cnt++;
+ }
- /* Find and stash the function pointer for the destruction function that
- * needs to be eventually invoked from the map free path.
- */
- if (info_arr[i].type == BPF_KPTR_REF) {
- const struct btf_type *dtor_func;
- const char *dtor_func_name;
- unsigned long addr;
- s32 dtor_btf_id;
-
- /* This call also serves as a whitelist of allowed objects that
- * can be used as a referenced pointer and be stored in a map at
- * the same time.
- */
- dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id);
- if (dtor_btf_id < 0) {
- ret = dtor_btf_id;
- goto end_btf;
- }
+ /* bpf_list_head requires bpf_spin_lock */
+ if (btf_record_has_field(rec, BPF_LIST_HEAD) && rec->spin_lock_off < 0) {
+ ret = -EINVAL;
+ goto end;
+ }
- dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id);
- if (!dtor_func) {
- ret = -ENOENT;
- goto end_btf;
- }
+ return rec;
+end:
+ btf_record_free(rec);
+ return ERR_PTR(ret);
+}
- if (btf_is_module(kernel_btf)) {
- mod = btf_try_get_module(kernel_btf);
- if (!mod) {
- ret = -ENXIO;
- goto end_btf;
- }
- }
+int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
+{
+ int i;
- /* We already verified dtor_func to be btf_type_is_func
- * in register_btf_id_dtor_kfuncs.
- */
- dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off);
- addr = kallsyms_lookup_name(dtor_func_name);
- if (!addr) {
- ret = -EINVAL;
- goto end_mod;
- }
- tab->off[i].kptr.dtor = (void *)addr;
- }
+ /* There are two owning types, kptr_ref and bpf_list_head. The former
+ * only supports storing kernel types, which can never store references
+ * to program allocated local types, atleast not yet. Hence we only need
+ * to ensure that bpf_list_head ownership does not form cycles.
+ */
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_LIST_HEAD))
+ return 0;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_struct_meta *meta;
+ u32 btf_id;
+
+ if (!(rec->fields[i].type & BPF_LIST_HEAD))
+ continue;
+ btf_id = rec->fields[i].list_head.value_btf_id;
+ meta = btf_find_struct_meta(btf, btf_id);
+ if (!meta)
+ return -EFAULT;
+ rec->fields[i].list_head.value_rec = meta->record;
+
+ if (!(rec->field_mask & BPF_LIST_NODE))
+ continue;
- tab->off[i].offset = info_arr[i].off;
- tab->off[i].type = info_arr[i].type;
- tab->off[i].kptr.btf_id = id;
- tab->off[i].kptr.btf = kernel_btf;
- tab->off[i].kptr.module = mod;
+ /* We need to ensure ownership acyclicity among all types. The
+ * proper way to do it would be to topologically sort all BTF
+ * IDs based on the ownership edges, since there can be multiple
+ * bpf_list_head in a type. Instead, we use the following
+ * reasoning:
+ *
+ * - A type can only be owned by another type in user BTF if it
+ * has a bpf_list_node.
+ * - A type can only _own_ another type in user BTF if it has a
+ * bpf_list_head.
+ *
+ * We ensure that if a type has both bpf_list_head and
+ * bpf_list_node, its element types cannot be owning types.
+ *
+ * To ensure acyclicity:
+ *
+ * When A only has bpf_list_head, ownership chain can be:
+ * A -> B -> C
+ * Where:
+ * - B has both bpf_list_head and bpf_list_node.
+ * - C only has bpf_list_node.
+ *
+ * When A has both bpf_list_head and bpf_list_node, some other
+ * type already owns it in the BTF domain, hence it can not own
+ * another owning type through any of the bpf_list_head edges.
+ * A -> B
+ * Where:
+ * - B only has bpf_list_node.
+ */
+ if (meta->record->field_mask & BPF_LIST_HEAD)
+ return -ELOOP;
}
- tab->nr_off = nr_off;
- return tab;
-end_mod:
- module_put(mod);
-end_btf:
- btf_put(kernel_btf);
-end:
- while (i--) {
- btf_put(tab->off[i].kptr.btf);
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
+ return 0;
+}
+
+static int btf_field_offs_cmp(const void *_a, const void *_b, const void *priv)
+{
+ const u32 a = *(const u32 *)_a;
+ const u32 b = *(const u32 *)_b;
+
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ return 0;
+}
+
+static void btf_field_offs_swap(void *_a, void *_b, int size, const void *priv)
+{
+ struct btf_field_offs *foffs = (void *)priv;
+ u32 *off_base = foffs->field_off;
+ u32 *a = _a, *b = _b;
+ u8 *sz_a, *sz_b;
+
+ sz_a = foffs->field_sz + (a - off_base);
+ sz_b = foffs->field_sz + (b - off_base);
+
+ swap(*a, *b);
+ swap(*sz_a, *sz_b);
+}
+
+struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec)
+{
+ struct btf_field_offs *foffs;
+ u32 i, *off;
+ u8 *sz;
+
+ BUILD_BUG_ON(ARRAY_SIZE(foffs->field_off) != ARRAY_SIZE(foffs->field_sz));
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+
+ foffs = kzalloc(sizeof(*foffs), GFP_KERNEL | __GFP_NOWARN);
+ if (!foffs)
+ return ERR_PTR(-ENOMEM);
+
+ off = foffs->field_off;
+ sz = foffs->field_sz;
+ for (i = 0; i < rec->cnt; i++) {
+ off[i] = rec->fields[i].offset;
+ sz[i] = btf_field_type_size(rec->fields[i].type);
}
- kfree(tab);
- return ERR_PTR(ret);
+ foffs->cnt = rec->cnt;
+
+ if (foffs->cnt == 1)
+ return foffs;
+ sort_r(foffs->field_off, foffs->cnt, sizeof(foffs->field_off[0]),
+ btf_field_offs_cmp, btf_field_offs_swap, foffs);
+ return foffs;
}
static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
@@ -4468,7 +4781,6 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
nr_args--;
}
- err = 0;
for (i = 0; i < nr_args; i++) {
const struct btf_type *arg_type;
u32 arg_type_id;
@@ -4477,8 +4789,12 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
arg_type = btf_type_by_id(btf, arg_type_id);
if (!arg_type) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
+ }
+
+ if (btf_type_is_resolve_source_only(arg_type)) {
+ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
+ return -EINVAL;
}
if (args[i].name_off &&
@@ -4486,25 +4802,23 @@ static int btf_func_proto_check(struct btf_verifier_env *env,
!btf_name_valid_identifier(btf, args[i].name_off))) {
btf_verifier_log_type(env, t,
"Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
if (btf_type_needs_resolve(arg_type) &&
!env_type_is_resolved(env, arg_type_id)) {
err = btf_resolve(env, arg_type, arg_type_id);
if (err)
- break;
+ return err;
}
if (!btf_type_id_size(btf, &arg_type_id, NULL)) {
btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1);
- err = -EINVAL;
- break;
+ return -EINVAL;
}
}
- return err;
+ return 0;
}
static int btf_func_check(struct btf_verifier_env *env,
@@ -4918,6 +5232,119 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return btf_check_sec_info(env, btf_data_size);
}
+static const char *alloc_obj_fields[] = {
+ "bpf_spin_lock",
+ "bpf_list_head",
+ "bpf_list_node",
+};
+
+static struct btf_struct_metas *
+btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
+{
+ union {
+ struct btf_id_set set;
+ struct {
+ u32 _cnt;
+ u32 _ids[ARRAY_SIZE(alloc_obj_fields)];
+ } _arr;
+ } aof;
+ struct btf_struct_metas *tab = NULL;
+ int i, n, id, ret;
+
+ BUILD_BUG_ON(offsetof(struct btf_id_set, cnt) != 0);
+ BUILD_BUG_ON(sizeof(struct btf_id_set) != sizeof(u32));
+
+ memset(&aof, 0, sizeof(aof));
+ for (i = 0; i < ARRAY_SIZE(alloc_obj_fields); i++) {
+ /* Try to find whether this special type exists in user BTF, and
+ * if so remember its ID so we can easily find it among members
+ * of structs that we iterate in the next loop.
+ */
+ id = btf_find_by_name_kind(btf, alloc_obj_fields[i], BTF_KIND_STRUCT);
+ if (id < 0)
+ continue;
+ aof.set.ids[aof.set.cnt++] = id;
+ }
+
+ if (!aof.set.cnt)
+ return NULL;
+ sort(&aof.set.ids, aof.set.cnt, sizeof(aof.set.ids[0]), btf_id_cmp_func, NULL);
+
+ n = btf_nr_types(btf);
+ for (i = 1; i < n; i++) {
+ struct btf_struct_metas *new_tab;
+ const struct btf_member *member;
+ struct btf_field_offs *foffs;
+ struct btf_struct_meta *type;
+ struct btf_record *record;
+ const struct btf_type *t;
+ int j, tab_cnt;
+
+ t = btf_type_by_id(btf, i);
+ if (!t) {
+ ret = -EINVAL;
+ goto free;
+ }
+ if (!__btf_type_is_struct(t))
+ continue;
+
+ cond_resched();
+
+ for_each_member(j, t, member) {
+ if (btf_id_set_contains(&aof.set, member->type))
+ goto parse;
+ }
+ continue;
+ parse:
+ tab_cnt = tab ? tab->cnt : 0;
+ new_tab = krealloc(tab, offsetof(struct btf_struct_metas, types[tab_cnt + 1]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_tab) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (!tab)
+ new_tab->cnt = 0;
+ tab = new_tab;
+
+ type = &tab->types[tab->cnt];
+ type->btf_id = i;
+ record = btf_parse_fields(btf, t, BPF_SPIN_LOCK | BPF_LIST_HEAD | BPF_LIST_NODE, t->size);
+ /* The record cannot be unset, treat it as an error if so */
+ if (IS_ERR_OR_NULL(record)) {
+ ret = PTR_ERR_OR_ZERO(record) ?: -EFAULT;
+ goto free;
+ }
+ foffs = btf_parse_field_offs(record);
+ /* We need the field_offs to be valid for a valid record,
+ * either both should be set or both should be unset.
+ */
+ if (IS_ERR_OR_NULL(foffs)) {
+ btf_record_free(record);
+ ret = -EFAULT;
+ goto free;
+ }
+ type->record = record;
+ type->field_offs = foffs;
+ tab->cnt++;
+ }
+ return tab;
+free:
+ btf_struct_metas_free(tab);
+ return ERR_PTR(ret);
+}
+
+struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id)
+{
+ struct btf_struct_metas *tab;
+
+ BUILD_BUG_ON(offsetof(struct btf_struct_meta, btf_id) != 0);
+ tab = btf->struct_meta_tab;
+ if (!tab)
+ return NULL;
+ return bsearch(&btf_id, tab->types, tab->cnt, sizeof(tab->types[0]), btf_id_cmp_func);
+}
+
static int btf_check_type_tags(struct btf_verifier_env *env,
struct btf *btf, int start_id)
{
@@ -4968,6 +5395,7 @@ static int btf_check_type_tags(struct btf_verifier_env *env,
static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
+ struct btf_struct_metas *struct_meta_tab;
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
@@ -5036,15 +5464,34 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
if (err)
goto errout;
+ struct_meta_tab = btf_parse_struct_metas(log, btf);
+ if (IS_ERR(struct_meta_tab)) {
+ err = PTR_ERR(struct_meta_tab);
+ goto errout;
+ }
+ btf->struct_meta_tab = struct_meta_tab;
+
+ if (struct_meta_tab) {
+ int i;
+
+ for (i = 0; i < struct_meta_tab->cnt; i++) {
+ err = btf_check_and_fixup_fields(btf, struct_meta_tab->types[i].record);
+ if (err < 0)
+ goto errout_meta;
+ }
+ }
+
if (log->level && bpf_verifier_log_full(log)) {
err = -ENOSPC;
- goto errout;
+ goto errout_meta;
}
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
return btf;
+errout_meta:
+ btf_free_struct_meta_tab(btf);
errout:
btf_verifier_env_free(env);
if (btf)
@@ -5086,7 +5533,7 @@ static u8 bpf_ctx_convert_map[] = {
#undef BPF_MAP_TYPE
#undef BPF_LINK_TYPE
-static const struct btf_member *
+const struct btf_member *
btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
@@ -5159,6 +5606,26 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
return kern_ctx_type->type;
}
+int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type)
+{
+ const struct btf_member *kctx_member;
+ const struct btf_type *conv_struct;
+ const struct btf_type *kctx_type;
+ u32 kctx_type_id;
+
+ conv_struct = bpf_ctx_convert.t;
+ /* get member for kernel ctx type */
+ kctx_member = btf_type_member(conv_struct) + bpf_ctx_convert_map[prog_type] * 2 + 1;
+ kctx_type_id = kctx_member->type;
+ kctx_type = btf_type_by_id(btf_vmlinux, kctx_type_id);
+ if (!btf_type_is_struct(kctx_type)) {
+ bpf_log(log, "kern ctx type id %u is not a struct\n", kctx_type_id);
+ return -EINVAL;
+ }
+
+ return kctx_type_id;
+}
+
BTF_ID_LIST(bpf_ctx_convert_btf_id)
BTF_ID(struct, bpf_ctx_convert)
@@ -5356,6 +5823,22 @@ static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto,
return nr_args + 1;
}
+static bool prog_args_trusted(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type atype = prog->expected_attach_type;
+
+ switch (prog->type) {
+ case BPF_PROG_TYPE_TRACING:
+ return atype == BPF_TRACE_RAW_TP || atype == BPF_TRACE_ITER;
+ case BPF_PROG_TYPE_LSM:
+ return bpf_lsm_is_trusted(prog);
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return true;
+ default:
+ return false;
+ }
+}
+
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
@@ -5499,6 +5982,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
info->reg_type = PTR_TO_BTF_ID;
+ if (prog_args_trusted(prog))
+ info->reg_type |= PTR_TRUSTED;
+
if (tgt_prog) {
enum bpf_prog_type tgt_type;
@@ -5765,6 +6251,9 @@ error:
/* check __percpu tag */
if (strcmp(tag_value, "percpu") == 0)
tmp_flag = MEM_PERCPU;
+ /* check __rcu tag */
+ if (strcmp(tag_value, "rcu") == 0)
+ tmp_flag = MEM_RCU;
}
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
@@ -5794,20 +6283,50 @@ error:
return -EINVAL;
}
-int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype __maybe_unused,
+int btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size, enum bpf_access_type atype __maybe_unused,
u32 *next_btf_id, enum bpf_type_flag *flag)
{
+ const struct btf *btf = reg->btf;
enum bpf_type_flag tmp_flag = 0;
+ const struct btf_type *t;
+ u32 id = reg->btf_id;
int err;
- u32 id;
+ while (type_is_alloc(reg->type)) {
+ struct btf_struct_meta *meta;
+ struct btf_record *rec;
+ int i;
+
+ meta = btf_find_struct_meta(btf, id);
+ if (!meta)
+ break;
+ rec = meta->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 offset = field->offset;
+ if (off < offset + btf_field_type_size(field->type) && offset < off + size) {
+ bpf_log(log,
+ "direct access to %s is disallowed\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
+ }
+ }
+ break;
+ }
+
+ t = btf_type_by_id(btf, id);
do {
err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
switch (err) {
case WALK_PTR:
+ /* For local types, the destination register cannot
+ * become a pointer again.
+ */
+ if (type_is_alloc(reg->type))
+ return SCALAR_VALUE;
/* If we found the pointer or scalar on t+off,
* we're done.
*/
@@ -5842,8 +6361,8 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* end up with two different module BTFs, but IDs point to the common type in
* vmlinux BTF.
*/
-static bool btf_types_are_same(const struct btf *btf1, u32 id1,
- const struct btf *btf2, u32 id2)
+bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
{
if (id1 != id2)
return false;
@@ -6125,122 +6644,19 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr
return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
-#ifdef CONFIG_NET
- [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
- [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
- [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
-#endif
-};
-
-/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
-static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
- const struct btf *btf,
- const struct btf_type *t, int rec)
-{
- const struct btf_type *member_type;
- const struct btf_member *member;
- u32 i;
-
- if (!btf_type_is_struct(t))
- return false;
-
- for_each_member(i, t, member) {
- const struct btf_array *array;
-
- member_type = btf_type_skip_modifiers(btf, member->type, NULL);
- if (btf_type_is_struct(member_type)) {
- if (rec >= 3) {
- bpf_log(log, "max struct nesting depth exceeded\n");
- return false;
- }
- if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
- return false;
- continue;
- }
- if (btf_type_is_array(member_type)) {
- array = btf_type_array(member_type);
- if (!array->nelems)
- return false;
- member_type = btf_type_skip_modifiers(btf, array->type, NULL);
- if (!btf_type_is_scalar(member_type))
- return false;
- continue;
- }
- if (!btf_type_is_scalar(member_type))
- return false;
- }
- return true;
-}
-
-static bool is_kfunc_arg_mem_size(const struct btf *btf,
- const struct btf_param *arg,
- const struct bpf_reg_state *reg)
-{
- int len, sfx_len = sizeof("__sz") - 1;
- const struct btf_type *t;
- const char *param_name;
-
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
- return false;
-
- /* In the future, this can be ported to use BTF tagging */
- param_name = btf_name_by_offset(btf, arg->name_off);
- if (str_is_empty(param_name))
- return false;
- len = strlen(param_name);
- if (len < sfx_len)
- return false;
- param_name += len - sfx_len;
- if (strncmp(param_name, "__sz", sfx_len))
- return false;
-
- return true;
-}
-
-static bool btf_is_kfunc_arg_mem_size(const struct btf *btf,
- const struct btf_param *arg,
- const struct bpf_reg_state *reg,
- const char *name)
-{
- int len, target_len = strlen(name);
- const struct btf_type *t;
- const char *param_name;
-
- t = btf_type_skip_modifiers(btf, arg->type, NULL);
- if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
- return false;
-
- param_name = btf_name_by_offset(btf, arg->name_off);
- if (str_is_empty(param_name))
- return false;
- len = strlen(param_name);
- if (len != target_len)
- return false;
- if (strcmp(param_name, name))
- return false;
-
- return true;
-}
-
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok,
- struct bpf_kfunc_arg_meta *kfunc_meta,
bool processing_call)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
- bool rel = false, kptr_get = false, trusted_args = false;
- bool sleepable = false;
struct bpf_verifier_log *log = &env->log;
- u32 i, nargs, ref_id, ref_obj_id = 0;
- bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
- int ref_regno = 0, ret;
+ u32 i, nargs, ref_id;
+ int ret;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -6266,14 +6682,6 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- if (is_kfunc && kfunc_meta) {
- /* Only kfunc can be release func */
- rel = kfunc_meta->flags & KF_RELEASE;
- kptr_get = kfunc_meta->flags & KF_KPTR_GET;
- trusted_args = kfunc_meta->flags & KF_TRUSTED_ARGS;
- sleepable = kfunc_meta->flags & KF_SLEEPABLE;
- }
-
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
@@ -6281,42 +6689,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
enum bpf_arg_type arg_type = ARG_DONTCARE;
u32 regno = i + 1;
struct bpf_reg_state *reg = &regs[regno];
- bool obj_ptr = false;
t = btf_type_skip_modifiers(btf, args[i].type, NULL);
if (btf_type_is_scalar(t)) {
- if (is_kfunc && kfunc_meta) {
- bool is_buf_size = false;
-
- /* check for any const scalar parameter of name "rdonly_buf_size"
- * or "rdwr_buf_size"
- */
- if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
- "rdonly_buf_size")) {
- kfunc_meta->r0_rdonly = true;
- is_buf_size = true;
- } else if (btf_is_kfunc_arg_mem_size(btf, &args[i], reg,
- "rdwr_buf_size"))
- is_buf_size = true;
-
- if (is_buf_size) {
- if (kfunc_meta->r0_size) {
- bpf_log(log, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
- return -EINVAL;
- }
-
- if (!tnum_is_const(reg->var_off)) {
- bpf_log(log, "R%d is not a const\n", regno);
- return -EINVAL;
- }
-
- kfunc_meta->r0_size = reg->var_off.value;
- ret = mark_chain_precision(env, regno);
- if (ret)
- return ret;
- }
- }
-
if (reg->type == SCALAR_VALUE)
continue;
bpf_log(log, "R%d is not a scalar\n", regno);
@@ -6329,88 +6704,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
- /* These register types have special constraints wrt ref_obj_id
- * and offset checks. The rest of trusted args don't.
- */
- obj_ptr = reg->type == PTR_TO_CTX || reg->type == PTR_TO_BTF_ID ||
- reg2btf_ids[base_type(reg->type)];
-
- /* Check if argument must be a referenced pointer, args + i has
- * been verified to be a pointer (after skipping modifiers).
- * PTR_TO_CTX is ok without having non-zero ref_obj_id.
- */
- if (is_kfunc && trusted_args && (obj_ptr && reg->type != PTR_TO_CTX) && !reg->ref_obj_id) {
- bpf_log(log, "R%d must be referenced\n", regno);
- return -EINVAL;
- }
-
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- /* Trusted args have the same offset checks as release arguments */
- if ((trusted_args && obj_ptr) || (rel && reg->ref_obj_id))
- arg_type |= OBJ_RELEASE;
ret = check_func_arg_reg_off(env, reg, regno, arg_type);
if (ret < 0)
return ret;
- if (is_kfunc && reg->ref_obj_id) {
- /* Ensure only one argument is referenced PTR_TO_BTF_ID */
- if (ref_obj_id) {
- bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
- regno, reg->ref_obj_id, ref_obj_id);
- return -EFAULT;
- }
- ref_regno = regno;
- ref_obj_id = reg->ref_obj_id;
- }
-
- /* kptr_get is only true for kfunc */
- if (i == 0 && kptr_get) {
- struct bpf_map_value_off_desc *off_desc;
-
- if (reg->type != PTR_TO_MAP_VALUE) {
- bpf_log(log, "arg#0 expected pointer to map value\n");
- return -EINVAL;
- }
-
- /* check_func_arg_reg_off allows var_off for
- * PTR_TO_MAP_VALUE, but we need fixed offset to find
- * off_desc.
- */
- if (!tnum_is_const(reg->var_off)) {
- bpf_log(log, "arg#0 must have constant offset\n");
- return -EINVAL;
- }
-
- off_desc = bpf_map_kptr_off_contains(reg->map_ptr, reg->off + reg->var_off.value);
- if (!off_desc || off_desc->type != BPF_KPTR_REF) {
- bpf_log(log, "arg#0 no referenced kptr at map value offset=%llu\n",
- reg->off + reg->var_off.value);
- return -EINVAL;
- }
-
- if (!btf_type_is_ptr(ref_t)) {
- bpf_log(log, "arg#0 BTF type must be a double pointer\n");
- return -EINVAL;
- }
-
- ref_t = btf_type_skip_modifiers(btf, ref_t->type, &ref_id);
- ref_tname = btf_name_by_offset(btf, ref_t->name_off);
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- if (!btf_struct_ids_match(log, btf, ref_id, 0, off_desc->kptr.btf,
- off_desc->kptr.btf_id, true)) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s\n",
- func_name, i, btf_type_str(ref_t), ref_tname);
- return -EINVAL;
- }
- /* rest of the arguments can be anything, like normal kfunc */
- } else if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
@@ -6420,109 +6721,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
i, btf_type_str(t));
return -EINVAL;
}
- } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
- (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
- const struct btf_type *reg_ref_t;
- const struct btf *reg_btf;
- const char *reg_ref_tname;
- u32 reg_ref_id;
-
- if (!btf_type_is_struct(ref_t)) {
- bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
- func_name, i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (reg->type == PTR_TO_BTF_ID) {
- reg_btf = reg->btf;
- reg_ref_id = reg->btf_id;
- } else {
- reg_btf = btf_vmlinux;
- reg_ref_id = *reg2btf_ids[base_type(reg->type)];
- }
-
- reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
- &reg_ref_id);
- reg_ref_tname = btf_name_by_offset(reg_btf,
- reg_ref_t->name_off);
- if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id,
- trusted_args || (rel && reg->ref_obj_id))) {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname,
- regno, btf_type_str(reg_ref_t),
- reg_ref_tname);
- return -EINVAL;
- }
} else if (ptr_to_mem_ok && processing_call) {
const struct btf_type *resolve_ret;
u32 type_size;
- if (is_kfunc) {
- bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
- bool arg_dynptr = btf_type_is_struct(ref_t) &&
- !strcmp(ref_tname,
- stringify_struct(bpf_dynptr_kern));
-
- /* Permit pointer to mem, but only when argument
- * type is pointer to scalar, or struct composed
- * (recursively) of scalars.
- * When arg_mem_size is true, the pointer can be
- * void *.
- * Also permit initialized local dynamic pointers.
- */
- if (!btf_type_is_scalar(ref_t) &&
- !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
- !arg_dynptr &&
- (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
- bpf_log(log,
- "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
- i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
- return -EINVAL;
- }
-
- if (arg_dynptr) {
- if (reg->type != PTR_TO_STACK) {
- bpf_log(log, "arg#%d pointer type %s %s not to stack\n",
- i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (!is_dynptr_reg_valid_init(env, reg)) {
- bpf_log(log,
- "arg#%d pointer type %s %s must be valid and initialized\n",
- i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- if (!is_dynptr_type_expected(env, reg,
- ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL)) {
- bpf_log(log,
- "arg#%d pointer type %s %s points to unsupported dynamic pointer type\n",
- i, btf_type_str(ref_t),
- ref_tname);
- return -EINVAL;
- }
-
- continue;
- }
-
- /* Check for mem, len pair */
- if (arg_mem_size) {
- if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
- bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
- i, i + 1);
- return -EINVAL;
- }
- i++;
- continue;
- }
- }
-
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
@@ -6535,36 +6737,13 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
- bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
- is_kfunc ? "kernel " : "", func_name, func_id);
+ bpf_log(log, "reg type unsupported for arg#%d function %s#%d\n", i,
+ func_name, func_id);
return -EINVAL;
}
}
- /* Either both are set, or neither */
- WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
- /* We already made sure ref_obj_id is set only for one argument. We do
- * allow (!rel && ref_obj_id), so that passing such referenced
- * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
- * is_kfunc is true.
- */
- if (rel && !ref_obj_id) {
- bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
- func_name);
- return -EINVAL;
- }
-
- if (sleepable && !env->prog->aux->sleepable) {
- bpf_log(log, "kernel function %s is sleepable but the program is not\n",
- func_name);
- return -EINVAL;
- }
-
- if (kfunc_meta && ref_obj_id)
- kfunc_meta->ref_obj_id = ref_obj_id;
-
- /* returns argument register number > 0 in case of reference release kfunc */
- return rel ? ref_regno : 0;
+ return 0;
}
/* Compare BTF of a function declaration with given bpf_reg_state.
@@ -6594,7 +6773,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, false);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, false);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6637,7 +6816,7 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, NULL, true);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, true);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6648,14 +6827,6 @@ int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog,
return err;
}
-int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
- const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs,
- struct bpf_kfunc_arg_meta *meta)
-{
- return btf_check_func_arg_match(env, btf, func_id, regs, true, meta, true);
-}
-
/* Convert BTF of a function into bpf_reg_state if possible
* Returns:
* EFAULT - there is a verifier bug. Abort verification.
@@ -7038,23 +7209,6 @@ bool btf_is_module(const struct btf *btf)
return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
}
-static int btf_id_cmp_func(const void *a, const void *b)
-{
- const int *pa = a, *pb = b;
-
- return *pa - *pb;
-}
-
-bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
-{
- return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
-}
-
-static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
-{
- return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
-}
-
enum {
BTF_MODULE_F_LIVE = (1 << 0),
};
@@ -7415,6 +7569,8 @@ static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
switch (prog_type) {
+ case BPF_PROG_TYPE_UNSPEC:
+ return BTF_KFUNC_HOOK_COMMON;
case BPF_PROG_TYPE_XDP:
return BTF_KFUNC_HOOK_XDP;
case BPF_PROG_TYPE_SCHED_CLS:
@@ -7443,16 +7599,24 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf,
u32 kfunc_btf_id)
{
enum btf_kfunc_hook hook;
+ u32 *kfunc_flags;
+
+ kfunc_flags = __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_COMMON, kfunc_btf_id);
+ if (kfunc_flags)
+ return kfunc_flags;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}
-/* This function must be invoked only from initcalls/module init functions */
-int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
- const struct btf_kfunc_id_set *kset)
+u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id)
+{
+ return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id);
+}
+
+static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
{
- enum btf_kfunc_hook hook;
struct btf *btf;
int ret;
@@ -7471,13 +7635,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
if (IS_ERR(btf))
return PTR_ERR(btf);
- hook = bpf_prog_type_to_kfunc_hook(prog_type);
ret = btf_populate_kfunc_set(btf, hook, kset->set);
btf_put(btf);
return ret;
}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __register_btf_kfunc_id_set(hook, kset);
+}
EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset)
+{
+ return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset);
+}
+EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set);
+
s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id)
{
struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab;
diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c
index 9fcf09f2ef00..06989d278846 100644
--- a/kernel/bpf/cgroup_iter.c
+++ b/kernel/bpf/cgroup_iter.c
@@ -157,23 +157,37 @@ static const struct seq_operations cgroup_iter_seq_ops = {
.show = cgroup_iter_seq_show,
};
-BTF_ID_LIST_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
+BTF_ID_LIST_GLOBAL_SINGLE(bpf_cgroup_btf_id, struct, cgroup)
static int cgroup_iter_seq_init(void *priv, struct bpf_iter_aux_info *aux)
{
struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
struct cgroup *cgrp = aux->cgroup.start;
+ /* bpf_iter_attach_cgroup() has already acquired an extra reference
+ * for the start cgroup, but the reference may be released after
+ * cgroup_iter_seq_init(), so acquire another reference for the
+ * start cgroup.
+ */
p->start_css = &cgrp->self;
+ css_get(p->start_css);
p->terminate = false;
p->visited_all = false;
p->order = aux->cgroup.order;
return 0;
}
+static void cgroup_iter_seq_fini(void *priv)
+{
+ struct cgroup_iter_priv *p = (struct cgroup_iter_priv *)priv;
+
+ css_put(p->start_css);
+}
+
static const struct bpf_iter_seq_info cgroup_iter_seq_info = {
.seq_ops = &cgroup_iter_seq_ops,
.init_seq_private = cgroup_iter_seq_init,
+ .fini_seq_private = cgroup_iter_seq_fini,
.seq_priv_size = sizeof(struct cgroup_iter_priv),
};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 25a54e04560e..ba3fff17e2f9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -34,6 +34,7 @@
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <linux/nodemask.h>
+#include <linux/bpf_mem_alloc.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -60,6 +61,9 @@
#define CTX regs[BPF_REG_CTX]
#define IMM insn->imm
+struct bpf_mem_alloc bpf_global_ma;
+bool bpf_global_ma_set;
+
/* No hurry in this branch
*
* Exported for the bpf jit load helper.
@@ -864,8 +868,7 @@ static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_ins
list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
- set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
return pack;
}
@@ -883,8 +886,7 @@ void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns)
if (ptr) {
bpf_fill_ill_insns(ptr, size);
set_vm_flush_reset_perms(ptr);
- set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
- set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+ set_memory_rox((unsigned long)ptr, size / PAGE_SIZE);
}
goto out;
}
@@ -1032,7 +1034,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
- start = prandom_u32_max(hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
/* Leave a random number of instructions before BPF code. */
*image_ptr = &hdr->image[start];
@@ -1094,7 +1096,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
- start = prandom_u32_max(hole) & ~(alignment - 1);
+ start = get_random_u32_below(hole) & ~(alignment - 1);
*image_ptr = &ro_header->image[start];
*rw_image = &(*rw_header)->image[start];
@@ -2088,6 +2090,7 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(fp);
bool ret;
if (fp->kprobe_override)
@@ -2098,12 +2101,12 @@ bool bpf_prog_map_compatible(struct bpf_map *map,
/* There's no owner yet where we could check for
* compatibility.
*/
- map->owner.type = fp->type;
+ map->owner.type = prog_type;
map->owner.jited = fp->jited;
map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = map->owner.type == fp->type &&
+ ret = map->owner.type == prog_type &&
map->owner.jited == fp->jited &&
map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
@@ -2251,8 +2254,14 @@ static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
{
struct bpf_prog_array *progs;
+ /* If RCU Tasks Trace grace period implies RCU grace period, there is
+ * no need to call kfree_rcu(), just call kfree() directly.
+ */
progs = container_of(rcu, struct bpf_prog_array, rcu);
- kfree_rcu(progs, rcu);
+ if (rcu_trace_implies_rcu_gp())
+ kfree(progs);
+ else
+ kfree_rcu(progs, rcu);
}
void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
@@ -2740,6 +2749,18 @@ int __weak bpf_arch_text_invalidate(void *dst, size_t len)
return -ENOTSUPP;
}
+#ifdef CONFIG_BPF_SYSCALL
+static int __init bpf_global_ma_init(void)
+{
+ int ret;
+
+ ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false);
+ bpf_global_ma_set = !ret;
+ return ret;
+}
+late_initcall(bpf_global_ma_init);
+#endif
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b5ba34ddd4b6..e0b2d016f0bf 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -4,13 +4,16 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
-/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+/**
+ * DOC: cpu map
+ * The 'cpumap' is primarily used as a backend map for XDP BPF helper
* call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
*
- * Unlike devmap which redirects XDP frames out another NIC device,
+ * Unlike devmap which redirects XDP frames out to another NIC device,
* this map type redirects raw XDP frames to another CPU. The remote
* CPU will do SKB-allocation and call the normal network stack.
- *
+ */
+/*
* This is a scalability and isolation mechanism, that allow
* separating the early driver network XDP layer, from the rest of the
* netstack, and assigning dedicated CPUs for this stage. This
@@ -85,7 +88,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
- int err = -ENOMEM;
if (!bpf_capable())
return ERR_PTR(-EPERM);
@@ -97,29 +99,26 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
+ /* Pre-limit array size based on NR_CPUS, not final CPU check */
+ if (attr->max_entries > NR_CPUS)
+ return ERR_PTR(-E2BIG);
+
cmap = bpf_map_area_alloc(sizeof(*cmap), NUMA_NO_NODE);
if (!cmap)
return ERR_PTR(-ENOMEM);
bpf_map_init_from_attr(&cmap->map, attr);
- /* Pre-limit array size based on NR_CPUS, not final CPU check */
- if (cmap->map.max_entries > NR_CPUS) {
- err = -E2BIG;
- goto free_cmap;
- }
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
- if (!cmap->cpu_map)
- goto free_cmap;
+ if (!cmap->cpu_map) {
+ bpf_map_area_free(cmap);
+ return ERR_PTR(-ENOMEM);
+ }
return &cmap->map;
-free_cmap:
- bpf_map_area_free(cmap);
- return ERR_PTR(err);
}
static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
@@ -668,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags)
{
- return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ return __bpf_xdp_redirect_map(map, index, flags, 0,
__cpu_map_lookup_elem);
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f9a87dcc5535..d01e4c55b376 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -992,14 +992,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
map, key, value, map_flags);
}
-static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
__dev_map_lookup_elem);
}
-static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
return __bpf_xdp_redirect_map(map, ifindex, flags,
BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index 04f0a045dcaa..fa3e9225aedc 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -4,7 +4,7 @@
#include <linux/hash.h>
#include <linux/bpf.h>
#include <linux/filter.h>
-#include <linux/init.h>
+#include <linux/static_call.h>
/* The BPF dispatcher is a multiway branch code generator. The
* dispatcher is a mechanism to avoid the performance penalty of an
@@ -91,11 +91,6 @@ int __weak arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int n
return -ENOTSUPP;
}
-int __weak __init bpf_arch_init_dispatcher_early(void *ip)
-{
- return -ENOTSUPP;
-}
-
static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *buf)
{
s64 ips[BPF_DISPATCHER_MAX] = {}, *ipsp = &ips[0];
@@ -110,17 +105,11 @@ static int bpf_dispatcher_prepare(struct bpf_dispatcher *d, void *image, void *b
static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
{
- void *old, *new, *tmp;
- u32 noff;
- int err;
-
- if (!prev_num_progs) {
- old = NULL;
- noff = 0;
- } else {
- old = d->image + d->image_off;
+ void *new, *tmp;
+ u32 noff = 0;
+
+ if (prev_num_progs)
noff = d->image_off ^ (PAGE_SIZE / 2);
- }
new = d->num_progs ? d->image + noff : NULL;
tmp = d->num_progs ? d->rw_image + noff : NULL;
@@ -134,11 +123,15 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
return;
}
- err = bpf_arch_text_poke(d->func, BPF_MOD_JUMP, old, new);
- if (err || !new)
- return;
+ __BPF_DISPATCHER_UPDATE(d, new ?: (void *)&bpf_dispatcher_nop_func);
+
+ /* Make sure all the callers executing the previous/old half of the
+ * image leave it, so following update call can modify it safely.
+ */
+ synchronize_rcu();
- d->image_off = noff;
+ if (new)
+ d->image_off = noff;
}
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index f39ee3e05589..5aa2b5525f79 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -222,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int i;
- if (!map_value_has_timer(&htab->map))
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
@@ -231,28 +231,25 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab)
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_timer_cancel_and_free(elem->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
-static void htab_free_prealloced_kptrs(struct bpf_htab *htab)
+static void htab_free_prealloced_fields(struct bpf_htab *htab)
{
u32 num_entries = htab->map.max_entries;
int i;
- if (!map_value_has_kptrs(&htab->map))
+ if (IS_ERR_OR_NULL(htab->map.record))
return;
if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
-
for (i = 0; i < num_entries; i++) {
struct htab_elem *elem;
elem = get_htab_elem(htab, i);
- bpf_map_free_kptrs(&htab->map, elem->key + round_up(htab->map.key_size, 8));
+ bpf_obj_free_fields(htab->map.record, elem->key + round_up(htab->map.key_size, 8));
cond_resched();
}
}
@@ -764,10 +761,7 @@ static void check_and_free_fields(struct bpf_htab *htab,
{
void *map_value = elem->key + round_up(htab->map.key_size, 8);
- if (map_value_has_timer(&htab->map))
- bpf_timer_cancel_and_free(map_value + htab->map.timer_off);
- if (map_value_has_kptrs(&htab->map))
- bpf_map_free_kptrs(&htab->map, map_value);
+ bpf_obj_free_fields(htab->map.record, map_value);
}
/* It is called from the bpf_lru_list when the LRU needs to delete
@@ -1091,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
head = &b->head;
if (unlikely(map_flags & BPF_F_LOCK)) {
- if (unlikely(!map_value_has_spin_lock(map)))
+ if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
/* find an element without taking the bucket lock */
l_old = lookup_nulls_elem_raw(head, hash, key, key_size,
@@ -1474,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab)
struct htab_elem *l;
hlist_nulls_for_each_entry(l, n, head, hash_node) {
- /* We don't reset or free kptr on uref dropping to zero,
- * hence just free timer.
- */
- bpf_timer_cancel_and_free(l->key +
- round_up(htab->map.key_size, 8) +
- htab->map.timer_off);
+ /* We only free timer on uref dropping to zero */
+ bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8));
}
cond_resched_rcu();
}
@@ -1490,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
- /* We don't reset or free kptr on uref dropping to zero. */
- if (!map_value_has_timer(&htab->map))
+ /* We only free timer on uref dropping to zero */
+ if (!btf_record_has_field(htab->map.record, BPF_TIMER))
return;
if (!htab_is_prealloc(htab))
htab_free_malloced_timers(htab);
@@ -1517,11 +1507,10 @@ static void htab_map_free(struct bpf_map *map)
if (!htab_is_prealloc(htab)) {
delete_all_elements(htab);
} else {
- htab_free_prealloced_kptrs(htab);
+ htab_free_prealloced_fields(htab);
prealloc_destroy(htab);
}
- bpf_map_free_kptr_off_tab(map);
free_percpu(htab->extra_elems);
bpf_map_area_free(htab->buckets);
bpf_mem_alloc_destroy(&htab->pcpu_ma);
@@ -1675,7 +1664,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
elem_map_flags = attr->batch.elem_flags;
if ((elem_map_flags & ~BPF_F_LOCK) ||
- ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
+ ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
map_flags = attr->batch.flags;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a6b04faed282..af30c6cbd65d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -4,6 +4,7 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -19,6 +20,7 @@
#include <linux/proc_ns.h>
#include <linux/security.h>
#include <linux/btf_ids.h>
+#include <linux/bpf_mem_alloc.h>
#include "../../lib/kstrtox.h"
@@ -336,6 +338,7 @@ const struct bpf_func_proto bpf_spin_lock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
static inline void __bpf_spin_unlock_irqrestore(struct bpf_spin_lock *lock)
@@ -358,6 +361,7 @@ const struct bpf_func_proto bpf_spin_unlock_proto = {
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
+ .arg1_btf_id = BPF_PTR_POISON,
};
void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
@@ -366,9 +370,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
struct bpf_spin_lock *lock;
if (lock_src)
- lock = src + map->spin_lock_off;
+ lock = src + map->record->spin_lock_off;
else
- lock = dst + map->spin_lock_off;
+ lock = dst + map->record->spin_lock_off;
preempt_disable();
__bpf_spin_lock_irqsave(lock);
copy_map_value(map, dst, src);
@@ -657,6 +661,7 @@ BPF_CALL_3(bpf_copy_from_user, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_proto = {
.func = bpf_copy_from_user,
.gpl_only = false,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
@@ -687,6 +692,7 @@ BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
const struct bpf_func_proto bpf_copy_from_user_task_proto = {
.func = bpf_copy_from_user_task,
.gpl_only = true,
+ .might_sleep = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
@@ -1169,7 +1175,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
ret = -ENOMEM;
goto out;
}
- t->value = (void *)timer - map->timer_off;
+ t->value = (void *)timer - map->record->timer_off;
t->map = map;
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
@@ -1398,7 +1404,7 @@ static const struct bpf_func_proto bpf_kptr_xchg_proto = {
#define DYNPTR_SIZE_MASK 0xFFFFFF
#define DYNPTR_RDONLY_BIT BIT(31)
-static bool bpf_dynptr_is_rdonly(struct bpf_dynptr_kern *ptr)
+static bool bpf_dynptr_is_rdonly(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_RDONLY_BIT;
}
@@ -1408,7 +1414,7 @@ static void bpf_dynptr_set_type(struct bpf_dynptr_kern *ptr, enum bpf_dynptr_typ
ptr->size |= type << DYNPTR_TYPE_SHIFT;
}
-u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr)
+u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr)
{
return ptr->size & DYNPTR_SIZE_MASK;
}
@@ -1432,7 +1438,7 @@ void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr)
memset(ptr, 0, sizeof(*ptr));
}
-static int bpf_dynptr_check_off_len(struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
+static int bpf_dynptr_check_off_len(const struct bpf_dynptr_kern *ptr, u32 offset, u32 len)
{
u32 size = bpf_dynptr_get_size(ptr);
@@ -1477,7 +1483,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT,
};
-BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src,
+BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src,
u32, offset, u64, flags)
{
int err;
@@ -1489,7 +1495,11 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
if (err)
return err;
- memcpy(dst, src->data + src->offset + offset, len);
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst, src->data + src->offset + offset, len);
return 0;
}
@@ -1500,12 +1510,12 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_UNINIT_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_DYNPTR,
+ .arg3_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg4_type = ARG_ANYTHING,
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
+BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src,
u32, len, u64, flags)
{
int err;
@@ -1517,7 +1527,11 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *,
if (err)
return err;
- memcpy(dst->data + dst->offset + offset, src, len);
+ /* Source and destination may possibly overlap, hence use memmove to
+ * copy the data. E.g. bpf_dynptr_from_mem may create two dynptr
+ * pointing to overlapping PTR_TO_MAP_VALUE regions.
+ */
+ memmove(dst->data + dst->offset + offset, src, len);
return 0;
}
@@ -1526,14 +1540,14 @@ static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
.arg5_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
+BPF_CALL_3(bpf_dynptr_data, const struct bpf_dynptr_kern *, ptr, u32, offset, u32, len)
{
int err;
@@ -1554,7 +1568,7 @@ static const struct bpf_func_proto bpf_dynptr_data_proto = {
.func = bpf_dynptr_data,
.gpl_only = false,
.ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
- .arg1_type = ARG_PTR_TO_DYNPTR,
+ .arg1_type = ARG_PTR_TO_DYNPTR | MEM_RDONLY,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
};
@@ -1663,6 +1677,12 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_dynptr_write_proto;
case BPF_FUNC_dynptr_data:
return &bpf_dynptr_data_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_cgrp_storage_get:
+ return &bpf_cgrp_storage_get_proto;
+ case BPF_FUNC_cgrp_storage_delete:
+ return &bpf_cgrp_storage_delete_proto;
+#endif
default:
break;
}
@@ -1700,20 +1720,401 @@ bpf_base_func_proto(enum bpf_func_id func_id)
}
}
-BTF_SET8_START(tracing_btf_ids)
+void bpf_list_head_free(const struct btf_field *field, void *list_head,
+ struct bpf_spin_lock *spin_lock)
+{
+ struct list_head *head = list_head, *orig_head = list_head;
+
+ BUILD_BUG_ON(sizeof(struct list_head) > sizeof(struct bpf_list_head));
+ BUILD_BUG_ON(__alignof__(struct list_head) > __alignof__(struct bpf_list_head));
+
+ /* Do the actual list draining outside the lock to not hold the lock for
+ * too long, and also prevent deadlocks if tracing programs end up
+ * executing on entry/exit of functions called inside the critical
+ * section, and end up doing map ops that call bpf_list_head_free for
+ * the same map value again.
+ */
+ __bpf_spin_lock_irqsave(spin_lock);
+ if (!head->next || list_empty(head))
+ goto unlock;
+ head = head->next;
+unlock:
+ INIT_LIST_HEAD(orig_head);
+ __bpf_spin_unlock_irqrestore(spin_lock);
+
+ while (head != orig_head) {
+ void *obj = head;
+
+ obj -= field->list_head.node_offset;
+ head = head->next;
+ /* The contained type can also have resources, including a
+ * bpf_list_head which needs to be freed.
+ */
+ bpf_obj_free_fields(field->list_head.value_rec, obj);
+ /* bpf_mem_free requires migrate_disable(), since we can be
+ * called from map free path as well apart from BPF program (as
+ * part of map ops doing bpf_obj_free_fields).
+ */
+ migrate_disable();
+ bpf_mem_free(&bpf_global_ma, obj);
+ migrate_enable();
+ }
+}
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
+
+void *bpf_obj_new_impl(u64 local_type_id__k, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ u64 size = local_type_id__k;
+ void *p;
+
+ p = bpf_mem_alloc(&bpf_global_ma, size);
+ if (!p)
+ return NULL;
+ if (meta)
+ bpf_obj_init(meta->field_offs, p);
+ return p;
+}
+
+void bpf_obj_drop_impl(void *p__alloc, void *meta__ign)
+{
+ struct btf_struct_meta *meta = meta__ign;
+ void *p = p__alloc;
+
+ if (meta)
+ bpf_obj_free_fields(meta->record, p);
+ bpf_mem_free(&bpf_global_ma, p);
+}
+
+static void __bpf_list_add(struct bpf_list_node *node, struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n = (void *)node, *h = (void *)head;
+
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (unlikely(!n->next))
+ INIT_LIST_HEAD(n);
+ tail ? list_add_tail(n, h) : list_add(n, h);
+}
+
+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node)
+{
+ return __bpf_list_add(node, head, false);
+}
+
+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node)
+{
+ return __bpf_list_add(node, head, true);
+}
+
+static struct bpf_list_node *__bpf_list_del(struct bpf_list_head *head, bool tail)
+{
+ struct list_head *n, *h = (void *)head;
+
+ if (unlikely(!h->next))
+ INIT_LIST_HEAD(h);
+ if (list_empty(h))
+ return NULL;
+ n = tail ? h->prev : h->next;
+ list_del_init(n);
+ return (struct bpf_list_node *)n;
+}
+
+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, false);
+}
+
+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head)
+{
+ return __bpf_list_del(head, true);
+}
+
+/**
+ * bpf_task_acquire - Acquire a reference to a task. A task acquired by this
+ * kfunc which is not stored in a map as a kptr, must be released by calling
+ * bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_acquire(struct task_struct *p)
+{
+ return get_task_struct(p);
+}
+
+/**
+ * bpf_task_acquire_not_zero - Acquire a reference to a rcu task object. A task
+ * acquired by this kfunc which is not stored in a map as a kptr, must be
+ * released by calling bpf_task_release().
+ * @p: The task on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_acquire_not_zero(struct task_struct *p)
+{
+ /* For the time being this function returns NULL, as it's not currently
+ * possible to safely acquire a reference to a task with RCU protection
+ * using get_task_struct() and put_task_struct(). This is due to the
+ * slightly odd mechanics of p->rcu_users, and how task RCU protection
+ * works.
+ *
+ * A struct task_struct is refcounted by two different refcount_t
+ * fields:
+ *
+ * 1. p->usage: The "true" refcount field which tracks a task's
+ * lifetime. The task is freed as soon as this
+ * refcount drops to 0.
+ *
+ * 2. p->rcu_users: An "RCU users" refcount field which is statically
+ * initialized to 2, and is co-located in a union with
+ * a struct rcu_head field (p->rcu). p->rcu_users
+ * essentially encapsulates a single p->usage
+ * refcount, and when p->rcu_users goes to 0, an RCU
+ * callback is scheduled on the struct rcu_head which
+ * decrements the p->usage refcount.
+ *
+ * There are two important implications to this task refcounting logic
+ * described above. The first is that
+ * refcount_inc_not_zero(&p->rcu_users) cannot be used anywhere, as
+ * after the refcount goes to 0, the RCU callback being scheduled will
+ * cause the memory backing the refcount to again be nonzero due to the
+ * fields sharing a union. The other is that we can't rely on RCU to
+ * guarantee that a task is valid in a BPF program. This is because a
+ * task could have already transitioned to being in the TASK_DEAD
+ * state, had its rcu_users refcount go to 0, and its rcu callback
+ * invoked in which it drops its single p->usage reference. At this
+ * point the task will be freed as soon as the last p->usage reference
+ * goes to 0, without waiting for another RCU gp to elapse. The only
+ * way that a BPF program can guarantee that a task is valid is in this
+ * scenario is to hold a p->usage refcount itself.
+ *
+ * Until we're able to resolve this issue, either by pulling
+ * p->rcu_users and p->rcu out of the union, or by getting rid of
+ * p->usage and just using p->rcu_users for refcounting, we'll just
+ * return NULL here.
+ */
+ return NULL;
+}
+
+/**
+ * bpf_task_kptr_get - Acquire a reference on a struct task_struct kptr. A task
+ * kptr acquired by this kfunc which is not subsequently stored in a map, must
+ * be released by calling bpf_task_release().
+ * @pp: A pointer to a task kptr on which a reference is being acquired.
+ */
+struct task_struct *bpf_task_kptr_get(struct task_struct **pp)
+{
+ /* We must return NULL here until we have clarity on how to properly
+ * leverage RCU for ensuring a task's lifetime. See the comment above
+ * in bpf_task_acquire_not_zero() for more details.
+ */
+ return NULL;
+}
+
+/**
+ * bpf_task_release - Release the reference acquired on a task.
+ * @p: The task on which a reference is being released.
+ */
+void bpf_task_release(struct task_struct *p)
+{
+ if (!p)
+ return;
+
+ put_task_struct(p);
+}
+
+#ifdef CONFIG_CGROUPS
+/**
+ * bpf_cgroup_acquire - Acquire a reference to a cgroup. A cgroup acquired by
+ * this kfunc which is not stored in a map as a kptr, must be released by
+ * calling bpf_cgroup_release().
+ * @cgrp: The cgroup on which a reference is being acquired.
+ */
+struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ return cgrp;
+}
+
+/**
+ * bpf_cgroup_kptr_get - Acquire a reference on a struct cgroup kptr. A cgroup
+ * kptr acquired by this kfunc which is not subsequently stored in a map, must
+ * be released by calling bpf_cgroup_release().
+ * @cgrpp: A pointer to a cgroup kptr on which a reference is being acquired.
+ */
+struct cgroup *bpf_cgroup_kptr_get(struct cgroup **cgrpp)
+{
+ struct cgroup *cgrp;
+
+ rcu_read_lock();
+ /* Another context could remove the cgroup from the map and release it
+ * at any time, including after we've done the lookup above. This is
+ * safe because we're in an RCU read region, so the cgroup is
+ * guaranteed to remain valid until at least the rcu_read_unlock()
+ * below.
+ */
+ cgrp = READ_ONCE(*cgrpp);
+
+ if (cgrp && !cgroup_tryget(cgrp))
+ /* If the cgroup had been removed from the map and freed as
+ * described above, cgroup_tryget() will return false. The
+ * cgroup will be freed at some point after the current RCU gp
+ * has ended, so just return NULL to the user.
+ */
+ cgrp = NULL;
+ rcu_read_unlock();
+
+ return cgrp;
+}
+
+/**
+ * bpf_cgroup_release - Release the reference acquired on a cgroup.
+ * If this kfunc is invoked in an RCU read region, the cgroup is guaranteed to
+ * not be freed until the current grace period has ended, even if its refcount
+ * drops to 0.
+ * @cgrp: The cgroup on which a reference is being released.
+ */
+void bpf_cgroup_release(struct cgroup *cgrp)
+{
+ if (!cgrp)
+ return;
+
+ cgroup_put(cgrp);
+}
+
+/**
+ * bpf_cgroup_ancestor - Perform a lookup on an entry in a cgroup's ancestor
+ * array. A cgroup returned by this kfunc which is not subsequently stored in a
+ * map, must be released by calling bpf_cgroup_release().
+ * @cgrp: The cgroup for which we're performing a lookup.
+ * @level: The level of ancestor to look up.
+ */
+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
+{
+ struct cgroup *ancestor;
+
+ if (level > cgrp->level || level < 0)
+ return NULL;
+
+ ancestor = cgrp->ancestors[level];
+ cgroup_get(ancestor);
+ return ancestor;
+}
+#endif /* CONFIG_CGROUPS */
+
+/**
+ * bpf_task_from_pid - Find a struct task_struct from its pid by looking it up
+ * in the root pid namespace idr. If a task is returned, it must either be
+ * stored in a map, or released with bpf_task_release().
+ * @pid: The pid of the task being looked up.
+ */
+struct task_struct *bpf_task_from_pid(s32 pid)
+{
+ struct task_struct *p;
+
+ rcu_read_lock();
+ p = find_task_by_pid_ns(pid, &init_pid_ns);
+ if (p)
+ bpf_task_acquire(p);
+ rcu_read_unlock();
+
+ return p;
+}
+
+void *bpf_cast_to_kern_ctx(void *obj)
+{
+ return obj;
+}
+
+void *bpf_rdonly_cast(void *obj__ign, u32 btf_id__k)
+{
+ return obj__ign;
+}
+
+void bpf_rcu_read_lock(void)
+{
+ rcu_read_lock();
+}
+
+void bpf_rcu_read_unlock(void)
+{
+ rcu_read_unlock();
+}
+
+__diag_pop();
+
+BTF_SET8_START(generic_btf_ids)
#ifdef CONFIG_KEXEC_CORE
BTF_ID_FLAGS(func, crash_kexec, KF_DESTRUCTIVE)
#endif
-BTF_SET8_END(tracing_btf_ids)
+BTF_ID_FLAGS(func, bpf_obj_new_impl, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_obj_drop_impl, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_list_push_front)
+BTF_ID_FLAGS(func, bpf_list_push_back)
+BTF_ID_FLAGS(func, bpf_list_pop_front, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_list_pop_back, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_task_acquire_not_zero, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_task_release, KF_RELEASE)
+#ifdef CONFIG_CGROUPS
+BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
+BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL)
+#endif
+BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
+BTF_SET8_END(generic_btf_ids)
-static const struct btf_kfunc_id_set tracing_kfunc_set = {
+static const struct btf_kfunc_id_set generic_kfunc_set = {
.owner = THIS_MODULE,
- .set = &tracing_btf_ids,
+ .set = &generic_btf_ids,
+};
+
+
+BTF_ID_LIST(generic_dtor_ids)
+BTF_ID(struct, task_struct)
+BTF_ID(func, bpf_task_release)
+#ifdef CONFIG_CGROUPS
+BTF_ID(struct, cgroup)
+BTF_ID(func, bpf_cgroup_release)
+#endif
+
+BTF_SET8_START(common_btf_ids)
+BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
+BTF_ID_FLAGS(func, bpf_rdonly_cast)
+BTF_ID_FLAGS(func, bpf_rcu_read_lock)
+BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
+BTF_SET8_END(common_btf_ids)
+
+static const struct btf_kfunc_id_set common_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &common_btf_ids,
};
static int __init kfunc_init(void)
{
- return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &tracing_kfunc_set);
+ int ret;
+ const struct btf_id_dtor_kfunc generic_dtors[] = {
+ {
+ .btf_id = generic_dtor_ids[0],
+ .kfunc_btf_id = generic_dtor_ids[1]
+ },
+#ifdef CONFIG_CGROUPS
+ {
+ .btf_id = generic_dtor_ids[2],
+ .kfunc_btf_id = generic_dtor_ids[3]
+ },
+#endif
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
+ ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
+ ARRAY_SIZE(generic_dtors),
+ THIS_MODULE);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_UNSPEC, &common_kfunc_set);
}
late_initcall(kfunc_init);
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 098cf336fae6..e90d9f63edc5 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return -EINVAL;
if (unlikely((flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)))
return -EINVAL;
storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map,
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 135205d0d560..38136ec4e095 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
struct bpf_map *inner_map, *inner_map_meta;
u32 inner_map_meta_size;
struct fd f;
+ int ret;
f = fdget(inner_map_ufd);
inner_map = __bpf_map_get(f);
@@ -20,18 +21,13 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
/* Does not support >1 level map-in-map */
if (inner_map->inner_map_meta) {
- fdput(f);
- return ERR_PTR(-EINVAL);
+ ret = -EINVAL;
+ goto put;
}
if (!inner_map->ops->map_meta_equal) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
- }
-
- if (map_value_has_spin_lock(inner_map)) {
- fdput(f);
- return ERR_PTR(-ENOTSUPP);
+ ret = -ENOTSUPP;
+ goto put;
}
inner_map_meta_size = sizeof(*inner_map_meta);
@@ -41,8 +37,8 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
if (!inner_map_meta) {
- fdput(f);
- return ERR_PTR(-ENOMEM);
+ ret = -ENOMEM;
+ goto put;
}
inner_map_meta->map_type = inner_map->map_type;
@@ -50,9 +46,33 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
inner_map_meta->max_entries = inner_map->max_entries;
- inner_map_meta->spin_lock_off = inner_map->spin_lock_off;
- inner_map_meta->timer_off = inner_map->timer_off;
- inner_map_meta->kptr_off_tab = bpf_map_copy_kptr_off_tab(inner_map);
+
+ inner_map_meta->record = btf_record_dup(inner_map->record);
+ if (IS_ERR(inner_map_meta->record)) {
+ /* btf_record_dup returns NULL or valid pointer in case of
+ * invalid/empty/valid, but ERR_PTR in case of errors. During
+ * equality NULL or IS_ERR is equivalent.
+ */
+ ret = PTR_ERR(inner_map_meta->record);
+ goto free;
+ }
+ if (inner_map_meta->record) {
+ struct btf_field_offs *field_offs;
+ /* If btf_record is !IS_ERR_OR_NULL, then field_offs is always
+ * valid.
+ */
+ field_offs = kmemdup(inner_map->field_offs, sizeof(*inner_map->field_offs), GFP_KERNEL | __GFP_NOWARN);
+ if (!field_offs) {
+ ret = -ENOMEM;
+ goto free_rec;
+ }
+ inner_map_meta->field_offs = field_offs;
+ }
+ /* Note: We must use the same BTF, as we also used btf_record_dup above
+ * which relies on BTF being same for both maps, as some members like
+ * record->fields.list_head have pointers like value_rec pointing into
+ * inner_map->btf.
+ */
if (inner_map->btf) {
btf_get(inner_map->btf);
inner_map_meta->btf = inner_map->btf;
@@ -68,11 +88,19 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
fdput(f);
return inner_map_meta;
+free_rec:
+ btf_record_free(inner_map_meta->record);
+free:
+ kfree(inner_map_meta);
+put:
+ fdput(f);
+ return ERR_PTR(ret);
}
void bpf_map_meta_free(struct bpf_map *map_meta)
{
- bpf_map_free_kptr_off_tab(map_meta);
+ kfree(map_meta->field_offs);
+ bpf_map_free_record(map_meta);
btf_put(map_meta->btf);
kfree(map_meta);
}
@@ -84,9 +112,8 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
return meta0->map_type == meta1->map_type &&
meta0->key_size == meta1->key_size &&
meta0->value_size == meta1->value_size &&
- meta0->timer_off == meta1->timer_off &&
meta0->map_flags == meta1->map_flags &&
- bpf_map_equal_kptr_off_tab(meta0, meta1);
+ btf_record_equal(meta0->record, meta1->record);
}
void *bpf_map_fd_get_ptr(struct bpf_map *map,
diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 4901fa1048cd..ebcc3dd0fa19 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -171,9 +171,24 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
for (i = 0; i < cnt; i++) {
- obj = __alloc(c, node);
- if (!obj)
- break;
+ /*
+ * free_by_rcu is only manipulated by irq work refill_work().
+ * IRQ works on the same CPU are called sequentially, so it is
+ * safe to use __llist_del_first() here. If alloc_bulk() is
+ * invoked by the initial prefill, there will be no running
+ * refill_work(), so __llist_del_first() is fine as well.
+ *
+ * In most cases, objects on free_by_rcu are from the same CPU.
+ * If some objects come from other CPUs, it doesn't incur any
+ * harm because NUMA_NO_NODE means the preference for current
+ * numa node and it is not a guarantee.
+ */
+ obj = __llist_del_first(&c->free_by_rcu);
+ if (!obj) {
+ obj = __alloc(c, node);
+ if (!obj)
+ break;
+ }
if (IS_ENABLED(CONFIG_PREEMPT_RT))
/* In RT irq_work runs in per-cpu kthread, so disable
* interrupts to avoid preemption and interrupts and
@@ -222,9 +237,13 @@ static void __free_rcu(struct rcu_head *head)
static void __free_rcu_tasks_trace(struct rcu_head *head)
{
- struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu);
-
- call_rcu(&c->rcu, __free_rcu);
+ /* If RCU Tasks Trace grace period implies RCU grace period,
+ * there is no need to invoke call_rcu().
+ */
+ if (rcu_trace_implies_rcu_gp())
+ __free_rcu(head);
+ else
+ call_rcu(head, __free_rcu);
}
static void enque_to_free(struct bpf_mem_cache *c, void *obj)
@@ -253,8 +272,9 @@ static void do_call_rcu(struct bpf_mem_cache *c)
*/
__llist_add(llnode, &c->waiting_for_gp);
/* Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
- * Then use call_rcu() to wait for normal progs to finish
- * and finally do free_one() on each element.
+ * If RCU Tasks Trace grace period implies RCU grace period, free
+ * these elements directly, else use call_rcu() to wait for normal
+ * progs to finish and finally do free_one() on each element.
*/
call_rcu_tasks_trace(&c->rcu, __free_rcu_tasks_trace);
}
@@ -444,9 +464,17 @@ static void free_mem_alloc(struct bpf_mem_alloc *ma)
{
/* waiting_for_gp lists was drained, but __free_rcu might
* still execute. Wait for it now before we freeing percpu caches.
+ *
+ * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(),
+ * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used
+ * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(),
+ * so if call_rcu(head, __free_rcu) is skipped due to
+ * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by
+ * using rcu_trace_implies_rcu_gp() as well.
*/
rcu_barrier_tasks_trace();
- rcu_barrier();
+ if (!rcu_trace_implies_rcu_gp())
+ rcu_barrier();
free_mem_alloc_no_barrier(ma);
}
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index b6e7f5c5b9ab..034cf87b54e9 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -100,22 +100,21 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems)
{
struct pcpu_freelist_head *head;
- int i, cpu, pcpu_entries;
+ unsigned int cpu, cpu_idx, i, j, n, m;
- pcpu_entries = nr_elems / num_possible_cpus() + 1;
- i = 0;
+ n = nr_elems / num_possible_cpus();
+ m = nr_elems % num_possible_cpus();
+ cpu_idx = 0;
for_each_possible_cpu(cpu) {
-again:
head = per_cpu_ptr(s->freelist, cpu);
- /* No locking required as this is not visible yet. */
- pcpu_freelist_push_node(head, buf);
- i++;
- buf += elem_size;
- if (i == nr_elems)
- break;
- if (i % pcpu_entries)
- goto again;
+ j = n + (cpu_idx < m ? 1 : 0);
+ for (i = 0; i < j; i++) {
+ /* No locking required as this is not visible yet. */
+ pcpu_freelist_push_node(head, buf);
+ buf += elem_size;
+ }
+ cpu_idx++;
}
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 9e832acf4692..80f4b4d88aaf 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -447,7 +447,7 @@ BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
.func = bpf_ringbuf_reserve,
- .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+ .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
@@ -490,7 +490,7 @@ BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_submit_proto = {
.func = bpf_ringbuf_submit,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
@@ -503,7 +503,7 @@ BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
const struct bpf_func_proto bpf_ringbuf_discard_proto = {
.func = bpf_ringbuf_discard,
.ret_type = RET_VOID,
- .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
+ .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE,
.arg2_type = ARG_ANYTHING,
};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 7b373a5e861f..64131f88c553 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -175,8 +175,8 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
-static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
- void *value, __u64 flags)
+static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
+ void *key, void *value, __u64 flags)
{
int err;
@@ -190,7 +190,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
map->map_type == BPF_MAP_TYPE_SOCKMAP) {
return sock_map_update_elem_sys(map, key, value, flags);
} else if (IS_FD_PROG_ARRAY(map)) {
- return bpf_fd_array_map_update_elem(map, f.file, key, value,
+ return bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
}
@@ -205,12 +205,12 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
flags);
} else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
- err = bpf_fd_array_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_array_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
rcu_read_lock();
- err = bpf_fd_htab_map_update_elem(map, f.file, key, value,
+ err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
flags);
rcu_read_unlock();
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
@@ -495,114 +495,181 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
-static int bpf_map_kptr_off_cmp(const void *a, const void *b)
+static int btf_field_cmp(const void *a, const void *b)
{
- const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;
+ const struct btf_field *f1 = a, *f2 = b;
- if (off_desc1->offset < off_desc2->offset)
+ if (f1->offset < f2->offset)
return -1;
- else if (off_desc1->offset > off_desc2->offset)
+ else if (f1->offset > f2->offset)
return 1;
return 0;
}
-struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset)
+struct btf_field *btf_record_find(const struct btf_record *rec, u32 offset,
+ enum btf_field_type type)
{
- /* Since members are iterated in btf_find_field in increasing order,
- * offsets appended to kptr_off_tab are in increasing order, so we can
- * do bsearch to find exact match.
- */
- struct bpf_map_value_off *tab;
+ struct btf_field *field;
- if (!map_value_has_kptrs(map))
+ if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & type))
+ return NULL;
+ field = bsearch(&offset, rec->fields, rec->cnt, sizeof(rec->fields[0]), btf_field_cmp);
+ if (!field || !(field->type & type))
return NULL;
- tab = map->kptr_off_tab;
- return bsearch(&offset, tab->off, tab->nr_off, sizeof(tab->off[0]), bpf_map_kptr_off_cmp);
+ return field;
}
-void bpf_map_free_kptr_off_tab(struct bpf_map *map)
+void btf_record_free(struct btf_record *rec)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab;
int i;
- if (!map_value_has_kptrs(map))
+ if (IS_ERR_OR_NULL(rec))
return;
- for (i = 0; i < tab->nr_off; i++) {
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
- btf_put(tab->off[i].kptr.btf);
+ for (i = 0; i < rec->cnt; i++) {
+ switch (rec->fields[i].type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (rec->fields[i].kptr.module)
+ module_put(rec->fields[i].kptr.module);
+ btf_put(rec->fields[i].kptr.btf);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ /* Nothing to release for bpf_list_head */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
}
- kfree(tab);
- map->kptr_off_tab = NULL;
+ kfree(rec);
+}
+
+void bpf_map_free_record(struct bpf_map *map)
+{
+ btf_record_free(map->record);
+ map->record = NULL;
}
-struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map)
+struct btf_record *btf_record_dup(const struct btf_record *rec)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab, *new_tab;
- int size, i;
+ const struct btf_field *fields;
+ struct btf_record *new_rec;
+ int ret, size, i;
- if (!map_value_has_kptrs(map))
- return ERR_PTR(-ENOENT);
- size = offsetof(struct bpf_map_value_off, off[tab->nr_off]);
- new_tab = kmemdup(tab, size, GFP_KERNEL | __GFP_NOWARN);
- if (!new_tab)
+ if (IS_ERR_OR_NULL(rec))
+ return NULL;
+ size = offsetof(struct btf_record, fields[rec->cnt]);
+ new_rec = kmemdup(rec, size, GFP_KERNEL | __GFP_NOWARN);
+ if (!new_rec)
return ERR_PTR(-ENOMEM);
- /* Do a deep copy of the kptr_off_tab */
- for (i = 0; i < tab->nr_off; i++) {
- btf_get(tab->off[i].kptr.btf);
- if (tab->off[i].kptr.module && !try_module_get(tab->off[i].kptr.module)) {
- while (i--) {
- if (tab->off[i].kptr.module)
- module_put(tab->off[i].kptr.module);
- btf_put(tab->off[i].kptr.btf);
+ /* Do a deep copy of the btf_record */
+ fields = rec->fields;
+ new_rec->cnt = 0;
+ for (i = 0; i < rec->cnt; i++) {
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ case BPF_TIMER:
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ btf_get(fields[i].kptr.btf);
+ if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
+ ret = -ENXIO;
+ goto free;
}
- kfree(new_tab);
- return ERR_PTR(-ENXIO);
+ break;
+ case BPF_LIST_HEAD:
+ case BPF_LIST_NODE:
+ /* Nothing to acquire for bpf_list_head */
+ break;
+ default:
+ ret = -EFAULT;
+ WARN_ON_ONCE(1);
+ goto free;
}
+ new_rec->cnt++;
}
- return new_tab;
+ return new_rec;
+free:
+ btf_record_free(new_rec);
+ return ERR_PTR(ret);
}
-bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b)
+bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b)
{
- struct bpf_map_value_off *tab_a = map_a->kptr_off_tab, *tab_b = map_b->kptr_off_tab;
- bool a_has_kptr = map_value_has_kptrs(map_a), b_has_kptr = map_value_has_kptrs(map_b);
+ bool a_has_fields = !IS_ERR_OR_NULL(rec_a), b_has_fields = !IS_ERR_OR_NULL(rec_b);
int size;
- if (!a_has_kptr && !b_has_kptr)
+ if (!a_has_fields && !b_has_fields)
return true;
- if (a_has_kptr != b_has_kptr)
+ if (a_has_fields != b_has_fields)
return false;
- if (tab_a->nr_off != tab_b->nr_off)
+ if (rec_a->cnt != rec_b->cnt)
return false;
- size = offsetof(struct bpf_map_value_off, off[tab_a->nr_off]);
- return !memcmp(tab_a, tab_b, size);
+ size = offsetof(struct btf_record, fields[rec_a->cnt]);
+ /* btf_parse_fields uses kzalloc to allocate a btf_record, so unused
+ * members are zeroed out. So memcmp is safe to do without worrying
+ * about padding/unused fields.
+ *
+ * While spin_lock, timer, and kptr have no relation to map BTF,
+ * list_head metadata is specific to map BTF, the btf and value_rec
+ * members in particular. btf is the map BTF, while value_rec points to
+ * btf_record in that map BTF.
+ *
+ * So while by default, we don't rely on the map BTF (which the records
+ * were parsed from) matching for both records, which is not backwards
+ * compatible, in case list_head is part of it, we implicitly rely on
+ * that by way of depending on memcmp succeeding for it.
+ */
+ return !memcmp(rec_a, rec_b, size);
}
-/* Caller must ensure map_value_has_kptrs is true. Note that this function can
- * be called on a map value while the map_value is visible to BPF programs, as
- * it ensures the correct synchronization, and we already enforce the same using
- * the bpf_kptr_xchg helper on the BPF program side for referenced kptrs.
- */
-void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
+void bpf_obj_free_timer(const struct btf_record *rec, void *obj)
{
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- unsigned long *btf_id_ptr;
- int i;
+ if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER)))
+ return;
+ bpf_timer_cancel_and_free(obj + rec->timer_off);
+}
- for (i = 0; i < tab->nr_off; i++) {
- struct bpf_map_value_off_desc *off_desc = &tab->off[i];
- unsigned long old_ptr;
+void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
+{
+ const struct btf_field *fields;
+ int i;
- btf_id_ptr = map_value + off_desc->offset;
- if (off_desc->type == BPF_KPTR_UNREF) {
- u64 *p = (u64 *)btf_id_ptr;
+ if (IS_ERR_OR_NULL(rec))
+ return;
+ fields = rec->fields;
+ for (i = 0; i < rec->cnt; i++) {
+ const struct btf_field *field = &fields[i];
+ void *field_ptr = obj + field->offset;
- WRITE_ONCE(*p, 0);
+ switch (fields[i].type) {
+ case BPF_SPIN_LOCK:
+ break;
+ case BPF_TIMER:
+ bpf_timer_cancel_and_free(field_ptr);
+ break;
+ case BPF_KPTR_UNREF:
+ WRITE_ONCE(*(u64 *)field_ptr, 0);
+ break;
+ case BPF_KPTR_REF:
+ field->kptr.dtor((void *)xchg((unsigned long *)field_ptr, 0));
+ break;
+ case BPF_LIST_HEAD:
+ if (WARN_ON_ONCE(rec->spin_lock_off < 0))
+ continue;
+ bpf_list_head_free(field, field_ptr, obj + rec->spin_lock_off);
+ break;
+ case BPF_LIST_NODE:
+ break;
+ default:
+ WARN_ON_ONCE(1);
continue;
}
- old_ptr = xchg(btf_id_ptr, 0);
- off_desc->kptr.dtor((void *)old_ptr);
}
}
@@ -610,14 +677,24 @@ void bpf_map_free_kptrs(struct bpf_map *map, void *map_value)
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ struct btf_field_offs *foffs = map->field_offs;
+ struct btf_record *rec = map->record;
security_bpf_map_free(map);
- kfree(map->off_arr);
bpf_map_release_memcg(map);
- /* implementation dependent freeing, map_free callback also does
- * bpf_map_free_kptr_off_tab, if needed.
- */
+ /* implementation dependent freeing */
map->ops->map_free(map);
+ /* Delay freeing of field_offs and btf_record for maps, as map_free
+ * callback usually needs access to them. It is better to do it here
+ * than require each callback to do the free itself manually.
+ *
+ * Note that the btf_record stashed in map->inner_map_meta->record was
+ * already freed using the map_free callback for map in map case which
+ * eventually calls bpf_map_free_meta, since inner_map_meta is only a
+ * template bpf_map struct used during verification.
+ */
+ kfree(foffs);
+ btf_record_free(rec);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -778,8 +855,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
struct bpf_map *map = filp->private_data;
int err;
- if (!map->ops->map_mmap || map_value_has_spin_lock(map) ||
- map_value_has_timer(map) || map_value_has_kptrs(map))
+ if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
if (!(vma->vm_flags & VM_SHARED))
@@ -906,84 +982,6 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP;
}
-static int map_off_arr_cmp(const void *_a, const void *_b, const void *priv)
-{
- const u32 a = *(const u32 *)_a;
- const u32 b = *(const u32 *)_b;
-
- if (a < b)
- return -1;
- else if (a > b)
- return 1;
- return 0;
-}
-
-static void map_off_arr_swap(void *_a, void *_b, int size, const void *priv)
-{
- struct bpf_map *map = (struct bpf_map *)priv;
- u32 *off_base = map->off_arr->field_off;
- u32 *a = _a, *b = _b;
- u8 *sz_a, *sz_b;
-
- sz_a = map->off_arr->field_sz + (a - off_base);
- sz_b = map->off_arr->field_sz + (b - off_base);
-
- swap(*a, *b);
- swap(*sz_a, *sz_b);
-}
-
-static int bpf_map_alloc_off_arr(struct bpf_map *map)
-{
- bool has_spin_lock = map_value_has_spin_lock(map);
- bool has_timer = map_value_has_timer(map);
- bool has_kptrs = map_value_has_kptrs(map);
- struct bpf_map_off_arr *off_arr;
- u32 i;
-
- if (!has_spin_lock && !has_timer && !has_kptrs) {
- map->off_arr = NULL;
- return 0;
- }
-
- off_arr = kmalloc(sizeof(*map->off_arr), GFP_KERNEL | __GFP_NOWARN);
- if (!off_arr)
- return -ENOMEM;
- map->off_arr = off_arr;
-
- off_arr->cnt = 0;
- if (has_spin_lock) {
- i = off_arr->cnt;
-
- off_arr->field_off[i] = map->spin_lock_off;
- off_arr->field_sz[i] = sizeof(struct bpf_spin_lock);
- off_arr->cnt++;
- }
- if (has_timer) {
- i = off_arr->cnt;
-
- off_arr->field_off[i] = map->timer_off;
- off_arr->field_sz[i] = sizeof(struct bpf_timer);
- off_arr->cnt++;
- }
- if (has_kptrs) {
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- u32 *off = &off_arr->field_off[off_arr->cnt];
- u8 *sz = &off_arr->field_sz[off_arr->cnt];
-
- for (i = 0; i < tab->nr_off; i++) {
- *off++ = tab->off[i].offset;
- *sz++ = sizeof(u64);
- }
- off_arr->cnt += tab->nr_off;
- }
-
- if (off_arr->cnt == 1)
- return 0;
- sort_r(off_arr->field_off, off_arr->cnt, sizeof(off_arr->field_off[0]),
- map_off_arr_cmp, map_off_arr_swap, map);
- return 0;
-}
-
static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id)
{
@@ -1006,39 +1004,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size)
return -EINVAL;
- map->spin_lock_off = btf_find_spin_lock(btf, value_type);
-
- if (map_value_has_spin_lock(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
- map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
- map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
- map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
- return -ENOTSUPP;
- if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
- map->value_size) {
- WARN_ONCE(1,
- "verifier bug spin_lock_off %d value_size %d\n",
- map->spin_lock_off, map->value_size);
- return -EFAULT;
- }
- }
+ map->record = btf_parse_fields(btf, value_type,
+ BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD,
+ map->value_size);
+ if (!IS_ERR_OR_NULL(map->record)) {
+ int i;
- map->timer_off = btf_find_timer(btf, value_type);
- if (map_value_has_timer(map)) {
- if (map->map_flags & BPF_F_RDONLY_PROG)
- return -EACCES;
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY)
- return -EOPNOTSUPP;
- }
-
- map->kptr_off_tab = btf_parse_kptrs(btf, value_type);
- if (map_value_has_kptrs(map)) {
if (!bpf_capable()) {
ret = -EPERM;
goto free_map_tab;
@@ -1047,15 +1018,60 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
ret = -EACCES;
goto free_map_tab;
}
- if (map->map_type != BPF_MAP_TYPE_HASH &&
- map->map_type != BPF_MAP_TYPE_LRU_HASH &&
- map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
- ret = -EOPNOTSUPP;
- goto free_map_tab;
+ for (i = 0; i < sizeof(map->record->field_mask) * 8; i++) {
+ switch (map->record->field_mask & (1 << i)) {
+ case 0:
+ continue;
+ case BPF_SPIN_LOCK:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_INODE_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_TASK_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_TIMER:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ case BPF_LIST_HEAD:
+ if (map->map_type != BPF_MAP_TYPE_HASH &&
+ map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+ map->map_type != BPF_MAP_TYPE_ARRAY) {
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
+ break;
+ default:
+ /* Fail if map_type checks are missing for a field type */
+ ret = -EOPNOTSUPP;
+ goto free_map_tab;
+ }
}
}
+ ret = btf_check_and_fixup_fields(btf, map->record);
+ if (ret < 0)
+ goto free_map_tab;
+
if (map->ops->map_check_btf) {
ret = map->ops->map_check_btf(map, btf, key_type, value_type);
if (ret < 0)
@@ -1064,7 +1080,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
free_map_tab:
- bpf_map_free_kptr_off_tab(map);
+ bpf_map_free_record(map);
return ret;
}
@@ -1073,6 +1089,7 @@ free_map_tab:
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
+ struct btf_field_offs *foffs;
struct bpf_map *map;
int f_flags;
int err;
@@ -1117,8 +1134,6 @@ static int map_create(union bpf_attr *attr)
mutex_init(&map->freeze_mutex);
spin_lock_init(&map->owner.lock);
- map->spin_lock_off = -EINVAL;
- map->timer_off = -EINVAL;
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
* the bpf_prog.o must have BTF to begin with
@@ -1154,13 +1169,17 @@ static int map_create(union bpf_attr *attr)
attr->btf_vmlinux_value_type_id;
}
- err = bpf_map_alloc_off_arr(map);
- if (err)
+
+ foffs = btf_parse_field_offs(map->record);
+ if (IS_ERR(foffs)) {
+ err = PTR_ERR(foffs);
goto free_map;
+ }
+ map->field_offs = foffs;
err = security_bpf_map_alloc(map);
if (err)
- goto free_map_off_arr;
+ goto free_map_field_offs;
err = bpf_map_alloc_id(map);
if (err)
@@ -1184,8 +1203,8 @@ static int map_create(union bpf_attr *attr)
free_map_sec:
security_bpf_map_free(map);
-free_map_off_arr:
- kfree(map->off_arr);
+free_map_field_offs:
+ kfree(map->field_offs);
free_map:
btf_put(map->btf);
map->ops->map_free(map);
@@ -1332,7 +1351,7 @@ static int map_lookup_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1405,7 +1424,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1423,7 +1442,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto free_key;
}
- err = bpf_map_update_value(map, f, key, value, attr->flags);
+ err = bpf_map_update_value(map, f.file, key, value, attr->flags);
kvfree(value);
free_key:
@@ -1568,7 +1587,7 @@ int generic_map_delete_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1609,23 +1628,21 @@ int generic_map_delete_batch(struct bpf_map *map,
return err;
}
-int generic_map_update_batch(struct bpf_map *map,
+int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->batch.map_fd;
void *key, *value;
- struct fd f;
int err = 0;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
return -EINVAL;
}
@@ -1645,7 +1662,6 @@ int generic_map_update_batch(struct bpf_map *map,
return -ENOMEM;
}
- f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
@@ -1653,7 +1669,7 @@ int generic_map_update_batch(struct bpf_map *map,
copy_from_user(value, values + cp * value_size, value_size))
break;
- err = bpf_map_update_value(map, f, key, value,
+ err = bpf_map_update_value(map, map_file, key, value,
attr->batch.elem_flags);
if (err)
@@ -1666,7 +1682,6 @@ int generic_map_update_batch(struct bpf_map *map,
kvfree(value);
kvfree(key);
- fdput(f);
return err;
}
@@ -1688,7 +1703,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
return -EINVAL;
if ((attr->batch.elem_flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map))
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK))
return -EINVAL;
value_size = bpf_map_value_size(map);
@@ -1810,7 +1825,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
}
if ((attr->flags & BPF_F_LOCK) &&
- !map_value_has_spin_lock(map)) {
+ !btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
err = -EINVAL;
goto err_put;
}
@@ -1881,8 +1896,7 @@ static int map_freeze(const union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS ||
- map_value_has_timer(map) || map_value_has_kptrs(map)) {
+ if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) {
fdput(f);
return -ENOTSUPP;
}
@@ -2117,11 +2131,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,
st = per_cpu_ptr(prog->stats, cpu);
do {
- start = u64_stats_fetch_begin_irq(&st->syncp);
+ start = u64_stats_fetch_begin(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
- } while (u64_stats_fetch_retry_irq(&st->syncp, start));
+ } while (u64_stats_fetch_retry(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
@@ -3504,9 +3518,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_LSM:
if (ptype == BPF_PROG_TYPE_LSM &&
prog->expected_attach_type != BPF_LSM_CGROUP)
- return -EINVAL;
-
- ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+ ret = -EINVAL;
+ else
+ ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
ret = -EINVAL;
@@ -4460,13 +4474,13 @@ put_file:
#define BPF_MAP_BATCH_LAST_FIELD batch.flags
-#define BPF_DO_BATCH(fn) \
+#define BPF_DO_BATCH(fn, ...) \
do { \
if (!fn) { \
err = -ENOTSUPP; \
goto err_put; \
} \
- err = fn(map, attr, uattr); \
+ err = fn(__VA_ARGS__); \
} while (0)
static int bpf_map_do_batch(const union bpf_attr *attr,
@@ -4500,13 +4514,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
}
if (cmd == BPF_MAP_LOOKUP_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_batch, map, attr, uattr);
else if (cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH)
- BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch);
+ BPF_DO_BATCH(map->ops->map_lookup_and_delete_batch, map, attr, uattr);
else if (cmd == BPF_MAP_UPDATE_BATCH)
- BPF_DO_BATCH(map->ops->map_update_batch);
+ BPF_DO_BATCH(map->ops->map_update_batch, map, f.file, attr, uattr);
else
- BPF_DO_BATCH(map->ops->map_delete_batch);
+ BPF_DO_BATCH(map->ops->map_delete_batch, map, attr, uattr);
err_put:
if (has_write)
bpf_map_write_active_dec(map);
@@ -5133,13 +5147,14 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size)
run_ctx.bpf_cookie = 0;
run_ctx.saved_run_ctx = NULL;
- if (!__bpf_prog_enter_sleepable(prog, &run_ctx)) {
+ if (!__bpf_prog_enter_sleepable_recur(prog, &run_ctx)) {
/* recursion detected */
bpf_prog_put(prog);
return -EBUSY;
}
attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
- __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */, &run_ctx);
+ __bpf_prog_exit_sleepable_recur(prog, 0 /* bpf_prog_run does runtime stats */,
+ &run_ctx);
bpf_prog_put(prog);
return 0;
#endif
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index bf0906e1e2b9..11f5ec0b8016 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -468,8 +468,7 @@ again:
if (err < 0)
goto out;
- set_memory_ro((long)im->image, 1);
- set_memory_x((long)im->image, 1);
+ set_memory_rox((long)im->image, 1);
WARN_ON(tr->cur_image && tr->selector == 0);
WARN_ON(!tr->cur_image && tr->selector);
@@ -864,7 +863,7 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
* [2..MAX_U64] - execute bpf prog and record execution time.
* This is start time.
*/
-u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
@@ -901,7 +900,8 @@ static void notrace update_prog_stats(struct bpf_prog *prog,
}
}
-void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -912,8 +912,8 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
- struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
/* Runtime stats are exported via actual BPF_LSM_CGROUP
@@ -927,8 +927,8 @@ u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
return NO_START_TIME;
}
-void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -937,7 +937,8 @@ void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
rcu_read_unlock();
}
-u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
+u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
{
rcu_read_lock_trace();
migrate_disable();
@@ -953,8 +954,8 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_r
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -964,8 +965,30 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
rcu_read_unlock_trace();
}
-u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
- struct bpf_tramp_run_ctx *run_ctx)
+static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return bpf_prog_start_time();
+}
+
+static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ update_prog_stats(prog, start);
+ migrate_enable();
+ rcu_read_unlock_trace();
+}
+
+static u64 notrace __bpf_prog_enter(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
__acquires(RCU)
{
rcu_read_lock();
@@ -976,8 +999,8 @@ u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog,
return bpf_prog_start_time();
}
-void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start,
- struct bpf_tramp_run_ctx *run_ctx)
+static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
__releases(RCU)
{
bpf_reset_run_ctx(run_ctx->saved_run_ctx);
@@ -997,6 +1020,36 @@ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
percpu_ref_put(&tr->pcref);
}
+bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_enter_sleepable_recur :
+ __bpf_prog_enter_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_enter_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter;
+}
+
+bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog)
+{
+ bool sleepable = prog->aux->sleepable;
+
+ if (bpf_prog_check_recur(prog))
+ return sleepable ? __bpf_prog_exit_sleepable_recur :
+ __bpf_prog_exit_recur;
+
+ if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type == BPF_LSM_CGROUP)
+ return __bpf_prog_exit_lsm_cgroup;
+
+ return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit;
+}
+
int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7f0a9f6cb889..a5255a0dcbb6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -262,7 +262,7 @@ struct bpf_call_arg_meta {
struct btf *ret_btf;
u32 ret_btf_id;
u32 subprogno;
- struct bpf_map_value_off_desc *kptr_off_desc;
+ struct btf_field *kptr_field;
u8 uninit_dynptr_regno;
};
@@ -451,17 +451,29 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_SOCK_COMMON;
}
-static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+static bool type_is_ptr_alloc_obj(u32 type)
{
- return reg->type == PTR_TO_MAP_VALUE &&
- map_value_has_spin_lock(reg->map_ptr);
+ return base_type(type) == PTR_TO_BTF_ID && type_flag(type) & MEM_ALLOC;
}
-static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
+static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg)
{
- type = base_type(type);
- return type == PTR_TO_SOCKET || type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_MEM || type == PTR_TO_BTF_ID;
+ struct btf_record *rec = NULL;
+ struct btf_struct_meta *meta;
+
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ rec = reg->map_ptr->record;
+ } else if (type_is_ptr_alloc_obj(reg->type)) {
+ meta = btf_find_struct_meta(reg->btf, reg->btf_id);
+ if (meta)
+ rec = meta->record;
+ }
+ return rec;
+}
+
+static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+{
+ return btf_record_has_field(reg_btf_record(reg), BPF_SPIN_LOCK);
}
static bool type_is_rdonly_mem(u32 type)
@@ -511,6 +523,23 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_dynptr_data;
}
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_for_each_map_elem ||
+ func_id == BPF_FUNC_timer_set_callback ||
+ func_id == BPF_FUNC_find_vma ||
+ func_id == BPF_FUNC_loop ||
+ func_id == BPF_FUNC_user_ringbuf_drain;
+}
+
+static bool is_storage_get_function(enum bpf_func_id func_id)
+{
+ return func_id == BPF_FUNC_sk_storage_get ||
+ func_id == BPF_FUNC_inode_storage_get ||
+ func_id == BPF_FUNC_task_storage_get ||
+ func_id == BPF_FUNC_cgrp_storage_get;
+}
+
static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -541,7 +570,7 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
static const char *reg_type_str(struct bpf_verifier_env *env,
enum bpf_reg_type type)
{
- char postfix[16] = {0}, prefix[32] = {0};
+ char postfix[16] = {0}, prefix[64] = {0};
static const char * const str[] = {
[NOT_INIT] = "?",
[SCALAR_VALUE] = "scalar",
@@ -563,7 +592,7 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
[PTR_TO_MAP_KEY] = "map_key",
- [PTR_TO_DYNPTR] = "dynptr_ptr",
+ [CONST_PTR_TO_DYNPTR] = "dynptr_ptr",
};
if (type & PTR_MAYBE_NULL) {
@@ -573,16 +602,15 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
strncpy(postfix, "_or_null", 16);
}
- if (type & MEM_RDONLY)
- strncpy(prefix, "rdonly_", 32);
- if (type & MEM_ALLOC)
- strncpy(prefix, "alloc_", 32);
- if (type & MEM_USER)
- strncpy(prefix, "user_", 32);
- if (type & MEM_PERCPU)
- strncpy(prefix, "percpu_", 32);
- if (type & PTR_UNTRUSTED)
- strncpy(prefix, "untrusted_", 32);
+ snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s",
+ type & MEM_RDONLY ? "rdonly_" : "",
+ type & MEM_RINGBUF ? "ringbuf_" : "",
+ type & MEM_USER ? "user_" : "",
+ type & MEM_PERCPU ? "percpu_" : "",
+ type & MEM_RCU ? "rcu_" : "",
+ type & PTR_UNTRUSTED ? "untrusted_" : "",
+ type & PTR_TRUSTED ? "trusted_" : ""
+ );
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
@@ -697,6 +725,28 @@ static bool dynptr_type_refcounted(enum bpf_dynptr_type type)
return type == BPF_DYNPTR_TYPE_RINGBUF;
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type,
+ bool first_slot);
+
+static void __mark_reg_not_init(const struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg);
+
+static void mark_dynptr_stack_regs(struct bpf_reg_state *sreg1,
+ struct bpf_reg_state *sreg2,
+ enum bpf_dynptr_type type)
+{
+ __mark_dynptr_reg(sreg1, type, true);
+ __mark_dynptr_reg(sreg2, type, false);
+}
+
+static void mark_dynptr_cb_reg(struct bpf_reg_state *reg,
+ enum bpf_dynptr_type type)
+{
+ __mark_dynptr_reg(reg, type, true);
+}
+
+
static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
enum bpf_arg_type arg_type, int insn_idx)
{
@@ -718,9 +768,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (type == BPF_DYNPTR_TYPE_INVALID)
return -EINVAL;
- state->stack[spi].spilled_ptr.dynptr.first_slot = true;
- state->stack[spi].spilled_ptr.dynptr.type = type;
- state->stack[spi - 1].spilled_ptr.dynptr.type = type;
+ mark_dynptr_stack_regs(&state->stack[spi].spilled_ptr,
+ &state->stack[spi - 1].spilled_ptr, type);
if (dynptr_type_refcounted(type)) {
/* The id is used to track proper releasing */
@@ -728,8 +777,8 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (id < 0)
return id;
- state->stack[spi].spilled_ptr.id = id;
- state->stack[spi - 1].spilled_ptr.id = id;
+ state->stack[spi].spilled_ptr.ref_obj_id = id;
+ state->stack[spi - 1].spilled_ptr.ref_obj_id = id;
}
return 0;
@@ -751,25 +800,23 @@ static int unmark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_re
}
/* Invalidate any slices associated with this dynptr */
- if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type)) {
- release_reference(env, state->stack[spi].spilled_ptr.id);
- state->stack[spi].spilled_ptr.id = 0;
- state->stack[spi - 1].spilled_ptr.id = 0;
- }
-
- state->stack[spi].spilled_ptr.dynptr.first_slot = false;
- state->stack[spi].spilled_ptr.dynptr.type = 0;
- state->stack[spi - 1].spilled_ptr.dynptr.type = 0;
+ if (dynptr_type_refcounted(state->stack[spi].spilled_ptr.dynptr.type))
+ WARN_ON_ONCE(release_reference(env, state->stack[spi].spilled_ptr.ref_obj_id));
+ __mark_reg_not_init(env, &state->stack[spi].spilled_ptr);
+ __mark_reg_not_init(env, &state->stack[spi - 1].spilled_ptr);
return 0;
}
static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
- int i;
+ int spi, i;
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return false;
+
+ spi = get_spi(reg->off);
if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS))
return true;
@@ -782,13 +829,17 @@ static bool is_dynptr_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_
return true;
}
-bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg)
+static bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
int i;
+ /* This already represents first slot of initialized bpf_dynptr */
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return true;
+
+ spi = get_spi(reg->off);
if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
!state->stack[spi].spilled_ptr.dynptr.first_slot)
return false;
@@ -802,21 +853,24 @@ bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env,
return true;
}
-bool is_dynptr_type_expected(struct bpf_verifier_env *env,
- struct bpf_reg_state *reg,
- enum bpf_arg_type arg_type)
+static bool is_dynptr_type_expected(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ enum bpf_arg_type arg_type)
{
struct bpf_func_state *state = func(env, reg);
enum bpf_dynptr_type dynptr_type;
- int spi = get_spi(reg->off);
+ int spi;
/* ARG_PTR_TO_DYNPTR takes any type of dynptr */
if (arg_type == ARG_PTR_TO_DYNPTR)
return true;
dynptr_type = arg_to_dynptr_type(arg_type);
-
- return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ if (reg->type == CONST_PTR_TO_DYNPTR) {
+ return reg->dynptr.type == dynptr_type;
+ } else {
+ spi = get_spi(reg->off);
+ return state->stack[spi].spilled_ptr.dynptr.type == dynptr_type;
+ }
}
/* The reg state of a pointer or a bounded scalar was saved when
@@ -875,7 +929,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (reg->id)
verbose_a("id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id)
+ if (reg->ref_obj_id)
verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (t != SCALAR_VALUE)
verbose_a("off=%d", reg->off);
@@ -1008,9 +1062,9 @@ static void *copy_array(void *dst, const void *src, size_t n, size_t size, gfp_t
if (unlikely(check_mul_overflow(n, size, &bytes)))
return NULL;
- if (ksize(dst) < bytes) {
+ if (ksize(dst) < ksize(src)) {
kfree(dst);
- dst = kmalloc_track_caller(bytes, flags);
+ dst = kmalloc_track_caller(kmalloc_size_roundup(bytes), flags);
if (!dst)
return NULL;
}
@@ -1027,12 +1081,19 @@ out:
*/
static void *realloc_array(void *arr, size_t old_n, size_t new_n, size_t size)
{
+ size_t alloc_size;
+ void *new_arr;
+
if (!new_n || old_n == new_n)
goto out;
- arr = krealloc_array(arr, new_n, size, GFP_KERNEL);
- if (!arr)
+ alloc_size = kmalloc_size_roundup(size_mul(new_n, size));
+ new_arr = krealloc(arr, alloc_size, GFP_KERNEL);
+ if (!new_arr) {
+ kfree(arr);
return NULL;
+ }
+ arr = new_arr;
if (new_n > old_n)
memset(arr + old_n * size, 0, (new_n - old_n) * size);
@@ -1199,8 +1260,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->frame[i] = NULL;
}
dst_state->speculative = src->speculative;
+ dst_state->active_rcu_lock = src->active_rcu_lock;
dst_state->curframe = src->curframe;
- dst_state->active_spin_lock = src->active_spin_lock;
+ dst_state->active_lock.ptr = src->active_lock.ptr;
+ dst_state->active_lock.id = src->active_lock.id;
dst_state->branches = src->branches;
dst_state->parent = src->parent;
dst_state->first_insn_idx = src->first_insn_idx;
@@ -1319,9 +1382,6 @@ static const int caller_saved[CALLER_SAVED_REGS] = {
BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
};
-static void __mark_reg_not_init(const struct bpf_verifier_env *env,
- struct bpf_reg_state *reg);
-
/* This helper doesn't clear reg->id */
static void ___mark_reg_known(struct bpf_reg_state *reg, u64 imm)
{
@@ -1384,6 +1444,19 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
__mark_reg_known_zero(regs + regno);
}
+static void __mark_dynptr_reg(struct bpf_reg_state *reg, enum bpf_dynptr_type type,
+ bool first_slot)
+{
+ /* reg->type has no meaning for STACK_DYNPTR, but when we set reg for
+ * callback arguments, it does need to be CONST_PTR_TO_DYNPTR, so simply
+ * set it unconditionally as it is ignored for STACK_DYNPTR anyway.
+ */
+ __mark_reg_known_zero(reg);
+ reg->type = CONST_PTR_TO_DYNPTR;
+ reg->dynptr.type = type;
+ reg->dynptr.first_slot = first_slot;
+}
+
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
@@ -1395,7 +1468,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- if (map_value_has_timer(map->inner_map_meta))
+ if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER))
reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
@@ -1684,7 +1757,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env,
reg->type = SCALAR_VALUE;
reg->var_off = tnum_unknown;
reg->frameno = 0;
- reg->precise = env->subprog_cnt > 1 || !env->bpf_capable;
+ reg->precise = !env->bpf_capable;
__mark_reg_unbounded(reg);
}
@@ -2493,15 +2566,30 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].jmp_point = true;
+}
+
+static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].jmp_point;
+}
+
/* for any branch, call, exit record the history of jmps in the given state */
static int push_jmp_history(struct bpf_verifier_env *env,
struct bpf_verifier_state *cur)
{
u32 cnt = cur->jmp_history_cnt;
struct bpf_idx_pair *p;
+ size_t alloc_size;
+
+ if (!is_jmp_point(env, env->insn_idx))
+ return 0;
cnt++;
- p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
if (!p)
return -ENOMEM;
p[cnt - 1].idx = env->insn_idx;
@@ -2653,6 +2741,11 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (opcode == BPF_CALL) {
if (insn->src_reg == BPF_PSEUDO_CALL)
return -ENOTSUPP;
+ /* BPF helpers that invoke callback subprogs are
+ * equivalent to BPF_PSEUDO_CALL above
+ */
+ if (insn->src_reg == 0 && is_callback_calling_function(insn->imm))
+ return -ENOTSUPP;
/* regular helper call sets R0 */
*reg_mask &= ~1;
if (*reg_mask & 0x3f) {
@@ -2742,8 +2835,11 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
/* big hammer: mark all scalars precise in this path.
* pop_stack may still get !precise scalars.
+ * We also skip current state and go straight to first parent state,
+ * because precision markings in current non-checkpointed state are
+ * not needed. See why in the comment in __mark_chain_precision below.
*/
- for (; st; st = st->parent)
+ for (st = st->parent; st; st = st->parent) {
for (i = 0; i <= st->curframe; i++) {
func = st->frame[i];
for (j = 0; j < BPF_REG_FP; j++) {
@@ -2761,9 +2857,122 @@ static void mark_all_scalars_precise(struct bpf_verifier_env *env,
reg->precise = true;
}
}
+ }
}
-static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+static void mark_all_scalars_imprecise(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (!is_spilled_reg(&func->stack[j]))
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = false;
+ }
+ }
+}
+
+/*
+ * __mark_chain_precision() backtracks BPF program instruction sequence and
+ * chain of verifier states making sure that register *regno* (if regno >= 0)
+ * and/or stack slot *spi* (if spi >= 0) are marked as precisely tracked
+ * SCALARS, as well as any other registers and slots that contribute to
+ * a tracked state of given registers/stack slots, depending on specific BPF
+ * assembly instructions (see backtrack_insns() for exact instruction handling
+ * logic). This backtracking relies on recorded jmp_history and is able to
+ * traverse entire chain of parent states. This process ends only when all the
+ * necessary registers/slots and their transitive dependencies are marked as
+ * precise.
+ *
+ * One important and subtle aspect is that precise marks *do not matter* in
+ * the currently verified state (current state). It is important to understand
+ * why this is the case.
+ *
+ * First, note that current state is the state that is not yet "checkpointed",
+ * i.e., it is not yet put into env->explored_states, and it has no children
+ * states as well. It's ephemeral, and can end up either a) being discarded if
+ * compatible explored state is found at some point or BPF_EXIT instruction is
+ * reached or b) checkpointed and put into env->explored_states, branching out
+ * into one or more children states.
+ *
+ * In the former case, precise markings in current state are completely
+ * ignored by state comparison code (see regsafe() for details). Only
+ * checkpointed ("old") state precise markings are important, and if old
+ * state's register/slot is precise, regsafe() assumes current state's
+ * register/slot as precise and checks value ranges exactly and precisely. If
+ * states turn out to be compatible, current state's necessary precise
+ * markings and any required parent states' precise markings are enforced
+ * after the fact with propagate_precision() logic, after the fact. But it's
+ * important to realize that in this case, even after marking current state
+ * registers/slots as precise, we immediately discard current state. So what
+ * actually matters is any of the precise markings propagated into current
+ * state's parent states, which are always checkpointed (due to b) case above).
+ * As such, for scenario a) it doesn't matter if current state has precise
+ * markings set or not.
+ *
+ * Now, for the scenario b), checkpointing and forking into child(ren)
+ * state(s). Note that before current state gets to checkpointing step, any
+ * processed instruction always assumes precise SCALAR register/slot
+ * knowledge: if precise value or range is useful to prune jump branch, BPF
+ * verifier takes this opportunity enthusiastically. Similarly, when
+ * register's value is used to calculate offset or memory address, exact
+ * knowledge of SCALAR range is assumed, checked, and enforced. So, similar to
+ * what we mentioned above about state comparison ignoring precise markings
+ * during state comparison, BPF verifier ignores and also assumes precise
+ * markings *at will* during instruction verification process. But as verifier
+ * assumes precision, it also propagates any precision dependencies across
+ * parent states, which are not yet finalized, so can be further restricted
+ * based on new knowledge gained from restrictions enforced by their children
+ * states. This is so that once those parent states are finalized, i.e., when
+ * they have no more active children state, state comparison logic in
+ * is_state_visited() would enforce strict and precise SCALAR ranges, if
+ * required for correctness.
+ *
+ * To build a bit more intuition, note also that once a state is checkpointed,
+ * the path we took to get to that state is not important. This is crucial
+ * property for state pruning. When state is checkpointed and finalized at
+ * some instruction index, it can be correctly and safely used to "short
+ * circuit" any *compatible* state that reaches exactly the same instruction
+ * index. I.e., if we jumped to that instruction from a completely different
+ * code path than original finalized state was derived from, it doesn't
+ * matter, current state can be discarded because from that instruction
+ * forward having a compatible state will ensure we will safely reach the
+ * exit. States describe preconditions for further exploration, but completely
+ * forget the history of how we got here.
+ *
+ * This also means that even if we needed precise SCALAR range to get to
+ * finalized state, but from that point forward *that same* SCALAR register is
+ * never used in a precise context (i.e., it's precise value is not needed for
+ * correctness), it's correct and safe to mark such register as "imprecise"
+ * (i.e., precise marking set to false). This is what we rely on when we do
+ * not set precise marking in current state. If no child state requires
+ * precision for any given SCALAR register, it's safe to dictate that it can
+ * be imprecise. If any child state does require this register to be precise,
+ * we'll mark it precise later retroactively during precise markings
+ * propagation from child state to parent states.
+ *
+ * Skipping precise marking setting in current state is a mild version of
+ * relying on the above observation. But we can utilize this property even
+ * more aggressively by proactively forgetting any precise marking in the
+ * current state (which we inherited from the parent state), right before we
+ * checkpoint it and branch off into new child state. This is done by
+ * mark_all_scalars_imprecise() to hopefully get more permissive and generic
+ * finalized states which help in short circuiting more future states.
+ */
+static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int regno,
int spi)
{
struct bpf_verifier_state *st = env->cur_state;
@@ -2780,18 +2989,18 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
if (!env->bpf_capable)
return 0;
- func = st->frame[st->curframe];
+ /* Do sanity checks against current state of register and/or stack
+ * slot, but don't set precise flag in current state, as precision
+ * tracking in the current state is unnecessary.
+ */
+ func = st->frame[frame];
if (regno >= 0) {
reg = &func->regs[regno];
if (reg->type != SCALAR_VALUE) {
WARN_ONCE(1, "backtracing misuse");
return -EFAULT;
}
- if (!reg->precise)
- new_marks = true;
- else
- reg_mask = 0;
- reg->precise = true;
+ new_marks = true;
}
while (spi >= 0) {
@@ -2804,11 +3013,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
stack_mask = 0;
break;
}
- if (!reg->precise)
- new_marks = true;
- else
- stack_mask = 0;
- reg->precise = true;
+ new_marks = true;
break;
}
@@ -2816,12 +3021,42 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
return 0;
if (!reg_mask && !stack_mask)
return 0;
+
for (;;) {
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+
+ if (last_idx < 0) {
+ /* we are at the entry into subprog, which
+ * is expected for global funcs, but only if
+ * requested precise registers are R1-R5
+ * (which are global func's input arguments)
+ */
+ if (st->curframe == 0 &&
+ st->frame[0]->subprogno > 0 &&
+ st->frame[0]->callsite == BPF_MAIN_FUNC &&
+ stack_mask == 0 && (reg_mask & ~0x3e) == 0) {
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ reg = &st->frame[0]->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ reg_mask &= ~(1u << i);
+ continue;
+ }
+ reg->precise = true;
+ }
+ return 0;
+ }
+
+ verbose(env, "BUG backtracing func entry subprog %d reg_mask %x stack_mask %llx\n",
+ st->frame[0]->subprogno, reg_mask, stack_mask);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
for (i = last_idx;;) {
if (skip_first) {
err = 0;
@@ -2861,7 +3096,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
break;
new_marks = false;
- func = st->frame[st->curframe];
+ func = st->frame[frame];
bitmap_from_u64(mask, reg_mask);
for_each_set_bit(i, mask, 32) {
reg = &func->regs[i];
@@ -2927,12 +3162,17 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
int mark_chain_precision(struct bpf_verifier_env *env, int regno)
{
- return __mark_chain_precision(env, regno, -1);
+ return __mark_chain_precision(env, env->cur_state->curframe, regno, -1);
}
-static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+static int mark_chain_precision_frame(struct bpf_verifier_env *env, int frame, int regno)
{
- return __mark_chain_precision(env, -1, spi);
+ return __mark_chain_precision(env, frame, regno, -1);
+}
+
+static int mark_chain_precision_stack_frame(struct bpf_verifier_env *env, int frame, int spi)
+{
+ return __mark_chain_precision(env, frame, -1, spi);
}
static bool is_spillable_regtype(enum bpf_reg_type type)
@@ -3181,14 +3421,17 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
mark_stack_slot_scratched(env, spi);
- if (!env->allow_ptr_leaks
- && *stype != NOT_INIT
- && *stype != SCALAR_VALUE) {
- /* Reject the write if there's are spilled pointers in
- * range. If we didn't reject here, the ptr status
- * would be erased below (even though not all slots are
- * actually overwritten), possibly opening the door to
- * leaks.
+ if (!env->allow_ptr_leaks && *stype != STACK_MISC && *stype != STACK_ZERO) {
+ /* Reject the write if range we may write to has not
+ * been initialized beforehand. If we didn't reject
+ * here, the ptr status would be erased below (even
+ * though not all slots are actually overwritten),
+ * possibly opening the door to leaks.
+ *
+ * We do however catch STACK_INVALID case below, and
+ * only allow reading possibly uninitialized memory
+ * later for CAP_PERFMON, as the write may not happen to
+ * that slot.
*/
verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d",
insn_idx, i);
@@ -3678,15 +3921,15 @@ int check_ptr_off_reg(struct bpf_verifier_env *env,
}
static int map_kptr_match_type(struct bpf_verifier_env *env,
- struct bpf_map_value_off_desc *off_desc,
+ struct btf_field *kptr_field,
struct bpf_reg_state *reg, u32 regno)
{
- const char *targ_name = kernel_type_name(off_desc->kptr.btf, off_desc->kptr.btf_id);
- int perm_flags = PTR_MAYBE_NULL;
+ const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
+ int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED;
const char *reg_name = "";
/* Only unreferenced case accepts untrusted pointers */
- if (off_desc->type == BPF_KPTR_UNREF)
+ if (kptr_field->type == BPF_KPTR_UNREF)
perm_flags |= PTR_UNTRUSTED;
if (base_type(reg->type) != PTR_TO_BTF_ID || (type_flag(reg->type) & ~perm_flags))
@@ -3733,15 +3976,15 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
* strict mode to true for type match.
*/
if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off,
- off_desc->kptr.btf, off_desc->kptr.btf_id,
- off_desc->type == BPF_KPTR_REF))
+ kptr_field->kptr.btf, kptr_field->kptr.btf_id,
+ kptr_field->type == BPF_KPTR_REF))
goto bad_type;
return 0;
bad_type:
verbose(env, "invalid kptr access, R%d type=%s%s ", regno,
reg_type_str(env, reg->type), reg_name);
verbose(env, "expected=%s%s", reg_type_str(env, PTR_TO_BTF_ID), targ_name);
- if (off_desc->type == BPF_KPTR_UNREF)
+ if (kptr_field->type == BPF_KPTR_UNREF)
verbose(env, " or %s%s\n", reg_type_str(env, PTR_TO_BTF_ID | PTR_UNTRUSTED),
targ_name);
else
@@ -3751,7 +3994,7 @@ bad_type:
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx,
- struct bpf_map_value_off_desc *off_desc)
+ struct btf_field *kptr_field)
{
struct bpf_insn *insn = &env->prog->insnsi[insn_idx];
int class = BPF_CLASS(insn->code);
@@ -3761,7 +4004,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* - Reject cases where variable offset may touch kptr
* - size of access (must be BPF_DW)
* - tnum_is_const(reg->var_off)
- * - off_desc->offset == off + reg->var_off.value
+ * - kptr_field->offset == off + reg->var_off.value
*/
/* Only BPF_[LDX,STX,ST] | BPF_MEM | BPF_DW is supported */
if (BPF_MODE(insn->code) != BPF_MEM) {
@@ -3772,7 +4015,7 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We only allow loading referenced kptr, since it will be marked as
* untrusted, similar to unreferenced kptr.
*/
- if (class != BPF_LDX && off_desc->type == BPF_KPTR_REF) {
+ if (class != BPF_LDX && kptr_field->type == BPF_KPTR_REF) {
verbose(env, "store to referenced kptr disallowed\n");
return -EACCES;
}
@@ -3782,19 +4025,19 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
/* We can simply mark the value_regno receiving the pointer
* value from map as PTR_TO_BTF_ID, with the correct type.
*/
- mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, off_desc->kptr.btf,
- off_desc->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
+ mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED);
/* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) {
val_reg = reg_state(env, value_regno);
if (!register_is_null(val_reg) &&
- map_kptr_match_type(env, off_desc, val_reg, value_regno))
+ map_kptr_match_type(env, kptr_field, val_reg, value_regno))
return -EACCES;
} else if (class == BPF_ST) {
if (insn->imm) {
verbose(env, "BPF_ST imm must be 0 when storing to kptr at off=%u\n",
- off_desc->offset);
+ kptr_field->offset);
return -EACCES;
}
} else {
@@ -3813,45 +4056,30 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_reg_state *reg = &state->regs[regno];
struct bpf_map *map = reg->map_ptr;
- int err;
+ struct btf_record *rec;
+ int err, i;
err = check_mem_region_access(env, regno, off, size, map->value_size,
zero_size_allowed);
if (err)
return err;
- if (map_value_has_spin_lock(map)) {
- u32 lock = map->spin_lock_off;
+ if (IS_ERR_OR_NULL(map->record))
+ return 0;
+ rec = map->record;
+ for (i = 0; i < rec->cnt; i++) {
+ struct btf_field *field = &rec->fields[i];
+ u32 p = field->offset;
- /* if any part of struct bpf_spin_lock can be touched by
- * load/store reject this program.
- * To check that [x1, x2) overlaps with [y1, y2)
+ /* If any part of a field can be touched by load/store, reject
+ * this program. To check that [x1, x2) overlaps with [y1, y2),
* it is sufficient to check x1 < y2 && y1 < x2.
*/
- if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
- lock < reg->umax_value + off + size) {
- verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
- return -EACCES;
- }
- }
- if (map_value_has_timer(map)) {
- u32 t = map->timer_off;
-
- if (reg->smin_value + off < t + sizeof(struct bpf_timer) &&
- t < reg->umax_value + off + size) {
- verbose(env, "bpf_timer cannot be accessed directly by load/store\n");
- return -EACCES;
- }
- }
- if (map_value_has_kptrs(map)) {
- struct bpf_map_value_off *tab = map->kptr_off_tab;
- int i;
-
- for (i = 0; i < tab->nr_off; i++) {
- u32 p = tab->off[i].offset;
-
- if (reg->smin_value + off < p + sizeof(u64) &&
- p < reg->umax_value + off + size) {
+ if (reg->smin_value + off < p + btf_field_type_size(field->type) &&
+ p < reg->umax_value + off + size) {
+ switch (field->type) {
+ case BPF_KPTR_UNREF:
+ case BPF_KPTR_REF:
if (src != ACCESS_DIRECT) {
verbose(env, "kptr cannot be accessed indirectly by helper\n");
return -EACCES;
@@ -3870,10 +4098,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
}
break;
+ default:
+ verbose(env, "%s cannot be accessed directly by load/store\n",
+ btf_field_type_name(field->type));
+ return -EACCES;
}
}
}
- return err;
+ return 0;
}
#define MAX_PACKET_OFF 0xffff
@@ -4090,6 +4322,30 @@ static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno)
return reg->type == PTR_TO_FLOW_KEYS;
}
+static bool is_trusted_reg(const struct bpf_reg_state *reg)
+{
+ /* A referenced register is always trusted. */
+ if (reg->ref_obj_id)
+ return true;
+
+ /* If a register is not referenced, it is trusted if it has the
+ * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the
+ * other type modifiers may be safe, but we elect to take an opt-in
+ * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are
+ * not.
+ *
+ * Eventually, we should make PTR_TRUSTED the single source of truth
+ * for whether a register is trusted.
+ */
+ return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS &&
+ !bpf_type_has_unsafe_modifiers(reg->type);
+}
+
+static bool is_rcu_reg(const struct bpf_reg_state *reg)
+{
+ return reg->type & MEM_RCU;
+}
+
static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int off, int size, bool strict)
@@ -4506,6 +4762,18 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
u32 btf_id;
int ret;
+ if (!env->allow_ptr_leaks) {
+ verbose(env,
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ tname);
+ return -EPERM;
+ }
+ if (!env->prog->gpl_compatible && btf_is_kernel(reg->btf)) {
+ verbose(env,
+ "Cannot access kernel 'struct %s' from non-GPL compatible program\n",
+ tname);
+ return -EINVAL;
+ }
if (off < 0) {
verbose(env,
"R%d is ptr_%s invalid negative access: off=%d\n",
@@ -4536,17 +4804,28 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
- if (env->ops->btf_struct_access) {
- ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id, &flag);
+ if (env->ops->btf_struct_access && !type_is_alloc(reg->type)) {
+ if (!btf_is_kernel(reg->btf)) {
+ verbose(env, "verifier internal error: reg->btf must be kernel btf\n");
+ return -EFAULT;
+ }
+ ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
} else {
- if (atype != BPF_READ) {
+ /* Writes are permitted with default btf_struct_access for
+ * program allocated objects (which always have ref_obj_id > 0),
+ * but not for untrusted PTR_TO_BTF_ID | MEM_ALLOC.
+ */
+ if (atype != BPF_READ && reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
verbose(env, "only read is supported\n");
return -EACCES;
}
- ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id, &flag);
+ if (type_is_alloc(reg->type) && !reg->ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id for allocated object must be non-zero\n");
+ return -EFAULT;
+ }
+
+ ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag);
}
if (ret < 0)
@@ -4558,6 +4837,30 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
if (type_flag(reg->type) & PTR_UNTRUSTED)
flag |= PTR_UNTRUSTED;
+ /* By default any pointer obtained from walking a trusted pointer is
+ * no longer trusted except the rcu case below.
+ */
+ flag &= ~PTR_TRUSTED;
+
+ if (flag & MEM_RCU) {
+ /* Mark value register as MEM_RCU only if it is protected by
+ * bpf_rcu_read_lock() and the ptr reg is rcu or trusted. MEM_RCU
+ * itself can already indicate trustedness inside the rcu
+ * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
+ * it could be null in some cases.
+ */
+ if (!env->cur_state->active_rcu_lock ||
+ !(is_trusted_reg(reg) || is_rcu_reg(reg)))
+ flag &= ~MEM_RCU;
+ else
+ flag |= PTR_MAYBE_NULL;
+ } else if (reg->type & MEM_RCU) {
+ /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
+ * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
+ */
+ flag |= PTR_UNTRUSTED;
+ }
+
if (atype == BPF_READ && value_regno >= 0)
mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
@@ -4572,6 +4875,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ struct bpf_reg_state map_reg;
enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
@@ -4592,9 +4896,9 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
t = btf_type_by_id(btf_vmlinux, *map->ops->map_btf_id);
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!env->allow_ptr_to_map_access) {
+ if (!env->allow_ptr_leaks) {
verbose(env,
- "%s access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
+ "'struct %s' access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN\n",
tname);
return -EPERM;
}
@@ -4610,7 +4914,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
+ /* Simulate access to a PTR_TO_BTF_ID */
+ memset(&map_reg, 0, sizeof(map_reg));
+ mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0);
+ ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag);
if (ret < 0)
return ret;
@@ -4746,7 +5053,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_MAP_VALUE) {
- struct bpf_map_value_off_desc *kptr_off_desc = NULL;
+ struct btf_field *kptr_field = NULL;
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
@@ -4760,11 +5067,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err)
return err;
if (tnum_is_const(reg->var_off))
- kptr_off_desc = bpf_map_kptr_off_contains(reg->map_ptr,
- off + reg->var_off.value);
- if (kptr_off_desc) {
- err = check_map_kptr_access(env, regno, value_regno, insn_idx,
- kptr_off_desc);
+ kptr_field = btf_record_find(reg->map_ptr->record,
+ off + reg->var_off.value, BPF_KPTR);
+ if (kptr_field) {
+ err = check_map_kptr_access(env, regno, value_regno, insn_idx, kptr_field);
} else if (t == BPF_READ && value_regno >= 0) {
struct bpf_map *map = reg->map_ptr;
@@ -5155,10 +5461,6 @@ static int check_stack_range_initialized(
}
if (is_spilled_reg(&state->stack[spi]) &&
- base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID)
- goto mark;
-
- if (is_spilled_reg(&state->stack[spi]) &&
(state->stack[spi].spilled_ptr.type == SCALAR_VALUE ||
env->allow_ptr_leaks)) {
if (clobber) {
@@ -5188,6 +5490,11 @@ mark:
mark_reg_read(env, &state->stack[spi].spilled_ptr,
state->stack[spi].spilled_ptr.parent,
REG_LIVE_READ64);
+ /* We do not set REG_LIVE_WRITTEN for stack slot, as we can not
+ * be sure that whether stack slot is written to or not. Hence,
+ * we must still conservatively propagate reads upwards even if
+ * helper may write to the entire memory range.
+ */
}
return update_stack_depth(env, state, min_off);
}
@@ -5369,8 +5676,8 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return err;
}
-int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
- u32 regno)
+static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
{
struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
bool may_be_null = type_may_be_null(mem_reg->type);
@@ -5398,23 +5705,26 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
}
/* Implementation details:
- * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
+ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL.
+ * bpf_obj_new returns PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL.
* Two bpf_map_lookups (even with the same key) will have different reg->id.
- * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
- * value_or_null->value transition, since the verifier only cares about
- * the range of access to valid map value pointer and doesn't care about actual
- * address of the map element.
+ * Two separate bpf_obj_new will also have different reg->id.
+ * For traditional PTR_TO_MAP_VALUE or PTR_TO_BTF_ID | MEM_ALLOC, the verifier
+ * clears reg->id after value_or_null->value transition, since the verifier only
+ * cares about the range of access to valid map value pointer and doesn't care
+ * about actual address of the map element.
* For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
* reg->id > 0 after value_or_null->value transition. By doing so
* two bpf_map_lookups will be considered two different pointers that
- * point to different bpf_spin_locks.
+ * point to different bpf_spin_locks. Likewise for pointers to allocated objects
+ * returned from bpf_obj_new.
* The verifier allows taking only one bpf_spin_lock at a time to avoid
* dead-locks.
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_state->active_spin_lock remembers which map value element got locked
- * and clears it after bpf_spin_unlock.
+ * cur_state->active_lock remembers which map value element or allocated
+ * object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
@@ -5422,8 +5732,10 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
- struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
+ struct bpf_map *map = NULL;
+ struct btf *btf = NULL;
+ struct btf_record *rec;
if (!is_const) {
verbose(env,
@@ -5431,49 +5743,78 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
regno);
return -EINVAL;
}
- if (!map->btf) {
- verbose(env,
- "map '%s' has to have BTF in order to use bpf_spin_lock\n",
- map->name);
- return -EINVAL;
- }
- if (!map_value_has_spin_lock(map)) {
- if (map->spin_lock_off == -E2BIG)
- verbose(env,
- "map '%s' has more than one 'struct bpf_spin_lock'\n",
- map->name);
- else if (map->spin_lock_off == -ENOENT)
+ if (reg->type == PTR_TO_MAP_VALUE) {
+ map = reg->map_ptr;
+ if (!map->btf) {
verbose(env,
- "map '%s' doesn't have 'struct bpf_spin_lock'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_spin_lock is mangled\n",
+ "map '%s' has to have BTF in order to use bpf_spin_lock\n",
map->name);
+ return -EINVAL;
+ }
+ } else {
+ btf = reg->btf;
+ }
+
+ rec = reg_btf_record(reg);
+ if (!btf_record_has_field(rec, BPF_SPIN_LOCK)) {
+ verbose(env, "%s '%s' has no valid bpf_spin_lock\n", map ? "map" : "local",
+ map ? map->name : "kptr");
return -EINVAL;
}
- if (map->spin_lock_off != val + reg->off) {
- verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
- val + reg->off);
+ if (rec->spin_lock_off != val + reg->off) {
+ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n",
+ val + reg->off, rec->spin_lock_off);
return -EINVAL;
}
if (is_lock) {
- if (cur->active_spin_lock) {
+ if (cur->active_lock.ptr) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
- cur->active_spin_lock = reg->id;
+ if (map)
+ cur->active_lock.ptr = map;
+ else
+ cur->active_lock.ptr = btf;
+ cur->active_lock.id = reg->id;
} else {
- if (!cur->active_spin_lock) {
+ struct bpf_func_state *fstate = cur_func(env);
+ void *ptr;
+ int i;
+
+ if (map)
+ ptr = map;
+ else
+ ptr = btf;
+
+ if (!cur->active_lock.ptr) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
- if (cur->active_spin_lock != reg->id) {
+ if (cur->active_lock.ptr != ptr ||
+ cur->active_lock.id != reg->id) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
- cur->active_spin_lock = 0;
+ cur->active_lock.ptr = NULL;
+ cur->active_lock.id = 0;
+
+ for (i = fstate->acquired_refs - 1; i >= 0; i--) {
+ int err;
+
+ /* Complain on error because this reference state cannot
+ * be freed before this point, as bpf_spin_lock critical
+ * section does not allow functions that release the
+ * allocated object immediately.
+ */
+ if (!fstate->refs[i].release_on_unlock)
+ continue;
+ err = release_reference(env, fstate->refs[i].id);
+ if (err) {
+ verbose(env, "failed to release release_on_unlock reference");
+ return err;
+ }
+ }
}
return 0;
}
@@ -5497,24 +5838,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
map->name);
return -EINVAL;
}
- if (!map_value_has_timer(map)) {
- if (map->timer_off == -E2BIG)
- verbose(env,
- "map '%s' has more than one 'struct bpf_timer'\n",
- map->name);
- else if (map->timer_off == -ENOENT)
- verbose(env,
- "map '%s' doesn't have 'struct bpf_timer'\n",
- map->name);
- else
- verbose(env,
- "map '%s' is not a struct type or bpf_timer is mangled\n",
- map->name);
+ if (!btf_record_has_field(map->record, BPF_TIMER)) {
+ verbose(env, "map '%s' has no valid bpf_timer\n", map->name);
return -EINVAL;
}
- if (map->timer_off != val + reg->off) {
+ if (map->record->timer_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n",
- val + reg->off, map->timer_off);
+ val + reg->off, map->record->timer_off);
return -EINVAL;
}
if (meta->map_ptr) {
@@ -5530,10 +5860,9 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- struct bpf_map_value_off_desc *off_desc;
struct bpf_map *map_ptr = reg->map_ptr;
+ struct btf_field *kptr_field;
u32 kptr_off;
- int ret;
if (!tnum_is_const(reg->var_off)) {
verbose(env,
@@ -5546,30 +5875,136 @@ static int process_kptr_func(struct bpf_verifier_env *env, int regno,
map_ptr->name);
return -EINVAL;
}
- if (!map_value_has_kptrs(map_ptr)) {
- ret = PTR_ERR_OR_ZERO(map_ptr->kptr_off_tab);
- if (ret == -E2BIG)
- verbose(env, "map '%s' has more than %d kptr\n", map_ptr->name,
- BPF_MAP_VALUE_OFF_MAX);
- else if (ret == -EEXIST)
- verbose(env, "map '%s' has repeating kptr BTF tags\n", map_ptr->name);
- else
- verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
+ if (!btf_record_has_field(map_ptr->record, BPF_KPTR)) {
+ verbose(env, "map '%s' has no valid kptr\n", map_ptr->name);
return -EINVAL;
}
meta->map_ptr = map_ptr;
kptr_off = reg->off + reg->var_off.value;
- off_desc = bpf_map_kptr_off_contains(map_ptr, kptr_off);
- if (!off_desc) {
+ kptr_field = btf_record_find(map_ptr->record, kptr_off, BPF_KPTR);
+ if (!kptr_field) {
verbose(env, "off=%d doesn't point to kptr\n", kptr_off);
return -EACCES;
}
- if (off_desc->type != BPF_KPTR_REF) {
+ if (kptr_field->type != BPF_KPTR_REF) {
verbose(env, "off=%d kptr isn't referenced kptr\n", kptr_off);
return -EACCES;
}
- meta->kptr_off_desc = off_desc;
+ meta->kptr_field = kptr_field;
+ return 0;
+}
+
+/* There are two register types representing a bpf_dynptr, one is PTR_TO_STACK
+ * which points to a stack slot, and the other is CONST_PTR_TO_DYNPTR.
+ *
+ * In both cases we deal with the first 8 bytes, but need to mark the next 8
+ * bytes as STACK_DYNPTR in case of PTR_TO_STACK. In case of
+ * CONST_PTR_TO_DYNPTR, we are guaranteed to get the beginning of the object.
+ *
+ * Mutability of bpf_dynptr is at two levels, one is at the level of struct
+ * bpf_dynptr itself, i.e. whether the helper is receiving a pointer to struct
+ * bpf_dynptr or pointer to const struct bpf_dynptr. In the former case, it can
+ * mutate the view of the dynptr and also possibly destroy it. In the latter
+ * case, it cannot mutate the bpf_dynptr itself but it can still mutate the
+ * memory that dynptr points to.
+ *
+ * The verifier will keep track both levels of mutation (bpf_dynptr's in
+ * reg->type and the memory's in reg->dynptr.type), but there is no support for
+ * readonly dynptr view yet, hence only the first case is tracked and checked.
+ *
+ * This is consistent with how C applies the const modifier to a struct object,
+ * where the pointer itself inside bpf_dynptr becomes const but not what it
+ * points to.
+ *
+ * Helpers which do not mutate the bpf_dynptr set MEM_RDONLY in their argument
+ * type, and declare it as 'const struct bpf_dynptr *' in their prototype.
+ */
+int process_dynptr_func(struct bpf_verifier_env *env, int regno,
+ enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+
+ /* MEM_UNINIT and MEM_RDONLY are exclusive, when applied to an
+ * ARG_PTR_TO_DYNPTR (or ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_*):
+ */
+ if ((arg_type & (MEM_UNINIT | MEM_RDONLY)) == (MEM_UNINIT | MEM_RDONLY)) {
+ verbose(env, "verifier internal error: misconfigured dynptr helper type flags\n");
+ return -EFAULT;
+ }
+ /* CONST_PTR_TO_DYNPTR already has fixed and var_off as 0 due to
+ * check_func_arg_reg_off's logic. We only need to check offset
+ * alignment for PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK && (reg->off % BPF_REG_SIZE)) {
+ verbose(env, "cannot pass in dynptr at an offset=%d\n", reg->off);
+ return -EINVAL;
+ }
+ /* MEM_UNINIT - Points to memory that is an appropriate candidate for
+ * constructing a mutable bpf_dynptr object.
+ *
+ * Currently, this is only possible with PTR_TO_STACK
+ * pointing to a region of at least 16 bytes which doesn't
+ * contain an existing bpf_dynptr.
+ *
+ * MEM_RDONLY - Points to a initialized bpf_dynptr that will not be
+ * mutated or destroyed. However, the memory it points to
+ * may be mutated.
+ *
+ * None - Points to a initialized dynptr that can be mutated and
+ * destroyed, including mutation of the memory it points
+ * to.
+ */
+ if (arg_type & MEM_UNINIT) {
+ if (!is_dynptr_reg_valid_uninit(env, reg)) {
+ verbose(env, "Dynptr has to be an uninitialized dynptr\n");
+ return -EINVAL;
+ }
+
+ /* We only support one dynptr being uninitialized at the moment,
+ * which is sufficient for the helper functions we have right now.
+ */
+ if (meta->uninit_dynptr_regno) {
+ verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
+ return -EFAULT;
+ }
+
+ meta->uninit_dynptr_regno = regno;
+ } else /* MEM_RDONLY and None case from above */ {
+ /* For the reg->type == PTR_TO_STACK case, bpf_dynptr is never const */
+ if (reg->type == CONST_PTR_TO_DYNPTR && !(arg_type & MEM_RDONLY)) {
+ verbose(env, "cannot pass pointer to const bpf_dynptr, the helper mutates it\n");
+ return -EINVAL;
+ }
+
+ if (!is_dynptr_reg_valid_init(env, reg)) {
+ verbose(env,
+ "Expected an initialized dynptr as arg #%d\n",
+ regno);
+ return -EINVAL;
+ }
+
+ /* Fold modifiers (in this case, MEM_RDONLY) when checking expected type */
+ if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) {
+ const char *err_extra = "";
+
+ switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
+ case DYNPTR_TYPE_LOCAL:
+ err_extra = "local";
+ break;
+ case DYNPTR_TYPE_RINGBUF:
+ err_extra = "ringbuf";
+ break;
+ default:
+ err_extra = "<unknown>";
+ break;
+ }
+ verbose(env,
+ "Expected a dynptr of type %s as arg #%d\n",
+ err_extra, regno);
+ return -EINVAL;
+ }
+ }
return 0;
}
@@ -5634,16 +6069,6 @@ struct bpf_reg_types {
u32 *btf_id;
};
-static const struct bpf_reg_types map_key_value_types = {
- .types = {
- PTR_TO_STACK,
- PTR_TO_PACKET,
- PTR_TO_PACKET_META,
- PTR_TO_MAP_KEY,
- PTR_TO_MAP_VALUE,
- },
-};
-
static const struct bpf_reg_types sock_types = {
.types = {
PTR_TO_SOCK_COMMON,
@@ -5661,6 +6086,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
PTR_TO_TCP_SOCK,
PTR_TO_XDP_SOCK,
PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
},
.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
@@ -5674,7 +6100,7 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_MEM | MEM_ALLOC,
+ PTR_TO_MEM | MEM_RINGBUF,
PTR_TO_BUF,
},
};
@@ -5689,14 +6115,31 @@ static const struct bpf_reg_types int_ptr_types = {
},
};
+static const struct bpf_reg_types spin_lock_types = {
+ .types = {
+ PTR_TO_MAP_VALUE,
+ PTR_TO_BTF_ID | MEM_ALLOC,
+ }
+};
+
static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } };
+static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
-static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
-static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
+static const struct bpf_reg_types btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID,
+ PTR_TO_BTF_ID | PTR_TRUSTED,
+ PTR_TO_BTF_ID | MEM_RCU,
+ },
+};
+static const struct bpf_reg_types percpu_btf_ptr_types = {
+ .types = {
+ PTR_TO_BTF_ID | MEM_PERCPU,
+ PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED,
+ }
+};
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -5705,13 +6148,13 @@ static const struct bpf_reg_types kptr_types = { .types = { PTR_TO_MAP_VALUE } }
static const struct bpf_reg_types dynptr_types = {
.types = {
PTR_TO_STACK,
- PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL,
+ CONST_PTR_TO_DYNPTR,
}
};
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
- [ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
+ [ARG_PTR_TO_MAP_KEY] = &mem_types,
+ [ARG_PTR_TO_MAP_VALUE] = &mem_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
@@ -5725,7 +6168,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
+ [ARG_PTR_TO_RINGBUF_MEM] = &ringbuf_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
@@ -5784,7 +6227,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
return -EACCES;
found:
- if (reg->type == PTR_TO_BTF_ID) {
+ if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) {
/* For bpf_sk_release, it needs to match against first member
* 'struct sock_common', hence make an exception for it. This
* allows bpf_sk_release to work for multiple socket types.
@@ -5801,7 +6244,7 @@ found:
}
if (meta->func_id == BPF_FUNC_kptr_xchg) {
- if (map_kptr_match_type(env, meta->kptr_off_desc, reg, regno))
+ if (map_kptr_match_type(env, meta->kptr_field, reg, regno))
return -EACCES;
} else {
if (arg_btf_id == BPF_PTR_POISON) {
@@ -5820,6 +6263,11 @@ found:
return -EACCES;
}
}
+ } else if (type_is_alloc(reg->type)) {
+ if (meta->func_id != BPF_FUNC_spin_lock && meta->func_id != BPF_FUNC_spin_unlock) {
+ verbose(env, "verifier internal error: unimplemented handling of MEM_ALLOC\n");
+ return -EFAULT;
+ }
}
return 0;
@@ -5829,64 +6277,80 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg, int regno,
enum bpf_arg_type arg_type)
{
- enum bpf_reg_type type = reg->type;
- bool fixed_off_ok = false;
+ u32 type = reg->type;
- switch ((u32)type) {
- /* Pointer types where reg offset is explicitly allowed: */
- case PTR_TO_STACK:
- if (arg_type_is_dynptr(arg_type) && reg->off % BPF_REG_SIZE) {
- verbose(env, "cannot pass in dynptr at an offset\n");
+ /* When referenced register is passed to release function, its fixed
+ * offset must be 0.
+ *
+ * We will check arg_type_is_release reg has ref_obj_id when storing
+ * meta->release_regno.
+ */
+ if (arg_type_is_release(arg_type)) {
+ /* ARG_PTR_TO_DYNPTR with OBJ_RELEASE is a bit special, as it
+ * may not directly point to the object being released, but to
+ * dynptr pointing to such object, which might be at some offset
+ * on the stack. In that case, we simply to fallback to the
+ * default handling.
+ */
+ if (arg_type_is_dynptr(arg_type) && type == PTR_TO_STACK)
+ return 0;
+ /* Doing check_ptr_off_reg check for the offset will catch this
+ * because fixed_off_ok is false, but checking here allows us
+ * to give the user a better error message.
+ */
+ if (reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func or trusted arg to kfunc\n",
+ regno);
return -EINVAL;
}
- fallthrough;
+ return __check_ptr_off_reg(env, reg, regno, false);
+ }
+
+ switch (type) {
+ /* Pointer types where both fixed and variable offset is explicitly allowed: */
+ case PTR_TO_STACK:
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
case PTR_TO_MEM:
case PTR_TO_MEM | MEM_RDONLY:
- case PTR_TO_MEM | MEM_ALLOC:
+ case PTR_TO_MEM | MEM_RINGBUF:
case PTR_TO_BUF:
case PTR_TO_BUF | MEM_RDONLY:
case SCALAR_VALUE:
- /* Some of the argument types nevertheless require a
- * zero register offset.
- */
- if (base_type(arg_type) != ARG_PTR_TO_ALLOC_MEM)
- return 0;
- break;
+ return 0;
/* All the rest must be rejected, except PTR_TO_BTF_ID which allows
* fixed offset.
*/
case PTR_TO_BTF_ID:
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | PTR_TRUSTED:
+ case PTR_TO_BTF_ID | MEM_RCU:
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
/* When referenced PTR_TO_BTF_ID is passed to release function,
- * it's fixed offset must be 0. In the other cases, fixed offset
- * can be non-zero.
+ * its fixed offset must be 0. In the other cases, fixed offset
+ * can be non-zero. This was already checked above. So pass
+ * fixed_off_ok as true to allow fixed offset for all other
+ * cases. var_off always must be 0 for PTR_TO_BTF_ID, hence we
+ * still need to do checks instead of returning.
*/
- if (arg_type_is_release(arg_type) && reg->off) {
- verbose(env, "R%d must have zero offset when passed to release func\n",
- regno);
- return -EINVAL;
- }
- /* For arg is release pointer, fixed_off_ok must be false, but
- * we already checked and rejected reg->off != 0 above, so set
- * to true to allow fixed offset for all other cases.
- */
- fixed_off_ok = true;
- break;
+ return __check_ptr_off_reg(env, reg, regno, true);
default:
- break;
+ return __check_ptr_off_reg(env, reg, regno, false);
}
- return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
}
-static u32 stack_slot_get_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static u32 dynptr_ref_obj_id(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
+
+ if (reg->type == CONST_PTR_TO_DYNPTR)
+ return reg->ref_obj_id;
- return state->stack[spi].spilled_ptr.id;
+ spi = get_spi(reg->off);
+ return state->stack[spi].spilled_ptr.ref_obj_id;
}
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
@@ -5935,7 +6399,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
goto skip_type_check;
/* arg_btf_id and arg_size are in a union. */
- if (base_type(arg_type) == ARG_PTR_TO_BTF_ID)
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID ||
+ base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK)
arg_btf_id = fn->arg_btf_id[arg];
err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
@@ -5950,11 +6415,22 @@ skip_type_check:
if (arg_type_is_release(arg_type)) {
if (arg_type_is_dynptr(arg_type)) {
struct bpf_func_state *state = func(env, reg);
- int spi = get_spi(reg->off);
+ int spi;
- if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
- !state->stack[spi].spilled_ptr.id) {
- verbose(env, "arg %d is an unacquired reference\n", regno);
+ /* Only dynptr created on stack can be released, thus
+ * the get_spi and stack state checks for spilled_ptr
+ * should only be done before process_dynptr_func for
+ * PTR_TO_STACK.
+ */
+ if (reg->type == PTR_TO_STACK) {
+ spi = get_spi(reg->off);
+ if (!is_spi_bounds_valid(state, spi, BPF_DYNPTR_NR_SLOTS) ||
+ !state->stack[spi].spilled_ptr.ref_obj_id) {
+ verbose(env, "arg %d is an unacquired reference\n", regno);
+ return -EINVAL;
+ }
+ } else {
+ verbose(env, "cannot release unowned const bpf_dynptr\n");
return -EINVAL;
}
} else if (!reg->ref_obj_id && !register_is_null(reg)) {
@@ -6051,19 +6527,22 @@ skip_type_check:
break;
case ARG_PTR_TO_SPIN_LOCK:
if (meta->func_id == BPF_FUNC_spin_lock) {
- if (process_spin_lock(env, regno, true))
- return -EACCES;
+ err = process_spin_lock(env, regno, true);
+ if (err)
+ return err;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
- if (process_spin_lock(env, regno, false))
- return -EACCES;
+ err = process_spin_lock(env, regno, false);
+ if (err)
+ return err;
} else {
verbose(env, "verifier internal error\n");
return -EFAULT;
}
break;
case ARG_PTR_TO_TIMER:
- if (process_timer_func(env, regno, meta))
- return -EACCES;
+ err = process_timer_func(env, regno, meta);
+ if (err)
+ return err;
break;
case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno;
@@ -6086,52 +6565,9 @@ skip_type_check:
err = check_mem_size_reg(env, reg, regno, true, meta);
break;
case ARG_PTR_TO_DYNPTR:
- /* We only need to check for initialized / uninitialized helper
- * dynptr args if the dynptr is not PTR_TO_DYNPTR, as the
- * assumption is that if it is, that a helper function
- * initialized the dynptr on behalf of the BPF program.
- */
- if (base_type(reg->type) == PTR_TO_DYNPTR)
- break;
- if (arg_type & MEM_UNINIT) {
- if (!is_dynptr_reg_valid_uninit(env, reg)) {
- verbose(env, "Dynptr has to be an uninitialized dynptr\n");
- return -EINVAL;
- }
-
- /* We only support one dynptr being uninitialized at the moment,
- * which is sufficient for the helper functions we have right now.
- */
- if (meta->uninit_dynptr_regno) {
- verbose(env, "verifier internal error: multiple uninitialized dynptr args\n");
- return -EFAULT;
- }
-
- meta->uninit_dynptr_regno = regno;
- } else if (!is_dynptr_reg_valid_init(env, reg)) {
- verbose(env,
- "Expected an initialized dynptr as arg #%d\n",
- arg + 1);
- return -EINVAL;
- } else if (!is_dynptr_type_expected(env, reg, arg_type)) {
- const char *err_extra = "";
-
- switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
- case DYNPTR_TYPE_LOCAL:
- err_extra = "local";
- break;
- case DYNPTR_TYPE_RINGBUF:
- err_extra = "ringbuf";
- break;
- default:
- err_extra = "<unknown>";
- break;
- }
- verbose(env,
- "Expected a dynptr of type %s as arg #%d\n",
- err_extra, arg + 1);
- return -EINVAL;
- }
+ err = process_dynptr_func(env, regno, arg_type, meta);
+ if (err)
+ return err;
break;
case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
@@ -6198,8 +6634,9 @@ skip_type_check:
break;
}
case ARG_PTR_TO_KPTR:
- if (process_kptr_func(env, regno, meta))
- return -EACCES;
+ err = process_kptr_func(env, regno, meta);
+ if (err)
+ return err;
break;
}
@@ -6360,6 +6797,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_task_storage_delete)
goto error;
break;
+ case BPF_MAP_TYPE_CGRP_STORAGE:
+ if (func_id != BPF_FUNC_cgrp_storage_get &&
+ func_id != BPF_FUNC_cgrp_storage_delete)
+ goto error;
+ break;
case BPF_MAP_TYPE_BLOOM_FILTER:
if (func_id != BPF_FUNC_map_peek_elem &&
func_id != BPF_FUNC_map_push_elem)
@@ -6472,6 +6914,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE)
goto error;
break;
+ case BPF_FUNC_cgrp_storage_get:
+ case BPF_FUNC_cgrp_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_CGRP_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -6543,9 +6990,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
int i;
for (i = 0; i < ARRAY_SIZE(fn->arg_type); i++) {
- if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
- return false;
-
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID)
+ return !!fn->arg_btf_id[i];
+ if (base_type(fn->arg_type[i]) == ARG_PTR_TO_SPIN_LOCK)
+ return fn->arg_btf_id[i] == BPF_PTR_POISON;
if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
/* arg_btf_id and arg_size are in a union. */
(base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
@@ -6618,8 +7066,12 @@ static int release_reference(struct bpf_verifier_env *env,
return err;
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
- if (reg->ref_obj_id == ref_obj_id)
- __mark_reg_unknown(env, reg);
+ if (reg->ref_obj_id == ref_obj_id) {
+ if (!env->allow_ptr_leaks)
+ __mark_reg_not_init(env, reg);
+ else
+ __mark_reg_unknown(env, reg);
+ }
}));
return 0;
@@ -6642,6 +7094,10 @@ typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
struct bpf_func_state *callee,
int insn_idx);
+static int set_callee_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee, int insn_idx);
+
static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx, int subprog,
set_callee_state_fn set_callee_state_cb)
@@ -6692,6 +7148,16 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
}
+ /* set_callee_state is used for direct subprog calls, but we are
+ * interested in validating only BPF helpers that can call subprogs as
+ * callbacks
+ */
+ if (set_callee_state_cb != set_callee_state && !is_callback_calling_function(insn->imm)) {
+ verbose(env, "verifier bug: helper %s#%d is not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ }
+
if (insn->code == (BPF_JMP | BPF_CALL) &&
insn->src_reg == 0 &&
insn->imm == BPF_FUNC_timer_set_callback) {
@@ -6736,11 +7202,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* Transfer references to the callee */
err = copy_reference_state(callee, caller);
if (err)
- return err;
+ goto err_out;
err = set_callee_state_cb(env, caller, callee, *insn_idx);
if (err)
- return err;
+ goto err_out;
clear_caller_saved_regs(env, caller->regs);
@@ -6757,6 +7223,11 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
print_verifier_state(env, callee, true);
}
return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
}
int map_set_for_each_callback_args(struct bpf_verifier_env *env,
@@ -6933,11 +7404,10 @@ static int set_user_ringbuf_callback_state(struct bpf_verifier_env *env,
{
/* bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void
* callback_ctx, u64 flags);
- * callback_fn(struct bpf_dynptr_t* dynptr, void *callback_ctx);
+ * callback_fn(const struct bpf_dynptr_t* dynptr, void *callback_ctx);
*/
__mark_reg_not_init(env, &callee->regs[BPF_REG_0]);
- callee->regs[BPF_REG_1].type = PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL;
- __mark_reg_known_zero(&callee->regs[BPF_REG_1]);
+ mark_dynptr_cb_reg(&callee->regs[BPF_REG_1], BPF_DYNPTR_TYPE_LOCAL);
callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
/* unused */
@@ -6970,8 +7440,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return -EINVAL;
}
- state->curframe--;
- caller = state->frame[state->curframe];
+ caller = state->frame[state->curframe - 1];
if (callee->in_callback_fn) {
/* enforce R0 return value range [0, 1]. */
struct tnum range = callee->callback_ret_range;
@@ -7010,7 +7479,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
}
/* clear everything in the callee */
free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
+ state->frame[state->curframe--] = NULL;
return 0;
}
@@ -7270,6 +7739,11 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EINVAL;
}
+ if (!env->prog->aux->sleepable && fn->might_sleep) {
+ verbose(env, "helper call might sleep in a non-sleepable prog\n");
+ return -EINVAL;
+ }
+
/* With LD_ABS/IND some JITs save/restore skb from r1. */
changes_data = bpf_helper_changes_pkt_data(fn->func);
if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) {
@@ -7288,6 +7762,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
+ if (env->cur_state->active_rcu_lock) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in rcu_read_lock region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->sleepable && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -7316,7 +7801,15 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be reinitialized by any dynptr helper. Hence, mark_stack_slots_dynptr
+ * is safe to do directly.
+ */
if (meta.uninit_dynptr_regno) {
+ if (regs[meta.uninit_dynptr_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be initialized\n");
+ return -EFAULT;
+ }
/* we write BPF_DW bits (8 bytes) at a time */
for (i = 0; i < BPF_DYNPTR_SIZE; i += 8) {
err = check_mem_access(env, insn_idx, meta.uninit_dynptr_regno,
@@ -7334,15 +7827,24 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (meta.release_regno) {
err = -EINVAL;
- if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1]))
+ /* This can only be set for PTR_TO_STACK, as CONST_PTR_TO_DYNPTR cannot
+ * be released by any dynptr helper. Hence, unmark_stack_slots_dynptr
+ * is safe to do directly.
+ */
+ if (arg_type_is_dynptr(fn->arg_type[meta.release_regno - BPF_REG_1])) {
+ if (regs[meta.release_regno].type == CONST_PTR_TO_DYNPTR) {
+ verbose(env, "verifier internal error: CONST_PTR_TO_DYNPTR cannot be released\n");
+ return -EFAULT;
+ }
err = unmark_stack_slots_dynptr(env, &regs[meta.release_regno]);
- else if (meta.ref_obj_id)
+ } else if (meta.ref_obj_id) {
err = release_reference(env, meta.ref_obj_id);
- /* meta.ref_obj_id can only be 0 if register that is meant to be
- * released is NULL, which must be > R0.
- */
- else if (register_is_null(&regs[meta.release_regno]))
+ } else if (register_is_null(&regs[meta.release_regno])) {
+ /* meta.ref_obj_id can only be 0 if register that is meant to be
+ * released is NULL, which must be > R0.
+ */
err = 0;
+ }
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
func_id_name(func_id), func_id);
@@ -7416,11 +7918,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -EFAULT;
}
- if (base_type(reg->type) != PTR_TO_DYNPTR)
- /* Find the id of the dynptr we're
- * tracking the reference of
- */
- meta.ref_obj_id = stack_slot_get_id(env, reg);
+ meta.ref_obj_id = dynptr_ref_obj_id(env, reg);
break;
}
}
@@ -7475,7 +7973,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
if (!type_may_be_null(ret_type) &&
- map_value_has_spin_lock(meta.map_ptr)) {
+ btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
break;
@@ -7491,7 +7989,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
break;
- case RET_PTR_TO_ALLOC_MEM:
+ case RET_PTR_TO_MEM:
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
@@ -7539,8 +8037,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
if (func_id == BPF_FUNC_kptr_xchg) {
- ret_btf = meta.kptr_off_desc->kptr.btf;
- ret_btf_id = meta.kptr_off_desc->kptr.btf_id;
+ ret_btf = meta.kptr_field->kptr.btf;
+ ret_btf_id = meta.kptr_field->kptr.btf_id;
} else {
if (fn->ret_btf_id == BPF_PTR_POISON) {
verbose(env, "verifier internal error:");
@@ -7654,19 +8152,926 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
+struct bpf_kfunc_call_arg_meta {
+ /* In parameters */
+ struct btf *btf;
+ u32 func_id;
+ u32 kfunc_flags;
+ const struct btf_type *func_proto;
+ const char *func_name;
+ /* Out parameters */
+ u32 ref_obj_id;
+ u8 release_regno;
+ bool r0_rdonly;
+ u32 ret_btf_id;
+ u64 r0_size;
+ struct {
+ u64 value;
+ bool found;
+ } arg_constant;
+ struct {
+ struct btf *btf;
+ u32 btf_id;
+ } arg_obj_drop;
+ struct {
+ struct btf_field *field;
+ } arg_list_head;
+};
+
+static bool is_kfunc_acquire(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_ACQUIRE;
+}
+
+static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RET_NULL;
+}
+
+static bool is_kfunc_release(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RELEASE;
+}
+
+static bool is_kfunc_trusted_args(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_TRUSTED_ARGS;
+}
+
+static bool is_kfunc_sleepable(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_SLEEPABLE;
+}
+
+static bool is_kfunc_destructive(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_DESTRUCTIVE;
+}
+
+static bool is_kfunc_rcu(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->kfunc_flags & KF_RCU;
+}
+
+static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg)
+{
+ return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET);
+}
+
+static bool __kfunc_param_match_suffix(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *suffix)
+{
+ int suffix_len = strlen(suffix), len;
+ const char *param_name;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < suffix_len)
+ return false;
+ param_name += len - suffix_len;
+ return !strncmp(param_name, suffix, suffix_len);
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ const struct btf_type *t;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ return __kfunc_param_match_suffix(btf, arg, "__sz");
+}
+
+static bool is_kfunc_arg_constant(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__k");
+}
+
+static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__ign");
+}
+
+static bool is_kfunc_arg_alloc_obj(const struct btf *btf, const struct btf_param *arg)
+{
+ return __kfunc_param_match_suffix(btf, arg, "__alloc");
+}
+
+static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
+ const struct btf_param *arg,
+ const char *name)
+{
+ int len, target_len = strlen(name);
+ const char *param_name;
+
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len != target_len)
+ return false;
+ if (strcmp(param_name, name))
+ return false;
+
+ return true;
+}
+
+enum {
+ KF_ARG_DYNPTR_ID,
+ KF_ARG_LIST_HEAD_ID,
+ KF_ARG_LIST_NODE_ID,
+};
+
+BTF_ID_LIST(kf_arg_btf_ids)
+BTF_ID(struct, bpf_dynptr_kern)
+BTF_ID(struct, bpf_list_head)
+BTF_ID(struct, bpf_list_node)
+
+static bool __is_kfunc_ptr_arg_type(const struct btf *btf,
+ const struct btf_param *arg, int type)
+{
+ const struct btf_type *t;
+ u32 res_id;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!t)
+ return false;
+ if (!btf_type_is_ptr(t))
+ return false;
+ t = btf_type_skip_modifiers(btf, t->type, &res_id);
+ if (!t)
+ return false;
+ return btf_types_are_same(btf, res_id, btf_vmlinux, kf_arg_btf_ids[type]);
+}
+
+static bool is_kfunc_arg_dynptr(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_DYNPTR_ID);
+}
+
+static bool is_kfunc_arg_list_head(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_HEAD_ID);
+}
+
+static bool is_kfunc_arg_list_node(const struct btf *btf, const struct btf_param *arg)
+{
+ return __is_kfunc_ptr_arg_type(btf, arg, KF_ARG_LIST_NODE_ID);
+}
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_env *env,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ verbose(env, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(env, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
+
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+enum kfunc_ptr_arg_type {
+ KF_ARG_PTR_TO_CTX,
+ KF_ARG_PTR_TO_ALLOC_BTF_ID, /* Allocated object */
+ KF_ARG_PTR_TO_KPTR, /* PTR_TO_KPTR but type specific */
+ KF_ARG_PTR_TO_DYNPTR,
+ KF_ARG_PTR_TO_LIST_HEAD,
+ KF_ARG_PTR_TO_LIST_NODE,
+ KF_ARG_PTR_TO_BTF_ID, /* Also covers reg2btf_ids conversions */
+ KF_ARG_PTR_TO_MEM,
+ KF_ARG_PTR_TO_MEM_SIZE, /* Size derived from next argument, skip it */
+};
+
+enum special_kfunc_type {
+ KF_bpf_obj_new_impl,
+ KF_bpf_obj_drop_impl,
+ KF_bpf_list_push_front,
+ KF_bpf_list_push_back,
+ KF_bpf_list_pop_front,
+ KF_bpf_list_pop_back,
+ KF_bpf_cast_to_kern_ctx,
+ KF_bpf_rdonly_cast,
+ KF_bpf_rcu_read_lock,
+ KF_bpf_rcu_read_unlock,
+};
+
+BTF_SET_START(special_kfunc_set)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_list_push_front)
+BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_SET_END(special_kfunc_set)
+
+BTF_ID_LIST(special_kfunc_list)
+BTF_ID(func, bpf_obj_new_impl)
+BTF_ID(func, bpf_obj_drop_impl)
+BTF_ID(func, bpf_list_push_front)
+BTF_ID(func, bpf_list_push_back)
+BTF_ID(func, bpf_list_pop_front)
+BTF_ID(func, bpf_list_pop_back)
+BTF_ID(func, bpf_cast_to_kern_ctx)
+BTF_ID(func, bpf_rdonly_cast)
+BTF_ID(func, bpf_rcu_read_lock)
+BTF_ID(func, bpf_rcu_read_unlock)
+
+static bool is_kfunc_bpf_rcu_read_lock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_lock];
+}
+
+static bool is_kfunc_bpf_rcu_read_unlock(struct bpf_kfunc_call_arg_meta *meta)
+{
+ return meta->func_id == special_kfunc_list[KF_bpf_rcu_read_unlock];
+}
+
+static enum kfunc_ptr_arg_type
+get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ const struct btf_type *t, const struct btf_type *ref_t,
+ const char *ref_tname, const struct btf_param *args,
+ int argno, int nargs)
+{
+ u32 regno = argno + 1;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[regno];
+ bool arg_mem_size = false;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx])
+ return KF_ARG_PTR_TO_CTX;
+
+ /* In this function, we verify the kfunc's BTF as per the argument type,
+ * leaving the rest of the verification with respect to the register
+ * type to our caller. When a set of conditions hold in the BTF type of
+ * arguments, we resolve it to a known kfunc_ptr_arg_type.
+ */
+ if (btf_get_prog_ctx_type(&env->log, meta->btf, t, resolve_prog_type(env->prog), argno))
+ return KF_ARG_PTR_TO_CTX;
+
+ if (is_kfunc_arg_alloc_obj(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_ALLOC_BTF_ID;
+
+ if (is_kfunc_arg_kptr_get(meta, argno)) {
+ if (!btf_type_is_ptr(ref_t)) {
+ verbose(env, "arg#0 BTF type must be a double pointer for kptr_get kfunc\n");
+ return -EINVAL;
+ }
+ ref_t = btf_type_by_id(meta->btf, ref_t->type);
+ ref_tname = btf_name_by_offset(meta->btf, ref_t->name_off);
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#0 pointer type %s %s is not supported\n",
+ meta->func_name, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_KPTR;
+ }
+
+ if (is_kfunc_arg_dynptr(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_DYNPTR;
+
+ if (is_kfunc_arg_list_head(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_HEAD;
+
+ if (is_kfunc_arg_list_node(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_LIST_NODE;
+
+ if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
+ if (!btf_type_is_struct(ref_t)) {
+ verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return KF_ARG_PTR_TO_BTF_ID;
+ }
+
+ if (argno + 1 < nargs && is_kfunc_arg_mem_size(meta->btf, &args[argno + 1], &regs[regno + 1]))
+ arg_mem_size = true;
+
+ /* This is the catch all argument type of register types supported by
+ * check_helper_mem_access. However, we only allow when argument type is
+ * pointer to scalar, or struct composed (recursively) of scalars. When
+ * arg_mem_size is true, the pointer can be void *.
+ */
+ if (!btf_type_is_scalar(ref_t) && !__btf_type_is_scalar_struct(env, meta->btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ verbose(env, "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ argno, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+ return arg_mem_size ? KF_ARG_PTR_TO_MEM_SIZE : KF_ARG_PTR_TO_MEM;
+}
+
+static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname, u32 ref_id,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ const struct btf_type *reg_ref_t;
+ bool strict_type_match = false;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (base_type(reg->type) == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+ }
+
+ if (is_kfunc_trusted_args(meta) || (is_kfunc_release(meta) && reg->ref_obj_id))
+ strict_type_match = true;
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf, reg_ref_t->name_off);
+ if (!btf_struct_ids_match(&env->log, reg_btf, reg_ref_id, reg->off, meta->btf, ref_id, strict_type_match)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname, argno + 1,
+ btf_type_str(reg_ref_t), reg_ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_kptr(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ const struct btf_type *ref_t,
+ const char *ref_tname,
+ struct bpf_kfunc_call_arg_meta *meta,
+ int argno)
+{
+ struct btf_field *kptr_field;
+
+ /* check_func_arg_reg_off allows var_off for
+ * PTR_TO_MAP_VALUE, but we need fixed offset to find
+ * off_desc.
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "arg#0 must have constant offset\n");
+ return -EINVAL;
+ }
+
+ kptr_field = btf_record_find(reg->map_ptr->record, reg->off + reg->var_off.value, BPF_KPTR);
+ if (!kptr_field || kptr_field->type != BPF_KPTR_REF) {
+ verbose(env, "arg#0 no referenced kptr at map value offset=%llu\n",
+ reg->off + reg->var_off.value);
+ return -EINVAL;
+ }
+
+ if (!btf_struct_ids_match(&env->log, meta->btf, ref_t->type, 0, kptr_field->kptr.btf,
+ kptr_field->kptr.btf_id, true)) {
+ verbose(env, "kernel function %s args#%d expected pointer to %s %s\n",
+ meta->func_name, argno, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ref_set_release_on_unlock(struct bpf_verifier_env *env, u32 ref_obj_id)
+{
+ struct bpf_func_state *state = cur_func(env);
+ struct bpf_reg_state *reg;
+ int i;
+
+ /* bpf_spin_lock only allows calling list_push and list_pop, no BPF
+ * subprogs, no global functions. This means that the references would
+ * not be released inside the critical section but they may be added to
+ * the reference state, and the acquired_refs are never copied out for a
+ * different frame as BPF to BPF calls don't work in bpf_spin_lock
+ * critical sections.
+ */
+ if (!ref_obj_id) {
+ verbose(env, "verifier internal error: ref_obj_id is zero for release_on_unlock\n");
+ return -EFAULT;
+ }
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].id == ref_obj_id) {
+ if (state->refs[i].release_on_unlock) {
+ verbose(env, "verifier internal error: expected false release_on_unlock");
+ return -EFAULT;
+ }
+ state->refs[i].release_on_unlock = true;
+ /* Now mark everyone sharing same ref_obj_id as untrusted */
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->ref_obj_id == ref_obj_id)
+ reg->type |= PTR_UNTRUSTED;
+ }));
+ return 0;
+ }
+ }
+ verbose(env, "verifier internal error: ref state missing for ref_obj_id\n");
+ return -EFAULT;
+}
+
+/* Implementation details:
+ *
+ * Each register points to some region of memory, which we define as an
+ * allocation. Each allocation may embed a bpf_spin_lock which protects any
+ * special BPF objects (bpf_list_head, bpf_rb_root, etc.) part of the same
+ * allocation. The lock and the data it protects are colocated in the same
+ * memory region.
+ *
+ * Hence, everytime a register holds a pointer value pointing to such
+ * allocation, the verifier preserves a unique reg->id for it.
+ *
+ * The verifier remembers the lock 'ptr' and the lock 'id' whenever
+ * bpf_spin_lock is called.
+ *
+ * To enable this, lock state in the verifier captures two values:
+ * active_lock.ptr = Register's type specific pointer
+ * active_lock.id = A unique ID for each register pointer value
+ *
+ * Currently, PTR_TO_MAP_VALUE and PTR_TO_BTF_ID | MEM_ALLOC are the two
+ * supported register types.
+ *
+ * The active_lock.ptr in case of map values is the reg->map_ptr, and in case of
+ * allocated objects is the reg->btf pointer.
+ *
+ * The active_lock.id is non-unique for maps supporting direct_value_addr, as we
+ * can establish the provenance of the map value statically for each distinct
+ * lookup into such maps. They always contain a single map value hence unique
+ * IDs for each pseudo load pessimizes the algorithm and rejects valid programs.
+ *
+ * So, in case of global variables, they use array maps with max_entries = 1,
+ * hence their active_lock.ptr becomes map_ptr and id = 0 (since they all point
+ * into the same map value as max_entries is 1, as described above).
+ *
+ * In case of inner map lookups, the inner map pointer has same map_ptr as the
+ * outer map pointer (in verifier context), but each lookup into an inner map
+ * assigns a fresh reg->id to the lookup, so while lookups into distinct inner
+ * maps from the same outer map share the same map_ptr as active_lock.ptr, they
+ * will get different reg->id assigned to each lookup, hence different
+ * active_lock.id.
+ *
+ * In case of allocated objects, active_lock.ptr is the reg->btf, and the
+ * reg->id is a unique ID preserved after the NULL pointer check on the pointer
+ * returned from bpf_obj_new. Each allocation receives a new reg->id.
+ */
+static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ void *ptr;
+ u32 id;
+
+ switch ((int)reg->type) {
+ case PTR_TO_MAP_VALUE:
+ ptr = reg->map_ptr;
+ break;
+ case PTR_TO_BTF_ID | MEM_ALLOC:
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED:
+ ptr = reg->btf;
+ break;
+ default:
+ verbose(env, "verifier internal error: unknown reg type for lock check\n");
+ return -EFAULT;
+ }
+ id = reg->id;
+
+ if (!env->cur_state->active_lock.ptr)
+ return -EINVAL;
+ if (env->cur_state->active_lock.ptr != ptr ||
+ env->cur_state->active_lock.id != id) {
+ verbose(env, "held lock and object are not in the same allocation\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static bool is_bpf_list_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_list_push_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_push_back] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ btf_id == special_kfunc_list[KF_bpf_list_pop_back];
+}
+
+static int process_kf_arg_ptr_to_list_head(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 list_head_off;
+
+ if (meta->btf != btf_vmlinux || !is_bpf_list_api_kfunc(meta->func_id)) {
+ verbose(env, "verifier internal error: bpf_list_head argument for unknown kfunc\n");
+ return -EFAULT;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_list_head has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ list_head_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, list_head_off, BPF_LIST_HEAD);
+ if (!field) {
+ verbose(env, "bpf_list_head not found at offset=%u\n", list_head_off);
+ return -EINVAL;
+ }
+
+ /* All functions require bpf_list_head to be protected using a bpf_spin_lock */
+ if (check_reg_allocation_locked(env, reg)) {
+ verbose(env, "bpf_spin_lock at off=%d must be held for bpf_list_head\n",
+ rec->spin_lock_off);
+ return -EINVAL;
+ }
+
+ if (meta->arg_list_head.field) {
+ verbose(env, "verifier internal error: repeating bpf_list_head arg\n");
+ return -EFAULT;
+ }
+ meta->arg_list_head.field = field;
+ return 0;
+}
+
+static int process_kf_arg_ptr_to_list_node(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ const struct btf_type *et, *t;
+ struct btf_field *field;
+ struct btf_record *rec;
+ u32 list_node_off;
+
+ if (meta->btf != btf_vmlinux ||
+ (meta->func_id != special_kfunc_list[KF_bpf_list_push_front] &&
+ meta->func_id != special_kfunc_list[KF_bpf_list_push_back])) {
+ verbose(env, "verifier internal error: bpf_list_node argument for unknown kfunc\n");
+ return -EFAULT;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env,
+ "R%d doesn't have constant offset. bpf_list_node has to be at the constant offset\n",
+ regno);
+ return -EINVAL;
+ }
+
+ rec = reg_btf_record(reg);
+ list_node_off = reg->off + reg->var_off.value;
+ field = btf_record_find(rec, list_node_off, BPF_LIST_NODE);
+ if (!field || field->offset != list_node_off) {
+ verbose(env, "bpf_list_node not found at offset=%u\n", list_node_off);
+ return -EINVAL;
+ }
+
+ field = meta->arg_list_head.field;
+
+ et = btf_type_by_id(field->list_head.btf, field->list_head.value_btf_id);
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, 0, field->list_head.btf,
+ field->list_head.value_btf_id, true)) {
+ verbose(env, "operation on bpf_list_head expects arg#1 bpf_list_node at offset=%d "
+ "in struct %s, but arg is at offset=%d in struct %s\n",
+ field->list_head.node_offset, btf_name_by_offset(field->list_head.btf, et->name_off),
+ list_node_off, btf_name_by_offset(reg->btf, t->name_off));
+ return -EINVAL;
+ }
+
+ if (list_node_off != field->list_head.node_offset) {
+ verbose(env, "arg#1 offset=%d, but expected bpf_list_node at offset=%d in struct %s\n",
+ list_node_off, field->list_head.node_offset,
+ btf_name_by_offset(field->list_head.btf, et->name_off));
+ return -EINVAL;
+ }
+ /* Set arg#1 for expiration after unlock */
+ return ref_set_release_on_unlock(env, reg->ref_obj_id);
+}
+
+static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_arg_meta *meta)
+{
+ const char *func_name = meta->func_name, *ref_tname;
+ const struct btf *btf = meta->btf;
+ const struct btf_param *args;
+ u32 i, nargs;
+ int ret;
+
+ args = (const struct btf_param *)(meta->func_proto + 1);
+ nargs = btf_type_vlen(meta->func_proto);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ verbose(env, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
+ }
+
+ /* Check that BTF function arguments match actual types that the
+ * verifier sees.
+ */
+ for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[i + 1];
+ const struct btf_type *t, *ref_t, *resolve_ret;
+ enum bpf_arg_type arg_type = ARG_DONTCARE;
+ u32 regno = i + 1, ref_id, type_size;
+ bool is_ret_buf_sz = false;
+ int kf_arg_type;
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+
+ if (is_kfunc_arg_ignore(btf, &args[i]))
+ continue;
+
+ if (btf_type_is_scalar(t)) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "R%d is not a scalar\n", regno);
+ return -EINVAL;
+ }
+
+ if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (meta->arg_constant.found) {
+ verbose(env, "verifier internal error: only one constant argument permitted\n");
+ return -EFAULT;
+ }
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ ret = mark_chain_precision(env, regno);
+ if (ret < 0)
+ return ret;
+ meta->arg_constant.found = true;
+ meta->arg_constant.value = reg->var_off.value;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdonly_buf_size")) {
+ meta->r0_rdonly = true;
+ is_ret_buf_sz = true;
+ } else if (is_kfunc_arg_scalar_with_name(btf, &args[i], "rdwr_buf_size")) {
+ is_ret_buf_sz = true;
+ }
+
+ if (is_ret_buf_sz) {
+ if (meta->r0_size) {
+ verbose(env, "2 or more rdonly/rdwr_buf_size parameters for kfunc");
+ return -EINVAL;
+ }
+
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d is not a const\n", regno);
+ return -EINVAL;
+ }
+
+ meta->r0_size = reg->var_off.value;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ }
+ continue;
+ }
+
+ if (!btf_type_is_ptr(t)) {
+ verbose(env, "Unrecognized arg#%d type %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (reg->ref_obj_id) {
+ if (is_kfunc_release(meta) && meta->ref_obj_id) {
+ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id,
+ meta->ref_obj_id);
+ return -EFAULT;
+ }
+ meta->ref_obj_id = reg->ref_obj_id;
+ if (is_kfunc_release(meta))
+ meta->release_regno = regno;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ kf_arg_type = get_kfunc_ptr_arg_type(env, meta, t, ref_t, ref_tname, args, i, nargs);
+ if (kf_arg_type < 0)
+ return kf_arg_type;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ case KF_ARG_PTR_TO_BTF_ID:
+ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta))
+ break;
+
+ if (!is_trusted_reg(reg)) {
+ if (!is_kfunc_rcu(meta)) {
+ verbose(env, "R%d must be referenced or trusted\n", regno);
+ return -EINVAL;
+ }
+ if (!is_rcu_reg(reg)) {
+ verbose(env, "R%d must be a rcu pointer\n", regno);
+ return -EINVAL;
+ }
+ }
+
+ fallthrough;
+ case KF_ARG_PTR_TO_CTX:
+ /* Trusted arguments have the same offset checks as release arguments */
+ arg_type |= OBJ_RELEASE;
+ break;
+ case KF_ARG_PTR_TO_KPTR:
+ case KF_ARG_PTR_TO_DYNPTR:
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ case KF_ARG_PTR_TO_LIST_NODE:
+ case KF_ARG_PTR_TO_MEM:
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ /* Trusted by default */
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return -EFAULT;
+ }
+
+ if (is_kfunc_release(meta) && reg->ref_obj_id)
+ arg_type |= OBJ_RELEASE;
+ ret = check_func_arg_reg_off(env, reg, regno, arg_type);
+ if (ret < 0)
+ return ret;
+
+ switch (kf_arg_type) {
+ case KF_ARG_PTR_TO_CTX:
+ if (reg->type != PTR_TO_CTX) {
+ verbose(env, "arg#%d expected pointer to ctx, but got %s\n", i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ ret = get_kern_ctx_btf_id(&env->log, resolve_prog_type(env->prog));
+ if (ret < 0)
+ return -EINVAL;
+ meta->ret_btf_id = ret;
+ }
+ break;
+ case KF_ARG_PTR_TO_ALLOC_BTF_ID:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ if (meta->btf == btf_vmlinux &&
+ meta->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ meta->arg_obj_drop.btf = reg->btf;
+ meta->arg_obj_drop.btf_id = reg->btf_id;
+ }
+ break;
+ case KF_ARG_PTR_TO_KPTR:
+ if (reg->type != PTR_TO_MAP_VALUE) {
+ verbose(env, "arg#0 expected pointer to map value\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_kptr(env, reg, ref_t, ref_tname, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_DYNPTR:
+ if (reg->type != PTR_TO_STACK &&
+ reg->type != CONST_PTR_TO_DYNPTR) {
+ verbose(env, "arg#%d expected pointer to stack or dynptr_ptr\n", i);
+ return -EINVAL;
+ }
+
+ ret = process_dynptr_func(env, regno, ARG_PTR_TO_DYNPTR | MEM_RDONLY, NULL);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_HEAD:
+ if (reg->type != PTR_TO_MAP_VALUE &&
+ reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to map value or allocated object\n", i);
+ return -EINVAL;
+ }
+ if (reg->type == (PTR_TO_BTF_ID | MEM_ALLOC) && !reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_head(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_LIST_NODE:
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC)) {
+ verbose(env, "arg#%d expected pointer to allocated object\n", i);
+ return -EINVAL;
+ }
+ if (!reg->ref_obj_id) {
+ verbose(env, "allocated object must be referenced\n");
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_list_node(env, reg, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_BTF_ID:
+ /* Only base_type is checked, further checks are done here */
+ if ((base_type(reg->type) != PTR_TO_BTF_ID ||
+ (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) &&
+ !reg2btf_ids[base_type(reg->type)]) {
+ verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type));
+ verbose(env, "expected %s or socket\n",
+ reg_type_str(env, base_type(reg->type) |
+ (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS)));
+ return -EINVAL;
+ }
+ ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM:
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ verbose(env, "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname, PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+ ret = check_mem_reg(env, reg, regno, type_size);
+ if (ret < 0)
+ return ret;
+ break;
+ case KF_ARG_PTR_TO_MEM_SIZE:
+ ret = check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1);
+ if (ret < 0) {
+ verbose(env, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", i, i + 1);
+ return ret;
+ }
+ /* Skip next '__sz' argument */
+ i++;
+ break;
+ }
+ }
+
+ if (is_kfunc_release(meta) && !meta->release_regno) {
+ verbose(env, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
struct bpf_reg_state *regs = cur_regs(env);
- struct bpf_kfunc_arg_meta meta = { 0 };
const char *func_name, *ptr_type_name;
+ bool sleepable, rcu_lock, rcu_unlock;
+ struct bpf_kfunc_call_arg_meta meta;
u32 i, nargs, func_id, ptr_type_id;
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
+ const struct btf_type *ret_t;
struct btf *desc_btf;
u32 *kfunc_flags;
- bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
if (!insn->imm)
@@ -7687,24 +9092,68 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name);
return -EACCES;
}
- if (*kfunc_flags & KF_DESTRUCTIVE && !capable(CAP_SYS_BOOT)) {
- verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capabilities\n");
+
+ /* Prepare kfunc call metadata */
+ memset(&meta, 0, sizeof(meta));
+ meta.btf = desc_btf;
+ meta.func_id = func_id;
+ meta.kfunc_flags = *kfunc_flags;
+ meta.func_proto = func_proto;
+ meta.func_name = func_name;
+
+ if (is_kfunc_destructive(&meta) && !capable(CAP_SYS_BOOT)) {
+ verbose(env, "destructive kfunc calls require CAP_SYS_BOOT capability\n");
+ return -EACCES;
+ }
+
+ sleepable = is_kfunc_sleepable(&meta);
+ if (sleepable && !env->prog->aux->sleepable) {
+ verbose(env, "program must be sleepable to call sleepable kfunc %s\n", func_name);
return -EACCES;
}
- acq = *kfunc_flags & KF_ACQUIRE;
+ rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
+ rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
+ if ((rcu_lock || rcu_unlock) && !env->rcu_tag_supported) {
+ verbose(env, "no vmlinux btf rcu tag support for kfunc %s\n", func_name);
+ return -EACCES;
+ }
- meta.flags = *kfunc_flags;
+ if (env->cur_state->active_rcu_lock) {
+ struct bpf_func_state *state;
+ struct bpf_reg_state *reg;
+
+ if (rcu_lock) {
+ verbose(env, "nested rcu read lock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ } else if (rcu_unlock) {
+ bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ if (reg->type & MEM_RCU) {
+ reg->type &= ~(MEM_RCU | PTR_MAYBE_NULL);
+ reg->type |= PTR_UNTRUSTED;
+ }
+ }));
+ env->cur_state->active_rcu_lock = false;
+ } else if (sleepable) {
+ verbose(env, "kernel func %s is sleepable within rcu_read_lock region\n", func_name);
+ return -EACCES;
+ }
+ } else if (rcu_lock) {
+ env->cur_state->active_rcu_lock = true;
+ } else if (rcu_unlock) {
+ verbose(env, "unmatched rcu read unlock (kernel function %s)\n", func_name);
+ return -EINVAL;
+ }
/* Check the arguments */
- err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, &meta);
+ err = check_kfunc_args(env, &meta);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
- * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+ * PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
- if (err) {
- err = release_reference(env, regs[err].ref_obj_id);
+ if (meta.release_regno) {
+ err = release_reference(env, regs[meta.release_regno].ref_obj_id);
if (err) {
verbose(env, "kfunc %s#%d reference has not been acquired before\n",
func_name, func_id);
@@ -7718,18 +9167,92 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
- if (acq && !btf_type_is_struct_ptr(desc_btf, t)) {
- verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
- return -EINVAL;
+ if (is_kfunc_acquire(&meta) && !btf_type_is_struct_ptr(meta.btf, t)) {
+ /* Only exception is bpf_obj_new_impl */
+ if (meta.btf != btf_vmlinux || meta.func_id != special_kfunc_list[KF_bpf_obj_new_impl]) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
}
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
} else if (btf_type_is_ptr(t)) {
- ptr_type = btf_type_skip_modifiers(desc_btf, t->type,
- &ptr_type_id);
- if (!btf_type_is_struct(ptr_type)) {
+ ptr_type = btf_type_skip_modifiers(desc_btf, t->type, &ptr_type_id);
+
+ if (meta.btf == btf_vmlinux && btf_id_set_contains(&special_kfunc_set, meta.func_id)) {
+ if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf *ret_btf;
+ u32 ret_btf_id;
+
+ if (unlikely(!bpf_global_ma_set))
+ return -ENOMEM;
+
+ if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
+ verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
+ return -EINVAL;
+ }
+
+ ret_btf = env->prog->aux->btf;
+ ret_btf_id = meta.arg_constant.value;
+
+ /* This may be NULL due to user not supplying a BTF */
+ if (!ret_btf) {
+ verbose(env, "bpf_obj_new requires prog BTF\n");
+ return -EINVAL;
+ }
+
+ ret_t = btf_type_by_id(ret_btf, ret_btf_id);
+ if (!ret_t || !__btf_type_is_struct(ret_t)) {
+ verbose(env, "bpf_obj_new type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = ret_btf;
+ regs[BPF_REG_0].btf_id = ret_btf_id;
+
+ env->insn_aux_data[insn_idx].obj_new_size = ret_t->size;
+ env->insn_aux_data[insn_idx].kptr_struct_meta =
+ btf_find_struct_meta(ret_btf, ret_btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ env->insn_aux_data[insn_idx].kptr_struct_meta =
+ btf_find_struct_meta(meta.arg_obj_drop.btf,
+ meta.arg_obj_drop.btf_id);
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_list_pop_front] ||
+ meta.func_id == special_kfunc_list[KF_bpf_list_pop_back]) {
+ struct btf_field *field = meta.arg_list_head.field;
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | MEM_ALLOC;
+ regs[BPF_REG_0].btf = field->list_head.btf;
+ regs[BPF_REG_0].btf_id = field->list_head.value_btf_id;
+ regs[BPF_REG_0].off = field->list_head.node_offset;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_TRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+ } else if (meta.func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ ret_t = btf_type_by_id(desc_btf, meta.arg_constant.value);
+ if (!ret_t || !btf_type_is_struct(ret_t)) {
+ verbose(env,
+ "kfunc bpf_rdonly_cast type ID argument must be of a struct\n");
+ return -EINVAL;
+ }
+
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | PTR_UNTRUSTED;
+ regs[BPF_REG_0].btf = desc_btf;
+ regs[BPF_REG_0].btf_id = meta.arg_constant.value;
+ } else {
+ verbose(env, "kernel function %s unhandled dynamic return type\n",
+ meta.func_name);
+ return -EFAULT;
+ }
+ } else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
ptr_type_name = btf_name_by_offset(desc_btf,
ptr_type->name_off);
@@ -7757,20 +9280,24 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
}
- if (*kfunc_flags & KF_RET_NULL) {
+
+ if (is_kfunc_ret_null(&meta)) {
regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
regs[BPF_REG_0].id = ++env->id_gen;
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
- if (acq) {
+ if (is_kfunc_acquire(&meta)) {
int id = acquire_reference_state(env, insn_idx);
if (id < 0)
return id;
- regs[BPF_REG_0].id = id;
+ if (is_kfunc_ret_null(&meta))
+ regs[BPF_REG_0].id = id;
regs[BPF_REG_0].ref_obj_id = id;
}
+ if (reg_may_point_to_spin_lock(&regs[BPF_REG_0]) && !regs[BPF_REG_0].id)
+ regs[BPF_REG_0].id = ++env->id_gen;
} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
nargs = btf_type_vlen(func_proto);
@@ -9198,6 +10725,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
+ } else if (dst_reg->precise) {
+ /* if dst_reg is precise, src_reg should be precise as well */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
}
} else {
/* Pretend the src is a reg with a known value, since we only
@@ -9937,17 +11469,20 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
bool is_null)
{
if (type_may_be_null(reg->type) && reg->id == id &&
- !WARN_ON_ONCE(!reg->id)) {
- if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
- !tnum_equals_const(reg->var_off, 0) ||
- reg->off)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL. If we
- * see this happening, don't convert the register.
- */
+ (is_rcu_reg(reg) || !WARN_ON_ONCE(!reg->id))) {
+ /* Old offset (both fixed and variable parts) should have been
+ * known-zero, because we don't allow pointer arithmetic on
+ * pointers that might be NULL. If we see this happening, don't
+ * convert the register.
+ *
+ * But in some cases, some helpers that return local kptrs
+ * advance offset for the returned pointer. In those cases, it
+ * is fine to expect to see reg->off.
+ */
+ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0)))
+ return;
+ if (reg->type != (PTR_TO_BTF_ID | MEM_ALLOC | PTR_MAYBE_NULL) && WARN_ON_ONCE(reg->off))
return;
- }
if (is_null) {
reg->type = SCALAR_VALUE;
/* We don't need id and ref_obj_id from this point
@@ -10121,6 +11656,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
+ struct bpf_reg_state *eq_branch_regs;
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
int pred = -1;
@@ -10230,8 +11766,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
/* detect if we are comparing against a constant value so we can adjust
* our min/max values for our dst register.
* this is only legit if both are scalars (or pointers to the same
- * object, I suppose, but we don't support that right now), because
- * otherwise the different base pointers mean the offsets aren't
+ * object, I suppose, see the PTR_MAYBE_NULL related if block below),
+ * because otherwise the different base pointers mean the offsets aren't
* comparable.
*/
if (BPF_SRC(insn->code) == BPF_X) {
@@ -10280,6 +11816,36 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
}
+ /* if one pointer register is compared to another pointer
+ * register check if PTR_MAYBE_NULL could be lifted.
+ * E.g. register A - maybe null
+ * register B - not null
+ * for JNE A, B, ... - A is not null in the false branch;
+ * for JEQ A, B, ... - A is not null in the true branch.
+ */
+ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_X &&
+ __is_pointer_value(false, src_reg) && __is_pointer_value(false, dst_reg) &&
+ type_may_be_null(src_reg->type) != type_may_be_null(dst_reg->type)) {
+ eq_branch_regs = NULL;
+ switch (opcode) {
+ case BPF_JEQ:
+ eq_branch_regs = other_branch_regs;
+ break;
+ case BPF_JNE:
+ eq_branch_regs = regs;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+ if (eq_branch_regs) {
+ if (type_may_be_null(src_reg->type))
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->src_reg]);
+ else
+ mark_ptr_not_null_reg(&eq_branch_regs[insn->dst_reg]);
+ }
+ }
+
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
@@ -10386,8 +11952,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) {
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
- if (map_value_has_spin_lock(map))
- dst_reg->id = ++env->id_gen;
+ WARN_ON_ONCE(map->max_entries != 1);
+ /* We want reg->id to be same (0) as map_value is not distinct */
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_IDX) {
dst_reg->type = CONST_PTR_TO_MAP;
@@ -10465,11 +12031,16 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr) {
verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_rcu_read_lock-ed region\n");
+ return -EINVAL;
+ }
+
if (regs[ctx_reg].type != PTR_TO_CTX) {
verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
@@ -10671,7 +12242,7 @@ static int check_return_code(struct bpf_verifier_env *env)
* 3 let S be a stack
* 4 S.push(v)
* 5 while S is not empty
- * 6 t <- S.pop()
+ * 6 t <- S.peek()
* 7 if t is what we're looking for:
* 8 return t
* 9 for all edges e in G.adjacentEdges(t) do
@@ -10720,11 +12291,16 @@ static struct bpf_verifier_state_list **explored_state(
return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
}
-static void init_explored_state(struct bpf_verifier_env *env, int idx)
+static void mark_prune_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].prune_point = true;
}
+static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].prune_point;
+}
+
enum {
DONE_EXPLORING = 0,
KEEP_EXPLORING = 1,
@@ -10753,9 +12329,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return -EINVAL;
}
- if (e == BRANCH)
+ if (e == BRANCH) {
/* mark branch target for state pruning */
- init_explored_state(env, w);
+ mark_prune_point(env, w);
+ mark_jmp_point(env, w);
+ }
if (insn_state[w] == 0) {
/* tree-edge */
@@ -10782,8 +12360,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
return DONE_EXPLORING;
}
-static int visit_func_call_insn(int t, int insn_cnt,
- struct bpf_insn *insns,
+static int visit_func_call_insn(int t, struct bpf_insn *insns,
struct bpf_verifier_env *env,
bool visit_callee)
{
@@ -10793,10 +12370,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
if (ret)
return ret;
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + 1);
+ /* when we exit from subprog, we need to record non-linear history */
+ mark_jmp_point(env, t + 1);
+
if (visit_callee) {
- init_explored_state(env, t);
+ mark_prune_point(env, t);
ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
/* It's ok to allow recursion from CFG point of
* view. __check_func_call() will do the actual
@@ -10812,13 +12391,13 @@ static int visit_func_call_insn(int t, int insn_cnt,
* DONE_EXPLORING - the instruction was fully explored
* KEEP_EXPLORING - there is still work to be done before it is fully explored
*/
-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+static int visit_insn(int t, struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int ret;
if (bpf_pseudo_func(insns + t))
- return visit_func_call_insn(t, insn_cnt, insns, env, true);
+ return visit_func_call_insn(t, insns, env, true);
/* All non-branch instructions have a single fall-through edge. */
if (BPF_CLASS(insns[t].code) != BPF_JMP &&
@@ -10831,13 +12410,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
case BPF_CALL:
if (insns[t].imm == BPF_FUNC_timer_set_callback)
- /* Mark this call insn to trigger is_state_visited() check
- * before call itself is processed by __check_func_call().
- * Otherwise new async state will be pushed for further
- * exploration.
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
*/
- init_explored_state(env, t);
- return visit_func_call_insn(t, insn_cnt, insns, env,
+ mark_prune_point(env, t);
+ return visit_func_call_insn(t, insns, env,
insns[t].src_reg == BPF_PSEUDO_CALL);
case BPF_JA:
@@ -10850,22 +12429,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
if (ret)
return ret;
- /* unconditional jmp is not a good pruning point,
- * but it's marked, since backtracking needs
- * to record jmp history in is_state_visited().
- */
- init_explored_state(env, t + insns[t].off + 1);
- /* tell verifier to check for equivalent states
- * after every call and jump
- */
- if (t + 1 < insn_cnt)
- init_explored_state(env, t + 1);
+ mark_prune_point(env, t + insns[t].off + 1);
+ mark_jmp_point(env, t + insns[t].off + 1);
return ret;
default:
/* conditional jump with two edges */
- init_explored_state(env, t);
+ mark_prune_point(env, t);
+
ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret)
return ret;
@@ -10901,7 +12473,7 @@ static int check_cfg(struct bpf_verifier_env *env)
while (env->cfg.cur_stack > 0) {
int t = insn_stack[env->cfg.cur_stack - 1];
- ret = visit_insn(t, insn_cnt, env);
+ ret = visit_insn(t, env);
switch (ret) {
case DONE_EXPLORING:
insn_state[t] = EXPLORED;
@@ -11492,15 +13064,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0;
- if (rold->type == PTR_TO_STACK)
- /* two stack pointers are equal only if they're pointing to
- * the same stack frame, since fp-8 in foo != fp-8 in bar
- */
- return equal && rold->frameno == rcur->frameno;
-
- if (equal)
- return true;
-
if (rold->type == NOT_INIT)
/* explored state can't have used this */
return true;
@@ -11508,10 +13071,12 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return false;
switch (base_type(rold->type)) {
case SCALAR_VALUE:
+ if (equal)
+ return true;
if (env->explore_alu_limits)
return false;
if (rcur->type == SCALAR_VALUE) {
- if (!rold->precise && !rcur->precise)
+ if (!rold->precise)
return true;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
@@ -11554,7 +13119,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
*/
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
- tnum_in(rold->var_off, rcur->var_off);
+ tnum_in(rold->var_off, rcur->var_off) &&
+ check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
if (rcur->type != rold->type)
@@ -11578,20 +13144,14 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_CTX:
- case CONST_PTR_TO_MAP:
- case PTR_TO_PACKET_END:
- case PTR_TO_FLOW_KEYS:
- case PTR_TO_SOCKET:
- case PTR_TO_SOCK_COMMON:
- case PTR_TO_TCP_SOCK:
- case PTR_TO_XDP_SOCK:
- /* Only valid matches are exact, which memcmp() above
- * would have accepted
+ case PTR_TO_STACK:
+ /* two stack pointers are equal only if they're pointing to
+ * the same stack frame, since fp-8 in foo != fp-8 in bar
*/
+ return equal && rold->frameno == rcur->frameno;
default:
- /* Don't know what's going on, just say it's not safe */
- return false;
+ /* Only valid matches are exact, which memcmp() */
+ return equal;
}
/* Shouldn't get here; if we do, say it's not safe */
@@ -11701,7 +13261,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
{
int i;
- memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
for (i = 0; i < MAX_BPF_REG; i++)
if (!regsafe(env, &old->regs[i], &cur->regs[i],
env->idmap_scratch))
@@ -11725,13 +13284,28 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->curframe != cur->curframe)
return false;
+ memset(env->idmap_scratch, 0, sizeof(env->idmap_scratch));
+
/* Verification state from speculative execution simulation
* must never prune a non-speculative execution one.
*/
if (old->speculative && !cur->speculative)
return false;
- if (old->active_spin_lock != cur->active_spin_lock)
+ if (old->active_lock.ptr != cur->active_lock.ptr)
+ return false;
+
+ /* Old and cur active_lock's have to be either both present
+ * or both absent.
+ */
+ if (!!old->active_lock.id != !!cur->active_lock.id)
+ return false;
+
+ if (old->active_lock.id &&
+ !check_ids(old->active_lock.id, cur->active_lock.id, env->idmap_scratch))
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
return false;
/* for states to be equal callsites have to be the same
@@ -11834,34 +13408,36 @@ static int propagate_precision(struct bpf_verifier_env *env,
{
struct bpf_reg_state *state_reg;
struct bpf_func_state *state;
- int i, err = 0;
+ int i, err = 0, fr;
- state = old->frame[old->curframe];
- state_reg = state->regs;
- for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating r%d\n", i);
- err = mark_chain_precision(env, i);
- if (err < 0)
- return err;
- }
+ for (fr = old->curframe; fr >= 0; fr--) {
+ state = old->frame[fr];
+ state_reg = state->regs;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating r%d\n", i, fr);
+ err = mark_chain_precision_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
- for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
- if (!is_spilled_reg(&state->stack[i]))
- continue;
- state_reg = &state->stack[i].spilled_ptr;
- if (state_reg->type != SCALAR_VALUE ||
- !state_reg->precise)
- continue;
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "propagating fp%d\n",
- (-i - 1) * BPF_REG_SIZE);
- err = mark_chain_precision_stack(env, i);
- if (err < 0)
- return err;
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (!is_spilled_reg(&state->stack[i]))
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame %d: propagating fp%d\n",
+ (-i - 1) * BPF_REG_SIZE, fr);
+ err = mark_chain_precision_stack_frame(env, fr, i);
+ if (err < 0)
+ return err;
+ }
}
return 0;
}
@@ -11893,13 +13469,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
int i, j, err, states_cnt = 0;
bool add_new_state = env->test_state_freq ? true : false;
- cur->last_insn_idx = env->prev_insn_idx;
- if (!env->insn_aux_data[insn_idx].prune_point)
- /* this 'insn_idx' instruction wasn't marked, so we will not
- * be doing state search here
- */
- return 0;
-
/* bpf progs typically have pruning point every 4 instructions
* http://vger.kernel.org/bpfconf2019.html#session-1
* Do not add new state for future pruning if the verifier hasn't seen
@@ -12034,10 +13603,10 @@ next:
env->max_states_per_insn = states_cnt;
if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return push_jmp_history(env, cur);
+ return 0;
if (!add_new_state)
- return push_jmp_history(env, cur);
+ return 0;
/* There were no equivalent states, remember the current one.
* Technically the current state is not proven to be safe yet,
@@ -12056,6 +13625,10 @@ next:
env->prev_jmps_processed = env->jmps_processed;
env->prev_insn_processed = env->insn_processed;
+ /* forget precise markings we inherited, see __mark_chain_precision */
+ if (env->bpf_capable)
+ mark_all_scalars_imprecise(env, cur);
+
/* add new state to the head of linked list */
new = &new_sl->state;
err = copy_verifier_state(new, cur);
@@ -12173,21 +13746,31 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, env->insn_idx);
- if (err < 0)
- return err;
- if (err == 1) {
- /* found equivalent state, can prune the search */
- if (env->log.level & BPF_LOG_LEVEL) {
- if (do_print_state)
- verbose(env, "\nfrom %d to %d%s: safe\n",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- else
- verbose(env, "%d: safe\n", env->insn_idx);
+ state->last_insn_idx = env->prev_insn_idx;
+
+ if (is_prune_point(env, env->insn_idx)) {
+ err = is_state_visited(env, env->insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (env->log.level & BPF_LOG_LEVEL) {
+ if (do_print_state)
+ verbose(env, "\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ else
+ verbose(env, "%d: safe\n", env->insn_idx);
+ }
+ goto process_bpf_exit;
}
- goto process_bpf_exit;
+ }
+
+ if (is_jmp_point(env, env->insn_idx)) {
+ err = push_jmp_history(env, state);
+ if (err)
+ return err;
}
if (signal_pending(current))
@@ -12370,11 +13953,14 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock &&
- (insn->src_reg == BPF_PSEUDO_CALL ||
- insn->imm != BPF_FUNC_spin_unlock)) {
- verbose(env, "function calls are not allowed while holding a lock\n");
- return -EINVAL;
+ if (env->cur_state->active_lock.ptr) {
+ if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
+ (insn->src_reg == BPF_PSEUDO_CALL) ||
+ (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+ (insn->off != 0 || !is_bpf_list_api_kfunc(insn->imm)))) {
+ verbose(env, "function calls are not allowed while holding a lock\n");
+ return -EINVAL;
+ }
}
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
@@ -12407,11 +13993,16 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (env->cur_state->active_spin_lock) {
+ if (env->cur_state->active_lock.ptr) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
+ if (env->cur_state->active_rcu_lock) {
+ verbose(env, "bpf_rcu_read_unlock is missing\n");
+ return -EINVAL;
+ }
+
/* We must do check_reference_leak here before
* prepare_func_exit to handle the case when
* state->curframe > 0, it may be a callback
@@ -12664,7 +14255,14 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
{
enum bpf_prog_type prog_type = resolve_prog_type(prog);
- if (map_value_has_spin_lock(map)) {
+ if (btf_record_has_field(map->record, BPF_LIST_HEAD)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_list_head yet\n");
+ return -EINVAL;
+ }
+ }
+
+ if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) {
if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) {
verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
@@ -12681,7 +14279,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
- if (map_value_has_timer(map)) {
+ if (btf_record_has_field(map->record, BPF_TIMER)) {
if (is_tracing_prog_type(prog_type)) {
verbose(env, "tracing progs cannot use bpf_timer yet\n");
return -EINVAL;
@@ -12714,10 +14312,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_INODE_STORAGE:
case BPF_MAP_TYPE_SK_STORAGE:
case BPF_MAP_TYPE_TASK_STORAGE:
+ case BPF_MAP_TYPE_CGRP_STORAGE:
break;
default:
verbose(env,
- "Sleepable programs can only use array, hash, and ringbuf maps\n");
+ "Sleepable programs can only use array, hash, ringbuf and local storage maps\n");
return -EINVAL;
}
@@ -13373,6 +14972,10 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
if (!bpf_jit_needs_zext() && !is_cmpxchg_insn(&insn))
continue;
+ /* Zero-extension is done by the caller. */
+ if (bpf_pseudo_kfunc_call(&insn))
+ continue;
+
if (WARN_ON(load_reg == -1)) {
verbose(env, "verifier bug. zext_dst is set, but no reg is defined\n");
return -EFAULT;
@@ -13500,6 +15103,13 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
break;
case PTR_TO_BTF_ID:
case PTR_TO_BTF_ID | PTR_UNTRUSTED:
+ /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike
+ * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot
+ * be said once it is marked PTR_UNTRUSTED, hence we must handle
+ * any faults for loads into such types. BPF_WRITE is disallowed
+ * for this case.
+ */
+ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED:
if (type == BPF_READ) {
insn->code = BPF_LDX | BPF_PROBE_MEM |
BPF_SIZE((insn)->code);
@@ -13865,8 +15475,8 @@ static int fixup_call_args(struct bpf_verifier_env *env)
return err;
}
-static int fixup_kfunc_call(struct bpf_verifier_env *env,
- struct bpf_insn *insn)
+static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ struct bpf_insn *insn_buf, int insn_idx, int *cnt)
{
const struct bpf_kfunc_desc *desc;
@@ -13876,7 +15486,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
}
/* insn->imm has the btf func_id. Replace it with
- * an address (relative to __bpf_base_call).
+ * an address (relative to __bpf_call_base).
*/
desc = find_kfunc_desc(env->prog, insn->imm, insn->off);
if (!desc) {
@@ -13885,8 +15495,33 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
return -EFAULT;
}
+ *cnt = 0;
insn->imm = desc->imm;
-
+ if (insn->off)
+ return 0;
+ if (desc->func_id == special_kfunc_list[KF_bpf_obj_new_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+ u64 obj_new_size = env->insn_aux_data[insn_idx].obj_new_size;
+
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_1, obj_new_size);
+ insn_buf[1] = addr[0];
+ insn_buf[2] = addr[1];
+ insn_buf[3] = *insn;
+ *cnt = 4;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_obj_drop_impl]) {
+ struct btf_struct_meta *kptr_struct_meta = env->insn_aux_data[insn_idx].kptr_struct_meta;
+ struct bpf_insn addr[2] = { BPF_LD_IMM64(BPF_REG_2, (long)kptr_struct_meta) };
+
+ insn_buf[0] = addr[0];
+ insn_buf[1] = addr[1];
+ insn_buf[2] = *insn;
+ *cnt = 3;
+ } else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+ desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+ insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
+ *cnt = 1;
+ }
return 0;
}
@@ -14028,9 +15663,19 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
if (insn->src_reg == BPF_PSEUDO_CALL)
continue;
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
- ret = fixup_kfunc_call(env, insn);
+ ret = fixup_kfunc_call(env, insn, insn_buf, i + delta, &cnt);
if (ret)
return ret;
+ if (cnt == 0)
+ continue;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
continue;
}
@@ -14148,13 +15793,12 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
- if (insn->imm == BPF_FUNC_task_storage_get ||
- insn->imm == BPF_FUNC_sk_storage_get ||
- insn->imm == BPF_FUNC_inode_storage_get) {
- if (env->prog->aux->sleepable)
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
- else
+ if (is_storage_get_function(insn->imm)) {
+ if (!env->prog->aux->sleepable ||
+ env->insn_aux_data[i + delta].storage_get_func_atomic)
insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
insn_buf[1] = *insn;
cnt = 2;
@@ -14224,7 +15868,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_redirect,
- (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+ (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL));
BUILD_BUG_ON(!__same_type(ops->map_for_each_callback,
(int (*)(struct bpf_map *map,
bpf_callback_t callback_fn,
@@ -14603,6 +16247,8 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
BPF_MAIN_FUNC /* callsite */,
0 /* frameno */,
subprog);
+ state->first_insn_idx = env->subprog_info[subprog].start;
+ state->last_insn_idx = -1;
regs = state->frame[state->curframe]->regs;
if (subprog || env->prog->type == BPF_PROG_TYPE_EXT) {
@@ -15012,12 +16658,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
ret = -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_TRACING:
- /* fentry/fexit/fmod_ret progs can be sleepable only if they are
+
+ /* fentry/fexit/fmod_ret progs can be sleepable if they are
* attached to ALLOW_ERROR_INJECTION and are not in denylist.
*/
if (!check_non_sleepable_error_inject(btf_id) &&
within_error_injection_list(addr))
ret = 0;
+ /* fentry/fexit/fmod_ret progs can also be sleepable if they are
+ * in the fmodret id set with the KF_SLEEPABLE flag.
+ */
+ else {
+ u32 *flags = btf_kfunc_is_modify_return(btf, btf_id);
+
+ if (flags && (*flags & KF_SLEEPABLE))
+ ret = 0;
+ }
break;
case BPF_PROG_TYPE_LSM:
/* LSM progs check that they are attached to bpf_lsm_*() funcs.
@@ -15038,7 +16694,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
bpf_log(log, "can't modify return codes of BPF programs\n");
return -EINVAL;
}
- ret = check_attach_modify_return(addr, tname);
+ ret = -EINVAL;
+ if (btf_kfunc_is_modify_return(btf, btf_id) ||
+ !check_attach_modify_return(addr, tname))
+ ret = 0;
if (ret) {
bpf_log(log, "%s() is not modifiable\n", tname);
return ret;
@@ -15227,10 +16886,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
env->allow_ptr_leaks = bpf_allow_ptr_leaks();
env->allow_uninit_stack = bpf_allow_uninit_stack();
- env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access();
env->bypass_spec_v1 = bpf_bypass_spec_v1();
env->bypass_spec_v4 = bpf_bypass_spec_v4();
env->bpf_capable = bpf_capable();
+ env->rcu_tag_supported = btf_vmlinux &&
+ btf_find_by_name_kind(btf_vmlinux, "rcu", BTF_KIND_TYPE_TAG) > 0;
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
diff --git a/kernel/capability.c b/kernel/capability.c
index 765194f5d678..860fd22117c1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -489,8 +489,8 @@ bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
struct user_namespace *mnt_userns,
const struct inode *inode)
{
- return kuid_has_mapping(ns, i_uid_into_mnt(mnt_userns, inode)) &&
- kgid_has_mapping(ns, i_gid_into_mnt(mnt_userns, inode));
+ return vfsuid_has_mapping(ns, i_uid_into_vfsuid(mnt_userns, inode)) &&
+ vfsgid_has_mapping(ns, i_gid_into_vfsgid(mnt_userns, inode));
}
/**
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index fd4020835ec6..367b0a42ada9 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -167,7 +167,6 @@ struct cgroup_mgctx {
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
-extern struct file_system_type cgroup_fs_type;
/* iterate across the hierarchies */
#define for_each_root(root) \
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 2319946715e0..c099cf3fa02d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -248,6 +248,12 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+#ifdef CONFIG_DEBUG_CGROUP_REF
+#define CGROUP_REF_FN_ATTRS noinline
+#define CGROUP_REF_EXPORT(fn) EXPORT_SYMBOL_GPL(fn);
+#include <linux/cgroup_refcnt.h>
+#endif
+
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
@@ -2860,14 +2866,12 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
* take an rcu_read_lock.
*/
spin_lock_irq(&css_set_lock);
- rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_task(task, mgctx);
if (!threadgroup)
break;
} while_each_thread(leader, task);
- rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return cgroup_migrate_execute(mgctx);
@@ -5349,6 +5353,7 @@ static void css_free_rwork_fn(struct work_struct *work)
atomic_dec(&cgrp->root->nr_cgrps);
cgroup1_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
+ bpf_cgrp_storage_free(cgrp);
if (cgroup_parent(cgrp)) {
/*
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index b474289c15b8..a29c0b13706b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -550,11 +550,15 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_rwsem held. The check can be skipped
+ * if on default hierarchy.
*/
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
+static void cpuset_update_task_spread_flags(struct cpuset *cs,
struct task_struct *tsk)
{
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ return;
+
if (is_spread_page(cs))
task_set_spread_page(tsk);
else
@@ -2153,7 +2157,7 @@ static void update_tasks_flags(struct cpuset *cs)
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
css_task_iter_end(&it);
}
@@ -2509,12 +2513,28 @@ static void cpuset_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
+ bool cpus_updated, mems_updated;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
+ cpus_updated = !cpumask_equal(cs->effective_cpus,
+ oldcs->effective_cpus);
+ mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
+
+ /*
+ * In the default hierarchy, enabling cpuset in the child cgroups
+ * will trigger a number of cpuset_attach() calls with no change
+ * in effective cpus and mems. In that case, we can optimize out
+ * by skipping the task iteration and update.
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !cpus_updated && !mems_updated) {
+ cpuset_attach_nodemask_to = cs->effective_mems;
+ goto out;
+ }
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2530,14 +2550,19 @@ static void cpuset_attach(struct cgroup_taskset *tset)
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, task);
+ cpuset_update_task_spread_flags(cs, task);
}
/*
* Change mm for all threadgroup leaders. This is expensive and may
- * sleep and should be moved outside migration path proper.
+ * sleep and should be moved outside migration path proper. Skip it
+ * if there is no change in effective_mems and CS_MEMORY_MIGRATE is
+ * not set.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
+ if (!is_memory_migrate(cs) && !mems_updated)
+ goto out;
+
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
@@ -2560,6 +2585,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
}
}
+out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
cs->attach_in_progress--;
@@ -3046,11 +3072,15 @@ static struct cftype dfl_files[] = {
};
-/*
- * cpuset_css_alloc - allocate a cpuset css
- * cgrp: control group that the new cpuset will be part of
+/**
+ * cpuset_css_alloc - Allocate a cpuset css
+ * @parent_css: Parent css of the control group that the new cpuset will be
+ * part of
+ * Return: cpuset css on success, -ENOMEM on failure.
+ *
+ * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return
+ * top cpuset css otherwise.
*/
-
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -3630,11 +3660,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
return NOTIFY_OK;
}
-static struct notifier_block cpuset_track_online_nodes_nb = {
- .notifier_call = cpuset_track_online_nodes,
- .priority = 10, /* ??! */
-};
-
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -3652,7 +3677,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+ hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index 8a44b93da0f3..c2f9c912df1c 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -7,5 +7,6 @@ CONFIG_KERNEL_XZ=y
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
# CONFIG_SLAB is not set
-# CONFIG_SLUB is not set
-CONFIG_SLOB=y
+# CONFIG_SLOB_DEPRECATED is not set
+CONFIG_SLUB=y
+CONFIG_SLUB_TINY=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bbad5e375d3b..6c0a92ca6bb5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -663,21 +663,51 @@ static bool cpuhp_next_state(bool bringup,
return true;
}
-static int cpuhp_invoke_callback_range(bool bringup,
- unsigned int cpu,
- struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
+static int __cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target,
+ bool nofail)
{
enum cpuhp_state state;
- int err = 0;
+ int ret = 0;
while (cpuhp_next_state(bringup, &state, st, target)) {
+ int err;
+
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
- if (err)
+ if (!err)
+ continue;
+
+ if (nofail) {
+ pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
+ cpu, bringup ? "UP" : "DOWN",
+ cpuhp_get_step(st->state)->name,
+ st->state, err);
+ ret = -1;
+ } else {
+ ret = err;
break;
+ }
}
- return err;
+ return ret;
+}
+
+static inline int cpuhp_invoke_callback_range(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
+}
+
+static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
+ unsigned int cpu,
+ struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
@@ -999,7 +1029,6 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
- int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -1012,13 +1041,10 @@ static int take_cpu_down(void *_param)
*/
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
- /* Invoke the former CPU_DYING callbacks */
- ret = cpuhp_invoke_callback_range(false, cpu, st, target);
-
/*
- * DYING must not fail!
+ * Invoke the former CPU_DYING callbacks. DYING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(false, cpu, st, target);
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -1296,16 +1322,14 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
- int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
- ret = cpuhp_invoke_callback_range(true, cpu, st, target);
/*
* STARTING must not fail!
*/
- WARN_ON_ONCE(ret);
+ cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}
/*
@@ -2326,8 +2350,10 @@ static ssize_t target_store(struct device *dev, struct device_attribute *attr,
if (st->state < target)
ret = cpu_up(dev->id, target);
- else
+ else if (st->state > target)
ret = cpu_down(dev->id, target);
+ else if (WARN_ON(st->target != target))
+ st->target = target;
out:
unlock_device_hotplug();
return ret ? ret : count;
@@ -2688,6 +2714,7 @@ void __init boot_cpu_hotplug_init(void)
cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
#endif
this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
+ this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}
/*
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index a0eb4d5cf557..87ef6096823f 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -383,6 +383,9 @@ void vmcoreinfo_append_str(const char *fmt, ...)
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
vmcoreinfo_size += r;
+
+ WARN_ONCE(vmcoreinfo_size == VMCOREINFO_BYTES,
+ "vmcoreinfo data exceeds allocated size, truncating");
}
/*
diff --git a/kernel/cred.c b/kernel/cred.c
index e10c15f51c1f..811ad654abd1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -701,9 +701,9 @@ void __init cred_init(void)
* override a task's own credentials so that work can be done on behalf of that
* task that requires a different subjective context.
*
- * @daemon is used to provide a base for the security record, but can be NULL.
- * If @daemon is supplied, then the security data will be derived from that;
- * otherwise they'll be set to 0 and no groups, full capabilities and no keys.
+ * @daemon is used to provide a base cred, with the security data derived from
+ * that; if this is "&init_task", they'll be set to 0, no groups, full
+ * capabilities, and no keys.
*
* The caller may change these controls afterwards if desired.
*
@@ -714,17 +714,16 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
const struct cred *old;
struct cred *new;
+ if (WARN_ON_ONCE(!daemon))
+ return NULL;
+
new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
if (!new)
return NULL;
kdebug("prepare_kernel_cred() alloc %p", new);
- if (daemon)
- old = get_task_cred(daemon);
- else
- old = get_cred(&init_cred);
-
+ old = get_task_cred(daemon);
validate_creds(old);
*new = *old;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 67d3c48a1522..5c7e9ba7cd6b 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -545,6 +545,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
{
struct console *c;
const char *cp;
+ int cookie;
int len;
if (msg_len == 0)
@@ -558,8 +559,20 @@ static void kdb_msg_write(const char *msg, int msg_len)
cp++;
}
- for_each_console(c) {
- if (!(c->flags & CON_ENABLED))
+ /*
+ * The console_srcu_read_lock() only provides safe console list
+ * traversal. The use of the ->write() callback relies on all other
+ * CPUs being stopped at the moment and console drivers being able to
+ * handle reentrance when @oops_in_progress is set.
+ *
+ * There is no guarantee that every console driver can handle
+ * reentrance in this way; the developer deploying the debugger
+ * is responsible for ensuring that the console drivers they
+ * have selected handle reentrance appropriately.
+ */
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if (!(console_srcu_read_flags(c) & CON_ENABLED))
continue;
if (c == dbg_io_ops->cons)
continue;
@@ -577,6 +590,7 @@ static void kdb_msg_write(const char *msg, int msg_len)
--oops_in_progress;
touch_nmi_watchdog();
}
+ console_srcu_read_unlock(cookie);
}
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 33437d620644..68106e3791f6 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -498,6 +498,14 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
WARN_ON_ONCE(!dev->coherent_dma_mask);
+ /*
+ * DMA allocations can never be turned back into a page pointer, so
+ * requesting compound pages doesn't make sense (and can't even be
+ * supported at all by various backends).
+ */
+ if (WARN_ON_ONCE(flag & __GFP_COMP))
+ return NULL;
+
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
return cpu_addr;
@@ -552,6 +560,8 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size,
return NULL;
if (WARN_ON_ONCE(gfp & (__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM)))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
@@ -637,6 +647,8 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (WARN_ON_ONCE(attrs & ~DMA_ATTR_ALLOC_SINGLE_PAGES))
return NULL;
+ if (WARN_ON_ONCE(gfp & __GFP_COMP))
+ return NULL;
if (ops && ops->alloc_noncontiguous)
sgt = ops->alloc_noncontiguous(dev, size, dir, gfp, attrs);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 339a990554e7..a34c38bbe28f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -300,6 +300,37 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
return;
}
+static void *swiotlb_memblock_alloc(unsigned long nslabs, unsigned int flags,
+ int (*remap)(void *tlb, unsigned long nslabs))
+{
+ size_t bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
+ void *tlb;
+
+ /*
+ * By default allocate the bounce buffer memory from low memory, but
+ * allow to pick a location everywhere for hypervisors with guest
+ * memory encryption.
+ */
+ if (flags & SWIOTLB_ANY)
+ tlb = memblock_alloc(bytes, PAGE_SIZE);
+ else
+ tlb = memblock_alloc_low(bytes, PAGE_SIZE);
+
+ if (!tlb) {
+ pr_warn("%s: Failed to allocate %zu bytes tlb structure\n",
+ __func__, bytes);
+ return NULL;
+ }
+
+ if (remap && remap(tlb, nslabs) < 0) {
+ memblock_free(tlb, PAGE_ALIGN(bytes));
+ pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
+ return NULL;
+ }
+
+ return tlb;
+}
+
/*
* Statically reserve bounce buffer space and initialize bounce buffer data
* structures for the software IO TLB used to implement the DMA API.
@@ -310,7 +341,6 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
struct io_tlb_mem *mem = &io_tlb_default_mem;
unsigned long nslabs;
size_t alloc_size;
- size_t bytes;
void *tlb;
if (!addressing_limit && !swiotlb_force_bounce)
@@ -326,31 +356,16 @@ void __init swiotlb_init_remap(bool addressing_limit, unsigned int flags,
swiotlb_adjust_nareas(num_possible_cpus());
nslabs = default_nslabs;
- /*
- * By default allocate the bounce buffer memory from low memory, but
- * allow to pick a location everywhere for hypervisors with guest
- * memory encryption.
- */
-retry:
- bytes = PAGE_ALIGN(nslabs << IO_TLB_SHIFT);
- if (flags & SWIOTLB_ANY)
- tlb = memblock_alloc(bytes, PAGE_SIZE);
- else
- tlb = memblock_alloc_low(bytes, PAGE_SIZE);
- if (!tlb) {
- pr_warn("%s: failed to allocate tlb structure\n", __func__);
- return;
- }
-
- if (remap && remap(tlb, nslabs) < 0) {
- memblock_free(tlb, PAGE_ALIGN(bytes));
-
+ while ((tlb = swiotlb_memblock_alloc(nslabs, flags, remap)) == NULL) {
+ if (nslabs <= IO_TLB_MIN_SLABS)
+ return;
nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE);
- if (nslabs >= IO_TLB_MIN_SLABS)
- goto retry;
+ }
- pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes);
- return;
+ if (default_nslabs != nslabs) {
+ pr_info("SWIOTLB bounce buffer size adjusted %lu -> %lu slabs",
+ default_nslabs, nslabs);
+ default_nslabs = nslabs;
}
alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs));
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4ec3717003d5..eacc3702654d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -155,12 +155,6 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -184,6 +178,14 @@ static bool is_kernel_event(struct perf_event *event)
return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}
+static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+struct perf_event_context *perf_cpu_task_ctx(void)
+{
+ lockdep_assert_irqs_disabled();
+ return this_cpu_ptr(&perf_cpu_context)->task_ctx;
+}
+
/*
* On task ctx scheduling...
*
@@ -217,7 +219,7 @@ static int event_function(void *info)
struct event_function_struct *efs = info;
struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
@@ -314,7 +316,7 @@ again:
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
@@ -388,7 +390,6 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -448,7 +449,7 @@ static void update_perf_cpu_limits(void)
WRITE_ONCE(perf_sample_allowed_ns, tmp);
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
int perf_proc_update_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
@@ -571,12 +572,6 @@ void perf_sample_event_took(u64 sample_len_ns)
static atomic64_t perf_event_id;
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -691,13 +686,31 @@ do { \
___p; \
})
+static void perf_ctx_disable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_disable(pmu_ctx->pmu);
+}
+
+static void perf_ctx_enable(struct perf_event_context *ctx)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ perf_pmu_enable(pmu_ctx->pmu);
+}
+
+static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
+static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
perf_cgroup_match(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
/* @event doesn't care about cgroup */
if (!event->cgrp)
@@ -823,54 +836,39 @@ perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
}
}
-static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-
/*
* reschedule events based on the cgroup constraint of task.
*/
static void perf_cgroup_switch(struct task_struct *task)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_cgroup *cgrp;
- struct perf_cpu_context *cpuctx, *tmp;
- struct list_head *list;
- unsigned long flags;
-
- /*
- * Disable interrupts and preemption to avoid this CPU's
- * cgrp_cpuctx_entry to change under us.
- */
- local_irq_save(flags);
cgrp = perf_cgroup_from_task(task, NULL);
- list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
- WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
- if (READ_ONCE(cpuctx->cgrp) == cgrp)
- continue;
-
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ return;
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to update_cgrp_time_from_cpuctx() in
- * ctx_sched_out()
- */
- cpuctx->cgrp = cgrp;
- /*
- * set cgrp before ctxsw in to allow
- * perf_cgroup_set_timestamp() in ctx_sched_in()
- * to not have to pass task around
- */
- cpu_ctx_sched_in(cpuctx, EVENT_ALL);
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_ctx_disable(&cpuctx->ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- }
+ ctx_sched_out(&cpuctx->ctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
+ */
+ cpuctx->cgrp = cgrp;
+ /*
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
+ */
+ ctx_sched_in(&cpuctx->ctx, EVENT_ALL);
- local_irq_restore(flags);
+ perf_ctx_enable(&cpuctx->ctx);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
static int perf_cgroup_ensure_storage(struct perf_event *event,
@@ -888,7 +886,7 @@ static int perf_cgroup_ensure_storage(struct perf_event *event,
heap_size++;
for_each_possible_cpu(cpu) {
- cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
if (heap_size <= cpuctx->heap_size)
continue;
@@ -972,8 +970,6 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
return;
cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
- list_add(&cpuctx->cgrp_cpuctx_entry,
- per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}
static inline void
@@ -994,7 +990,6 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
return;
cpuctx->cgrp = NULL;
- list_del(&cpuctx->cgrp_cpuctx_entry);
}
#else /* !CONFIG_CGROUP_PERF */
@@ -1069,34 +1064,30 @@ static void perf_cgroup_switch(struct task_struct *task)
*/
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_pmu_context *cpc;
bool rotations;
lockdep_assert_irqs_disabled();
- cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
- rotations = perf_rotate_context(cpuctx);
+ cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
+ rotations = perf_rotate_context(cpc);
- raw_spin_lock(&cpuctx->hrtimer_lock);
+ raw_spin_lock(&cpc->hrtimer_lock);
if (rotations)
- hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+ hrtimer_forward_now(hr, cpc->hrtimer_interval);
else
- cpuctx->hrtimer_active = 0;
- raw_spin_unlock(&cpuctx->hrtimer_lock);
+ cpc->hrtimer_active = 0;
+ raw_spin_unlock(&cpc->hrtimer_lock);
return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}
-static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
+ struct pmu *pmu = cpc->epc.pmu;
u64 interval;
- /* no multiplexing needed for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return;
-
/*
* check default is sane, if not set then force to
* default interval (1/tick)
@@ -1105,34 +1096,34 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
if (interval < 1)
interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
- raw_spin_lock_init(&cpuctx->hrtimer_lock);
+ raw_spin_lock_init(&cpc->hrtimer_lock);
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
timer->function = perf_mux_hrtimer_handler;
}
-static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
- struct hrtimer *timer = &cpuctx->hrtimer;
- struct pmu *pmu = cpuctx->ctx.pmu;
+ struct hrtimer *timer = &cpc->hrtimer;
unsigned long flags;
- /* not for SW PMU */
- if (pmu->task_ctx_nr == perf_sw_context)
- return 0;
-
- raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
- if (!cpuctx->hrtimer_active) {
- cpuctx->hrtimer_active = 1;
- hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+ raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
+ if (!cpc->hrtimer_active) {
+ cpc->hrtimer_active = 1;
+ hrtimer_forward_now(timer, cpc->hrtimer_interval);
hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
}
- raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
+ raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);
return 0;
}
+static int perf_mux_hrtimer_restart_ipi(void *arg)
+{
+ return perf_mux_hrtimer_restart(arg);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1147,32 +1138,9 @@ void perf_pmu_enable(struct pmu *pmu)
pmu->pmu_enable(pmu);
}
-static DEFINE_PER_CPU(struct list_head, active_ctx_list);
-
-/*
- * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
- * perf_event_task_tick() are fully serialized because they're strictly cpu
- * affine and perf_event_ctx{activate,deactivate} are called with IRQs
- * disabled, while perf_event_task_tick is called from IRQ context.
- */
-static void perf_event_ctx_activate(struct perf_event_context *ctx)
-{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
-
- lockdep_assert_irqs_disabled();
-
- WARN_ON(!list_empty(&ctx->active_ctx_list));
-
- list_add(&ctx->active_ctx_list, head);
-}
-
-static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
+static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- lockdep_assert_irqs_disabled();
-
- WARN_ON(list_empty(&ctx->active_ctx_list));
-
- list_del_init(&ctx->active_ctx_list);
+ WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
}
static void get_ctx(struct perf_event_context *ctx)
@@ -1199,7 +1167,6 @@ static void free_ctx(struct rcu_head *head)
struct perf_event_context *ctx;
ctx = container_of(head, struct perf_event_context, rcu_head);
- free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
kfree(ctx);
}
@@ -1384,7 +1351,7 @@ static u64 primary_event_id(struct perf_event *event)
* the context could get moved to another task.
*/
static struct perf_event_context *
-perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
struct perf_event_context *ctx;
@@ -1400,7 +1367,7 @@ retry:
*/
local_irq_save(*flags);
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (ctx) {
/*
* If this context is a clone of another, it might
@@ -1413,7 +1380,7 @@ retry:
* can't get swapped on us any more.
*/
raw_spin_lock(&ctx->lock);
- if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
+ if (ctx != rcu_dereference(task->perf_event_ctxp)) {
raw_spin_unlock(&ctx->lock);
rcu_read_unlock();
local_irq_restore(*flags);
@@ -1440,12 +1407,12 @@ retry:
* reference count so that the context can't get freed.
*/
static struct perf_event_context *
-perf_pin_task_context(struct task_struct *task, int ctxn)
+perf_pin_task_context(struct task_struct *task)
{
struct perf_event_context *ctx;
unsigned long flags;
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1593,14 +1560,22 @@ static inline struct cgroup *event_cgroup(const struct perf_event *event)
* which provides ordering when rotating groups for the same CPU.
*/
static __always_inline int
-perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
- const u64 left_group_index, const struct perf_event *right)
+perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
+ const struct cgroup *left_cgroup, const u64 left_group_index,
+ const struct perf_event *right)
{
if (left_cpu < right->cpu)
return -1;
if (left_cpu > right->cpu)
return 1;
+ if (left_pmu) {
+ if (left_pmu < right->pmu_ctx->pmu)
+ return -1;
+ if (left_pmu > right->pmu_ctx->pmu)
+ return 1;
+ }
+
#ifdef CONFIG_CGROUP_PERF
{
const struct cgroup *right_cgroup = event_cgroup(right);
@@ -1643,12 +1618,13 @@ perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
struct perf_event *e = __node_2_pe(a);
- return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
- __node_2_pe(b)) < 0;
+ return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
+ e->group_index, __node_2_pe(b)) < 0;
}
struct __group_key {
int cpu;
+ struct pmu *pmu;
struct cgroup *cgroup;
};
@@ -1657,14 +1633,25 @@ static inline int __group_cmp(const void *key, const struct rb_node *node)
const struct __group_key *a = key;
const struct perf_event *b = __node_2_pe(node);
- /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
- return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
+ /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
+}
+
+static inline int
+__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
+{
+ const struct __group_key *a = key;
+ const struct perf_event *b = __node_2_pe(node);
+
+ /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
+ return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
+ b->group_index, b);
}
/*
- * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
- * key (see perf_event_groups_less). This places it last inside the CPU
- * subtree.
+ * Insert @event into @groups' tree; using
+ * {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
+ * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
*/
static void
perf_event_groups_insert(struct perf_event_groups *groups,
@@ -1714,14 +1701,15 @@ del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
}
/*
- * Get the leftmost event in the cpu/cgroup subtree.
+ * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
*/
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
- struct cgroup *cgrp)
+ struct pmu *pmu, struct cgroup *cgrp)
{
struct __group_key key = {
.cpu = cpu,
+ .pmu = pmu,
.cgroup = cgrp,
};
struct rb_node *node;
@@ -1733,14 +1721,12 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu,
return NULL;
}
-/*
- * Like rb_entry_next_safe() for the @cpu subtree.
- */
static struct perf_event *
-perf_event_groups_next(struct perf_event *event)
+perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
struct __group_key key = {
.cpu = event->cpu,
+ .pmu = pmu,
.cgroup = event_cgroup(event),
};
struct rb_node *next;
@@ -1752,6 +1738,10 @@ perf_event_groups_next(struct perf_event *event)
return NULL;
}
+#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) \
+ for (event = perf_event_groups_first(groups, cpu, pmu, NULL); \
+ event; event = perf_event_groups_next(event, pmu))
+
/*
* Iterate through the whole groups tree.
*/
@@ -1796,6 +1786,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
perf_cgroup_event_enable(event, ctx);
ctx->generation++;
+ event->pmu_ctx->nr_events++;
}
/*
@@ -1941,7 +1932,8 @@ static void perf_group_attach(struct perf_event *event)
lockdep_assert_held(&event->ctx->lock);
/*
- * We can have double attach due to group movement in perf_event_open.
+ * We can have double attach due to group movement (move_group) in
+ * perf_event_open().
*/
if (event->attach_state & PERF_ATTACH_GROUP)
return;
@@ -2006,6 +1998,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
}
ctx->generation++;
+ event->pmu_ctx->nr_events--;
}
static int
@@ -2022,13 +2015,11 @@ perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx);
static void perf_put_aux_event(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *iter;
/*
@@ -2057,7 +2048,7 @@ static void perf_put_aux_event(struct perf_event *event)
* state so that we don't try to schedule it again. Note
* that perf_event_enable() will clear the ERROR status.
*/
- event_sched_out(iter, cpuctx, ctx);
+ event_sched_out(iter, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -2108,8 +2099,8 @@ static int perf_get_aux_event(struct perf_event *event,
static inline struct list_head *get_event_list(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
+ return event->attr.pinned ? &event->pmu_ctx->pinned_active :
+ &event->pmu_ctx->flexible_active;
}
/*
@@ -2120,10 +2111,7 @@ static inline struct list_head *get_event_list(struct perf_event *event)
*/
static inline void perf_remove_sibling_event(struct perf_event *event)
{
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, event->ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
@@ -2212,53 +2200,22 @@ static bool is_orphaned_event(struct perf_event *event)
return event->state == PERF_EVENT_STATE_DEAD;
}
-static inline int __pmu_filter_match(struct perf_event *event)
-{
- struct pmu *pmu = event->pmu;
- return pmu->filter_match ? pmu->filter_match(event) : 1;
-}
-
-/*
- * Check whether we should attempt to schedule an event group based on
- * PMU-specific filtering. An event group can consist of HW and SW events,
- * potentially with a SW leader, so we must check all the filters, to
- * determine whether a group is schedulable:
- */
-static inline int pmu_filter_match(struct perf_event *event)
-{
- struct perf_event *sibling;
- unsigned long flags;
- int ret = 1;
-
- if (!__pmu_filter_match(event))
- return 0;
-
- local_irq_save(flags);
- for_each_sibling_event(sibling, event) {
- if (!__pmu_filter_match(sibling)) {
- ret = 0;
- break;
- }
- }
- local_irq_restore(flags);
-
- return ret;
-}
-
static inline int
event_filter_match(struct perf_event *event)
{
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
- perf_cgroup_match(event) && pmu_filter_match(event);
+ perf_cgroup_match(event);
}
static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
+ // XXX cpc serialization, probably per-cpu IRQ disabled
+
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
@@ -2291,6 +2248,7 @@ event_sched_out(struct perf_event *event,
!event->pending_work) {
event->pending_work = 1;
dec = false;
+ WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));
task_work_add(current, &event->pending_task, TWA_RESUME);
}
if (dec)
@@ -2300,42 +2258,37 @@ event_sched_out(struct perf_event *event,
perf_event_set_state(event, state);
if (!is_software_event(event))
- cpuctx->active_oncpu--;
- if (!--ctx->nr_active)
- perf_event_ctx_deactivate(ctx);
+ cpc->active_oncpu--;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq--;
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
+ if (event->attr.exclusive || !cpc->active_oncpu)
+ cpc->exclusive = 0;
perf_pmu_enable(event->pmu);
}
static void
-group_sched_out(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event;
if (group_event->state != PERF_EVENT_STATE_ACTIVE)
return;
- perf_pmu_disable(ctx->pmu);
+ perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
/*
* Schedule out siblings (if any):
*/
for_each_sibling_event(event, group_event)
- event_sched_out(event, cpuctx, ctx);
-
- perf_pmu_enable(ctx->pmu);
+ event_sched_out(event, ctx);
}
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
+#define DETACH_DEAD 0x04UL
/*
* Cross CPU call to remove a performance event
@@ -2349,6 +2302,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_context *ctx,
void *info)
{
+ struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
@@ -2356,19 +2310,38 @@ __perf_remove_from_context(struct perf_event *event,
update_cgrp_time_from_cpuctx(cpuctx, false);
}
- event_sched_out(event, cpuctx, ctx);
+ /*
+ * Ensure event_sched_out() switches to OFF, at the very least
+ * this avoids raising perf_pending_task() at this time.
+ */
+ if (flags & DETACH_DEAD)
+ event->pending_disable = 1;
+ event_sched_out(event, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
if (flags & DETACH_CHILD)
perf_child_detach(event);
list_del_event(event, ctx);
+ if (flags & DETACH_DEAD)
+ event->state = PERF_EVENT_STATE_DEAD;
+
+ if (!pmu_ctx->nr_events) {
+ pmu_ctx->rotate_necessary = 0;
+
+ if (ctx->task && ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+ }
if (!ctx->nr_events && ctx->is_active) {
if (ctx == &cpuctx->ctx)
update_cgrp_time_from_cpuctx(cpuctx, true);
ctx->is_active = 0;
- ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
@@ -2398,12 +2371,8 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
* event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock);
- /*
- * Cgroup events are per-cpu events, and must IPI because of
- * cgrp_cpuctx_list.
- */
- if (!ctx->is_active && !is_cgroup_event(event)) {
- __perf_remove_from_context(event, __get_cpu_context(ctx),
+ if (!ctx->is_active) {
+ __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
return;
@@ -2429,13 +2398,17 @@ static void __perf_event_disable(struct perf_event *event,
update_cgrp_time_from_event(event);
}
+ perf_pmu_disable(event->pmu_ctx->pmu);
+
if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
+ group_sched_out(event, ctx);
else
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_OFF);
perf_cgroup_event_disable(event, ctx);
+
+ perf_pmu_enable(event->pmu_ctx->pmu);
}
/*
@@ -2497,10 +2470,10 @@ static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);
static int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2541,14 +2514,12 @@ event_sched_in(struct perf_event *event,
}
if (!is_software_event(event))
- cpuctx->active_oncpu++;
- if (!ctx->nr_active++)
- perf_event_ctx_activate(ctx);
+ cpc->active_oncpu++;
if (event->attr.freq && event->attr.sample_freq)
ctx->nr_freq++;
if (event->attr.exclusive)
- cpuctx->exclusive = 1;
+ cpc->exclusive = 1;
out:
perf_pmu_enable(event->pmu);
@@ -2557,26 +2528,24 @@ out:
}
static int
-group_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
+group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = ctx->pmu;
+ struct pmu *pmu = group_event->pmu_ctx->pmu;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx))
+ if (event_sched_in(group_event, ctx))
goto error;
/*
* Schedule in siblings as one group (if any):
*/
for_each_sibling_event(event, group_event) {
- if (event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, ctx)) {
partial_group = event;
goto group_error;
}
@@ -2595,9 +2564,9 @@ group_error:
if (event == partial_group)
break;
- event_sched_out(event, cpuctx, ctx);
+ event_sched_out(event, ctx);
}
- event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, ctx);
error:
pmu->cancel_txn(pmu);
@@ -2607,10 +2576,11 @@ error:
/*
* Work out whether we can put this event group on the CPU now.
*/
-static int group_can_go_on(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- int can_add_hw)
+static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
+ struct perf_event_pmu_context *epc = event->pmu_ctx;
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+
/*
* Groups consisting entirely of software events can always go on.
*/
@@ -2620,7 +2590,7 @@ static int group_can_go_on(struct perf_event *event,
* If an exclusive group is already on, no other hardware
* events can go on.
*/
- if (cpuctx->exclusive)
+ if (cpc->exclusive)
return 0;
/*
* If this group is exclusive and there are already
@@ -2642,36 +2612,29 @@ static void add_event_to_ctx(struct perf_event *event,
perf_group_attach(event);
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
-
-static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- enum event_type_t event_type)
+static void task_ctx_sched_out(struct perf_event_context *ctx,
+ enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+
if (!cpuctx->task_ctx)
return;
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
- ctx_sched_out(ctx, cpuctx, event_type);
+ ctx_sched_out(ctx, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_PINNED);
+ ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, EVENT_FLEXIBLE);
}
/*
@@ -2689,11 +2652,15 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
+/*
+ * XXX: ctx_resched() reschedule entire perf_event_context while adding new
+ * event to the context or enabling existing event in the context. We can
+ * probably optimize it by rescheduling only affected pmu_ctx.
+ */
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
{
- enum event_type_t ctx_event_type;
bool cpu_event = !!(event_type & EVENT_CPU);
/*
@@ -2703,11 +2670,13 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
if (event_type & EVENT_PINNED)
event_type |= EVENT_FLEXIBLE;
- ctx_event_type = event_type & EVENT_ALL;
+ event_type &= EVENT_ALL;
- perf_pmu_disable(cpuctx->ctx.pmu);
- if (task_ctx)
- task_ctx_sched_out(cpuctx, task_ctx, event_type);
+ perf_ctx_disable(&cpuctx->ctx);
+ if (task_ctx) {
+ perf_ctx_disable(task_ctx);
+ task_ctx_sched_out(task_ctx, event_type);
+ }
/*
* Decide which cpu ctx groups to schedule out based on the types
@@ -2717,17 +2686,20 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
- cpu_ctx_sched_out(cpuctx, ctx_event_type);
- else if (ctx_event_type & EVENT_PINNED)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_out(&cpuctx->ctx, event_type);
+ else if (event_type & EVENT_PINNED)
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, task_ctx);
- perf_pmu_enable(cpuctx->ctx.pmu);
+
+ perf_ctx_enable(&cpuctx->ctx);
+ if (task_ctx)
+ perf_ctx_enable(task_ctx);
}
void perf_pmu_resched(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
@@ -2745,7 +2717,7 @@ static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
bool reprogram = true;
int ret = 0;
@@ -2787,7 +2759,7 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
} else {
@@ -2820,7 +2792,7 @@ perf_install_in_context(struct perf_event_context *ctx,
WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
if (event->cpu != -1)
- event->cpu = cpu;
+ WARN_ON_ONCE(event->cpu != cpu);
/*
* Ensures that if we can observe event->ctx, both the event and ctx
@@ -2832,8 +2804,6 @@ perf_install_in_context(struct perf_event_context *ctx,
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
- * Similarly, cgroup events for the context also needs the IPI to
- * manipulate the cgrp_cpuctx_list.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
@@ -2935,7 +2905,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (ctx->is_active)
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
@@ -2944,7 +2914,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -2953,7 +2923,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
return;
}
@@ -3222,11 +3192,52 @@ out:
return err;
}
-static void ctx_sched_out(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
+ enum event_type_t event_type)
{
+ struct perf_event_context *ctx = pmu_ctx->ctx;
struct perf_event *event, *tmp;
+ struct pmu *pmu = pmu_ctx->pmu;
+
+ if (ctx->task && !ctx->is_active) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = NULL;
+ }
+
+ if (!event_type)
+ return;
+
+ perf_pmu_disable(pmu);
+ if (event_type & EVENT_PINNED) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->pinned_active,
+ active_list)
+ group_sched_out(event, ctx);
+ }
+
+ if (event_type & EVENT_FLEXIBLE) {
+ list_for_each_entry_safe(event, tmp,
+ &pmu_ctx->flexible_active,
+ active_list)
+ group_sched_out(event, ctx);
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ pmu_ctx->rotate_necessary = 0;
+ }
+ perf_pmu_enable(pmu);
+}
+
+static void
+ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
+{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3274,27 +3285,8 @@ static void ctx_sched_out(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (!ctx->nr_active || !(is_active & EVENT_ALL))
- return;
-
- perf_pmu_disable(ctx->pmu);
- if (is_active & EVENT_PINNED) {
- list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
- group_sched_out(event, cpuctx, ctx);
- }
-
- if (is_active & EVENT_FLEXIBLE) {
- list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
- group_sched_out(event, cpuctx, ctx);
-
- /*
- * Since we cleared EVENT_FLEXIBLE, also clear
- * rotate_necessary, is will be reset by
- * ctx_flexible_sched_in() when needed.
- */
- ctx->rotate_necessary = 0;
- }
- perf_pmu_enable(ctx->pmu);
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry)
+ __pmu_ctx_sched_out(pmu_ctx, is_active);
}
/*
@@ -3399,26 +3391,68 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
+ for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
+ pos2 = list_first_entry(head2, typeof(*pos2), member); \
+ !list_entry_is_head(pos1, head1, member) && \
+ !list_entry_is_head(pos2, head2, member); \
+ pos1 = list_next_entry(pos1, member), \
+ pos2 = list_next_entry(pos2, member))
+
+static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
+ struct perf_event_context *next_ctx)
{
- struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
+ struct perf_event_pmu_context *prev_epc, *next_epc;
+
+ if (!prev_ctx->nr_task_data)
+ return;
+
+ double_list_for_each_entry(prev_epc, next_epc,
+ &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
+ pmu_ctx_entry) {
+
+ if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
+ continue;
+
+ /*
+ * PMU specific parts of task perf context can require
+ * additional synchronization. As an example of such
+ * synchronization see implementation details of Intel
+ * LBR call stack data profiling;
+ */
+ if (prev_epc->pmu->swap_task_ctx)
+ prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
+ else
+ swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
+ }
+}
+
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+{
+ struct perf_event_pmu_context *pmu_ctx;
+ struct perf_cpu_pmu_context *cpc;
+
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+
+ if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
+ pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ }
+}
+
+static void
+perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
+{
+ struct perf_event_context *ctx = task->perf_event_ctxp;
struct perf_event_context *next_ctx;
struct perf_event_context *parent, *next_parent;
- struct perf_cpu_context *cpuctx;
int do_switch = 1;
- struct pmu *pmu;
if (likely(!ctx))
return;
- pmu = ctx->pmu;
- cpuctx = __get_cpu_context(ctx);
- if (!cpuctx->task_ctx)
- return;
-
rcu_read_lock();
- next_ctx = next->perf_event_ctxp[ctxn];
+ next_ctx = rcu_dereference(next->perf_event_ctxp);
if (!next_ctx)
goto unlock;
@@ -3443,7 +3477,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
@@ -3460,21 +3494,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
+ perf_ctx_sched_task_cb(ctx, false);
+ perf_event_swap_task_ctx_data(ctx, next_ctx);
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (pmu->swap_task_ctx)
- pmu->swap_task_ctx(ctx, next_ctx);
- else
- swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
-
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
/*
* RCU_INIT_POINTER here is safe because we've not
@@ -3483,8 +3506,8 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
* since those values are always verified under
* ctx->lock which we're now holding.
*/
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
- RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+ RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
do_switch = 0;
@@ -3498,38 +3521,40 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
inside_switch:
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(ctx, false);
- task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
+ perf_ctx_sched_task_cb(ctx, false);
+ task_ctx_sched_out(ctx, EVENT_ALL);
- perf_pmu_enable(pmu);
+ perf_ctx_enable(ctx);
raw_spin_unlock(&ctx->lock);
}
}
static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
this_cpu_dec(perf_sched_cb_usages);
+ barrier();
- if (!--cpuctx->sched_cb_usage)
- list_del(&cpuctx->sched_cb_entry);
+ if (!--cpc->sched_cb_usage)
+ list_del(&cpc->sched_cb_entry);
}
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
- if (!cpuctx->sched_cb_usage++)
- list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ if (!cpc->sched_cb_usage++)
+ list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+ barrier();
this_cpu_inc(perf_sched_cb_usages);
}
@@ -3541,19 +3566,21 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
- pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
+ pmu = cpc->epc.pmu;
+ /* software PMUs will not have sched_task */
if (WARN_ON_ONCE(!pmu->sched_task))
return;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpuctx->task_ctx, sched_in);
+ pmu->sched_task(cpc->task_epc, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3563,26 +3590,20 @@ static void perf_pmu_sched_task(struct task_struct *prev,
struct task_struct *next,
bool sched_in)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_cpu_pmu_context *cpc;
- if (prev == next)
+ /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
+ if (prev == next || cpuctx->task_ctx)
return;
- list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
- /* will be handled in perf_event_context_sched_in/out */
- if (cpuctx->task_ctx)
- continue;
-
- __perf_pmu_sched_task(cpuctx, sched_in);
- }
+ list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
+ __perf_pmu_sched_task(cpc, sched_in);
}
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
-#define for_each_task_context_nr(ctxn) \
- for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
-
/*
* Called from scheduler to remove the events of the current task,
* with interrupts disabled.
@@ -3597,16 +3618,13 @@ static void perf_event_switch(struct task_struct *task,
void __perf_event_task_sched_out(struct task_struct *task,
struct task_struct *next)
{
- int ctxn;
-
if (__this_cpu_read(perf_sched_cb_usages))
perf_pmu_sched_task(task, next, false);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
- for_each_task_context_nr(ctxn)
- perf_event_context_sched_out(task, ctxn, next);
+ perf_event_context_sched_out(task, next);
/*
* if cgroup events exist on this CPU, then we need
@@ -3617,15 +3635,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_switch(next);
}
-/*
- * Called with IRQs disabled
- */
-static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
-{
- ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
-}
-
static bool perf_less_group_idx(const void *l, const void *r)
{
const struct perf_event *le = *(const struct perf_event **)l;
@@ -3657,21 +3666,39 @@ static void __heap_add(struct min_heap *heap, struct perf_event *event)
}
}
-static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
+static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
+{
+ struct perf_cpu_pmu_context *cpc;
+
+ if (!pmu_ctx->ctx->task)
+ return;
+
+ cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
+ cpc->task_epc = pmu_ctx;
+}
+
+static noinline int visit_groups_merge(struct perf_event_context *ctx,
struct perf_event_groups *groups, int cpu,
+ struct pmu *pmu,
int (*func)(struct perf_event *, void *),
void *data)
{
#ifdef CONFIG_CGROUP_PERF
struct cgroup_subsys_state *css = NULL;
#endif
+ struct perf_cpu_context *cpuctx = NULL;
/* Space for per CPU and/or any CPU event iterators. */
struct perf_event *itrs[2];
struct min_heap event_heap;
struct perf_event **evt;
int ret;
- if (cpuctx) {
+ if (pmu->filter && pmu->filter(pmu, cpu))
+ return 0;
+
+ if (!ctx->task) {
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
event_heap = (struct min_heap){
.data = cpuctx->heap,
.nr = 0,
@@ -3691,17 +3718,22 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
.size = ARRAY_SIZE(itrs),
};
/* Events not within a CPU context may be on any CPU. */
- __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
}
evt = event_heap.data;
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));
#ifdef CONFIG_CGROUP_PERF
for (; css; css = css->parent)
- __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
+ __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif
+ if (event_heap.nr) {
+ __link_epc((*evt)->pmu_ctx);
+ perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
+ }
+
min_heapify_all(&event_heap, &perf_min_heap);
while (event_heap.nr) {
@@ -3709,7 +3741,7 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
if (ret)
return ret;
- *evt = perf_event_groups_next(*evt);
+ *evt = perf_event_groups_next(*evt, pmu);
if (*evt)
min_heapify(&event_heap, 0, &perf_min_heap);
else
@@ -3751,7 +3783,6 @@ static inline void group_update_userpage(struct perf_event *group_event)
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int *can_add_hw = data;
if (event->state <= PERF_EVENT_STATE_OFF)
@@ -3760,8 +3791,8 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (!event_filter_match(event))
return 0;
- if (group_can_go_on(event, cpuctx, *can_add_hw)) {
- if (!group_sched_in(event, cpuctx, ctx))
+ if (group_can_go_on(event, *can_add_hw)) {
+ if (!group_sched_in(event, ctx))
list_add_tail(&event->active_list, get_event_list(event));
}
@@ -3771,8 +3802,11 @@ static int merge_sched_in(struct perf_event *event, void *data)
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
} else {
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
+ struct perf_cpu_pmu_context *cpc;
+
+ event->pmu_ctx->rotate_necessary = 1;
+ cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
+ perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
}
@@ -3780,39 +3814,53 @@ static int merge_sched_in(struct perf_event *event, void *data)
return 0;
}
-static void
-ctx_pinned_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_pinned_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
-
- visit_groups_merge(cpuctx, &ctx->pinned_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->pinned_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
}
-static void
-ctx_flexible_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx)
+static void ctx_flexible_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
{
+ struct perf_event_pmu_context *pmu_ctx;
int can_add_hw = 1;
- if (ctx != &cpuctx->ctx)
- cpuctx = NULL;
+ if (pmu) {
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu,
+ merge_sched_in, &can_add_hw);
+ } else {
+ list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ can_add_hw = 1;
+ visit_groups_merge(ctx, &ctx->flexible_groups,
+ smp_processor_id(), pmu_ctx->pmu,
+ merge_sched_in, &can_add_hw);
+ }
+ }
+}
- visit_groups_merge(cpuctx, &ctx->flexible_groups,
- smp_processor_id(),
- merge_sched_in, &can_add_hw);
+static void __pmu_ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu)
+{
+ ctx_flexible_sched_in(ctx, pmu);
}
static void
-ctx_sched_in(struct perf_event_context *ctx,
- struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
int is_active = ctx->is_active;
lockdep_assert_held(&ctx->lock);
@@ -3846,39 +3894,32 @@ ctx_sched_in(struct perf_event_context *ctx,
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
- ctx_pinned_sched_in(ctx, cpuctx);
+ ctx_pinned_sched_in(ctx, NULL);
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
- ctx_flexible_sched_in(ctx, cpuctx);
+ ctx_flexible_sched_in(ctx, NULL);
}
-static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+static void perf_event_context_sched_in(struct task_struct *task)
{
- struct perf_event_context *ctx = &cpuctx->ctx;
-
- ctx_sched_in(ctx, cpuctx, event_type);
-}
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
-static void perf_event_context_sched_in(struct perf_event_context *ctx,
- struct task_struct *task)
-{
- struct perf_cpu_context *cpuctx;
- struct pmu *pmu;
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp);
+ if (!ctx)
+ goto rcu_unlock;
- cpuctx = __get_cpu_context(ctx);
+ if (cpuctx->task_ctx == ctx) {
+ perf_ctx_lock(cpuctx, ctx);
+ perf_ctx_disable(ctx);
- /*
- * HACK: for HETEROGENEOUS the task context might have switched to a
- * different PMU, force (re)set the context,
- */
- pmu = ctx->pmu = cpuctx->ctx.pmu;
+ perf_ctx_sched_task_cb(ctx, true);
- if (cpuctx->task_ctx == ctx) {
- if (cpuctx->sched_cb_usage)
- __perf_pmu_sched_task(cpuctx, true);
- return;
+ perf_ctx_enable(ctx);
+ perf_ctx_unlock(cpuctx, ctx);
+ goto rcu_unlock;
}
perf_ctx_lock(cpuctx, ctx);
@@ -3889,7 +3930,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
if (!ctx->nr_events)
goto unlock;
- perf_pmu_disable(pmu);
+ perf_ctx_disable(ctx);
/*
* We want to keep the following priority order:
* cpu pinned (that don't need to move), task pinned,
@@ -3898,17 +3939,24 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* However, if task's ctx is not carrying any pinned
* events, no need to flip the cpuctx's events around.
*/
- if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
+ perf_ctx_disable(&cpuctx->ctx);
+ ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
+ }
+
perf_event_sched_in(cpuctx, ctx);
- if (cpuctx->sched_cb_usage && pmu->sched_task)
- pmu->sched_task(cpuctx->task_ctx, true);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
- perf_pmu_enable(pmu);
+ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
+ perf_ctx_enable(&cpuctx->ctx);
+
+ perf_ctx_enable(ctx);
unlock:
perf_ctx_unlock(cpuctx, ctx);
+rcu_unlock:
+ rcu_read_unlock();
}
/*
@@ -3925,16 +3973,7 @@ unlock:
void __perf_event_task_sched_in(struct task_struct *prev,
struct task_struct *task)
{
- struct perf_event_context *ctx;
- int ctxn;
-
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (likely(!ctx))
- continue;
-
- perf_event_context_sched_in(ctx, task);
- }
+ perf_event_context_sched_in(task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -4053,8 +4092,8 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
-static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
- int needs_unthr)
+static void
+perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
struct perf_event *event;
struct hw_perf_event *hwc;
@@ -4066,16 +4105,16 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
- if (!(ctx->nr_freq || needs_unthr))
+ if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
- perf_pmu_disable(ctx->pmu);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
+ // XXX use visit thingy to avoid the -1,cpu match
if (!event_filter_match(event))
continue;
@@ -4116,7 +4155,6 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
perf_pmu_enable(event->pmu);
}
- perf_pmu_enable(ctx->pmu);
raw_spin_unlock(&ctx->lock);
}
@@ -4138,72 +4176,109 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_event_to_rotate(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
struct perf_event *event;
+ struct rb_node *node;
+ struct rb_root *tree;
+ struct __group_key key = {
+ .pmu = pmu_ctx->pmu,
+ };
/* pick the first active flexible event */
- event = list_first_entry_or_null(&ctx->flexible_active,
+ event = list_first_entry_or_null(&pmu_ctx->flexible_active,
struct perf_event, active_list);
+ if (event)
+ goto out;
/* if no active flexible event, pick the first event */
- if (!event) {
- event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
- typeof(*event), group_node);
+ tree = &pmu_ctx->ctx->flexible_groups.tree;
+
+ if (!pmu_ctx->ctx->task) {
+ key.cpu = smp_processor_id();
+
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+ goto out;
}
+ key.cpu = -1;
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node) {
+ event = __node_2_pe(node);
+ goto out;
+ }
+
+ key.cpu = smp_processor_id();
+ node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
+ if (node)
+ event = __node_2_pe(node);
+
+out:
/*
* Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
* finds there are unschedulable events, it will set it again.
*/
- ctx->rotate_necessary = 0;
+ pmu_ctx->rotate_necessary = 0;
return event;
}
-static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
struct perf_event *cpu_event = NULL, *task_event = NULL;
- struct perf_event_context *task_ctx = NULL;
int cpu_rotate, task_rotate;
+ struct pmu *pmu;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- cpu_rotate = cpuctx->ctx.rotate_necessary;
- task_ctx = cpuctx->task_ctx;
- task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
+ cpu_epc = &cpc->epc;
+ pmu = cpu_epc->pmu;
+ task_epc = cpc->task_epc;
+
+ cpu_rotate = cpu_epc->rotate_necessary;
+ task_rotate = task_epc ? task_epc->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
+ perf_pmu_disable(pmu);
if (task_rotate)
- task_event = ctx_event_to_rotate(task_ctx);
+ task_event = ctx_event_to_rotate(task_epc);
if (cpu_rotate)
- cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(cpu_epc);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (task_ctx && cpu_event))
- ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
- if (cpu_event)
- cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_epc && cpu_event)) {
+ update_context_time(task_epc->ctx);
+ __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
+ }
- if (task_event)
- rotate_ctx(task_ctx, task_event);
- if (cpu_event)
+ if (cpu_event) {
+ update_context_time(&cpuctx->ctx);
+ __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
+ __pmu_ctx_sched_in(&cpuctx->ctx, pmu);
+ }
- perf_event_sched_in(cpuctx, task_ctx);
+ if (task_event)
+ rotate_ctx(task_epc->ctx, task_event);
+
+ if (task_event || (task_epc && cpu_event))
+ __pmu_ctx_sched_in(task_epc->ctx, pmu);
- perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
return true;
@@ -4211,8 +4286,8 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
void perf_event_task_tick(void)
{
- struct list_head *head = this_cpu_ptr(&active_ctx_list);
- struct perf_event_context *ctx, *tmp;
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
+ struct perf_event_context *ctx;
int throttled;
lockdep_assert_irqs_disabled();
@@ -4221,8 +4296,13 @@ void perf_event_task_tick(void)
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
- list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
- perf_adjust_freq_unthr_context(ctx, throttled);
+ perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);
+
+ rcu_read_lock();
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, !!throttled);
+ rcu_read_unlock();
}
static int event_enable_on_exec(struct perf_event *event,
@@ -4244,9 +4324,9 @@ static int event_enable_on_exec(struct perf_event *event,
* Enable all of a task's events that have been marked enable-on-exec.
* This expects task == current.
*/
-static void perf_event_enable_on_exec(int ctxn)
+static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
enum event_type_t event_type = 0;
struct perf_cpu_context *cpuctx;
struct perf_event *event;
@@ -4254,13 +4334,16 @@ static void perf_event_enable_on_exec(int ctxn)
int enabled = 0;
local_irq_save(flags);
- ctx = current->perf_event_ctxp[ctxn];
- if (!ctx || !ctx->nr_events)
+ if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
goto out;
- cpuctx = __get_cpu_context(ctx);
+ if (!ctx->nr_events)
+ goto out;
+
+ cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
+
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
event_type |= get_event_type(event);
@@ -4273,7 +4356,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_in(ctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4292,17 +4375,13 @@ static void perf_event_exit_event(struct perf_event *event,
* Removes all events from the current task that have been marked
* remove-on-exec, and feeds their values back to parent events.
*/
-static void perf_event_remove_on_exec(int ctxn)
+static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
- struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_event_context *clone_ctx = NULL;
struct perf_event *event, *next;
unsigned long flags;
bool modified = false;
- ctx = perf_pin_task_context(current, ctxn);
- if (!ctx)
- return;
-
mutex_lock(&ctx->mutex);
if (WARN_ON_ONCE(ctx->task != current))
@@ -4323,13 +4402,11 @@ static void perf_event_remove_on_exec(int ctxn)
raw_spin_lock_irqsave(&ctx->lock, flags);
if (modified)
clone_ctx = unclone_ctx(ctx);
- --ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
unlock:
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
if (clone_ctx)
put_ctx(clone_ctx);
}
@@ -4365,7 +4442,7 @@ static void __perf_event_read(void *info)
struct perf_read_data *data = info;
struct perf_event *sub, *event = data->event;
struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu = event->pmu;
/*
@@ -4591,17 +4668,25 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
{
raw_spin_lock_init(&ctx->lock);
mutex_init(&ctx->mutex);
- INIT_LIST_HEAD(&ctx->active_ctx_list);
+ INIT_LIST_HEAD(&ctx->pmu_ctx_list);
perf_event_groups_init(&ctx->pinned_groups);
perf_event_groups_init(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
- INIT_LIST_HEAD(&ctx->pinned_active);
- INIT_LIST_HEAD(&ctx->flexible_active);
refcount_set(&ctx->refcount, 1);
}
+static void
+__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
+{
+ epc->pmu = pmu;
+ INIT_LIST_HEAD(&epc->pmu_ctx_entry);
+ INIT_LIST_HEAD(&epc->pinned_active);
+ INIT_LIST_HEAD(&epc->flexible_active);
+ atomic_set(&epc->refcount, 1);
+}
+
static struct perf_event_context *
-alloc_perf_context(struct pmu *pmu, struct task_struct *task)
+alloc_perf_context(struct task_struct *task)
{
struct perf_event_context *ctx;
@@ -4612,7 +4697,6 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
__perf_event_init_context(ctx);
if (task)
ctx->task = get_task_struct(task);
- ctx->pmu = pmu;
return ctx;
}
@@ -4641,15 +4725,12 @@ find_lively_task_by_vpid(pid_t vpid)
* Returns a matching context with refcount and pincount.
*/
static struct perf_event_context *
-find_get_context(struct pmu *pmu, struct task_struct *task,
- struct perf_event *event)
+find_get_context(struct task_struct *task, struct perf_event *event)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
struct perf_cpu_context *cpuctx;
- void *task_ctx_data = NULL;
unsigned long flags;
- int ctxn, err;
- int cpu = event->cpu;
+ int err;
if (!task) {
/* Must be root to operate on a CPU event: */
@@ -4657,7 +4738,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
if (err)
return ERR_PTR(err);
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
raw_spin_lock_irqsave(&ctx->lock, flags);
@@ -4668,43 +4749,22 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
}
err = -EINVAL;
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto errout;
-
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- err = -ENOMEM;
- goto errout;
- }
- }
-
retry:
- ctx = perf_lock_task_context(task, ctxn, &flags);
+ ctx = perf_lock_task_context(task, &flags);
if (ctx) {
clone_ctx = unclone_ctx(ctx);
++ctx->pin_count;
- if (task_ctx_data && !ctx->task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
raw_spin_unlock_irqrestore(&ctx->lock, flags);
if (clone_ctx)
put_ctx(clone_ctx);
} else {
- ctx = alloc_perf_context(pmu, task);
+ ctx = alloc_perf_context(task);
err = -ENOMEM;
if (!ctx)
goto errout;
- if (task_ctx_data) {
- ctx->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- }
-
err = 0;
mutex_lock(&task->perf_event_mutex);
/*
@@ -4713,12 +4773,12 @@ retry:
*/
if (task->flags & PF_EXITING)
err = -ESRCH;
- else if (task->perf_event_ctxp[ctxn])
+ else if (task->perf_event_ctxp)
err = -EAGAIN;
else {
get_ctx(ctx);
++ctx->pin_count;
- rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ rcu_assign_pointer(task->perf_event_ctxp, ctx);
}
mutex_unlock(&task->perf_event_mutex);
@@ -4731,21 +4791,146 @@ retry:
}
}
- free_task_ctx_data(pmu, task_ctx_data);
return ctx;
errout:
- free_task_ctx_data(pmu, task_ctx_data);
return ERR_PTR(err);
}
+static struct perf_event_pmu_context *
+find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
+ struct perf_event *event)
+{
+ struct perf_event_pmu_context *new = NULL, *epc;
+ void *task_ctx_data = NULL;
+
+ if (!ctx->task) {
+ struct perf_cpu_pmu_context *cpc;
+
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ epc = &cpc->epc;
+
+ if (!epc->ctx) {
+ atomic_set(&epc->refcount, 1);
+ epc->embedded = 1;
+ raw_spin_lock_irq(&ctx->lock);
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+ raw_spin_unlock_irq(&ctx->lock);
+ } else {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ }
+
+ return epc;
+ }
+
+ new = kzalloc(sizeof(*epc), GFP_KERNEL);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ task_ctx_data = alloc_task_ctx_data(pmu);
+ if (!task_ctx_data) {
+ kfree(new);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+
+ __perf_init_event_pmu_context(new, pmu);
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because perf_event_init_task() doesn't actually hold the
+ * child_ctx->mutex.
+ */
+
+ raw_spin_lock_irq(&ctx->lock);
+ list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
+ if (epc->pmu == pmu) {
+ WARN_ON_ONCE(epc->ctx != ctx);
+ atomic_inc(&epc->refcount);
+ goto found_epc;
+ }
+ }
+
+ epc = new;
+ new = NULL;
+
+ list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
+ epc->ctx = ctx;
+
+found_epc:
+ if (task_ctx_data && !epc->task_ctx_data) {
+ epc->task_ctx_data = task_ctx_data;
+ task_ctx_data = NULL;
+ ctx->nr_task_data++;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+
+ free_task_ctx_data(pmu, task_ctx_data);
+ kfree(new);
+
+ return epc;
+}
+
+static void get_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
+}
+
+static void free_epc_rcu(struct rcu_head *head)
+{
+ struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
+
+ kfree(epc->task_ctx_data);
+ kfree(epc);
+}
+
+static void put_pmu_ctx(struct perf_event_pmu_context *epc)
+{
+ unsigned long flags;
+
+ if (!atomic_dec_and_test(&epc->refcount))
+ return;
+
+ if (epc->ctx) {
+ struct perf_event_context *ctx = epc->ctx;
+
+ /*
+ * XXX
+ *
+ * lockdep_assert_held(&ctx->mutex);
+ *
+ * can't because of the call-site in _free_event()/put_event()
+ * which isn't always called under ctx->mutex.
+ */
+
+ WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+ list_del_init(&epc->pmu_ctx_entry);
+ epc->ctx = NULL;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ }
+
+ WARN_ON_ONCE(!list_empty(&epc->pinned_active));
+ WARN_ON_ONCE(!list_empty(&epc->flexible_active));
+
+ if (epc->embedded)
+ return;
+
+ call_rcu(&epc->rcu_head, free_epc_rcu);
+}
+
static void perf_event_free_filter(struct perf_event *event);
static void free_event_rcu(struct rcu_head *head)
{
- struct perf_event *event;
+ struct perf_event *event = container_of(head, typeof(*event), rcu_head);
- event = container_of(head, struct perf_event, rcu_head);
if (event->ns)
put_pid_ns(event->ns);
perf_event_free_filter(event);
@@ -4883,7 +5068,7 @@ static void perf_sched_delayed(struct work_struct *work)
*
* 1) cpu-wide events in the presence of per-task events,
* 2) per-task events in the presence of cpu-wide events,
- * 3) two matching events on the same context.
+ * 3) two matching events on the same perf_event_context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
* _free_event()), the latter -- before the first perf_install_in_context().
@@ -5007,6 +5192,9 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
+ if (event->pmu_ctx)
+ put_pmu_ctx(event->pmu_ctx);
+
/*
* perf_event_free_task() relies on put_ctx() being 'last', in particular
* all task references must be cleaned up.
@@ -5107,8 +5295,8 @@ int perf_event_release_kernel(struct perf_event *event)
LIST_HEAD(free_list);
/*
- * If we got here through err_file: fput(event_file); we will not have
- * attached to a context yet.
+ * If we got here through err_alloc: free_event(event); we will not
+ * have attached to a context yet.
*/
if (!ctx) {
WARN_ON_ONCE(event->attach_state &
@@ -5121,9 +5309,7 @@ int perf_event_release_kernel(struct perf_event *event)
ctx = perf_event_ctx_lock(event);
WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, DETACH_GROUP);
- raw_spin_lock_irq(&ctx->lock);
/*
* Mark this event as STATE_DEAD, there is no external reference to it
* anymore.
@@ -5135,8 +5321,7 @@ int perf_event_release_kernel(struct perf_event *event)
* Thus this guarantees that we will in fact observe and kill _ALL_
* child events.
*/
- event->state = PERF_EVENT_STATE_DEAD;
- raw_spin_unlock_irq(&ctx->lock);
+ perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);
perf_event_ctx_unlock(event, ctx);
@@ -5543,7 +5728,7 @@ static void __perf_event_period(struct perf_event *event,
active = (event->state == PERF_EVENT_STATE_ACTIVE);
if (active) {
- perf_pmu_disable(ctx->pmu);
+ perf_pmu_disable(event->pmu);
/*
* We could be throttled; unthrottle now to avoid the tick
* trying to unthrottle while we already re-started the event.
@@ -5559,7 +5744,7 @@ static void __perf_event_period(struct perf_event *event,
if (active) {
event->pmu->start(event, PERF_EF_RELOAD);
- perf_pmu_enable(ctx->pmu);
+ perf_pmu_enable(event->pmu);
}
}
@@ -6577,6 +6762,8 @@ static void perf_pending_task(struct callback_head *head)
if (rctx >= 0)
perf_swevent_put_recursion_context(rctx);
preempt_enable_notrace();
+
+ put_event(event);
}
#ifdef CONFIG_GUEST_PERF_EVENTS
@@ -7306,7 +7493,7 @@ static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
return pud_leaf_size(pud);
pmdp = pmd_offset_lockless(pudp, pud, addr);
- pmd = READ_ONCE(*pmdp);
+ pmd = pmdp_get_lockless(pmdp);
if (!pmd_present(pmd))
return 0;
@@ -7720,7 +7907,6 @@ perf_iterate_sb(perf_iterate_f output, void *data,
struct perf_event_context *task_ctx)
{
struct perf_event_context *ctx;
- int ctxn;
rcu_read_lock();
preempt_disable();
@@ -7737,11 +7923,9 @@ perf_iterate_sb(perf_iterate_f output, void *data,
perf_iterate_sb_cpu(output, data);
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_iterate_ctx(ctx, output, data, false);
- }
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
+ perf_iterate_ctx(ctx, output, data, false);
done:
preempt_enable();
rcu_read_unlock();
@@ -7783,20 +7967,17 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
void perf_event_exec(void)
{
struct perf_event_context *ctx;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- perf_event_enable_on_exec(ctxn);
- perf_event_remove_on_exec(ctxn);
+ ctx = perf_pin_task_context(current);
+ if (!ctx)
+ return;
- rcu_read_lock();
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
- NULL, true);
- }
- rcu_read_unlock();
- }
+ perf_event_enable_on_exec(ctx);
+ perf_event_remove_on_exec(ctx);
+ perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);
+
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
}
struct remote_output {
@@ -7836,8 +8017,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->ctx->pmu;
- struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct remote_output ro = {
.rb = event->rb,
};
@@ -8626,7 +8806,6 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
struct perf_event_context *ctx;
- int ctxn;
/*
* Data tracing isn't supported yet and as such there is no need
@@ -8636,13 +8815,9 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
return;
rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (!ctx)
- continue;
-
+ ctx = rcu_dereference(current->perf_event_ctxp);
+ if (ctx)
perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
- }
rcu_read_unlock();
}
@@ -9030,7 +9205,7 @@ static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
PERF_RECORD_KSYMBOL_TYPE_BPF,
(u64)(unsigned long)subprog->bpf_func,
subprog->jited_len, unregister,
- prog->aux->ksym.name);
+ subprog->aux->ksym.name);
}
}
}
@@ -9273,6 +9448,19 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
+static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
+{
+ /*
+ * Due to interrupt latency (AKA "skid"), we may enter the
+ * kernel before taking an overflow, even if the PMU is only
+ * counting user events.
+ */
+ if (event->attr.exclude_kernel && !user_mode(regs))
+ return false;
+
+ return true;
+}
+
/*
* Generic event overflow handling, sampling.
*/
@@ -9307,15 +9495,38 @@ static int __perf_event_overflow(struct perf_event *event,
if (event->attr.sigtrap) {
/*
- * Should not be able to return to user space without processing
- * pending_sigtrap (kernel events can overflow multiple times).
+ * The desired behaviour of sigtrap vs invalid samples is a bit
+ * tricky; on the one hand, one should not loose the SIGTRAP if
+ * it is the first event, on the other hand, we should also not
+ * trigger the WARN or override the data address.
*/
- WARN_ON_ONCE(event->pending_sigtrap && event->attr.exclude_kernel);
+ bool valid_sample = sample_is_allowed(event, regs);
+ unsigned int pending_id = 1;
+
+ if (regs)
+ pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;
if (!event->pending_sigtrap) {
- event->pending_sigtrap = 1;
+ event->pending_sigtrap = pending_id;
local_inc(&event->ctx->nr_pending);
+ } else if (event->attr.exclude_kernel && valid_sample) {
+ /*
+ * Should not be able to return to user space without
+ * consuming pending_sigtrap; with exceptions:
+ *
+ * 1. Where !exclude_kernel, events can overflow again
+ * in the kernel without returning to user space.
+ *
+ * 2. Events that can overflow again before the IRQ-
+ * work without user space progress (e.g. hrtimer).
+ * To approximate progress (with false negatives),
+ * check 32-bit hash of the current IP.
+ */
+ WARN_ON_ONCE(event->pending_sigtrap != pending_id);
}
- event->pending_addr = data->addr;
+
+ event->pending_addr = 0;
+ if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
+ event->pending_addr = data->addr;
irq_work_queue(&event->pending_irq);
}
@@ -9781,6 +9992,44 @@ static struct pmu perf_swevent = {
#ifdef CONFIG_EVENT_TRACING
+static void tp_perf_event_destroy(struct perf_event *event)
+{
+ perf_trace_destroy(event);
+}
+
+static int perf_tp_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ return -ENOENT;
+
+ /*
+ * no branch sampling for tracepoint events
+ */
+ if (has_branch_stack(event))
+ return -EOPNOTSUPP;
+
+ err = perf_trace_init(event);
+ if (err)
+ return err;
+
+ event->destroy = tp_perf_event_destroy;
+
+ return 0;
+}
+
+static struct pmu perf_tracepoint = {
+ .task_ctx_nr = perf_sw_context,
+
+ .event_init = perf_tp_event_init,
+ .add = perf_trace_add,
+ .del = perf_trace_del,
+ .start = perf_swevent_start,
+ .stop = perf_swevent_stop,
+ .read = perf_swevent_read,
+};
+
static int perf_tp_filter_match(struct perf_event *event,
struct perf_sample_data *data)
{
@@ -9830,6 +10079,44 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
+static void __perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event *event)
+{
+ struct trace_entry *entry = record;
+
+ if (event->attr.config != entry->type)
+ return;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ return;
+ if (perf_tp_event_match(event, data, regs))
+ perf_swevent_event(event, count, data, regs);
+}
+
+static void perf_tp_event_target_task(u64 count, void *record,
+ struct pt_regs *regs,
+ struct perf_sample_data *data,
+ struct perf_event_context *ctx)
+{
+ unsigned int cpu = smp_processor_id();
+ struct pmu *pmu = &perf_tracepoint;
+ struct perf_event *event, *sibling;
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+
+ perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
+ __perf_tp_event_target_task(count, record, regs, data, event);
+ for_each_sibling_event(sibling, event)
+ __perf_tp_event_target_task(count, record, regs, data, sibling);
+ }
+}
+
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
struct task_struct *task)
@@ -9861,26 +10148,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
*/
if (task && task != current) {
struct perf_event_context *ctx;
- struct trace_entry *entry = record;
rcu_read_lock();
- ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ ctx = rcu_dereference(task->perf_event_ctxp);
if (!ctx)
goto unlock;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->cpu != smp_processor_id())
- continue;
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- continue;
- if (event->attr.config != entry->type)
- continue;
- /* Cannot deliver synchronous signal to other task. */
- if (event->attr.sigtrap)
- continue;
- if (perf_tp_event_match(event, &data, regs))
- perf_swevent_event(event, count, &data, regs);
- }
+ raw_spin_lock(&ctx->lock);
+ perf_tp_event_target_task(count, record, regs, &data, ctx);
+ raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
}
@@ -9889,44 +10165,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_tp_event);
-static void tp_perf_event_destroy(struct perf_event *event)
-{
- perf_trace_destroy(event);
-}
-
-static int perf_tp_event_init(struct perf_event *event)
-{
- int err;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -ENOENT;
-
- /*
- * no branch sampling for tracepoint events
- */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
-
- err = perf_trace_init(event);
- if (err)
- return err;
-
- event->destroy = tp_perf_event_destroy;
-
- return 0;
-}
-
-static struct pmu perf_tracepoint = {
- .task_ctx_nr = perf_sw_context,
-
- .event_init = perf_tp_event_init,
- .add = perf_trace_add,
- .del = perf_trace_del,
- .start = perf_swevent_start,
- .stop = perf_swevent_stop,
- .read = perf_swevent_read,
-};
-
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
* Flags in config, used by dynamic PMU kprobe and uprobe
@@ -11013,36 +11251,9 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-/*
- * Ensures all contexts with the same task_ctx_nr have the same
- * pmu_cpu_context too.
- */
-static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
-{
- struct pmu *pmu;
-
- if (ctxn < 0)
- return NULL;
-
- list_for_each_entry(pmu, &pmus, entry) {
- if (pmu->task_ctx_nr == ctxn)
- return pmu->pmu_cpu_context;
- }
-
- return NULL;
-}
-
static void free_pmu_context(struct pmu *pmu)
{
- /*
- * Static contexts such as perf_sw_context have a global lifetime
- * and may be shared between different PMUs. Avoid freeing them
- * when a single PMU is going away.
- */
- if (pmu->task_ctx_nr > perf_invalid_context)
- return;
-
- free_percpu(pmu->pmu_cpu_context);
+ free_percpu(pmu->cpu_pmu_context);
}
/*
@@ -11106,12 +11317,11 @@ perf_event_mux_interval_ms_store(struct device *dev,
/* update all cpuctx for this PMU */
cpus_read_lock();
for_each_online_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+ struct perf_cpu_pmu_context *cpc;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
- cpu_function_call(cpu,
- (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
+ cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
}
cpus_read_unlock();
mutex_unlock(&mux_interval_mutex);
@@ -11148,13 +11358,15 @@ static int pmu_dev_alloc(struct pmu *pmu)
pmu->dev->groups = pmu->attr_groups;
device_initialize(pmu->dev);
- ret = dev_set_name(pmu->dev, "%s", pmu->name);
- if (ret)
- goto free_dev;
dev_set_drvdata(pmu->dev, pmu);
pmu->dev->bus = &pmu_bus;
pmu->dev->release = pmu_dev_release;
+
+ ret = dev_set_name(pmu->dev, "%s", pmu->name);
+ if (ret)
+ goto free_dev;
+
ret = device_add(pmu->dev);
if (ret)
goto free_dev;
@@ -11222,47 +11434,19 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
}
skip_type:
- if (pmu->task_ctx_nr == perf_hw_context) {
- static int hw_context_taken = 0;
-
- /*
- * Other than systems with heterogeneous CPUs, it never makes
- * sense for two PMUs to share perf_hw_context. PMUs which are
- * uncore must use perf_invalid_context.
- */
- if (WARN_ON_ONCE(hw_context_taken &&
- !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
- pmu->task_ctx_nr = perf_invalid_context;
-
- hw_context_taken = 1;
- }
-
- pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
- if (pmu->pmu_cpu_context)
- goto got_cpu_context;
-
ret = -ENOMEM;
- pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
- if (!pmu->pmu_cpu_context)
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ if (!pmu->cpu_pmu_context)
goto free_dev;
for_each_possible_cpu(cpu) {
- struct perf_cpu_context *cpuctx;
-
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- __perf_event_init_context(&cpuctx->ctx);
- lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
- lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
- cpuctx->ctx.pmu = pmu;
- cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
-
- __perf_mux_hrtimer_init(cpuctx, cpu);
+ struct perf_cpu_pmu_context *cpc;
- cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
- cpuctx->heap = cpuctx->heap_default;
+ cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ __perf_init_event_pmu_context(&cpc->epc, pmu);
+ __perf_mux_hrtimer_init(cpc, cpu);
}
-got_cpu_context:
if (!pmu->start_txn) {
if (pmu->pmu_enable) {
/*
@@ -11741,10 +11925,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
/*
- * Disallow uncore-cgroup events, they don't make sense as the cgroup will
- * be different on other CPUs in the uncore mask.
+ * Disallow uncore-task events. Similarly, disallow uncore-cgroup
+ * events (they don't make sense as the cgroup will be different
+ * on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
err = -EINVAL;
goto err_pmu;
}
@@ -12091,37 +12276,6 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
-/*
- * Variation on perf_event_ctx_lock_nested(), except we take two context
- * mutexes.
- */
-static struct perf_event_context *
-__perf_event_ctx_lock_double(struct perf_event *group_leader,
- struct perf_event_context *ctx)
-{
- struct perf_event_context *gctx;
-
-again:
- rcu_read_lock();
- gctx = READ_ONCE(group_leader->ctx);
- if (!refcount_inc_not_zero(&gctx->refcount)) {
- rcu_read_unlock();
- goto again;
- }
- rcu_read_unlock();
-
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
- if (group_leader->ctx != gctx) {
- mutex_unlock(&ctx->mutex);
- mutex_unlock(&gctx->mutex);
- put_ctx(gctx);
- goto again;
- }
-
- return gctx;
-}
-
static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
@@ -12167,9 +12321,10 @@ SYSCALL_DEFINE5(perf_event_open,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *group_leader = NULL, *output_event = NULL;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *event, *sibling;
struct perf_event_attr attr;
- struct perf_event_context *ctx, *gctx;
+ struct perf_event_context *ctx;
struct file *event_file = NULL;
struct fd group = {NULL, 0};
struct task_struct *task = NULL;
@@ -12299,42 +12454,53 @@ SYSCALL_DEFINE5(perf_event_open,
if (pmu->task_ctx_nr == perf_sw_context)
event->event_caps |= PERF_EV_CAP_SOFTWARE;
- if (group_leader) {
- if (is_software_event(event) &&
- !in_software_context(group_leader)) {
- /*
- * If the event is a sw event, but the group_leader
- * is on hw context.
- *
- * Allow the addition of software events to hw
- * groups, this is safe because software events
- * never fail to schedule.
- */
- pmu = group_leader->ctx->pmu;
- } else if (!is_software_event(event) &&
- is_software_event(group_leader) &&
- (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
- /*
- * In case the group is a pure software group, and we
- * try to add a hardware event, move the whole group to
- * the hardware context.
- */
- move_group = 1;
- }
+ if (task) {
+ err = down_read_interruptible(&task->signal->exec_update_lock);
+ if (err)
+ goto err_alloc;
+
+ /*
+ * We must hold exec_update_lock across this and any potential
+ * perf_install_in_context() call for this new event to
+ * serialize against exec() altering our credentials (and the
+ * perf_event_exit_task() that could imply).
+ */
+ err = -EACCES;
+ if (!perf_check_permission(&attr, task))
+ goto err_cred;
}
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_alloc;
+ goto err_cred;
+ }
+
+ mutex_lock(&ctx->mutex);
+
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
+ if (!task) {
+ /*
+ * Check if the @cpu we're creating an event for is online.
+ *
+ * We use the perf_cpu_context::ctx::mutex to serialize against
+ * the hotplug notifiers. See perf_event_{init,exit}_cpu().
+ */
+ struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
+
+ if (!cpuctx->online) {
+ err = -ENODEV;
+ goto err_locked;
+ }
}
- /*
- * Look up the group leader (we will attach this event to it):
- */
if (group_leader) {
err = -EINVAL;
@@ -12343,11 +12509,11 @@ SYSCALL_DEFINE5(perf_event_open,
* becoming part of another group-sibling):
*/
if (group_leader->group_leader != group_leader)
- goto err_context;
+ goto err_locked;
/* All events in a group should have the same clock */
if (group_leader->clock != event->clock)
- goto err_context;
+ goto err_locked;
/*
* Make sure we're both events for the same CPU;
@@ -12355,145 +12521,76 @@ SYSCALL_DEFINE5(perf_event_open,
* you can never concurrently schedule them anyhow.
*/
if (group_leader->cpu != event->cpu)
- goto err_context;
-
- /*
- * Make sure we're both on the same task, or both
- * per-CPU events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ goto err_locked;
/*
- * Do not allow to attach to a group in a different task
- * or CPU context. If we're moving SW events, we'll fix
- * this up later, so allow that.
- *
- * Racy, not holding group_leader->ctx->mutex, see comment with
- * perf_event_ctx_lock().
+ * Make sure we're both on the same context; either task or cpu.
*/
- if (!move_group && group_leader->ctx != ctx)
- goto err_context;
+ if (group_leader->ctx != ctx)
+ goto err_locked;
/*
* Only a group leader can be exclusive or pinned
*/
if (attr.exclusive || attr.pinned)
- goto err_context;
- }
-
- if (output_event) {
- err = perf_event_set_output(event, output_event);
- if (err)
- goto err_context;
- }
-
- event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
- f_flags);
- if (IS_ERR(event_file)) {
- err = PTR_ERR(event_file);
- event_file = NULL;
- goto err_context;
- }
-
- if (task) {
- err = down_read_interruptible(&task->signal->exec_update_lock);
- if (err)
- goto err_file;
-
- /*
- * We must hold exec_update_lock across this and any potential
- * perf_install_in_context() call for this new event to
- * serialize against exec() altering our credentials (and the
- * perf_event_exit_task() that could imply).
- */
- err = -EACCES;
- if (!perf_check_permission(&attr, task))
- goto err_cred;
- }
-
- if (move_group) {
- gctx = __perf_event_ctx_lock_double(group_leader, ctx);
-
- if (gctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
goto err_locked;
- }
- /*
- * Check if we raced against another sys_perf_event_open() call
- * moving the software group underneath us.
- */
- if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ if (is_software_event(event) &&
+ !in_software_context(group_leader)) {
/*
- * If someone moved the group out from under us, check
- * if this new event wound up on the same ctx, if so
- * its the regular !move_group case, otherwise fail.
+ * If the event is a sw event, but the group_leader
+ * is on hw context.
+ *
+ * Allow the addition of software events to hw
+ * groups, this is safe because software events
+ * never fail to schedule.
+ *
+ * Note the comment that goes with struct
+ * perf_event_pmu_context.
*/
- if (gctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- } else {
- perf_event_ctx_unlock(group_leader, gctx);
- move_group = 0;
- goto not_move_group;
+ pmu = group_leader->pmu_ctx->pmu;
+ } else if (!is_software_event(event)) {
+ if (is_software_event(group_leader) &&
+ (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * In case the group is a pure software group, and we
+ * try to add a hardware event, move the whole group to
+ * the hardware context.
+ */
+ move_group = 1;
}
- }
-
- /*
- * Failure to create exclusive events returns -EBUSY.
- */
- err = -EBUSY;
- if (!exclusive_event_installable(group_leader, ctx))
- goto err_locked;
- for_each_sibling_event(sibling, group_leader) {
- if (!exclusive_event_installable(sibling, ctx))
+ /* Don't allow group of multiple hw events from different pmus */
+ if (!in_software_context(group_leader) &&
+ group_leader->pmu_ctx->pmu != pmu)
goto err_locked;
}
- } else {
- mutex_lock(&ctx->mutex);
-
- /*
- * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx,
- * see the group_leader && !move_group test earlier.
- */
- if (group_leader && group_leader->ctx != ctx) {
- err = -EINVAL;
- goto err_locked;
- }
}
-not_move_group:
- if (ctx->task == TASK_TOMBSTONE) {
- err = -ESRCH;
+ /*
+ * Now that we're certain of the pmu; find the pmu_ctx.
+ */
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
goto err_locked;
}
+ event->pmu_ctx = pmu_ctx;
- if (!perf_event_validate_size(event)) {
- err = -E2BIG;
- goto err_locked;
+ if (output_event) {
+ err = perf_event_set_output(event, output_event);
+ if (err)
+ goto err_context;
}
- if (!task) {
- /*
- * Check if the @cpu we're creating an event for is online.
- *
- * We use the perf_cpu_context::ctx::mutex to serialize against
- * the hotplug notifiers. See perf_event_{init,exit}_cpu().
- */
- struct perf_cpu_context *cpuctx =
- container_of(ctx, struct perf_cpu_context, ctx);
-
- if (!cpuctx->online) {
- err = -ENODEV;
- goto err_locked;
- }
+ if (!perf_event_validate_size(event)) {
+ err = -E2BIG;
+ goto err_context;
}
if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
err = -EINVAL;
- goto err_locked;
+ goto err_context;
}
/*
@@ -12502,36 +12599,33 @@ not_move_group:
*/
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_locked;
+ goto err_context;
}
WARN_ON_ONCE(ctx->parent_ctx);
+ event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
+ if (IS_ERR(event_file)) {
+ err = PTR_ERR(event_file);
+ event_file = NULL;
+ goto err_context;
+ }
+
/*
* This is the point on no return; we cannot fail hereafter. This is
* where we start modifying current state.
*/
if (move_group) {
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
perf_remove_from_context(group_leader, 0);
- put_ctx(gctx);
+ put_pmu_ctx(group_leader->pmu_ctx);
for_each_sibling_event(sibling, group_leader) {
perf_remove_from_context(sibling, 0);
- put_ctx(gctx);
+ put_pmu_ctx(sibling->pmu_ctx);
}
/*
- * Wait for everybody to stop referencing the events through
- * the old lists, before installing it on new lists.
- */
- synchronize_rcu();
-
- /*
* Install the group siblings before the group leader.
*
* Because a group leader will try and install the entire group
@@ -12542,9 +12636,10 @@ not_move_group:
* reachable through the group lists.
*/
for_each_sibling_event(sibling, group_leader) {
+ sibling->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(sibling);
perf_install_in_context(ctx, sibling, sibling->cpu);
- get_ctx(ctx);
}
/*
@@ -12552,9 +12647,10 @@ not_move_group:
* event. What we want here is event in the initial
* startup state, ready to be add into new context.
*/
+ group_leader->pmu_ctx = pmu_ctx;
+ get_pmu_ctx(pmu_ctx);
perf_event__state_init(group_leader);
perf_install_in_context(ctx, group_leader, group_leader->cpu);
- get_ctx(ctx);
}
/*
@@ -12571,8 +12667,6 @@ not_move_group:
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -12594,25 +12688,17 @@ not_move_group:
fd_install(event_fd, event_file);
return event_fd;
+err_context:
+ /* event->pmu_ctx freed by free_event() */
err_locked:
- if (move_group)
- perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_cred:
if (task)
up_read(&task->signal->exec_update_lock);
-err_file:
- fput(event_file);
-err_context:
- perf_unpin_context(ctx);
- put_ctx(ctx);
err_alloc:
- /*
- * If event_file is set, the fput() above will have called ->release()
- * and that will take care of freeing the event.
- */
- if (!event_file)
- free_event(event);
+ free_event(event);
err_task:
if (task)
put_task_struct(task);
@@ -12638,8 +12724,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
perf_overflow_handler_t overflow_handler,
void *context)
{
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event_context *ctx;
struct perf_event *event;
+ struct pmu *pmu;
int err;
/*
@@ -12658,14 +12746,18 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
+ pmu = event->pmu;
+
+ if (pmu->task_ctx_nr == perf_sw_context)
+ event->event_caps |= PERF_EV_CAP_SOFTWARE;
/*
* Get the target context (task or percpu):
*/
- ctx = find_get_context(event->pmu, task, event);
+ ctx = find_get_context(task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
- goto err_free;
+ goto err_alloc;
}
WARN_ON_ONCE(ctx->parent_ctx);
@@ -12675,6 +12767,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_unlock;
}
+ pmu_ctx = find_get_pmu_context(pmu, ctx, event);
+ if (IS_ERR(pmu_ctx)) {
+ err = PTR_ERR(pmu_ctx);
+ goto err_unlock;
+ }
+ event->pmu_ctx = pmu_ctx;
+
if (!task) {
/*
* Check if the @cpu we're creating an event for is online.
@@ -12686,13 +12785,13 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
container_of(ctx, struct perf_cpu_context, ctx);
if (!cpuctx->online) {
err = -ENODEV;
- goto err_unlock;
+ goto err_pmu_ctx;
}
}
if (!exclusive_event_installable(event, ctx)) {
err = -EBUSY;
- goto err_unlock;
+ goto err_pmu_ctx;
}
perf_install_in_context(ctx, event, event->cpu);
@@ -12701,44 +12800,61 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_pmu_ctx:
+ put_pmu_ctx(pmu_ctx);
err_unlock:
mutex_unlock(&ctx->mutex);
perf_unpin_context(ctx);
put_ctx(ctx);
-err_free:
+err_alloc:
free_event(event);
err:
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
-void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+static void __perf_pmu_remove(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu,
+ struct perf_event_groups *groups,
+ struct list_head *events)
{
- struct perf_event_context *src_ctx;
- struct perf_event_context *dst_ctx;
- struct perf_event *event, *tmp;
- LIST_HEAD(events);
-
- src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
- dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
+ struct perf_event *event, *sibling;
- /*
- * See perf_event_ctx_lock() for comments on the details
- * of swizzling perf_event::ctx.
- */
- mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
- list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
- event_entry) {
+ perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
perf_remove_from_context(event, 0);
- unaccount_event_cpu(event, src_cpu);
- put_ctx(src_ctx);
- list_add(&event->migrate_entry, &events);
+ unaccount_event_cpu(event, cpu);
+ put_pmu_ctx(event->pmu_ctx);
+ list_add(&event->migrate_entry, events);
+
+ for_each_sibling_event(sibling, event) {
+ perf_remove_from_context(sibling, 0);
+ unaccount_event_cpu(sibling, cpu);
+ put_pmu_ctx(sibling->pmu_ctx);
+ list_add(&sibling->migrate_entry, events);
+ }
}
+}
- /*
- * Wait for the events to quiesce before re-instating them.
- */
- synchronize_rcu();
+static void __perf_pmu_install_event(struct pmu *pmu,
+ struct perf_event_context *ctx,
+ int cpu, struct perf_event *event)
+{
+ struct perf_event_pmu_context *epc;
+
+ event->cpu = cpu;
+ epc = find_get_pmu_context(pmu, ctx, event);
+ event->pmu_ctx = epc;
+
+ if (event->state >= PERF_EVENT_STATE_OFF)
+ event->state = PERF_EVENT_STATE_INACTIVE;
+ account_event_cpu(event, cpu);
+ perf_install_in_context(ctx, event, cpu);
+}
+
+static void __perf_pmu_install(struct perf_event_context *ctx,
+ int cpu, struct pmu *pmu, struct list_head *events)
+{
+ struct perf_event *event, *tmp;
/*
* Re-instate events in 2 passes.
@@ -12748,30 +12864,48 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
* leader will enable its siblings, even if those are still on the old
* context.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
if (event->group_leader == event)
continue;
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
/*
* Once all the siblings are setup properly, install the group leaders
* to make it go.
*/
- list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
+ list_for_each_entry_safe(event, tmp, events, migrate_entry) {
list_del(&event->migrate_entry);
- if (event->state >= PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_INACTIVE;
- account_event_cpu(event, dst_cpu);
- perf_install_in_context(dst_ctx, event, dst_cpu);
- get_ctx(dst_ctx);
+ __perf_pmu_install_event(pmu, ctx, cpu, event);
}
+}
+
+void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
+{
+ struct perf_event_context *src_ctx, *dst_ctx;
+ LIST_HEAD(events);
+
+ src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
+ dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
+
+ /*
+ * See perf_event_ctx_lock() for comments on the details
+ * of swizzling perf_event::ctx.
+ */
+ mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
+
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
+ __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);
+
+ /*
+ * Wait for the events to quiesce before re-instating them.
+ */
+ synchronize_rcu();
+
+ __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
+
mutex_unlock(&dst_ctx->mutex);
mutex_unlock(&src_ctx->mutex);
}
@@ -12851,14 +12985,14 @@ perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
perf_event_wakeup(event);
}
-static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
+static void perf_event_exit_task_context(struct task_struct *child)
{
struct perf_event_context *child_ctx, *clone_ctx = NULL;
struct perf_event *child_event, *next;
WARN_ON_ONCE(child != current);
- child_ctx = perf_pin_task_context(child, ctxn);
+ child_ctx = perf_pin_task_context(child);
if (!child_ctx)
return;
@@ -12880,13 +13014,13 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
- task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
+ task_ctx_sched_out(child_ctx, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
* and mark the context dead.
*/
- RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
put_ctx(child_ctx); /* cannot be last */
WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
put_task_struct(current); /* cannot be last */
@@ -12921,7 +13055,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
void perf_event_exit_task(struct task_struct *child)
{
struct perf_event *event, *tmp;
- int ctxn;
mutex_lock(&child->perf_event_mutex);
list_for_each_entry_safe(event, tmp, &child->perf_event_list,
@@ -12937,8 +13070,7 @@ void perf_event_exit_task(struct task_struct *child)
}
mutex_unlock(&child->perf_event_mutex);
- for_each_task_context_nr(ctxn)
- perf_event_exit_task_context(child, ctxn);
+ perf_event_exit_task_context(child);
/*
* The perf_event_exit_task_context calls perf_event_task
@@ -12981,56 +13113,51 @@ void perf_event_free_task(struct task_struct *task)
{
struct perf_event_context *ctx;
struct perf_event *event, *tmp;
- int ctxn;
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
+ ctx = rcu_access_pointer(task->perf_event_ctxp);
+ if (!ctx)
+ return;
- mutex_lock(&ctx->mutex);
- raw_spin_lock_irq(&ctx->lock);
- /*
- * Destroy the task <-> ctx relation and mark the context dead.
- *
- * This is important because even though the task hasn't been
- * exposed yet the context has been (through child_list).
- */
- RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
- WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
- put_task_struct(task); /* cannot be last */
- raw_spin_unlock_irq(&ctx->lock);
+ mutex_lock(&ctx->mutex);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
- perf_free_event(event, ctx);
- mutex_unlock(&ctx->mutex);
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
+ perf_free_event(event, ctx);
- /*
- * perf_event_release_kernel() could've stolen some of our
- * child events and still have them on its free_list. In that
- * case we must wait for these events to have been freed (in
- * particular all their references to this task must've been
- * dropped).
- *
- * Without this copy_process() will unconditionally free this
- * task (irrespective of its reference count) and
- * _free_event()'s put_task_struct(event->hw.target) will be a
- * use-after-free.
- *
- * Wait for all events to drop their context reference.
- */
- wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
- put_ctx(ctx); /* must be last */
- }
+ mutex_unlock(&ctx->mutex);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
void perf_event_delayed_put(struct task_struct *task)
{
- int ctxn;
-
- for_each_task_context_nr(ctxn)
- WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
+ WARN_ON_ONCE(task->perf_event_ctxp);
}
struct file *perf_event_get(unsigned int fd)
@@ -13080,6 +13207,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event_context *child_ctx)
{
enum perf_event_state parent_state = parent_event->state;
+ struct perf_event_pmu_context *pmu_ctx;
struct perf_event *child_event;
unsigned long flags;
@@ -13100,17 +13228,12 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
-
- if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
- !child_ctx->task_ctx_data) {
- struct pmu *pmu = child_event->pmu;
-
- child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
- if (!child_ctx->task_ctx_data) {
- free_event(child_event);
- return ERR_PTR(-ENOMEM);
- }
+ pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
+ if (IS_ERR(pmu_ctx)) {
+ free_event(child_event);
+ return NULL;
}
+ child_event->pmu_ctx = pmu_ctx;
/*
* is_orphaned_event() and list_add_tail(&parent_event->child_list)
@@ -13233,11 +13356,11 @@ static int inherit_group(struct perf_event *parent_event,
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
struct perf_event_context *parent_ctx,
- struct task_struct *child, int ctxn,
+ struct task_struct *child,
u64 clone_flags, int *inherited_all)
{
- int ret;
struct perf_event_context *child_ctx;
+ int ret;
if (!event->attr.inherit ||
(event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
@@ -13247,7 +13370,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -13255,16 +13378,14 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* First allocate and initialize a context for the
* child.
*/
- child_ctx = alloc_perf_context(parent_ctx->pmu, child);
+ child_ctx = alloc_perf_context(child);
if (!child_ctx)
return -ENOMEM;
- child->perf_event_ctxp[ctxn] = child_ctx;
+ child->perf_event_ctxp = child_ctx;
}
- ret = inherit_group(event, parent, parent_ctx,
- child, child_ctx);
-
+ ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
if (ret)
*inherited_all = 0;
@@ -13274,8 +13395,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-static int perf_event_init_context(struct task_struct *child, int ctxn,
- u64 clone_flags)
+static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
@@ -13285,14 +13405,14 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
unsigned long flags;
int ret = 0;
- if (likely(!parent->perf_event_ctxp[ctxn]))
+ if (likely(!parent->perf_event_ctxp))
return 0;
/*
* If the parent's context is a clone, pin it so it won't get
* swapped under us.
*/
- parent_ctx = perf_pin_task_context(parent, ctxn);
+ parent_ctx = perf_pin_task_context(parent);
if (!parent_ctx)
return 0;
@@ -13315,8 +13435,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
*/
perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13332,8 +13451,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
ret = inherit_task_group(event, parent, parent_ctx,
- child, ctxn, clone_flags,
- &inherited_all);
+ child, clone_flags, &inherited_all);
if (ret)
goto out_unlock;
}
@@ -13341,7 +13459,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn,
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
parent_ctx->rotate_disable = 0;
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
/*
@@ -13377,18 +13495,16 @@ out_unlock:
*/
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
- int ctxn, ret;
+ int ret;
- memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+ child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
- for_each_task_context_nr(ctxn) {
- ret = perf_event_init_context(child, ctxn, clone_flags);
- if (ret) {
- perf_event_free_task(child);
- return ret;
- }
+ ret = perf_event_init_context(child, clone_flags);
+ if (ret) {
+ perf_event_free_task(child);
+ return ret;
}
return 0;
@@ -13397,6 +13513,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
static void __init perf_event_init_all_cpus(void)
{
struct swevent_htable *swhash;
+ struct perf_cpu_context *cpuctx;
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
@@ -13404,15 +13521,19 @@ static void __init perf_event_init_all_cpus(void)
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
mutex_init(&swhash->hlist_mutex);
- INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
-#ifdef CONFIG_CGROUP_PERF
- INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
-#endif
INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
+
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ __perf_event_init_context(&cpuctx->ctx);
+ lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
+ lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
+ cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
+ cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
+ cpuctx->heap = cpuctx->heap_default;
}
}
@@ -13434,12 +13555,12 @@ static void perf_swevent_init_cpu(unsigned int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
+ struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_context *ctx = __info;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event *event;
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ ctx_sched_out(ctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
@@ -13449,18 +13570,16 @@ static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
+ // XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
- cpuctx->online = 0;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ cpuctx->online = 0;
+ mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
@@ -13474,20 +13593,17 @@ int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
- struct pmu *pmu;
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask);
- list_for_each_entry(pmu, &pmus, entry) {
- cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
- ctx = &cpuctx->ctx;
+ cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
- mutex_lock(&ctx->mutex);
- cpuctx->online = 1;
- mutex_unlock(&ctx->mutex);
- }
+ mutex_lock(&ctx->mutex);
+ cpuctx->online = 1;
+ mutex_unlock(&ctx->mutex);
mutex_unlock(&pmus_lock);
return 0;
@@ -13624,9 +13740,12 @@ static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
- rcu_read_lock();
- perf_cgroup_switch(task);
- rcu_read_unlock();
+
+ preempt_disable();
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_switch(task);
+ preempt_enable();
+
return 0;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 35e0a31a0315..15dc2ec80c46 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -67,11 +67,58 @@
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
+#include <linux/sysfs.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/mmu_context.h>
+/*
+ * The default value should be high enough to not crash a system that randomly
+ * crashes its kernel from time to time, but low enough to at least not permit
+ * overflowing 32-bit refcounts or the ldsem writer count.
+ */
+static unsigned int oops_limit = 10000;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kern_exit_table[] = {
+ {
+ .procname = "oops_limit",
+ .data = &oops_limit,
+ .maxlen = sizeof(oops_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ { }
+};
+
+static __init int kernel_exit_sysctls_init(void)
+{
+ register_sysctl_init("kernel", kern_exit_table);
+ return 0;
+}
+late_initcall(kernel_exit_sysctls_init);
+#endif
+
+static atomic_t oops_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
+}
+
+static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
+
+static __init int kernel_exit_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_exit_sysfs_init);
+#endif
+
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
@@ -884,6 +931,7 @@ void __noreturn make_task_dead(int signr)
* Then do everything else.
*/
struct task_struct *tsk = current;
+ unsigned int limit;
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
@@ -898,6 +946,20 @@ void __noreturn make_task_dead(int signr)
}
/*
+ * Every time the system oopses, if the oops happens while a reference
+ * to an object was held, the reference leaks.
+ * If the oops doesn't also leak memory, repeated oopsing can cause
+ * reference counters to wrap around (if they're not using refcount_t).
+ * This means that repeated oopsing can make unexploitable-looking bugs
+ * exploitable through repeated oopsing.
+ * To make sure this can't happen, place an upper bound on how often the
+ * kernel may oops without panic().
+ */
+ limit = READ_ONCE(oops_limit);
+ if (atomic_inc_return(&oops_count) >= limit && limit)
+ panic("Oopsed too often (kernel.oops_limit is %d)", limit);
+
+ /*
* We're taking recursive faults here in make_task_dead. Safest is to just
* leave this task alone and wait for reboot.
*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 08969f5aa38d..9f7fe3541897 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,7 +75,6 @@
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
-#include <linux/random.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -97,6 +96,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/stackprotector.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -535,6 +535,9 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+#ifdef CONFIG_SECCOMP
+ WARN_ON_ONCE(tsk->seccomp.filter);
+#endif
release_user_cpus_ptr(tsk);
scs_release(tsk);
@@ -753,8 +756,13 @@ static void check_mm(struct mm_struct *mm)
"Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) {
- long x = atomic_long_read(&mm->rss_stat.count[i]);
+ long x = percpu_counter_sum(&mm->rss_stat[i]);
+
+ if (likely(!x))
+ continue;
+ /* Making sure this is not due to race with CPU offlining. */
+ x = percpu_counter_sum_all(&mm->rss_stat[i]);
if (unlikely(x))
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
mm, resident_page_types[i], x);
@@ -779,6 +787,8 @@ static void check_mm(struct mm_struct *mm)
*/
void __mmdrop(struct mm_struct *mm)
{
+ int i;
+
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == current->active_mm);
@@ -788,6 +798,9 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ percpu_counter_destroy(&mm->rss_stat[i]);
free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1107,6 +1120,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
+ int i;
+
mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
atomic_set(&mm->mm_users, 1);
@@ -1148,10 +1163,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ for (i = 0; i < NR_MM_COUNTERS; i++)
+ if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
+ goto fail_pcpu;
+
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
return mm;
+fail_pcpu:
+ while (i > 0)
+ percpu_counter_destroy(&mm->rss_stat[--i]);
fail_nocontext:
mm_free_pgd(mm);
fail_nopgd:
@@ -2043,15 +2065,6 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
- /*
- * If the new process will be in a different time namespace
- * do not allow it to share VM or a thread group with the forking task.
- */
- if (clone_flags & (CLONE_THREAD | CLONE_VM)) {
- if (nsp->time_ns != nsp->time_ns_for_children)
- return ERR_PTR(-EINVAL);
- }
-
if (clone_flags & CLONE_PIDFD) {
/*
* - CLONE_DETACHED is blocked so that we can potentially
@@ -2406,12 +2419,6 @@ static __latent_entropy struct task_struct *copy_process(
spin_lock(&current->sighand->siglock);
- /*
- * Copy seccomp details explicitly here, in case they were changed
- * before holding sighand lock.
- */
- copy_seccomp(p);
-
rv_task_fork(p);
rseq_fork(p, clone_flags);
@@ -2428,6 +2435,14 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
+ /* No more failure paths after this point. */
+
+ /*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2592,11 +2607,6 @@ struct task_struct * __init fork_idle(int cpu)
return task;
}
-struct mm_struct *copy_init_mm(void)
-{
- return dup_mm(NULL, &init_mm);
-}
-
/*
* This is like kernel_clone(), but shaved down and tailored to just
* creating io_uring workers. It returns a created task, or an error pointer.
@@ -3015,10 +3025,27 @@ static void sighand_ctor(void *data)
init_waitqueue_head(&sighand->signalfd_wqh);
}
-void __init proc_caches_init(void)
+void __init mm_cache_init(void)
{
unsigned int mm_size;
+ /*
+ * The mm_cpumask is located at the end of mm_struct, and is
+ * dynamically sized based on the maximum CPU number this system
+ * can have, taking hotplug into account (nr_cpu_ids).
+ */
+ mm_size = sizeof(struct mm_struct) + cpumask_size();
+
+ mm_cachep = kmem_cache_create_usercopy("mm_struct",
+ mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
+ offsetof(struct mm_struct, saved_auxv),
+ sizeof_field(struct mm_struct, saved_auxv),
+ NULL);
+}
+
+void __init proc_caches_init(void)
+{
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3036,19 +3063,6 @@ void __init proc_caches_init(void)
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
- /*
- * The mm_cpumask is located at the end of mm_struct, and is
- * dynamically sized based on the maximum CPU number this system
- * can have, taking hotplug into account (nr_cpu_ids).
- */
- mm_size = sizeof(struct mm_struct) + cpumask_size();
-
- mm_cachep = kmem_cache_create_usercopy("mm_struct",
- mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
- offsetof(struct mm_struct, saved_auxv),
- sizeof_field(struct mm_struct, saved_auxv),
- NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index b22ef1efe751..514e4582b863 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -638,6 +638,7 @@ static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
bool pi, bool pending_op)
{
u32 uval, nval, mval;
+ pid_t owner;
int err;
/* Futex address must be 32bit aligned */
@@ -659,6 +660,10 @@ retry:
* 2. A woken up waiter is killed before it can acquire the
* futex in user space.
*
+ * In the second case, the wake up notification could be generated
+ * by the unlock path in user space after setting the futex value
+ * to zero or by the kernel after setting the OWNER_DIED bit below.
+ *
* In both cases the TID validation below prevents a wakeup of
* potential waiters which can cause these waiters to block
* forever.
@@ -667,24 +672,27 @@ retry:
*
* 1) task->robust_list->list_op_pending != NULL
* @pending_op == true
- * 2) User space futex value == 0
+ * 2) The owner part of user space futex value == 0
* 3) Regular futex: @pi == false
*
* If these conditions are met, it is safe to attempt waking up a
* potential waiter without touching the user space futex value and
- * trying to set the OWNER_DIED bit. The user space futex value is
- * uncontended and the rest of the user space mutex state is
- * consistent, so a woken waiter will just take over the
- * uncontended futex. Setting the OWNER_DIED bit would create
- * inconsistent state and malfunction of the user space owner died
- * handling.
+ * trying to set the OWNER_DIED bit. If the futex value is zero,
+ * the rest of the user space mutex state is consistent, so a woken
+ * waiter will just take over the uncontended futex. Setting the
+ * OWNER_DIED bit would create inconsistent state and malfunction
+ * of the user space owner died handling. Otherwise, the OWNER_DIED
+ * bit is already set, and the woken waiter is expected to deal with
+ * this.
*/
- if (pending_op && !pi && !uval) {
+ owner = uval & FUTEX_TID_MASK;
+
+ if (pending_op && !pi && !owner) {
futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
return 0;
}
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr))
+ if (owner != task_pid_vnr(curr))
return 0;
/*
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index cbb0bed958ab..7670a811a565 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -280,6 +280,8 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
for (i = 0; i < sfn_ptr->num_counters; i++)
dfn_ptr->counters[i] += sfn_ptr->counters[i];
+
+ sfn_ptr = list_next_entry(sfn_ptr, head);
}
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 7971e989e425..74a4ef1da9ad 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -82,6 +82,7 @@ struct gcov_fn_info {
* @version: gcov version magic indicating the gcc version used for compilation
* @next: list head for a singly-linked list
* @stamp: uniquifying time stamp
+ * @checksum: unique object checksum
* @filename: name of the associated gcov data file
* @merge: merge functions (null for unused counter type)
* @n_functions: number of instrumented functions
@@ -94,6 +95,10 @@ struct gcov_info {
unsigned int version;
struct gcov_info *next;
unsigned int stamp;
+ /* Since GCC 12.1 a checksum field is added. */
+#if (__GNUC__ >= 12)
+ unsigned int checksum;
+#endif
const char *filename;
void (*merge[GCOV_COUNTERS])(gcov_type *, unsigned int);
unsigned int n_functions;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index db3d174c53d4..b64c44ae4c25 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -86,15 +86,10 @@ config GENERIC_IRQ_IPI
depends on SMP
select IRQ_DOMAIN_HIERARCHY
-# Generic MSI interrupt support
-config GENERIC_MSI_IRQ
- bool
-
# Generic MSI hierarchical interrupt domain support
-config GENERIC_MSI_IRQ_DOMAIN
+config GENERIC_MSI_IRQ
bool
select IRQ_DOMAIN_HIERARCHY
- select GENERIC_MSI_IRQ
config IRQ_MSI_IOMMU
bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8ac37e8e738a..49e7bc871fec 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1561,10 +1561,10 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
-static struct device *irq_get_parent_device(struct irq_data *data)
+static struct device *irq_get_pm_device(struct irq_data *data)
{
if (data->domain)
- return data->domain->dev;
+ return data->domain->pm_dev;
return NULL;
}
@@ -1578,7 +1578,7 @@ static struct device *irq_get_parent_device(struct irq_data *data)
*/
int irq_chip_pm_get(struct irq_data *data)
{
- struct device *dev = irq_get_parent_device(data);
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
if (IS_ENABLED(CONFIG_PM) && dev)
@@ -1597,7 +1597,7 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
- struct device *dev = irq_get_parent_device(data);
+ struct device *dev = irq_get_pm_device(data);
int retval = 0;
if (IS_ENABLED(CONFIG_PM) && dev)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index f09c60393e55..5fdc0b557579 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -52,6 +52,7 @@ enum {
* IRQS_PENDING - irq is pending and replayed later
* IRQS_SUSPENDED - irq is suspended
* IRQS_NMI - irq line is used to deliver NMIs
+ * IRQS_SYSFS - descriptor has been added to sysfs
*/
enum {
IRQS_AUTODETECT = 0x00000001,
@@ -64,6 +65,7 @@ enum {
IRQS_SUSPENDED = 0x00000800,
IRQS_TIMINGS = 0x00001000,
IRQS_NMI = 0x00002000,
+ IRQS_SYSFS = 0x00004000,
};
#include "debug.h"
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index a91f9001103c..fd0996274401 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -288,22 +288,25 @@ static void irq_sysfs_add(int irq, struct irq_desc *desc)
if (irq_kobj_base) {
/*
* Continue even in case of failure as this is nothing
- * crucial.
+ * crucial and failures in the late irq_sysfs_init()
+ * cannot be rolled back.
*/
if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
pr_warn("Failed to add kobject for irq %d\n", irq);
+ else
+ desc->istate |= IRQS_SYSFS;
}
}
static void irq_sysfs_del(struct irq_desc *desc)
{
/*
- * If irq_sysfs_init() has not yet been invoked (early boot), then
- * irq_kobj_base is NULL and the descriptor was never added.
- * kobject_del() complains about a object with no parent, so make
- * it conditional.
+ * Only invoke kobject_del() when kobject_add() was successfully
+ * invoked for the descriptor. This covers both early boot, where
+ * sysfs is not initialized yet, and the case of a failed
+ * kobject_add() invocation.
*/
- if (irq_kobj_base)
+ if (desc->istate & IRQS_SYSFS)
kobject_del(&desc->kobj);
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 40fe7806cc8c..5b7cf28df290 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -321,7 +321,7 @@ static int irq_try_set_affinity(struct irq_data *data,
}
static bool irq_set_affinity_deactivated(struct irq_data *data,
- const struct cpumask *mask, bool force)
+ const struct cpumask *mask)
{
struct irq_desc *desc = irq_data_to_desc(data);
@@ -354,7 +354,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
if (!chip || !chip->irq_set_affinity)
return -EINVAL;
- if (irq_set_affinity_deactivated(data, mask, force))
+ if (irq_set_affinity_deactivated(data, mask))
return 0;
if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index a9ee535293eb..955267bbc2be 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -19,8 +19,31 @@
#include "internals.h"
+/**
+ * struct msi_ctrl - MSI internal management control structure
+ * @domid: ID of the domain on which management operations should be done
+ * @first: First (hardware) slot index to operate on
+ * @last: Last (hardware) slot index to operate on
+ * @nirqs: The number of Linux interrupts to allocate. Can be larger
+ * than the range due to PCI/multi-MSI.
+ */
+struct msi_ctrl {
+ unsigned int domid;
+ unsigned int first;
+ unsigned int last;
+ unsigned int nirqs;
+};
+
+/* Invalid Xarray index which is outside of any searchable range */
+#define MSI_XA_MAX_INDEX (ULONG_MAX - 1)
+/* The maximum domain size */
+#define MSI_XA_DOMAIN_SIZE (MSI_MAX_INDEX + 1)
+
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl);
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid);
static inline int msi_sysfs_create_group(struct device *dev);
+
/**
* msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
@@ -33,7 +56,7 @@ static inline int msi_sysfs_create_group(struct device *dev);
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+ const struct irq_affinity_desc *affinity)
{
struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
@@ -58,25 +81,56 @@ static void msi_free_desc(struct msi_desc *desc)
kfree(desc);
}
-static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
+static int msi_insert_desc(struct device *dev, struct msi_desc *desc,
+ unsigned int domid, unsigned int index)
{
+ struct msi_device_data *md = dev->msi.data;
+ struct xarray *xa = &md->__domains[domid].store;
+ unsigned int hwsize;
int ret;
- desc->msi_index = index;
- ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
- if (ret)
- msi_free_desc(desc);
+ hwsize = msi_domain_get_hwsize(dev, domid);
+
+ if (index == MSI_ANY_INDEX) {
+ struct xa_limit limit = { .min = 0, .max = hwsize - 1 };
+ unsigned int index;
+
+ /* Let the xarray allocate a free index within the limit */
+ ret = xa_alloc(xa, &index, desc, limit, GFP_KERNEL);
+ if (ret)
+ goto fail;
+
+ desc->msi_index = index;
+ return 0;
+ } else {
+ if (index >= hwsize) {
+ ret = -ERANGE;
+ goto fail;
+ }
+
+ desc->msi_index = index;
+ ret = xa_insert(xa, index, desc, GFP_KERNEL);
+ if (ret)
+ goto fail;
+ return 0;
+ }
+fail:
+ msi_free_desc(desc);
return ret;
}
/**
- * msi_add_msi_desc - Allocate and initialize a MSI descriptor
+ * msi_domain_insert_msi_desc - Allocate and initialize a MSI descriptor and
+ * insert it at @init_desc->msi_index
+ *
* @dev: Pointer to the device for which the descriptor is allocated
+ * @domid: The id of the interrupt domain to which the desriptor is added
* @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
*
* Return: 0 on success or an appropriate failure code.
*/
-int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
+int msi_domain_insert_msi_desc(struct device *dev, unsigned int domid,
+ struct msi_desc *init_desc)
{
struct msi_desc *desc;
@@ -88,40 +142,8 @@ int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;
- return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
-}
-/**
- * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
- * @dev: Pointer to the device for which the descriptors are allocated
- * @index: Index for the first MSI descriptor
- * @ndesc: Number of descriptors to allocate
- *
- * Return: 0 on success or an appropriate failure code.
- */
-static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
-{
- unsigned int idx, last = index + ndesc - 1;
- struct msi_desc *desc;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- for (idx = index; idx <= last; idx++) {
- desc = msi_alloc_desc(dev, 1, NULL);
- if (!desc)
- goto fail_mem;
- ret = msi_insert_desc(dev->msi.data, desc, idx);
- if (ret)
- goto fail;
- }
- return 0;
-
-fail_mem:
- ret = -ENOMEM;
-fail:
- msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
- return ret;
+ return msi_insert_desc(dev, desc, domid, init_desc->msi_index);
}
static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
@@ -138,28 +160,97 @@ static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
return false;
}
+static bool msi_ctrl_valid(struct device *dev, struct msi_ctrl *ctrl)
+{
+ unsigned int hwsize;
+
+ if (WARN_ON_ONCE(ctrl->domid >= MSI_MAX_DEVICE_IRQDOMAINS ||
+ (dev->msi.domain &&
+ !dev->msi.data->__domains[ctrl->domid].domain)))
+ return false;
+
+ hwsize = msi_domain_get_hwsize(dev, ctrl->domid);
+ if (WARN_ON_ONCE(ctrl->first > ctrl->last ||
+ ctrl->first >= hwsize ||
+ ctrl->last >= hwsize))
+ return false;
+ return true;
+}
+
+static void msi_domain_free_descs(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_desc *desc;
+ struct xarray *xa;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ xa = &dev->msi.data->__domains[ctrl->domid].store;
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ xa_erase(xa, idx);
+
+ /* Leak the descriptor when it is still referenced */
+ if (WARN_ON_ONCE(msi_desc_match(desc, MSI_DESC_ASSOCIATED)))
+ continue;
+ msi_free_desc(desc);
+ }
+}
+
/**
- * msi_free_msi_descs_range - Free MSI descriptors of a device
- * @dev: Device to free the descriptors
- * @filter: Descriptor state filter
- * @first_index: Index to start freeing from
- * @last_index: Last index to be freed
+ * msi_domain_free_msi_descs_range - Free a range of MSI descriptors of a device in an irqdomain
+ * @dev: Device for which to free the descriptors
+ * @domid: Id of the domain to operate on
+ * @first: Index to start freeing from (inclusive)
+ * @last: Last index to be freed (inclusive)
*/
-void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
- unsigned int first_index, unsigned int last_index)
+void msi_domain_free_msi_descs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+
+ msi_domain_free_descs(dev, &ctrl);
+}
+
+/**
+ * msi_domain_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @ctrl: Allocation control struct
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_domain_add_simple_msi_descs(struct device *dev, struct msi_ctrl *ctrl)
{
- struct xarray *xa = &dev->msi.data->__store;
struct msi_desc *desc;
- unsigned long idx;
+ unsigned int idx;
+ int ret;
lockdep_assert_held(&dev->msi.data->mutex);
- xa_for_each_range(xa, idx, desc, first_index, last_index) {
- if (msi_desc_match(desc, filter)) {
- xa_erase(xa, idx);
- msi_free_desc(desc);
- }
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ for (idx = ctrl->first; idx <= ctrl->last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev, desc, ctrl->domid, idx);
+ if (ret)
+ goto fail;
}
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_domain_free_descs(dev, ctrl);
+ return ret;
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -178,9 +269,13 @@ EXPORT_SYMBOL_GPL(get_cached_msi_msg);
static void msi_device_data_release(struct device *dev, void *res)
{
struct msi_device_data *md = res;
+ int i;
- WARN_ON_ONCE(!xa_empty(&md->__store));
- xa_destroy(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++) {
+ msi_remove_device_irq_domain(dev, i);
+ WARN_ON_ONCE(!xa_empty(&md->__domains[i].store));
+ xa_destroy(&md->__domains[i].store);
+ }
dev->msi.data = NULL;
}
@@ -197,7 +292,7 @@ static void msi_device_data_release(struct device *dev, void *res)
int msi_setup_device_data(struct device *dev)
{
struct msi_device_data *md;
- int ret;
+ int ret, i;
if (dev->msi.data)
return 0;
@@ -212,7 +307,18 @@ int msi_setup_device_data(struct device *dev)
return ret;
}
- xa_init(&md->__store);
+ for (i = 0; i < MSI_MAX_DEVICE_IRQDOMAINS; i++)
+ xa_init_flags(&md->__domains[i].store, XA_FLAGS_ALLOC);
+
+ /*
+ * If @dev::msi::domain is set and is a global MSI domain, copy the
+ * pointer into the domain array so all code can operate on domain
+ * ids. The NULL pointer check is required to keep the legacy
+ * architecture specific PCI/MSI support working.
+ */
+ if (dev->msi.domain && !irq_domain_is_msi_parent(dev->msi.domain))
+ md->__domains[MSI_DEFAULT_DOMAIN].domain = dev->msi.domain;
+
mutex_init(&md->mutex);
dev->msi.data = md;
devres_add(dev, md);
@@ -235,27 +341,30 @@ EXPORT_SYMBOL_GPL(msi_lock_descs);
*/
void msi_unlock_descs(struct device *dev)
{
- /* Invalidate the index wich was cached by the iterator */
- dev->msi.data->__iter_idx = MSI_MAX_INDEX;
+ /* Invalidate the index which was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
EXPORT_SYMBOL_GPL(msi_unlock_descs);
-static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
+ enum msi_desc_filter filter)
{
+ struct xarray *xa = &md->__domains[domid].store;
struct msi_desc *desc;
- xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
+ xa_for_each_start(xa, md->__iter_idx, desc, md->__iter_idx) {
if (msi_desc_match(desc, filter))
return desc;
}
- md->__iter_idx = MSI_MAX_INDEX;
+ md->__iter_idx = MSI_XA_MAX_INDEX;
return NULL;
}
/**
- * msi_first_desc - Get the first MSI descriptor of a device
+ * msi_domain_first_desc - Get the first MSI descriptor of an irqdomain associated to a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
* @filter: Descriptor state filter
*
* Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
@@ -264,23 +373,26 @@ static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_
* Return: Pointer to the first MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_domain_first_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
md->__iter_idx = 0;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
-EXPORT_SYMBOL_GPL(msi_first_desc);
+EXPORT_SYMBOL_GPL(msi_domain_first_desc);
/**
* msi_next_desc - Get the next MSI descriptor of a device
* @dev: Device to operate on
+ * @domid: The id of the interrupt domain which should be walked.
+ * @filter: Descriptor state filter
*
* The first invocation of msi_next_desc() has to be preceeded by a
* successful invocation of __msi_first_desc(). Consecutive invocations are
@@ -290,11 +402,12 @@ EXPORT_SYMBOL_GPL(msi_first_desc);
* Return: Pointer to the next MSI descriptor matching the search
* criteria, NULL if none found.
*/
-struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
+struct msi_desc *msi_next_desc(struct device *dev, unsigned int domid,
+ enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
- if (WARN_ON_ONCE(!md))
+ if (WARN_ON_ONCE(!md || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
return NULL;
lockdep_assert_held(&md->mutex);
@@ -303,30 +416,38 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
return NULL;
md->__iter_idx++;
- return msi_find_desc(md, filter);
+ return msi_find_desc(md, domid, filter);
}
EXPORT_SYMBOL_GPL(msi_next_desc);
/**
- * msi_get_virq - Return Linux interrupt number of a MSI interrupt
+ * msi_domain_get_virq - Lookup the Linux interrupt number for a MSI index on a interrupt domain
* @dev: Device to operate on
+ * @domid: Domain ID of the interrupt domain associated to the device
* @index: MSI interrupt index to look for (0-based)
*
* Return: The Linux interrupt number on success (> 0), 0 if not found
*/
-unsigned int msi_get_virq(struct device *dev, unsigned int index)
+unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
{
struct msi_desc *desc;
unsigned int ret = 0;
- bool pcimsi;
+ bool pcimsi = false;
+ struct xarray *xa;
if (!dev->msi.data)
return 0;
- pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
+ if (WARN_ON_ONCE(index > MSI_MAX_INDEX || domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return 0;
+
+ /* This check is only valid for the PCI default MSI domain */
+ if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
+ pcimsi = to_pci_dev(dev)->msi_enabled;
msi_lock_descs(dev);
- desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
+ xa = &dev->msi.data->__domains[domid].store;
+ desc = xa_load(xa, pcimsi ? 0 : index);
if (desc && desc->irq) {
/*
* PCI-MSI has only one descriptor for multiple interrupts.
@@ -340,10 +461,11 @@ unsigned int msi_get_virq(struct device *dev, unsigned int index)
ret = desc->irq;
}
}
+
msi_unlock_descs(dev);
return ret;
}
-EXPORT_SYMBOL_GPL(msi_get_virq);
+EXPORT_SYMBOL_GPL(msi_domain_get_virq);
#ifdef CONFIG_SYSFS
static struct attribute *msi_dev_attrs[] = {
@@ -459,7 +581,39 @@ static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *d
static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
#endif /* !CONFIG_SYSFS */
-#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
+static struct irq_domain *msi_get_device_domain(struct device *dev, unsigned int domid)
+{
+ struct irq_domain *domain;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ if (WARN_ON_ONCE(domid >= MSI_MAX_DEVICE_IRQDOMAINS))
+ return NULL;
+
+ domain = dev->msi.data->__domains[domid].domain;
+ if (!domain)
+ return NULL;
+
+ if (WARN_ON_ONCE(irq_domain_is_msi_parent(domain)))
+ return NULL;
+
+ return domain;
+}
+
+static unsigned int msi_domain_get_hwsize(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+
+ domain = msi_get_device_domain(dev, domid);
+ if (domain) {
+ info = domain->host_data;
+ return info->hwsize;
+ }
+ /* No domain, default to MSI_XA_DOMAIN_SIZE */
+ return MSI_XA_DOMAIN_SIZE;
+}
+
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
{
@@ -613,21 +767,11 @@ static int msi_domain_ops_init(struct irq_domain *domain,
return 0;
}
-static int msi_domain_ops_check(struct irq_domain *domain,
- struct msi_domain_info *info,
- struct device *dev)
-{
- return 0;
-}
-
static struct msi_domain_ops msi_domain_ops_default = {
.get_hwirq = msi_domain_ops_get_hwirq,
.msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
.msi_prepare = msi_domain_ops_prepare,
.set_desc = msi_domain_ops_set_desc,
- .domain_alloc_irqs = __msi_domain_alloc_irqs,
- .domain_free_irqs = __msi_domain_free_irqs,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -639,11 +783,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
- if (ops->domain_alloc_irqs == NULL)
- ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
- if (ops->domain_free_irqs == NULL)
- ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
-
if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
return;
@@ -651,8 +790,6 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
ops->msi_init = msi_domain_ops_default.msi_init;
- if (ops->msi_check == NULL)
- ops->msi_check = msi_domain_ops_default.msi_check;
if (ops->msi_prepare == NULL)
ops->msi_prepare = msi_domain_ops_default.msi_prepare;
if (ops->set_desc == NULL)
@@ -668,6 +805,40 @@ static void msi_domain_update_chip_ops(struct msi_domain_info *info)
chip->irq_set_affinity = msi_domain_set_affinity;
}
+static struct irq_domain *__msi_create_irq_domain(struct fwnode_handle *fwnode,
+ struct msi_domain_info *info,
+ unsigned int flags,
+ struct irq_domain *parent)
+{
+ struct irq_domain *domain;
+
+ if (info->hwsize > MSI_XA_DOMAIN_SIZE)
+ return NULL;
+
+ /*
+ * Hardware size 0 is valid for backwards compatibility and for
+ * domains which are not backed by a hardware table. Grant the
+ * maximum index space.
+ */
+ if (!info->hwsize)
+ info->hwsize = MSI_XA_DOMAIN_SIZE;
+
+ msi_domain_update_dom_ops(info);
+ if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
+ msi_domain_update_chip_ops(info);
+
+ domain = irq_domain_create_hierarchy(parent, flags | IRQ_DOMAIN_FLAG_MSI, 0,
+ fwnode, &msi_domain_ops, info);
+
+ if (domain) {
+ if (!domain->name && info->chip)
+ domain->name = info->chip->name;
+ irq_domain_update_bus_token(domain, info->bus_token);
+ }
+
+ return domain;
+}
+
/**
* msi_create_irq_domain - Create an MSI interrupt domain
* @fwnode: Optional fwnode of the interrupt controller
@@ -680,19 +851,210 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
struct msi_domain_info *info,
struct irq_domain *parent)
{
+ return __msi_create_irq_domain(fwnode, info, 0, parent);
+}
+
+/**
+ * msi_parent_init_dev_msi_info - Delegate initialization of device MSI info down
+ * in the domain hierarchy
+ * @dev: The device for which the domain should be created
+ * @domain: The domain in the hierarchy this op is being called on
+ * @msi_parent_domain: The IRQ_DOMAIN_FLAG_MSI_PARENT domain for the child to
+ * be created
+ * @msi_child_info: The MSI domain info of the IRQ_DOMAIN_FLAG_MSI_DEVICE
+ * domain to be created
+ *
+ * Return: true on success, false otherwise
+ *
+ * This is the most complex problem of per device MSI domains and the
+ * underlying interrupt domain hierarchy:
+ *
+ * The device domain to be initialized requests the broadest feature set
+ * possible and the underlying domain hierarchy puts restrictions on it.
+ *
+ * That's trivial for a simple parent->child relationship, but it gets
+ * interesting with an intermediate domain: root->parent->child. The
+ * intermediate 'parent' can expand the capabilities which the 'root'
+ * domain is providing. So that creates a classic hen and egg problem:
+ * Which entity is doing the restrictions/expansions?
+ *
+ * One solution is to let the root domain handle the initialization that's
+ * why there is the @domain and the @msi_parent_domain pointer.
+ */
+bool msi_parent_init_dev_msi_info(struct device *dev, struct irq_domain *domain,
+ struct irq_domain *msi_parent_domain,
+ struct msi_domain_info *msi_child_info)
+{
+ struct irq_domain *parent = domain->parent;
+
+ if (WARN_ON_ONCE(!parent || !parent->msi_parent_ops ||
+ !parent->msi_parent_ops->init_dev_msi_info))
+ return false;
+
+ return parent->msi_parent_ops->init_dev_msi_info(dev, parent, msi_parent_domain,
+ msi_child_info);
+}
+
+/**
+ * msi_create_device_irq_domain - Create a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @template: MSI domain info bundle used as template
+ * @hwsize: Maximum number of MSI table entries (0 if unknown or unlimited)
+ * @domain_data: Optional pointer to domain specific data which is set in
+ * msi_domain_info::data
+ * @chip_data: Optional pointer to chip specific data which is set in
+ * msi_domain_info::chip_data
+ *
+ * Return: True on success, false otherwise
+ *
+ * There is no firmware node required for this interface because the per
+ * device domains are software constructs which are actually closer to the
+ * hardware reality than any firmware can describe them.
+ *
+ * The domain name and the irq chip name for a MSI device domain are
+ * composed by: "$(PREFIX)$(CHIPNAME)-$(DEVNAME)"
+ *
+ * $PREFIX: Optional prefix provided by the underlying MSI parent domain
+ * via msi_parent_ops::prefix. If that pointer is NULL the prefix
+ * is empty.
+ * $CHIPNAME: The name of the irq_chip in @template
+ * $DEVNAME: The name of the device
+ *
+ * This results in understandable chip names and hardware interrupt numbers
+ * in e.g. /proc/interrupts
+ *
+ * PCI-MSI-0000:00:1c.0 0-edge Parent domain has no prefix
+ * IR-PCI-MSI-0000:00:1c.4 0-edge Same with interrupt remapping prefix 'IR-'
+ *
+ * IR-PCI-MSIX-0000:3d:00.0 0-edge Hardware interrupt numbers reflect
+ * IR-PCI-MSIX-0000:3d:00.0 1-edge the real MSI-X index on that device
+ * IR-PCI-MSIX-0000:3d:00.0 2-edge
+ *
+ * On IMS domains the hardware interrupt number is either a table entry
+ * index or a purely software managed index but it is guaranteed to be
+ * unique.
+ *
+ * The domain pointer is stored in @dev::msi::data::__irqdomains[]. All
+ * subsequent operations on the domain depend on the domain id.
+ *
+ * The domain is automatically freed when the device is removed via devres
+ * in the context of @dev::msi::data freeing, but it can also be
+ * independently removed via @msi_remove_device_irq_domain().
+ */
+bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
+ const struct msi_domain_template *template,
+ unsigned int hwsize, void *domain_data,
+ void *chip_data)
+{
+ struct irq_domain *domain, *parent = dev->msi.domain;
+ const struct msi_parent_ops *pops;
+ struct msi_domain_template *bundle;
+ struct fwnode_handle *fwnode;
+
+ if (!irq_domain_is_msi_parent(parent))
+ return false;
+
+ if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
+ return false;
+
+ bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ if (!bundle)
+ return false;
+
+ bundle->info.hwsize = hwsize;
+ bundle->info.chip = &bundle->chip;
+ bundle->info.ops = &bundle->ops;
+ bundle->info.data = domain_data;
+ bundle->info.chip_data = chip_data;
+
+ pops = parent->msi_parent_ops;
+ snprintf(bundle->name, sizeof(bundle->name), "%s%s-%s",
+ pops->prefix ? : "", bundle->chip.name, dev_name(dev));
+ bundle->chip.name = bundle->name;
+
+ fwnode = irq_domain_alloc_named_fwnode(bundle->name);
+ if (!fwnode)
+ goto free_bundle;
+
+ if (msi_setup_device_data(dev))
+ goto free_fwnode;
+
+ msi_lock_descs(dev);
+
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
+ goto fail;
+
+ if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
+ goto fail;
+
+ domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
+ if (!domain)
+ goto fail;
+
+ domain->dev = dev;
+ dev->msi.data->__domains[domid].domain = domain;
+ msi_unlock_descs(dev);
+ return true;
+
+fail:
+ msi_unlock_descs(dev);
+free_fwnode:
+ kfree(fwnode);
+free_bundle:
+ kfree(bundle);
+ return false;
+}
+
+/**
+ * msi_remove_device_irq_domain - Free a device MSI interrupt domain
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ */
+void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
+{
+ struct msi_domain_info *info;
struct irq_domain *domain;
- msi_domain_update_dom_ops(info);
- if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
- msi_domain_update_chip_ops(info);
+ msi_lock_descs(dev);
- domain = irq_domain_create_hierarchy(parent, IRQ_DOMAIN_FLAG_MSI, 0,
- fwnode, &msi_domain_ops, info);
+ domain = msi_get_device_domain(dev, domid);
- if (domain && !domain->name && info->chip)
- domain->name = info->chip->name;
+ if (!domain || !irq_domain_is_msi_device(domain))
+ goto unlock;
- return domain;
+ dev->msi.data->__domains[domid].domain = NULL;
+ info = domain->host_data;
+ irq_domain_remove(domain);
+ kfree(container_of(info, struct msi_domain_template, info));
+
+unlock:
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_match_device_irq_domain - Match a device irq domain against a bus token
+ * @dev: Pointer to the device
+ * @domid: Domain id
+ * @bus_token: Bus token to match against the domain bus token
+ *
+ * Return: True if device domain exists and bus tokens match.
+ */
+bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
+ enum irq_domain_bus_token bus_token)
+{
+ struct msi_domain_info *info;
+ struct irq_domain *domain;
+ bool ret = false;
+
+ msi_lock_descs(dev);
+ domain = msi_get_device_domain(dev, domid);
+ if (domain && irq_domain_is_msi_device(domain)) {
+ info = domain->host_data;
+ ret = info->bus_token == bus_token;
+ }
+ msi_unlock_descs(dev);
+ return ret;
}
int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -700,13 +1062,8 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- int ret;
-
- ret = ops->msi_check(domain, info, dev);
- if (ret == 0)
- ret = ops->msi_prepare(domain, dev, nvec, arg);
- return ret;
+ return ops->msi_prepare(domain, dev, nvec, arg);
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
@@ -714,16 +1071,27 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ struct msi_ctrl ctrl = {
+ .domid = MSI_DEFAULT_DOMAIN,
+ .first = virq_base,
+ .last = virq_base + nvec - 1,
+ };
struct msi_desc *desc;
+ struct xarray *xa;
int ret, virq;
+ if (!msi_ctrl_valid(dev, &ctrl))
+ return -EINVAL;
+
msi_lock_descs(dev);
- ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
+ ret = msi_domain_add_simple_msi_descs(dev, &ctrl);
if (ret)
goto unlock;
+ xa = &dev->msi.data->__domains[ctrl.domid].store;
+
for (virq = virq_base; virq < virq_base + nvec; virq++) {
- desc = xa_load(&dev->msi.data->__store, virq);
+ desc = xa_load(xa, virq);
desc->irq = virq;
ops->set_desc(arg, desc);
@@ -739,7 +1107,7 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
fail:
for (--virq; virq >= virq_base; virq--)
irq_domain_free_irqs_common(domain, virq, 1);
- msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
+ msi_domain_free_descs(dev, &ctrl);
unlock:
msi_unlock_descs(dev);
return ret;
@@ -764,6 +1132,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
break;
default:
@@ -789,6 +1159,8 @@ static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
{
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSI:
+ case DOMAIN_BUS_PCI_DEVICE_MSIX:
case DOMAIN_BUS_VMD_MSI:
if (IS_ENABLED(CONFIG_PCI_MSI))
break;
@@ -850,18 +1222,19 @@ static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflag
return 0;
}
-int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+static int __msi_domain_alloc_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
+ unsigned int vflags = 0, allocated = 0;
msi_alloc_info_t arg = { };
- unsigned int vflags = 0;
struct msi_desc *desc;
- int allocated = 0;
+ unsigned long idx;
int i, ret, virq;
- ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
+ ret = msi_domain_prepare_irqs(domain, dev, ctrl->nirqs, &arg);
if (ret)
return ret;
@@ -883,11 +1256,21 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
* MSI affinity setting requires a special quirk (X86) when
* reservation mode is active.
*/
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ if (info->flags & MSI_FLAG_NOMASK_QUIRK)
vflags |= VIRQ_NOMASK_QUIRK;
}
- msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ if (!msi_desc_match(desc, MSI_DESC_NOTASSOCIATED))
+ continue;
+
+ /* This should return -ECONFUSED... */
+ if (WARN_ON_ONCE(allocated >= ctrl->nirqs))
+ return -EINVAL;
+
+ if (ops->prepare_desc)
+ ops->prepare_desc(domain, &arg, desc);
+
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
@@ -913,76 +1296,213 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
return 0;
}
-static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
- struct device *dev,
- unsigned int num_descs)
+static int msi_domain_alloc_simple_msi_descs(struct device *dev,
+ struct msi_domain_info *info,
+ struct msi_ctrl *ctrl)
{
if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
return 0;
- return msi_add_simple_msi_descs(dev, 0, num_descs);
+ return msi_domain_add_simple_msi_descs(dev, ctrl);
+}
+
+static int __msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+ int ret;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return -EINVAL;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return -ENODEV;
+
+ info = domain->host_data;
+
+ ret = msi_domain_alloc_simple_msi_descs(dev, info, ctrl);
+ if (ret)
+ return ret;
+
+ ops = info->ops;
+ if (ops->domain_alloc_irqs)
+ return ops->domain_alloc_irqs(domain, dev, ctrl->nirqs);
+
+ return __msi_domain_alloc_irqs(dev, domain, ctrl);
+}
+
+static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
+{
+ int ret = __msi_domain_alloc_locked(dev, ctrl);
+
+ if (ret)
+ msi_domain_free_locked(dev, ctrl);
+ return ret;
}
/**
- * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
- * pair. Use this for MSI irqdomains which implement their own vector
+ * pair. Use this for MSI irqdomains which implement their own descriptor
* allocation/free.
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
- int ret;
-
- lockdep_assert_held(&dev->msi.data->mutex);
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ .nirqs = last + 1 - first,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
- ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
- if (ret)
- return ret;
+/**
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to allocate (inclusive)
+ * @last: Last index to allocate (inclusive)
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ int ret;
- ret = ops->domain_alloc_irqs(domain, dev, nvec);
- if (ret)
- msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
return ret;
}
/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
+ * msi_domain_alloc_irqs_all_locked - Allocate all interrupts from a MSI interrupt domain
+ *
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
- * @nvec: The number of interrupts to allocate
+ * @domid: Id of the interrupt domain to operate on
+ * @nirqs: The number of interrupts to allocate
+ *
+ * This function scans all MSI descriptors of the MSI domain and allocates interrupts
+ * for all unassigned ones. That function is to be used for MSI domain usage where
+ * the descriptor allocation is handled at the call site, e.g. PCI/MSI[X].
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
+int msi_domain_alloc_irqs_all_locked(struct device *dev, unsigned int domid, int nirqs)
{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = 0,
+ .last = msi_domain_get_hwsize(dev, domid) - 1,
+ .nirqs = nirqs,
+ };
+
+ return msi_domain_alloc_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_alloc_irq_at - Allocate an interrupt from a MSI interrupt domain at
+ * a given index - or at the next free index
+ *
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @domid: Id of the interrupt domain to operate on
+ * @index: Index for allocation. If @index == %MSI_ANY_INDEX the allocation
+ * uses the next free index.
+ * @affdesc: Optional pointer to an interrupt affinity descriptor structure
+ * @icookie: Optional pointer to a domain specific per instance cookie. If
+ * non-NULL the content of the cookie is stored in msi_desc::data.
+ * Must be NULL for MSI-X allocations
+ *
+ * This requires a MSI interrupt domain which lets the core code manage the
+ * MSI descriptors.
+ *
+ * Return: struct msi_map
+ *
+ * On success msi_map::index contains the allocated index number and
+ * msi_map::virq the corresponding Linux interrupt number
+ *
+ * On failure msi_map::index contains the error code and msi_map::virq
+ * is %0.
+ */
+struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, unsigned int index,
+ const struct irq_affinity_desc *affdesc,
+ union msi_instance_cookie *icookie)
+{
+ struct msi_ctrl ctrl = { .domid = domid, .nirqs = 1, };
+ struct irq_domain *domain;
+ struct msi_map map = { };
+ struct msi_desc *desc;
int ret;
msi_lock_descs(dev);
- ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
+ domain = msi_get_device_domain(dev, domid);
+ if (!domain) {
+ map.index = -ENODEV;
+ goto unlock;
+ }
+
+ desc = msi_alloc_desc(dev, 1, affdesc);
+ if (!desc) {
+ map.index = -ENOMEM;
+ goto unlock;
+ }
+
+ if (icookie)
+ desc->data.icookie = *icookie;
+
+ ret = msi_insert_desc(dev, desc, domid, index);
+ if (ret) {
+ map.index = ret;
+ goto unlock;
+ }
+
+ ctrl.first = ctrl.last = desc->msi_index;
+
+ ret = __msi_domain_alloc_irqs(dev, domain, &ctrl);
+ if (ret) {
+ map.index = ret;
+ msi_domain_free_locked(dev, &ctrl);
+ } else {
+ map.index = desc->msi_index;
+ map.virq = desc->irq;
+ }
+unlock:
msi_unlock_descs(dev);
- return ret;
+ return map;
}
-void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+static void __msi_domain_free_irqs(struct device *dev, struct irq_domain *domain,
+ struct msi_ctrl *ctrl)
{
+ struct xarray *xa = &dev->msi.data->__domains[ctrl->domid].store;
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
+ unsigned long idx;
int i;
- /* Only handle MSI entries which have an interrupt associated */
- msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ xa_for_each_range(xa, idx, desc, ctrl->first, ctrl->last) {
+ /* Only handle MSI entries which have an interrupt associated */
+ if (!msi_desc_match(desc, MSI_DESC_ASSOCIATED))
+ continue;
+
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
@@ -997,44 +1517,99 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
}
}
-static void msi_domain_free_msi_descs(struct msi_domain_info *info,
- struct device *dev)
+static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
{
+ struct msi_domain_info *info;
+ struct msi_domain_ops *ops;
+ struct irq_domain *domain;
+
+ if (!msi_ctrl_valid(dev, ctrl))
+ return;
+
+ domain = msi_get_device_domain(dev, ctrl->domid);
+ if (!domain)
+ return;
+
+ info = domain->host_data;
+ ops = info->ops;
+
+ if (ops->domain_free_irqs)
+ ops->domain_free_irqs(domain, dev);
+ else
+ __msi_domain_free_irqs(dev, domain, ctrl);
+
+ if (ops->msi_post_free)
+ ops->msi_post_free(domain, dev);
+
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
- msi_free_msi_descs(dev);
+ msi_domain_free_descs(dev, ctrl);
}
/**
- * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_range_locked - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev with msi_lock held
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ struct msi_ctrl ctrl = {
+ .domid = domid,
+ .first = first,
+ .last = last,
+ };
+ msi_domain_free_locked(dev, &ctrl);
+}
+
+/**
+ * msi_domain_free_irqs_range - Free a range of interrupts from a MSI interrupt domain
+ * associated to @dev
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: Id of the interrupt domain to operate on
+ * @first: First index to free (inclusive)
+ * @last: Last index to free (inclusive)
+ */
+void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_range_locked(dev, domid, first, last);
+ msi_unlock_descs(dev);
+}
+
+/**
+ * msi_domain_free_irqs_all_locked - Free all interrupts from a MSI interrupt domain
+ * associated to a device
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are freed
+ * @domid: The id of the domain to operate on
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
* pair. Use this for MSI irqdomains which implement their own vector
* allocation.
*/
-void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
-
- lockdep_assert_held(&dev->msi.data->mutex);
-
- ops->domain_free_irqs(domain, dev);
- msi_domain_free_msi_descs(info, dev);
+ msi_domain_free_irqs_range_locked(dev, domid, 0,
+ msi_domain_get_hwsize(dev, domid) - 1);
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_free_irqs_all - Free all interrupts from a MSI interrupt domain
+ * associated to a device
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are freed
+ * @domid: The id of the domain to operate on
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
{
msi_lock_descs(dev);
- msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_domain_free_irqs_all_locked(dev, domid);
msi_unlock_descs(dev);
}
@@ -1048,5 +1623,3 @@ struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain)
{
return (struct msi_domain_info *)domain->host_data;
}
-
-#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 714ac4c3b556..d9c822bbffb8 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -113,11 +113,40 @@ int static_key_count(struct static_key *key)
}
EXPORT_SYMBOL_GPL(static_key_count);
-void static_key_slow_inc_cpuslocked(struct static_key *key)
+/*
+ * static_key_fast_inc_not_disabled - adds a user for a static key
+ * @key: static key that must be already enabled
+ *
+ * The caller must make sure that the static key can't get disabled while
+ * in this function. It doesn't patch jump labels, only adds a user to
+ * an already enabled static key.
+ *
+ * Returns true if the increment was done. Unlike refcount_t the ref counter
+ * is not saturated, but will fail to increment on overflow.
+ */
+bool static_key_fast_inc_not_disabled(struct static_key *key)
{
- int v, v1;
+ int v;
STATIC_KEY_CHECK_USE(key);
+ /*
+ * Negative key->enabled has a special meaning: it sends
+ * static_key_slow_inc() down the slow path, and it is non-zero
+ * so it counts as "enabled" in jump_label_update(). Note that
+ * atomic_inc_unless_negative() checks >= 0, so roll our own.
+ */
+ v = atomic_read(&key->enabled);
+ do {
+ if (v <= 0 || (v + 1) < 0)
+ return false;
+ } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);
+
+bool static_key_slow_inc_cpuslocked(struct static_key *key)
+{
lockdep_assert_cpus_held();
/*
@@ -126,17 +155,9 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
* jump_label_update() process. At the same time, however,
* the jump_label_update() call below wants to see
* static_key_enabled(&key) for jumps to be updated properly.
- *
- * So give a special meaning to negative key->enabled: it sends
- * static_key_slow_inc() down the slow path, and it is non-zero
- * so it counts as "enabled" in jump_label_update(). Note that
- * atomic_inc_unless_negative() checks >= 0, so roll our own.
*/
- for (v = atomic_read(&key->enabled); v > 0; v = v1) {
- v1 = atomic_cmpxchg(&key->enabled, v, v + 1);
- if (likely(v1 == v))
- return;
- }
+ if (static_key_fast_inc_not_disabled(key))
+ return true;
jump_label_lock();
if (atomic_read(&key->enabled) == 0) {
@@ -148,16 +169,23 @@ void static_key_slow_inc_cpuslocked(struct static_key *key)
*/
atomic_set_release(&key->enabled, 1);
} else {
- atomic_inc(&key->enabled);
+ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) {
+ jump_label_unlock();
+ return false;
+ }
}
jump_label_unlock();
+ return true;
}
-void static_key_slow_inc(struct static_key *key)
+bool static_key_slow_inc(struct static_key *key)
{
+ bool ret;
+
cpus_read_lock();
- static_key_slow_inc_cpuslocked(key);
+ ret = static_key_slow_inc_cpuslocked(key);
cpus_read_unlock();
+ return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 60c20f301a6b..83f499182c9a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -146,7 +146,7 @@ static unsigned int get_symbol_offset(unsigned long pos)
return name - kallsyms_names;
}
-static unsigned long kallsyms_sym_address(int idx)
+unsigned long kallsyms_sym_address(int idx)
{
if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE))
return kallsyms_addresses[idx];
@@ -187,26 +187,100 @@ static bool cleanup_symbol_name(char *s)
return false;
}
+static int compare_symbol_name(const char *name, char *namebuf)
+{
+ int ret;
+
+ ret = strcmp(name, namebuf);
+ if (!ret)
+ return ret;
+
+ if (cleanup_symbol_name(namebuf) && !strcmp(name, namebuf))
+ return 0;
+
+ return ret;
+}
+
+static unsigned int get_symbol_seq(int index)
+{
+ unsigned int i, seq = 0;
+
+ for (i = 0; i < 3; i++)
+ seq = (seq << 8) | kallsyms_seqs_of_names[3 * index + i];
+
+ return seq;
+}
+
+static int kallsyms_lookup_names(const char *name,
+ unsigned int *start,
+ unsigned int *end)
+{
+ int ret;
+ int low, mid, high;
+ unsigned int seq, off;
+ char namebuf[KSYM_NAME_LEN];
+
+ low = 0;
+ high = kallsyms_num_syms - 1;
+
+ while (low <= high) {
+ mid = low + (high - low) / 2;
+ seq = get_symbol_seq(mid);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ ret = compare_symbol_name(name, namebuf);
+ if (ret > 0)
+ low = mid + 1;
+ else if (ret < 0)
+ high = mid - 1;
+ else
+ break;
+ }
+
+ if (low > high)
+ return -ESRCH;
+
+ low = mid;
+ while (low) {
+ seq = get_symbol_seq(low - 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ low--;
+ }
+ *start = low;
+
+ if (end) {
+ high = mid;
+ while (high < kallsyms_num_syms - 1) {
+ seq = get_symbol_seq(high + 1);
+ off = get_symbol_offset(seq);
+ kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
+ if (compare_symbol_name(name, namebuf))
+ break;
+ high++;
+ }
+ *end = high;
+ }
+
+ return 0;
+}
+
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
{
- char namebuf[KSYM_NAME_LEN];
- unsigned long i;
- unsigned int off;
+ int ret;
+ unsigned int i;
/* Skip the search for empty string. */
if (!*name)
return 0;
- for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
-
- if (strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
+ ret = kallsyms_lookup_names(name, &i, NULL);
+ if (!ret)
+ return kallsyms_sym_address(get_symbol_seq(i));
- if (cleanup_symbol_name(namebuf) && strcmp(namebuf, name) == 0)
- return kallsyms_sym_address(i);
- }
return module_kallsyms_lookup_name(name);
}
@@ -233,6 +307,24 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
return 0;
}
+int kallsyms_on_each_match_symbol(int (*fn)(void *, unsigned long),
+ const char *name, void *data)
+{
+ int ret;
+ unsigned int i, start, end;
+
+ ret = kallsyms_lookup_names(name, &start, &end);
+ if (ret)
+ return 0;
+
+ for (i = start; !ret && i <= end; i++) {
+ ret = fn(data, kallsyms_sym_address(get_symbol_seq(i)));
+ cond_resched();
+ }
+
+ return ret;
+}
+
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)
diff --git a/kernel/kallsyms_internal.h b/kernel/kallsyms_internal.h
index 2d0c6f2f0243..27fabdcc40f5 100644
--- a/kernel/kallsyms_internal.h
+++ b/kernel/kallsyms_internal.h
@@ -26,5 +26,6 @@ extern const char kallsyms_token_table[] __weak;
extern const u16 kallsyms_token_index[] __weak;
extern const unsigned int kallsyms_markers[] __weak;
+extern const u8 kallsyms_seqs_of_names[] __weak;
#endif // LINUX_KALLSYMS_INTERNAL_H_
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
new file mode 100644
index 000000000000..f35d9cc1aab1
--- /dev/null
+++ b/kernel/kallsyms_selftest.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the function and performance of kallsyms
+ *
+ * Copyright (C) Huawei Technologies Co., Ltd., 2022
+ *
+ * Authors: Zhen Lei <thunder.leizhen@huawei.com> Huawei
+ */
+
+#define pr_fmt(fmt) "kallsyms_selftest: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/random.h>
+#include <linux/sched/clock.h>
+#include <linux/kthread.h>
+#include <linux/vmalloc.h>
+
+#include "kallsyms_internal.h"
+#include "kallsyms_selftest.h"
+
+
+#define MAX_NUM_OF_RECORDS 64
+
+struct test_stat {
+ int min;
+ int max;
+ int save_cnt;
+ int real_cnt;
+ int perf;
+ u64 sum;
+ char *name;
+ unsigned long addr;
+ unsigned long addrs[MAX_NUM_OF_RECORDS];
+};
+
+struct test_item {
+ char *name;
+ unsigned long addr;
+};
+
+#define ITEM_FUNC(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)s, \
+ }
+
+#define ITEM_DATA(s) \
+ { \
+ .name = #s, \
+ .addr = (unsigned long)&s, \
+ }
+
+
+static int kallsyms_test_var_bss_static;
+static int kallsyms_test_var_data_static = 1;
+int kallsyms_test_var_bss;
+int kallsyms_test_var_data = 1;
+
+static int kallsyms_test_func_static(void)
+{
+ kallsyms_test_var_bss_static++;
+ kallsyms_test_var_data_static++;
+
+ return 0;
+}
+
+int kallsyms_test_func(void)
+{
+ return kallsyms_test_func_static();
+}
+
+__weak int kallsyms_test_func_weak(void)
+{
+ kallsyms_test_var_bss++;
+ kallsyms_test_var_data++;
+ return 0;
+}
+
+static struct test_item test_items[] = {
+ ITEM_FUNC(kallsyms_test_func_static),
+ ITEM_FUNC(kallsyms_test_func),
+ ITEM_FUNC(kallsyms_test_func_weak),
+ ITEM_FUNC(vmalloc),
+ ITEM_FUNC(vfree),
+#ifdef CONFIG_KALLSYMS_ALL
+ ITEM_DATA(kallsyms_test_var_bss_static),
+ ITEM_DATA(kallsyms_test_var_data_static),
+ ITEM_DATA(kallsyms_test_var_bss),
+ ITEM_DATA(kallsyms_test_var_data),
+ ITEM_DATA(vmap_area_list),
+#endif
+};
+
+static char stub_name[KSYM_NAME_LEN];
+
+static int stat_symbol_len(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ *(u32 *)data += strlen(name);
+
+ return 0;
+}
+
+static void test_kallsyms_compression_ratio(void)
+{
+ u32 pos, off, len, num;
+ u32 ratio, total_size, total_len = 0;
+
+ kallsyms_on_each_symbol(stat_symbol_len, &total_len);
+
+ /*
+ * A symbol name cannot start with a number. This stub name helps us
+ * traverse the entire symbol table without finding a match. It's used
+ * for subsequent performance tests, and its length is the average
+ * length of all symbol names.
+ */
+ memset(stub_name, '4', sizeof(stub_name));
+ pos = total_len / kallsyms_num_syms;
+ stub_name[pos] = 0;
+
+ pos = 0;
+ num = 0;
+ off = 0;
+ while (pos < kallsyms_num_syms) {
+ len = kallsyms_names[off];
+ num++;
+ off++;
+ pos++;
+ if ((len & 0x80) != 0) {
+ len = (len & 0x7f) | (kallsyms_names[off] << 7);
+ num++;
+ off++;
+ }
+ off += len;
+ }
+
+ /*
+ * 1. The length fields is not counted
+ * 2. The memory occupied by array kallsyms_token_table[] and
+ * kallsyms_token_index[] needs to be counted.
+ */
+ total_size = off - num;
+ pos = kallsyms_token_index[0xff];
+ total_size += pos + strlen(&kallsyms_token_table[pos]) + 1;
+ total_size += 0x100 * sizeof(u16);
+
+ pr_info(" ---------------------------------------------------------\n");
+ pr_info("| nr_symbols | compressed size | original size | ratio(%%) |\n");
+ pr_info("|---------------------------------------------------------|\n");
+ ratio = (u32)div_u64(10000ULL * total_size, total_len);
+ pr_info("| %10d | %10d | %10d | %2d.%-2d |\n",
+ kallsyms_num_syms, total_size, total_len, ratio / 100, ratio % 100);
+ pr_info(" ---------------------------------------------------------\n");
+}
+
+static int lookup_name(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ u64 t0, t1, t;
+ unsigned long flags;
+ struct test_stat *stat = (struct test_stat *)data;
+
+ local_irq_save(flags);
+ t0 = sched_clock();
+ (void)kallsyms_lookup_name(name);
+ t1 = sched_clock();
+ local_irq_restore(flags);
+
+ t = t1 - t0;
+ if (t < stat->min)
+ stat->min = t;
+
+ if (t > stat->max)
+ stat->max = t;
+
+ stat->real_cnt++;
+ stat->sum += t;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_lookup_name(void)
+{
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.min = INT_MAX;
+ kallsyms_on_each_symbol(lookup_name, &stat);
+ pr_info("kallsyms_lookup_name() looked up %d symbols\n", stat.real_cnt);
+ pr_info("The time spent on each symbol is (ns): min=%d, max=%d, avg=%lld\n",
+ stat.min, stat.max, div_u64(stat.sum, stat.real_cnt));
+}
+
+static bool match_cleanup_name(const char *s, const char *name)
+{
+ char *p;
+ int len;
+
+ if (!IS_ENABLED(CONFIG_LTO_CLANG))
+ return false;
+
+ p = strchr(s, '.');
+ if (!p)
+ return false;
+
+ len = strlen(name);
+ if (p - s != len)
+ return false;
+
+ return !strncmp(s, name, len);
+}
+
+static int find_symbol(void *data, const char *name, struct module *mod, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ if (strcmp(name, stat->name) == 0 ||
+ (!stat->perf && match_cleanup_name(name, stat->name))) {
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+ }
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_symbol(void)
+{
+ u64 t0, t1;
+ unsigned long flags;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ stat.perf = 1;
+ local_irq_save(flags);
+ t0 = sched_clock();
+ kallsyms_on_each_symbol(find_symbol, &stat);
+ t1 = sched_clock();
+ local_irq_restore(flags);
+ pr_info("kallsyms_on_each_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int match_symbol(void *data, unsigned long addr)
+{
+ struct test_stat *stat = (struct test_stat *)data;
+
+ stat->real_cnt++;
+ stat->addr = addr;
+
+ if (stat->save_cnt < MAX_NUM_OF_RECORDS) {
+ stat->addrs[stat->save_cnt] = addr;
+ stat->save_cnt++;
+ }
+
+ if (stat->real_cnt == stat->max)
+ return 1;
+
+ return 0;
+}
+
+static void test_perf_kallsyms_on_each_match_symbol(void)
+{
+ u64 t0, t1;
+ unsigned long flags;
+ struct test_stat stat;
+
+ memset(&stat, 0, sizeof(stat));
+ stat.max = INT_MAX;
+ stat.name = stub_name;
+ local_irq_save(flags);
+ t0 = sched_clock();
+ kallsyms_on_each_match_symbol(match_symbol, stat.name, &stat);
+ t1 = sched_clock();
+ local_irq_restore(flags);
+ pr_info("kallsyms_on_each_match_symbol() traverse all: %lld ns\n", t1 - t0);
+}
+
+static int test_kallsyms_basic_function(void)
+{
+ int i, j, ret;
+ int next = 0, nr_failed = 0;
+ char *prefix;
+ unsigned short rand;
+ unsigned long addr, lookup_addr;
+ char namebuf[KSYM_NAME_LEN];
+ struct test_stat *stat, *stat2;
+
+ stat = kmalloc(sizeof(*stat) * 2, GFP_KERNEL);
+ if (!stat)
+ return -ENOMEM;
+ stat2 = stat + 1;
+
+ prefix = "kallsyms_lookup_name() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ addr = kallsyms_lookup_name(test_items[i].name);
+ if (addr != test_items[i].addr) {
+ nr_failed++;
+ pr_info("%s %s failed: addr=%lx, expect %lx\n",
+ prefix, test_items[i].name, addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_symbol(find_symbol, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ prefix = "kallsyms_on_each_match_symbol() for";
+ for (i = 0; i < ARRAY_SIZE(test_items); i++) {
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ stat->name = test_items[i].name;
+ kallsyms_on_each_match_symbol(match_symbol, test_items[i].name, stat);
+ if (stat->addr != test_items[i].addr || stat->real_cnt != 1) {
+ nr_failed++;
+ pr_info("%s %s failed: count=%d, addr=%lx, expect %lx\n",
+ prefix, test_items[i].name,
+ stat->real_cnt, stat->addr, test_items[i].addr);
+ }
+ }
+
+ if (nr_failed) {
+ kfree(stat);
+ return -ESRCH;
+ }
+
+ for (i = 0; i < kallsyms_num_syms; i++) {
+ addr = kallsyms_sym_address(i);
+ if (!is_ksym_addr(addr))
+ continue;
+
+ ret = lookup_symbol_name(addr, namebuf);
+ if (unlikely(ret)) {
+ namebuf[0] = 0;
+ goto failed;
+ }
+
+ /*
+ * The first '.' may be the initial letter, in which case the
+ * entire symbol name will be truncated to an empty string in
+ * cleanup_symbol_name(). Do not test these symbols.
+ *
+ * For example:
+ * cat /proc/kallsyms | awk '{print $3}' | grep -E "^\." | head
+ * .E_read_words
+ * .E_leading_bytes
+ * .E_trailing_bytes
+ * .E_write_words
+ * .E_copy
+ * .str.292.llvm.12122243386960820698
+ * .str.24.llvm.12122243386960820698
+ * .str.29.llvm.12122243386960820698
+ * .str.75.llvm.12122243386960820698
+ * .str.99.llvm.12122243386960820698
+ */
+ if (IS_ENABLED(CONFIG_LTO_CLANG) && !namebuf[0])
+ continue;
+
+ lookup_addr = kallsyms_lookup_name(namebuf);
+
+ memset(stat, 0, sizeof(*stat));
+ stat->max = INT_MAX;
+ kallsyms_on_each_match_symbol(match_symbol, namebuf, stat);
+
+ /*
+ * kallsyms_on_each_symbol() is too slow, randomly select some
+ * symbols for test.
+ */
+ if (i >= next) {
+ memset(stat2, 0, sizeof(*stat2));
+ stat2->max = INT_MAX;
+ stat2->name = namebuf;
+ kallsyms_on_each_symbol(find_symbol, stat2);
+
+ /*
+ * kallsyms_on_each_symbol() and kallsyms_on_each_match_symbol()
+ * need to get the same traversal result.
+ */
+ if (stat->addr != stat2->addr ||
+ stat->real_cnt != stat2->real_cnt ||
+ memcmp(stat->addrs, stat2->addrs,
+ stat->save_cnt * sizeof(stat->addrs[0])))
+ goto failed;
+
+ /*
+ * The average of random increments is 128, that is, one of
+ * them is tested every 128 symbols.
+ */
+ get_random_bytes(&rand, sizeof(rand));
+ next = i + (rand & 0xff) + 1;
+ }
+
+ /* Need to be found at least once */
+ if (!stat->real_cnt)
+ goto failed;
+
+ /*
+ * kallsyms_lookup_name() returns the address of the first
+ * symbol found and cannot be NULL.
+ */
+ if (!lookup_addr || lookup_addr != stat->addrs[0])
+ goto failed;
+
+ /*
+ * If the addresses of all matching symbols are recorded, the
+ * target address needs to be exist.
+ */
+ if (stat->real_cnt <= MAX_NUM_OF_RECORDS) {
+ for (j = 0; j < stat->save_cnt; j++) {
+ if (stat->addrs[j] == addr)
+ break;
+ }
+
+ if (j == stat->save_cnt)
+ goto failed;
+ }
+ }
+
+ kfree(stat);
+
+ return 0;
+
+failed:
+ pr_info("Test for %dth symbol failed: (%s) addr=%lx", i, namebuf, addr);
+ kfree(stat);
+ return -ESRCH;
+}
+
+static int test_entry(void *p)
+{
+ int ret;
+
+ do {
+ schedule_timeout(5 * HZ);
+ } while (system_state != SYSTEM_RUNNING);
+
+ pr_info("start\n");
+ ret = test_kallsyms_basic_function();
+ if (ret) {
+ pr_info("abort\n");
+ return 0;
+ }
+
+ test_kallsyms_compression_ratio();
+ test_perf_kallsyms_lookup_name();
+ test_perf_kallsyms_on_each_symbol();
+ test_perf_kallsyms_on_each_match_symbol();
+ pr_info("finish\n");
+
+ return 0;
+}
+
+static int __init kallsyms_test_init(void)
+{
+ struct task_struct *t;
+
+ t = kthread_create(test_entry, NULL, "kallsyms_test");
+ if (IS_ERR(t)) {
+ pr_info("Create kallsyms selftest task failed\n");
+ return PTR_ERR(t);
+ }
+ kthread_bind(t, 0);
+ wake_up_process(t);
+
+ return 0;
+}
+late_initcall(kallsyms_test_init);
diff --git a/kernel/kallsyms_selftest.h b/kernel/kallsyms_selftest.h
new file mode 100644
index 000000000000..c0ca548e2a22
--- /dev/null
+++ b/kernel/kallsyms_selftest.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef LINUX_KALLSYMS_SELFTEST_H_
+#define LINUX_KALLSYMS_SELFTEST_H_
+
+#include <linux/types.h>
+
+extern int kallsyms_test_var_bss;
+extern int kallsyms_test_var_data;
+
+extern int kallsyms_test_func(void);
+extern int kallsyms_test_func_weak(void);
+
+#endif // LINUX_KALLSYMS_SELFTEST_H_
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 4f35d1bced6a..8cf70f068d92 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -17,4 +17,5 @@ KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+CFLAGS_kcsan_test.o += $(DISABLE_STRUCTLEAK_PLUGIN)
obj-$(CONFIG_KCSAN_KUNIT_TEST) += kcsan_test.o
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index fe12dfe254ec..54d077e1a2dc 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -14,10 +14,12 @@
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/moduleparam.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/sched.h>
+#include <linux/string.h>
#include <linux/uaccess.h>
#include "encoding.h"
@@ -1308,3 +1310,51 @@ noinline void __tsan_atomic_signal_fence(int memorder)
}
}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
+
+#ifdef __HAVE_ARCH_MEMSET
+void *__tsan_memset(void *s, int c, size_t count);
+noinline void *__tsan_memset(void *s, int c, size_t count)
+{
+ /*
+ * Instead of not setting up watchpoints where accessed size is greater
+ * than MAX_ENCODABLE_SIZE, truncate checked size to MAX_ENCODABLE_SIZE.
+ */
+ size_t check_len = min_t(size_t, count, MAX_ENCODABLE_SIZE);
+
+ check_access(s, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ return memset(s, c, count);
+}
+#else
+void *__tsan_memset(void *s, int c, size_t count) __alias(memset);
+#endif
+EXPORT_SYMBOL(__tsan_memset);
+
+#ifdef __HAVE_ARCH_MEMMOVE
+void *__tsan_memmove(void *dst, const void *src, size_t len);
+noinline void *__tsan_memmove(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memmove(dst, src, len);
+}
+#else
+void *__tsan_memmove(void *dst, const void *src, size_t len) __alias(memmove);
+#endif
+EXPORT_SYMBOL(__tsan_memmove);
+
+#ifdef __HAVE_ARCH_MEMCPY
+void *__tsan_memcpy(void *dst, const void *src, size_t len);
+noinline void *__tsan_memcpy(void *dst, const void *src, size_t len)
+{
+ size_t check_len = min_t(size_t, len, MAX_ENCODABLE_SIZE);
+
+ check_access(dst, check_len, KCSAN_ACCESS_WRITE, _RET_IP_);
+ check_access(src, check_len, 0, _RET_IP_);
+ return memcpy(dst, src, len);
+}
+#else
+void *__tsan_memcpy(void *dst, const void *src, size_t len) __alias(memcpy);
+#endif
+EXPORT_SYMBOL(__tsan_memcpy);
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 67794404042a..e95ce7d7a76e 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -492,8 +492,7 @@ static void print_report(enum kcsan_value_change value_change,
dump_stack_print_info(KERN_DEFAULT);
pr_err("==================================================================\n");
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("KCSAN");
}
static void release_report(unsigned long *flags, struct other_info *other_info)
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 00cdf8fa5693..8679322450f2 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -22,13 +22,6 @@
#define ITERS_PER_TEST 2000
-/* Test requirements. */
-static bool __init test_requires(void)
-{
- /* random should be initialized for the below tests */
- return get_random_u32() + get_random_u32() != 0;
-}
-
/*
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
@@ -38,8 +31,8 @@ static bool __init test_encode_decode(void)
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
- size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
- bool is_write = !!prandom_u32_max(2);
+ size_t size = get_random_u32_inclusive(1, MAX_ENCODABLE_SIZE);
+ bool is_write = !!get_random_u32_below(2);
unsigned long verif_masked_addr;
long encoded_watchpoint;
bool verif_is_write;
@@ -259,7 +252,6 @@ static int __init kcsan_selftest(void)
pr_err("selftest: " #do_test " failed"); \
} while (0)
- RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
RUN_TEST(test_barrier);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ca2743f9c634..969e8f52f7da 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -561,23 +561,17 @@ static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
static int kimage_set_destination(struct kimage *image,
unsigned long destination)
{
- int result;
-
destination &= PAGE_MASK;
- result = kimage_add_entry(image, destination | IND_DESTINATION);
- return result;
+ return kimage_add_entry(image, destination | IND_DESTINATION);
}
static int kimage_add_page(struct kimage *image, unsigned long page)
{
- int result;
-
page &= PAGE_MASK;
- result = kimage_add_entry(image, page | IND_SOURCE);
- return result;
+ return kimage_add_entry(image, page | IND_SOURCE);
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 45637511e0de..dd5983010b7b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1141,7 +1141,7 @@ int crash_exclude_mem_range(struct crash_mem *mem,
{
int i, j;
unsigned long long start, end, p_start, p_end;
- struct crash_mem_range temp_range = {0, 0};
+ struct range temp_range = {0, 0};
for (i = 0; i < mem->nr_ranges; i++) {
start = mem->ranges[i].start;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cd9f5a66a690..1c18ecf9f98b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1766,7 +1766,13 @@ static int __unregister_kprobe_top(struct kprobe *p)
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
- ap->post_handler = NULL;
+ /*
+ * For the kprobe-on-ftrace case, we keep the
+ * post_handler setting to identify this aggrprobe
+ * armed with kprobe_ipmodify_ops.
+ */
+ if (!kprobe_ftrace(ap))
+ ap->post_handler = NULL;
}
noclean:
/*
@@ -2207,13 +2213,9 @@ int register_kretprobe(struct kretprobe *rp)
rp->kp.post_handler = NULL;
/* Pre-allocate memory for max kretprobe instances */
- if (rp->maxactive <= 0) {
-#ifdef CONFIG_PREEMPTION
+ if (rp->maxactive <= 0)
rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
-#else
- rp->maxactive = num_possible_cpus();
-#endif
- }
+
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
if (!rp->rh)
@@ -2358,6 +2360,14 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * 'MODULE_STATE_GOING' notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2374,14 +2384,6 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
-
- /*
- * The module is going away. We should disarm the kprobe which
- * is using ftrace, because ftrace framework is still available at
- * 'MODULE_STATE_GOING' notification.
- */
- if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
- disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65dba9076f31..2df00b789b90 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -6,6 +6,7 @@
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
*/
+#include <asm/byteorder.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/sysfs.h>
@@ -20,6 +21,14 @@
#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */
+#if defined(__LITTLE_ENDIAN)
+#define CPU_BYTEORDER_STRING "little"
+#elif defined(__BIG_ENDIAN)
+#define CPU_BYTEORDER_STRING "big"
+#else
+#error Unknown byteorder
+#endif
+
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -34,6 +43,14 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj,
}
KERNEL_ATTR_RO(uevent_seqnum);
+/* cpu byteorder */
+static ssize_t cpu_byteorder_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n", CPU_BYTEORDER_STRING);
+}
+KERNEL_ATTR_RO(cpu_byteorder);
+
#ifdef CONFIG_UEVENT_HELPER
/* uevent helper program, used during early boot */
static ssize_t uevent_helper_show(struct kobject *kobj,
@@ -215,6 +232,7 @@ EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
&fscaps_attr.attr,
&uevent_seqnum_attr.attr,
+ &cpu_byteorder_attr.attr,
#ifdef CONFIG_UEVENT_HELPER
&uevent_helper_attr.attr,
#endif
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 9ada0bc5247b..201f0c0482fb 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -125,20 +125,10 @@ struct klp_find_arg {
unsigned long pos;
};
-static int klp_find_callback(void *data, const char *name,
- struct module *mod, unsigned long addr)
+static int klp_match_callback(void *data, unsigned long addr)
{
struct klp_find_arg *args = data;
- if ((mod && !args->objname) || (!mod && args->objname))
- return 0;
-
- if (strcmp(args->name, name))
- return 0;
-
- if (args->objname && strcmp(args->objname, mod->name))
- return 0;
-
args->addr = addr;
args->count++;
@@ -153,6 +143,23 @@ static int klp_find_callback(void *data, const char *name,
return 0;
}
+static int klp_find_callback(void *data, const char *name,
+ struct module *mod, unsigned long addr)
+{
+ struct klp_find_arg *args = data;
+
+ if ((mod && !args->objname) || (!mod && args->objname))
+ return 0;
+
+ if (strcmp(args->name, name))
+ return 0;
+
+ if (args->objname && strcmp(args->objname, mod->name))
+ return 0;
+
+ return klp_match_callback(data, addr);
+}
+
static int klp_find_object_symbol(const char *objname, const char *name,
unsigned long sympos, unsigned long *addr)
{
@@ -167,7 +174,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
if (objname)
module_kallsyms_on_each_symbol(klp_find_callback, &args);
else
- kallsyms_on_each_symbol(klp_find_callback, &args);
+ kallsyms_on_each_match_symbol(klp_match_callback, name, &args);
/*
* Ensure an address was found. If sympos is 0, ensure symbol is unique;
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 4c4f5a776d80..4152c71507e2 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -118,7 +118,7 @@ static void notrace klp_ftrace_handler(unsigned long ip,
if (func->nop)
goto unlock;
- ftrace_instruction_pointer_set(fregs, (unsigned long)func->new_func);
+ ftrace_regs_set_instruction_pointer(fregs, (unsigned long)func->new_func);
unlock:
ftrace_test_recursion_unlock(bit);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 30187b1d8275..f1b25ec581e0 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -196,36 +196,36 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
struct klp_ops *ops;
int i;
- for (i = 0; i < nr_entries; i++) {
- address = entries[i];
+ if (klp_target_state == KLP_UNPATCHED) {
+ /*
+ * Check for the to-be-unpatched function
+ * (the func itself).
+ */
+ func_addr = (unsigned long)func->new_func;
+ func_size = func->new_size;
+ } else {
+ /*
+ * Check for the to-be-patched function
+ * (the previous func).
+ */
+ ops = klp_find_ops(func->old_func);
- if (klp_target_state == KLP_UNPATCHED) {
- /*
- * Check for the to-be-unpatched function
- * (the func itself).
- */
- func_addr = (unsigned long)func->new_func;
- func_size = func->new_size;
+ if (list_is_singular(&ops->func_stack)) {
+ /* original function */
+ func_addr = (unsigned long)func->old_func;
+ func_size = func->old_size;
} else {
- /*
- * Check for the to-be-patched function
- * (the previous func).
- */
- ops = klp_find_ops(func->old_func);
-
- if (list_is_singular(&ops->func_stack)) {
- /* original function */
- func_addr = (unsigned long)func->old_func;
- func_size = func->old_size;
- } else {
- /* previously patched function */
- struct klp_func *prev;
-
- prev = list_next_entry(func, stack_node);
- func_addr = (unsigned long)prev->new_func;
- func_size = prev->new_size;
- }
+ /* previously patched function */
+ struct klp_func *prev;
+
+ prev = list_next_entry(func, stack_node);
+ func_addr = (unsigned long)prev->new_func;
+ func_size = prev->new_size;
}
+ }
+
+ for (i = 0; i < nr_entries; i++) {
+ address = entries[i];
if (address >= func_addr && address < func_addr + func_size)
return -EAGAIN;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index ea925731fa40..0db4093d17b8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -7,7 +7,6 @@ obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
KCSAN_SANITIZE_lockdep.o := n
-KMSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 43efb2a04160..29dc253d03af 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -399,7 +399,7 @@ static int *get_random_order(int count)
order[n] = n;
for (n = count - 1; n > 1; n--) {
- r = prandom_u32_max(n + 1);
+ r = get_random_u32_below(n + 1);
if (r != n) {
tmp = order[n];
order[n] = order[r];
@@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work)
{
struct stress *stress = container_of(work, typeof(*stress), work);
const int nlocks = stress->nlocks;
- struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks);
+ struct ww_mutex *lock = stress->locks + get_random_u32_below(nlocks);
int err;
do {
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 26ea5d04f56c..424b3bc58f3f 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -221,9 +221,10 @@ endchoice
config MODULE_DECOMPRESS
bool "Support in-kernel module decompression"
- depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ
+ depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ || MODULE_COMPRESS_ZSTD
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
select XZ_DEC if MODULE_COMPRESS_XZ
+ select ZSTD_DECOMPRESS if MODULE_COMPRESS_ZSTD
help
Support for decompressing kernel modules by the kernel itself
diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c
index c033572d83f0..bb79ac1a6d8f 100644
--- a/kernel/module/decompress.c
+++ b/kernel/module/decompress.c
@@ -50,7 +50,7 @@ static struct page *module_get_next_page(struct load_info *info)
return page;
}
-#ifdef CONFIG_MODULE_COMPRESS_GZIP
+#if defined(CONFIG_MODULE_COMPRESS_GZIP)
#include <linux/zlib.h>
#define MODULE_COMPRESSION gzip
#define MODULE_DECOMPRESS_FN module_gzip_decompress
@@ -114,8 +114,8 @@ static ssize_t module_gzip_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out_inflate_end;
}
@@ -141,7 +141,7 @@ out:
kfree(s.workspace);
return retval;
}
-#elif CONFIG_MODULE_COMPRESS_XZ
+#elif defined(CONFIG_MODULE_COMPRESS_XZ)
#include <linux/xz.h>
#define MODULE_COMPRESSION xz
#define MODULE_DECOMPRESS_FN module_xz_decompress
@@ -173,8 +173,8 @@ static ssize_t module_xz_decompress(struct load_info *info,
do {
struct page *page = module_get_next_page(info);
- if (!page) {
- retval = -ENOMEM;
+ if (IS_ERR(page)) {
+ retval = PTR_ERR(page);
goto out;
}
@@ -199,6 +199,94 @@ static ssize_t module_xz_decompress(struct load_info *info,
xz_dec_end(xz_dec);
return retval;
}
+#elif defined(CONFIG_MODULE_COMPRESS_ZSTD)
+#include <linux/zstd.h>
+#define MODULE_COMPRESSION zstd
+#define MODULE_DECOMPRESS_FN module_zstd_decompress
+
+static ssize_t module_zstd_decompress(struct load_info *info,
+ const void *buf, size_t size)
+{
+ static const u8 signature[] = { 0x28, 0xb5, 0x2f, 0xfd };
+ ZSTD_outBuffer zstd_dec;
+ ZSTD_inBuffer zstd_buf;
+ zstd_frame_header header;
+ size_t wksp_size;
+ void *wksp = NULL;
+ ZSTD_DStream *dstream;
+ size_t ret;
+ size_t new_size = 0;
+ int retval;
+
+ if (size < sizeof(signature) ||
+ memcmp(buf, signature, sizeof(signature))) {
+ pr_err("not a zstd compressed module\n");
+ return -EINVAL;
+ }
+
+ zstd_buf.src = buf;
+ zstd_buf.pos = 0;
+ zstd_buf.size = size;
+
+ ret = zstd_get_frame_header(&header, zstd_buf.src, zstd_buf.size);
+ if (ret != 0) {
+ pr_err("ZSTD-compressed data has an incomplete frame header\n");
+ retval = -EINVAL;
+ goto out;
+ }
+ if (header.windowSize > (1 << ZSTD_WINDOWLOG_MAX)) {
+ pr_err("ZSTD-compressed data has too large a window size\n");
+ retval = -EINVAL;
+ goto out;
+ }
+
+ wksp_size = zstd_dstream_workspace_bound(header.windowSize);
+ wksp = kmalloc(wksp_size, GFP_KERNEL);
+ if (!wksp) {
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ dstream = zstd_init_dstream(header.windowSize, wksp, wksp_size);
+ if (!dstream) {
+ pr_err("Can't initialize ZSTD stream\n");
+ retval = -ENOMEM;
+ goto out;
+ }
+
+ do {
+ struct page *page = module_get_next_page(info);
+
+ if (!IS_ERR(page)) {
+ retval = PTR_ERR(page);
+ goto out;
+ }
+
+ zstd_dec.dst = kmap_local_page(page);
+ zstd_dec.pos = 0;
+ zstd_dec.size = PAGE_SIZE;
+
+ ret = zstd_decompress_stream(dstream, &zstd_dec, &zstd_buf);
+ kunmap(page);
+ retval = zstd_get_error_code(ret);
+ if (retval)
+ break;
+
+ new_size += zstd_dec.pos;
+ } while (zstd_dec.pos == PAGE_SIZE && ret != 0);
+
+ if (retval) {
+ pr_err("ZSTD-decompression failed with status %d\n", retval);
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = new_size;
+
+ out:
+ kfree(wksp);
+ return retval;
+}
#else
#error "Unexpected configuration for CONFIG_MODULE_DECOMPRESS"
#endif
diff --git a/kernel/module/kallsyms.c b/kernel/module/kallsyms.c
index f5c5c9175333..4523f99b0358 100644
--- a/kernel/module/kallsyms.c
+++ b/kernel/module/kallsyms.c
@@ -494,7 +494,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
return ret;
}
-#ifdef CONFIG_LIVEPATCH
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
@@ -531,4 +530,3 @@ out:
mutex_unlock(&module_mutex);
return ret;
}
-#endif /* CONFIG_LIVEPATCH */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d02d39c7174e..48568a0f5651 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -85,9 +85,6 @@ struct mod_tree_root mod_data_tree __cacheline_aligned = {
};
#endif
-#define module_addr_min mod_tree.addr_min
-#define module_addr_max mod_tree.addr_max
-
struct symsearch {
const struct kernel_symbol *start, *stop;
const s32 *crcs;
@@ -1674,6 +1671,11 @@ static int elf_validity_check(struct load_info *info)
info->hdr->e_machine);
goto no_exec;
}
+ if (!module_elf_check_arch(info->hdr)) {
+ pr_err("Invalid module architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
pr_err("Invalid ELF section header size\n");
goto no_exec;
@@ -2247,6 +2249,11 @@ static void flush_module_icache(const struct module *mod)
(unsigned long)mod->core_layout.base + mod->core_layout.size);
}
+bool __weak module_elf_check_arch(Elf_Ehdr *hdr)
+{
+ return true;
+}
+
int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
char *secstrings,
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index ce68f821dcd1..c921bf044050 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -340,7 +340,7 @@ static int mod_sysfs_init(struct module *mod)
int err;
struct kobject *kobj;
- if (!module_sysfs_initialized) {
+ if (!module_kset) {
pr_err("%s: module sysfs not initialized\n", mod->name);
err = -EINVAL;
goto out;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 0d5bd62c480e..ab75637fd904 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -62,7 +62,7 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* value of this parameter is -1.
* @nr_calls: Records the number of notifications sent. Don't care
* value of this field is NULL.
- * @returns: notifier_call_chain returns the value returned by the
+ * Return: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
static int notifier_call_chain(struct notifier_block **nl,
@@ -105,13 +105,13 @@ NOKPROBE_SYMBOL(notifier_call_chain);
* @val_up: Value passed unmodified to the notifier function
* @val_down: Value passed unmodified to the notifier function when recovering
* from an error on @val_up
- * @v Pointer passed unmodified to the notifier function
+ * @v: Pointer passed unmodified to the notifier function
*
* NOTE: It is important the @nl chain doesn't change between the two
* invocations of notifier_call_chain() such that we visit the
* exact same notifier callbacks; this rules out any RCU usage.
*
- * Returns: the return value of the @val_up call.
+ * Return: the return value of the @val_up call.
*/
static int notifier_call_chain_robust(struct notifier_block **nl,
unsigned long val_up, unsigned long val_down,
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index eec72ca962e2..a487ff24129b 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -157,7 +157,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
- if (likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
+ if ((flags & CLONE_VM) ||
+ likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
return 0;
}
@@ -179,7 +180,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
if (IS_ERR(new_ns))
return PTR_ERR(new_ns);
- timens_on_fork(new_ns, tsk);
+ if ((flags & CLONE_VM) == 0)
+ timens_on_fork(new_ns, tsk);
tsk->nsproxy = new_ns;
return 0;
@@ -254,6 +256,23 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+int exec_task_namespaces(void)
+{
+ struct task_struct *tsk = current;
+ struct nsproxy *new;
+
+ if (tsk->nsproxy->time_ns_for_children == tsk->nsproxy->time_ns)
+ return 0;
+
+ new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ timens_on_fork(new, tsk);
+ switch_task_namespaces(tsk, new);
+ return 0;
+}
+
static int check_setns_flags(unsigned long flags)
{
if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
diff --git a/kernel/padata.c b/kernel/padata.c
index e5819bb8bd1d..e007b8a4b738 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -83,8 +83,16 @@ static struct padata_work *padata_work_alloc(void)
return pw;
}
-static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
- void *data, int flags)
+/*
+ * This function is marked __ref because this function may be optimized in such
+ * a way that it directly refers to work_fn's address, which causes modpost to
+ * complain when work_fn is marked __init. This scenario was observed with clang
+ * LTO, where padata_work_init() was optimized to refer directly to
+ * padata_mt_helper() because the calls to padata_work_init() with other work_fn
+ * values were eliminated or inlined.
+ */
+static void __ref padata_work_init(struct padata_work *pw, work_func_t work_fn,
+ void *data, int flags)
{
if (flags & PADATA_WORK_ONSTACK)
INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
@@ -207,14 +215,16 @@ int padata_do_parallel(struct padata_shell *ps,
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+ if (!pw) {
+ /* Maximum works limit exceeded, run in the current task. */
+ padata->parallel(padata);
+ }
+
rcu_read_unlock_bh();
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
- } else {
- /* Maximum works limit exceeded, run in the current task. */
- padata->parallel(padata);
}
return 0;
@@ -388,13 +398,16 @@ void padata_do_serial(struct padata_priv *padata)
int hashed_cpu = padata_cpu_hash(pd, padata->seq_nr);
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
+ struct list_head *pos;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
- list_for_each_entry_reverse(cur, &reorder->list, list)
+ list_for_each_prev(pos, &reorder->list) {
+ cur = list_entry(pos, struct padata_priv, list);
if (cur->seq_nr < padata->seq_nr)
break;
- list_add(&padata->list, &cur->list);
+ }
+ list_add(&padata->list, pos);
spin_unlock(&reorder->lock);
/*
diff --git a/kernel/panic.c b/kernel/panic.c
index da323209f583..463c9295bc28 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -25,6 +25,7 @@
#include <linux/kexec.h>
#include <linux/panic_notifier.h>
#include <linux/sched.h>
+#include <linux/string_helpers.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
@@ -32,6 +33,7 @@
#include <linux/bug.h>
#include <linux/ratelimit.h>
#include <linux/debugfs.h>
+#include <linux/sysfs.h>
#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -58,6 +60,7 @@ bool crash_kexec_post_notifiers;
int panic_on_warn __read_mostly;
unsigned long panic_on_taint;
bool panic_on_taint_nousertaint = false;
+static unsigned int warn_limit __read_mostly;
int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
@@ -75,8 +78,9 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
-#if defined(CONFIG_SMP) && defined(CONFIG_SYSCTL)
+#ifdef CONFIG_SYSCTL
static struct ctl_table kern_panic_table[] = {
+#ifdef CONFIG_SMP
{
.procname = "oops_all_cpu_backtrace",
.data = &sysctl_oops_all_cpu_backtrace,
@@ -86,6 +90,14 @@ static struct ctl_table kern_panic_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
+#endif
+ {
+ .procname = "warn_limit",
+ .data = &warn_limit,
+ .maxlen = sizeof(warn_limit),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
{ }
};
@@ -97,6 +109,25 @@ static __init int kernel_panic_sysctls_init(void)
late_initcall(kernel_panic_sysctls_init);
#endif
+static atomic_t warn_count = ATOMIC_INIT(0);
+
+#ifdef CONFIG_SYSFS
+static ssize_t warn_count_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *page)
+{
+ return sysfs_emit(page, "%d\n", atomic_read(&warn_count));
+}
+
+static struct kobj_attribute warn_count_attr = __ATTR_RO(warn_count);
+
+static __init int kernel_panic_sysfs_init(void)
+{
+ sysfs_add_file_to_group(kernel_kobj, &warn_count_attr.attr, NULL);
+ return 0;
+}
+late_initcall(kernel_panic_sysfs_init);
+#endif
+
static long no_blink(int state)
{
return 0;
@@ -199,6 +230,19 @@ static void panic_print_sys_info(bool console_flush)
ftrace_dump(DUMP_ALL);
}
+void check_panic_on_warn(const char *origin)
+{
+ unsigned int limit;
+
+ if (panic_on_warn)
+ panic("%s: panic_on_warn set ...\n", origin);
+
+ limit = READ_ONCE(warn_limit);
+ if (atomic_inc_return(&warn_count) >= limit && limit)
+ panic("%s: system warned too often (kernel.warn_limit is %d)",
+ origin, limit);
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -617,8 +661,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (regs)
show_regs(regs);
- if (panic_on_warn)
- panic("panic_on_warn set ...\n");
+ check_panic_on_warn("kernel");
if (!regs)
dump_stack();
@@ -744,8 +787,8 @@ static int __init panic_on_taint_setup(char *s)
if (s && !strcmp(s, "nousertaint"))
panic_on_taint_nousertaint = true;
- pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n",
- panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis");
+ pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%s\n",
+ panic_on_taint, str_enabled_disabled(panic_on_taint_nousertaint));
return 0;
}
diff --git a/kernel/params.c b/kernel/params.c
index 5b92310425c5..14d66070757b 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -926,7 +926,7 @@ static const struct sysfs_ops module_sysfs_ops = {
.store = module_attr_store,
};
-static int uevent_filter(struct kobject *kobj)
+static int uevent_filter(const struct kobject *kobj)
{
const struct kobj_type *ktype = get_ktype(kobj);
@@ -940,7 +940,6 @@ static const struct kset_uevent_ops module_uevent_ops = {
};
struct kset *module_kset;
-int module_sysfs_initialized;
static void module_kobj_release(struct kobject *kobj)
{
@@ -954,7 +953,11 @@ struct kobj_type module_ktype = {
};
/*
- * param_sysfs_init - wrapper for built-in params support
+ * param_sysfs_init - create "module" kset
+ *
+ * This must be done before the initramfs is unpacked and
+ * request_module() thus becomes possible, because otherwise the
+ * module load would fail in mod_sysfs_init.
*/
static int __init param_sysfs_init(void)
{
@@ -964,13 +967,25 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__);
return -ENOMEM;
}
- module_sysfs_initialized = 1;
+
+ return 0;
+}
+subsys_initcall(param_sysfs_init);
+
+/*
+ * param_sysfs_builtin_init - add sysfs version and parameter
+ * attributes for built-in modules
+ */
+static int __init param_sysfs_builtin_init(void)
+{
+ if (!module_kset)
+ return -ENOMEM;
version_sysfs_builtin();
param_sysfs_builtin();
return 0;
}
-subsys_initcall(param_sysfs_init);
+late_initcall(param_sysfs_builtin_init);
#endif /* CONFIG_SYSFS */
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ddd9988327fe..6c1c7e566d35 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -27,6 +27,8 @@ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
static int try_to_freeze_tasks(bool user_only)
{
+ const char *what = user_only ? "user space processes" :
+ "remaining freezable tasks";
struct task_struct *g, *p;
unsigned long end_time;
unsigned int todo;
@@ -36,6 +38,8 @@ static int try_to_freeze_tasks(bool user_only)
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+ pr_info("Freezing %s\n", what);
+
start = ktime_get_boottime();
end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs);
@@ -82,9 +86,8 @@ static int try_to_freeze_tasks(bool user_only)
elapsed_msecs = ktime_to_ms(elapsed);
if (todo) {
- pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
+ pr_err("Freezing %s %s after %d.%03d seconds "
+ "(%d tasks refusing to freeze, wq_busy=%d):\n", what,
wakeup ? "aborted" : "failed",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
@@ -101,8 +104,8 @@ static int try_to_freeze_tasks(bool user_only)
read_unlock(&tasklist_lock);
}
} else {
- pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
- elapsed_msecs % 1000);
+ pr_info("Freezing %s completed (elapsed %d.%03d seconds)\n",
+ what, elapsed_msecs / 1000, elapsed_msecs % 1000);
}
return todo ? -EBUSY : 0;
@@ -130,14 +133,11 @@ int freeze_processes(void)
static_branch_inc(&freezer_active);
pm_wakeup_clear(0);
- pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
- if (!error) {
+ if (!error)
__usermodehelper_set_disable_depth(UMH_DISABLED);
- pr_cont("done.");
- }
- pr_cont("\n");
+
BUG_ON(in_atomic());
/*
@@ -166,14 +166,9 @@ int freeze_kernel_threads(void)
{
int error;
- pr_info("Freezing remaining freezable tasks ... ");
-
pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
- if (!error)
- pr_cont("done.");
- pr_cont("\n");
BUG_ON(in_atomic());
if (error)
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 2a406753af90..cd8b7b35f1e8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1723,8 +1723,8 @@ static unsigned long minimum_image_size(unsigned long saveable)
* /sys/power/reserved_size, respectively). To make this happen, we compute the
* total number of available page frames and allocate at least
*
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
- * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
+ * ([page frames total] - PAGES_FOR_IO - [metadata pages]) / 2
+ * - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
*
* of them, which corresponds to the maximum size of a hibernation image.
*
@@ -2259,10 +2259,14 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
- if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j]))
+ if (pfn_valid(buf[j]) && memory_bm_pfn_present(bm, buf[j])) {
memory_bm_set_bit(bm, buf[j]);
- else
+ } else {
+ if (!pfn_valid(buf[j]))
+ pr_err(FW_BUG "Memory map mismatch at 0x%llx after hibernation\n",
+ (unsigned long long)PFN_PHYS(buf[j]));
return -EFAULT;
+ }
}
return 0;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e4f1e7478b52..7decf1e9c486 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -79,13 +79,20 @@ int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);
/*
- * console_sem protects the console_drivers list, and also
- * provides serialisation for access to the entire console
- * driver system.
+ * console_mutex protects console_list updates and console->flags updates.
+ * The flags are synchronized only for consoles that are registered, i.e.
+ * accessible via the console list.
+ */
+static DEFINE_MUTEX(console_mutex);
+
+/*
+ * console_sem protects updates to console->seq and console_suspended,
+ * and also provides serialization for console printing.
*/
static DEFINE_SEMAPHORE(console_sem);
-struct console *console_drivers;
-EXPORT_SYMBOL_GPL(console_drivers);
+HLIST_HEAD(console_list);
+EXPORT_SYMBOL_GPL(console_list);
+DEFINE_STATIC_SRCU(console_srcu);
/*
* System may need to suppress printk message under certain
@@ -103,6 +110,19 @@ static int __read_mostly suppress_panic_printk;
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
};
+
+void lockdep_assert_console_list_lock_held(void)
+{
+ lockdep_assert_held(&console_mutex);
+}
+EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+bool console_srcu_read_lock_is_held(void)
+{
+ return srcu_read_lock_held(&console_srcu);
+}
#endif
enum devkmsg_log_bits {
@@ -220,6 +240,69 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
}
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */
+/**
+ * console_list_lock - Lock the console list
+ *
+ * For console list or console->flags updates
+ */
+void console_list_lock(void)
+{
+ /*
+ * In unregister_console() and console_force_preferred_locked(),
+ * synchronize_srcu() is called with the console_list_lock held.
+ * Therefore it is not allowed that the console_list_lock is taken
+ * with the srcu_lock held.
+ *
+ * Detecting if this context is really in the read-side critical
+ * section is only possible if the appropriate debug options are
+ * enabled.
+ */
+ WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
+ srcu_read_lock_held(&console_srcu));
+
+ mutex_lock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_lock);
+
+/**
+ * console_list_unlock - Unlock the console list
+ *
+ * Counterpart to console_list_lock()
+ */
+void console_list_unlock(void)
+{
+ mutex_unlock(&console_mutex);
+}
+EXPORT_SYMBOL(console_list_unlock);
+
+/**
+ * console_srcu_read_lock - Register a new reader for the
+ * SRCU-protected console list
+ *
+ * Use for_each_console_srcu() to iterate the console list
+ *
+ * Context: Any context.
+ * Return: A cookie to pass to console_srcu_read_unlock().
+ */
+int console_srcu_read_lock(void)
+{
+ return srcu_read_lock_nmisafe(&console_srcu);
+}
+EXPORT_SYMBOL(console_srcu_read_lock);
+
+/**
+ * console_srcu_read_unlock - Unregister an old reader from
+ * the SRCU-protected console list
+ * @cookie: cookie returned from console_srcu_read_lock()
+ *
+ * Counterpart to console_srcu_read_lock()
+ */
+void console_srcu_read_unlock(int cookie)
+{
+ srcu_read_unlock_nmisafe(&console_srcu, cookie);
+}
+EXPORT_SYMBOL(console_srcu_read_unlock);
+
/*
* Helper macros to handle lockdep when locking/unlocking console_sem. We use
* macros instead of functions so that _RET_IP_ contains useful information.
@@ -1814,13 +1897,13 @@ static void console_lock_spinning_enable(void)
* safe to start busy waiting for the lock. Second, it checks if
* there is a busy waiter and passes the lock rights to her.
*
- * Important: Callers lose the lock if there was a busy waiter.
- * They must not touch items synchronized by console_lock
- * in this case.
+ * Important: Callers lose both the console_lock and the SRCU read lock if
+ * there was a busy waiter. They must not touch items synchronized by
+ * console_lock or SRCU read lock in this case.
*
* Return: 1 if the lock rights were passed, 0 otherwise.
*/
-static int console_lock_spinning_disable_and_check(void)
+static int console_lock_spinning_disable_and_check(int cookie)
{
int waiter;
@@ -1840,6 +1923,12 @@ static int console_lock_spinning_disable_and_check(void)
spin_release(&console_owner_dep_map, _THIS_IP_);
/*
+ * Preserve lockdep lock ordering. Release the SRCU read lock before
+ * releasing the console_lock.
+ */
+ console_srcu_read_unlock(cookie);
+
+ /*
* Hand off console_lock to waiter. The waiter will perform
* the up(). After this, the waiter is the console_lock owner.
*/
@@ -2322,7 +2411,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
char *text, size_t text_len,
struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
-static int console_lock_spinning_disable_and_check(void) { return 0; }
+static int console_lock_spinning_disable_and_check(int cookie) { return 0; }
static void call_console_driver(struct console *con, const char *text, size_t len,
char *dropped_text)
{
@@ -2391,7 +2480,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
return -E2BIG;
if (!brl_options)
preferred_console = i;
- strlcpy(c->name, name, sizeof(c->name));
+ strscpy(c->name, name, sizeof(c->name));
c->options = options;
set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
@@ -2553,10 +2642,10 @@ static int console_cpu_notify(unsigned int cpu)
}
/**
- * console_lock - lock the console system for exclusive use.
+ * console_lock - block the console subsystem from printing
*
- * Acquires a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Acquires a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* Can sleep, returns nothing.
*/
@@ -2573,10 +2662,10 @@ void console_lock(void)
EXPORT_SYMBOL(console_lock);
/**
- * console_trylock - try to lock the console system for exclusive use.
+ * console_trylock - try to block the console subsystem from printing
*
- * Try to acquire a lock which guarantees that the caller has exclusive
- * access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that no consoles will
+ * be in or enter their write() callback.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
@@ -2623,11 +2712,13 @@ static bool abandon_console_lock_in_panic(void)
* Check if the given console is currently capable and allowed to print
* records.
*
- * Requires the console_lock.
+ * Requires the console_srcu_read_lock.
*/
static inline bool console_is_usable(struct console *con)
{
- if (!(con->flags & CON_ENABLED))
+ short flags = console_srcu_read_flags(con);
+
+ if (!(flags & CON_ENABLED))
return false;
if (!con->write)
@@ -2638,8 +2729,7 @@ static inline bool console_is_usable(struct console *con)
* allocated. So unless they're explicitly marked as being able to
* cope (CON_ANYTIME) don't call them until this CPU is officially up.
*/
- if (!cpu_online(raw_smp_processor_id()) &&
- !(con->flags & CON_ANYTIME))
+ if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
return false;
return true;
@@ -2664,16 +2754,18 @@ static void __console_unlock(void)
* DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
*
* @handover will be set to true if a printk waiter has taken over the
- * console_lock, in which case the caller is no longer holding the
- * console_lock. Otherwise it is set to false.
+ * console_lock, in which case the caller is no longer holding both the
+ * console_lock and the SRCU read lock. Otherwise it is set to false.
+ *
+ * @cookie is the cookie from the SRCU read lock.
*
* Returns false if the given console has no next record to print, otherwise
* true.
*
- * Requires the console_lock.
+ * Requires the console_lock and the SRCU read lock.
*/
static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
- char *dropped_text, bool *handover)
+ char *dropped_text, bool *handover, int cookie)
{
static int panic_console_dropped;
struct printk_info info;
@@ -2733,7 +2825,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
con->seq++;
- *handover = console_lock_spinning_disable_and_check();
+ *handover = console_lock_spinning_disable_and_check(cookie);
printk_safe_exit_irqrestore(flags);
skip:
return true;
@@ -2770,6 +2862,7 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
bool any_usable = false;
struct console *con;
bool any_progress;
+ int cookie;
*next_seq = 0;
*handover = false;
@@ -2777,23 +2870,29 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
do {
any_progress = false;
- for_each_console(con) {
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(con) {
bool progress;
if (!console_is_usable(con))
continue;
any_usable = true;
- if (con->flags & CON_EXTENDED) {
+ if (console_srcu_read_flags(con) & CON_EXTENDED) {
/* Extended consoles do not print "dropped messages". */
progress = console_emit_next_record(con, &text[0],
&ext_text[0], NULL,
- handover);
+ handover, cookie);
} else {
progress = console_emit_next_record(con, &text[0],
NULL, &dropped_text[0],
- handover);
+ handover, cookie);
}
+
+ /*
+ * If a handover has occurred, the SRCU read lock
+ * is already released.
+ */
if (*handover)
return false;
@@ -2807,21 +2906,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
/* Allow panic_cpu to take over the consoles safely. */
if (abandon_console_lock_in_panic())
- return false;
+ goto abandon;
if (do_cond_resched)
cond_resched();
}
+ console_srcu_read_unlock(cookie);
} while (any_progress);
return any_usable;
+
+abandon:
+ console_srcu_read_unlock(cookie);
+ return false;
}
/**
- * console_unlock - unlock the console system
+ * console_unlock - unblock the console subsystem from printing
*
- * Releases the console_lock which the caller holds on the console system
- * and the console driver list.
+ * Releases the console_lock which the caller holds to block printing of
+ * the console subsystem.
*
* While the console_lock was held, console output may have been buffered
* by printk(). If this is the case, console_unlock(); emits
@@ -2899,10 +3003,14 @@ EXPORT_SYMBOL(console_conditional_schedule);
void console_unblank(void)
{
struct console *c;
+ int cookie;
/*
- * console_unblank can no longer be called in interrupt context unless
- * oops_in_progress is set to 1..
+ * Stop console printing because the unblank() callback may
+ * assume the console is not within its write() callback.
+ *
+ * If @oops_in_progress is set, this may be an atomic context.
+ * In that case, attempt a trylock as best-effort.
*/
if (oops_in_progress) {
if (down_trylock_console_sem() != 0)
@@ -2912,9 +3020,14 @@ void console_unblank(void)
console_locked = 1;
console_may_schedule = 0;
- for_each_console(c)
- if ((c->flags & CON_ENABLED) && c->unblank)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ if ((console_srcu_read_flags(c) & CON_ENABLED) && c->unblank)
c->unblank();
+ }
+ console_srcu_read_unlock(cookie);
+
console_unlock();
if (!oops_in_progress)
@@ -2941,11 +3054,21 @@ void console_flush_on_panic(enum con_flush_mode mode)
if (mode == CONSOLE_REPLAY_ALL) {
struct console *c;
+ int cookie;
u64 seq;
seq = prb_first_valid_seq(prb);
- for_each_console(c)
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
+ /*
+ * If the above console_trylock() failed, this is an
+ * unsynchronized assignment. But in that case, the
+ * kernel is in "hope and pray" mode anyway.
+ */
c->seq = seq;
+ }
+ console_srcu_read_unlock(cookie);
}
console_unlock();
}
@@ -2957,15 +3080,25 @@ struct tty_driver *console_device(int *index)
{
struct console *c;
struct tty_driver *driver = NULL;
+ int cookie;
+ /*
+ * Take console_lock to serialize device() callback with
+ * other console operations. For example, fg_console is
+ * modified under console_lock when switching vt.
+ */
console_lock();
- for_each_console(c) {
+
+ cookie = console_srcu_read_lock();
+ for_each_console_srcu(c) {
if (!c->device)
continue;
driver = c->device(c, index);
if (driver)
break;
}
+ console_srcu_read_unlock(cookie);
+
console_unlock();
return driver;
}
@@ -2978,17 +3111,25 @@ struct tty_driver *console_device(int *index)
void console_stop(struct console *console)
{
__pr_flush(console, 1000, true);
- console_lock();
- console->flags &= ~CON_ENABLED;
- console_unlock();
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
+ console_list_unlock();
+
+ /*
+ * Ensure that all SRCU list walks have completed. All contexts must
+ * be able to see that this console is disabled so that (for example)
+ * the caller can suspend the port without risk of another context
+ * using the port.
+ */
+ synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_stop);
void console_start(struct console *console)
{
- console_lock();
- console->flags |= CON_ENABLED;
- console_unlock();
+ console_list_lock();
+ console_srcu_write_flags(console, console->flags | CON_ENABLED);
+ console_list_unlock();
__pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_start);
@@ -3081,6 +3222,72 @@ static void try_enable_default_console(struct console *newcon)
(con->flags & CON_BOOT) ? "boot" : "", \
con->name, con->index, ##__VA_ARGS__)
+static void console_init_seq(struct console *newcon, bool bootcon_registered)
+{
+ struct console *con;
+ bool handover;
+
+ if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
+ /* Get a consistent copy of @syslog_seq. */