aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/async.c3
-rw-r--r--kernel/audit.c109
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c7
-rw-r--r--kernel/auditsc.c100
-rw-r--r--kernel/bpf/Kconfig5
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c4
-rw-r--r--kernel/bpf/bloom_filter.c6
-rw-r--r--kernel/bpf/bpf_inode_storage.c15
-rw-r--r--kernel/bpf/bpf_iter.c55
-rw-r--r--kernel/bpf/bpf_local_storage.c110
-rw-r--r--kernel/bpf/bpf_lsm.c23
-rw-r--r--kernel/bpf/bpf_struct_ops.c6
-rw-r--r--kernel/bpf/bpf_task_storage.c20
-rw-r--r--kernel/bpf/btf.c1210
-rw-r--r--kernel/bpf/cgroup.c191
-rw-r--r--kernel/bpf/core.c370
-rw-r--r--kernel/bpf/cpumap.c20
-rw-r--r--kernel/bpf/devmap.c39
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/helpers.c80
-rw-r--r--kernel/bpf/inode.c53
-rw-r--r--kernel/bpf/local_storage.c5
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/map_iter.c4
-rw-r--r--kernel/bpf/mmap_unlock_work.h65
-rw-r--r--kernel/bpf/net_namespace.c1
-rw-r--r--kernel/bpf/preload/Kconfig7
-rw-r--r--kernel/bpf/preload/Makefile41
-rw-r--r--kernel/bpf/preload/bpf_preload.h8
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c126
-rw-r--r--kernel/bpf/preload/bpf_preload_umd_blob.S7
-rw-r--r--kernel/bpf/preload/iterators/Makefile6
-rw-r--r--kernel/bpf/preload/iterators/bpf_preload_common.h13
-rw-r--r--kernel/bpf/preload/iterators/iterators.c94
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel.h425
-rw-r--r--kernel/bpf/preload/iterators/iterators.skel.h412
-rw-r--r--kernel/bpf/reuseport_array.c8
-rw-r--r--kernel/bpf/ringbuf.c4
-rw-r--r--kernel/bpf/stackmap.c157
-rw-r--r--kernel/bpf/syscall.c164
-rw-r--r--kernel/bpf/task_iter.c82
-rw-r--r--kernel/bpf/trampoline.c41
-rw-r--r--kernel/bpf/verifier.c1446
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup/cgroup-internal.h19
-rw-r--r--kernel/cgroup/cgroup-v1.c49
-rw-r--r--kernel/cgroup/cgroup.c122
-rw-r--r--kernel/cgroup/cpuset.c93
-rw-r--r--kernel/cgroup/freezer.c2
-rw-r--r--kernel/cgroup/rstat.c69
-rw-r--r--kernel/configs/android-recommended.config2
-rw-r--r--kernel/configs/debug.config106
-rw-r--r--kernel/cpu.c58
-rw-r--r--kernel/crash_core.c11
-rw-r--r--kernel/cred.c11
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/delayacct.c49
-rw-r--r--kernel/dma/Kconfig7
-rw-r--r--kernel/dma/Makefile2
-rw-r--r--kernel/dma/contiguous.c4
-rw-r--r--kernel/dma/debug.c4
-rw-r--r--kernel/dma/direct.c246
-rw-r--r--kernel/dma/direct.h3
-rw-r--r--kernel/dma/map_benchmark.c25
-rw-r--r--kernel/dma/mapping.c4
-rw-r--r--kernel/dma/pool.c4
-rw-r--r--kernel/dma/swiotlb.c131
-rw-r--r--kernel/entry/common.c46
-rw-r--r--kernel/entry/kvm.c13
-rw-r--r--kernel/entry/syscall_user_dispatch.c4
-rw-r--r--kernel/events/callchain.c4
-rw-r--r--kernel/events/core.c549
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/exit.c109
-rw-r--r--kernel/extable.c24
-rw-r--r--kernel/fork.c369
-rw-r--r--kernel/futex/core.c39
-rw-r--r--kernel/futex/futex.h6
-rw-r--r--kernel/futex/syscalls.c22
-rw-r--r--kernel/gcov/Kconfig1
-rw-r--r--kernel/hung_task.c81
-rw-r--r--kernel/irq/affinity.c5
-rw-r--r--kernel/irq/chip.c33
-rw-r--r--kernel/irq/cpuhotplug.c4
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/generic-chip.c2
-rw-r--r--kernel/irq/handle.c11
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdesc.c27
-rw-r--r--kernel/irq/irqdomain.c9
-rw-r--r--kernel/irq/manage.c12
-rw-r--r--kernel/irq/msi.c792
-rw-r--r--kernel/irq/proc.c8
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/kallsyms.c5
-rw-r--r--kernel/kcov.c98
-rw-r--r--kernel/kcsan/Makefile3
-rw-r--r--kernel/kcsan/core.c347
-rw-r--r--kernel/kcsan/kcsan_test.c426
-rw-r--r--kernel/kcsan/report.c51
-rw-r--r--kernel/kcsan/selftest.c143
-rw-r--r--kernel/kexec_core.c2
-rw-r--r--kernel/kprobes.c225
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c130
-rw-r--r--kernel/livepatch/core.c33
-rw-r--r--kernel/livepatch/patch.c19
-rw-r--r--kernel/livepatch/shadow.c6
-rw-r--r--kernel/livepatch/transition.c12
-rw-r--r--kernel/locking/lockdep.c49
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/lockdep_proc.c51
-rw-r--r--kernel/locking/locktorture.c4
-rw-r--r--kernel/locking/mutex.c11
-rw-r--r--kernel/locking/percpu-rwsem.c5
-rw-r--r--kernel/locking/rtmutex.c10
-rw-r--r--kernel/locking/rtmutex_api.c30
-rw-r--r--kernel/locking/rwsem.c193
-rw-r--r--kernel/locking/spinlock.c10
-rw-r--r--kernel/locking/spinlock_rt.c18
-rw-r--r--kernel/locking/ww_rt_mutex.c2
-rw-r--r--kernel/module-internal.h19
-rw-r--r--kernel/module.c80
-rw-r--r--kernel/module_decompress.c273
-rw-r--r--kernel/notifier.c15
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c58
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/power/hibernate.c13
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/power.h1
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/suspend_test.c8
-rw-r--r--kernel/power/swap.c30
-rw-r--r--kernel/power/user.c2
-rw-r--r--kernel/power/wakelock.c11
-rw-r--r--kernel/printk/Makefile5
-rw-r--r--kernel/printk/internal.h8
-rw-r--r--kernel/printk/printk.c198
-rw-r--r--kernel/printk/printk_ringbuffer.c52
-rw-r--r--kernel/printk/printk_ringbuffer.h2
-rw-r--r--kernel/printk/sysctl.c85
-rw-r--r--kernel/profile.c73
-rw-r--r--kernel/ptrace.c49
-rw-r--r--kernel/rcu/Kconfig20
-rw-r--r--kernel/rcu/rcu_segcblist.c10
-rw-r--r--kernel/rcu/rcu_segcblist.h16
-rw-r--r--kernel/rcu/rcuscale.c14
-rw-r--r--kernel/rcu/rcutorture.c276
-rw-r--r--kernel/rcu/refscale.c50
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/tasks.h482
-rw-r--r--kernel/rcu/tree.c456
-rw-r--r--kernel/rcu/tree.h49
-rw-r--r--kernel/rcu/tree_exp.h19
-rw-r--r--kernel/rcu/tree_nocb.h178
-rw-r--r--kernel/rcu/tree_plugin.h283
-rw-r--r--kernel/rcu/tree_stall.h62
-rw-r--r--kernel/rcu/update.c7
-rw-r--r--kernel/resource.c45
-rw-r--r--kernel/rseq.c8
-rw-r--r--kernel/scftorture.c16
-rw-r--r--kernel/sched/Makefile35
-rw-r--r--kernel/sched/autogroup.c26
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c52
-rw-r--r--kernel/sched/build_utility.c109
-rw-r--r--kernel/sched/clock.c44
-rw-r--r--kernel/sched/completion.c2
-rw-r--r--kernel/sched/core.c655
-rw-r--r--kernel/sched/core_sched.c69
-rw-r--r--kernel/sched/cpuacct.c117
-rw-r--r--kernel/sched/cpudeadline.c3
-rw-r--r--kernel/sched/cpufreq.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c22
-rw-r--r--kernel/sched/cpupri.c1
-rw-r--r--kernel/sched/cputime.c17
-rw-r--r--kernel/sched/deadline.c155
-rw-r--r--kernel/sched/debug.c15
-rw-r--r--kernel/sched/fair.c319
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/isolation.c163
-rw-r--r--kernel/sched/loadavg.c1
-rw-r--r--kernel/sched/membarrier.c10
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/psi.c253
-rw-r--r--kernel/sched/rt.c74
-rw-r--r--kernel/sched/sched.h386
-rw-r--r--kernel/sched/stats.c1
-rw-r--r--kernel/sched/stats.h9
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/topology.c268
-rw-r--r--kernel/sched/wait.c8
-rw-r--r--kernel/sched/wait_bit.c2
-rw-r--r--kernel/scs.c12
-rw-r--r--kernel/seccomp.c11
-rw-r--r--kernel/signal.c173
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/stackleak.c31
-rw-r--r--kernel/stacktrace.c3
-rw-r--r--kernel/static_call.c542
-rw-r--r--kernel/static_call_inline.c543
-rw-r--r--kernel/sys.c223
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c742
-rw-r--r--kernel/task_work.c4
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/clocksource.c62
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/tick-sched.c90
-rw-r--r--kernel/time/tick-sched.h4
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer.c27
-rw-r--r--kernel/torture.c10
-rw-r--r--kernel/trace/Kconfig68
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/blktrace.c47
-rw-r--r--kernel/trace/bpf_trace.c452
-rw-r--r--kernel/trace/fgraph.c21
-rw-r--r--kernel/trace/fprobe.c332
-rw-r--r--kernel/trace/ftrace.c159
-rw-r--r--kernel/trace/rethook.c317
-rw-r--r--kernel/trace/ring_buffer.c7
-rw-r--r--kernel/trace/trace.c209
-rw-r--r--kernel/trace/trace.h110
-rw-r--r--kernel/trace/trace_eprobe.c54
-rw-r--r--kernel/trace/trace_events.c128
-rw-r--r--kernel/trace/trace_events_filter.c139
-rw-r--r--kernel/trace/trace_events_hist.c170
-rw-r--r--kernel/trace/trace_events_inject.c11
-rw-r--r--kernel/trace/trace_events_synth.c40
-rw-r--r--kernel/trace/trace_events_trigger.c497
-rw-r--r--kernel/trace/trace_events_user.c1628
-rw-r--r--kernel/trace/trace_hwlat.c6
-rw-r--r--kernel/trace/trace_kprobe.c50
-rw-r--r--kernel/trace/trace_osnoise.c114
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_probe.c15
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c6
-rw-r--r--kernel/trace/trace_syscalls.c6
-rw-r--r--kernel/trace/trace_uprobe.c39
-rw-r--r--kernel/trace/tracing_map.c3
-rw-r--r--kernel/tsacct.c7
-rw-r--r--kernel/ucount.c20
-rw-r--r--kernel/user_namespace.c14
-rw-r--r--kernel/watch_queue.c26
-rw-r--r--kernel/watchdog.c103
-rw-r--r--kernel/workqueue.c151
262 files changed, 16324 insertions, 7685 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index ce77f0265660..c2f1fd95a821 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -96,8 +96,9 @@ config PREEMPTION
config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
select PREEMPT_BUILD
- default y
+ default y if HAVE_PREEMPT_DYNAMIC_CALL
help
This option allows to define the preemption model on the kernel
command line parameter and thus override the default preemption
diff --git a/kernel/Makefile b/kernel/Makefile
index 186c49582f45..847a82bfe0e3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -67,6 +67,7 @@ obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
+obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -107,12 +108,14 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
-obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o
obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/async.c b/kernel/async.c
index b8d7a663497f..b2c4ba5686ee 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* mark that this task has queued an async job, used by module init */
- current->flags |= PF_USED_ASYNC;
-
/* schedule for execution */
queue_work_node(node, system_unbound_wq, &entry->work);
diff --git a/kernel/audit.c b/kernel/audit.c
index 121d37e700a6..7690c29d4ee4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb)
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
-static void kauditd_rehold_skb(struct sk_buff *skb)
+static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
- /* put the record back in the queue at the same place */
- skb_queue_head(&audit_hold_queue, skb);
+ /* put the record back in the queue */
+ skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
+ * @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
@@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
-static void kauditd_hold_skb(struct sk_buff *skb)
+static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
- if (!audit_default) {
- kfree_skb(skb);
- return;
+ if (!audit_default)
+ goto drop;
+
+ /* the hold queue is only for when the daemon goes away completely,
+ * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
+ * record on the retry queue unless it's full, in which case drop it
+ */
+ if (error == -EAGAIN) {
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+ audit_log_lost("kauditd retry queue overflow");
+ goto drop;
}
- /* if we have room, queue the message */
+ /* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
@@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
+drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
-static void kauditd_retry_skb(struct sk_buff *skb)
+static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
- /* NOTE: because records should only live in the retry queue for a
- * short period of time, before either being sent or moved to the hold
- * queue, we don't currently enforce a limit on this queue */
- skb_queue_tail(&audit_retry_queue, skb);
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+
+ /* we have to drop the record, send it via printk as a last effort */
+ kauditd_printk_skb(skb);
+ audit_log_lost("kauditd retry queue overflow");
+ kfree_skb(skb);
}
/**
@@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac)
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
- kauditd_hold_skb(skb);
+ kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
@@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
- void (*err_hook)(struct sk_buff *skb))
+ void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
- struct sk_buff *skb;
- static unsigned int failed = 0;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *skb_tail;
+ unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
- while ((skb = skb_dequeue(queue))) {
+ skb_tail = skb_peek_tail(queue);
+ while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
@@ -731,36 +755,34 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, -ECONNREFUSED);
continue;
}
+retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
- /* fatal failure for our queue flush attempt? */
+ /* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
- /* yes - error processing for the queue */
sk = NULL;
if (err_hook)
- (*err_hook)(skb);
- if (!skb_hook)
- goto out;
- /* keep processing with the skb_hook */
+ (*err_hook)(skb, rc);
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
continue;
} else
- /* no - requeue to preserve ordering */
- skb_queue_head(queue, skb);
+ goto retry;
} else {
- /* it worked - drop the extra reference and continue */
+ /* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
-out:
return (rc >= 0 ? 0 : rc);
}
@@ -1446,7 +1468,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
}
- sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
@@ -1459,7 +1481,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, sizeof(*sig_data) + len);
+ sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -1542,6 +1564,20 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
}
/* Log information about who is connecting to the audit multicast socket */
@@ -1609,7 +1645,8 @@ static int __net_init audit_net_init(struct net *net)
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
@@ -1825,7 +1862,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
- * while holding the mutex */
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
@@ -2132,7 +2171,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid_subj(current, &sid);
+ security_current_getsecid_subj(&sid);
if (!sid)
return 0;
@@ -2353,7 +2392,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid_subj(current, &audit_sig_sid);
+ security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);
diff --git a/kernel/audit.h b/kernel/audit.h
index c4498090a5bd..58b66543b4d5 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -201,6 +201,10 @@ struct audit_context {
struct {
char *name;
} module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 72324afcffef..e7315d487163 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
- tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d75acb014ccd..42d99896e7a6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -637,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
@@ -1092,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
- sizeof(*data) + data->buflen);
+ struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
@@ -1368,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid_subj(current,
- &sid);
+ security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b517947bfa48..ea2ee1181921 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -185,7 +185,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
case AUDITSC_OPENAT2:
- return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags);
+ return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
@@ -666,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid_subj(tsk, &sid);
+ /* @tsk should always be equal to
+ * @current with the exception of
+ * fork()/copy_process() in which case
+ * the new @tsk creds are still a dup
+ * of @current's creds so we can still
+ * use security_current_getsecid_subj()
+ * here even though it always refs
+ * @current's creds
+ */
+ security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@@ -1331,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
from_kuid(&init_user_ns, name->fcap.rootid));
}
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1445,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "(null)");
break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -2840,31 +2901,26 @@ void __audit_fanotify(unsigned int response)
void __audit_tk_injoffset(struct timespec64 offset)
{
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
- "sec=%lli nsec=%li",
- (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
- const char *op, enum audit_ntp_type type)
-{
- const struct audit_ntp_val *val = &ad->vals[type];
-
- if (val->newval == val->oldval)
- return;
+ struct audit_context *context = audit_context();
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
- "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
- audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
- audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
- audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
- audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
- audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
- audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index d24d518ddd63..d56ee177d5f8 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -30,6 +30,7 @@ config BPF_SYSCALL
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
+ select PAGE_POOL if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
@@ -58,6 +59,10 @@ config BPF_JIT_ALWAYS_ON
Enables BPF JIT and removes BPF interpreter to avoid speculative
execution of BPF instructions by the interpreter.
+ When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+ is permanently set to 1 and setting any other value than that will
+ return failure.
+
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index cf6ca339f3cd..c1a9be6a4b9f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+ $(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c7a5be3bf8be..7f145aefbff8 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_array_compatible(array, prog)) {
+ if (!bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
@@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
- spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
index 277a05e9c984..b141a1346f72 100644
--- a/kernel/bpf/bloom_filter.c
+++ b/kernel/bpf/bloom_filter.c
@@ -82,6 +82,11 @@ static int bloom_map_delete_elem(struct bpf_map *map, void *value)
return -EOPNOTSUPP;
}
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
{
u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
@@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = {
.map_meta_equal = bpf_map_meta_equal,
.map_alloc = bloom_map_alloc,
.map_free = bloom_map_free,
+ .map_get_next_key = bloom_map_get_next_key,
.map_push_elem = bloom_map_push_elem,
.map_peek_elem = bloom_map_peek_elem,
.map_pop_elem = bloom_map_pop_elem,
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index 96ceed0e0fb5..96be8d518885 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -17,6 +17,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
if (!bsb)
return NULL;
- inode_storage = rcu_dereference(bsb->storage);
+ inode_storage =
+ rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
if (!inode_storage)
return NULL;
@@ -134,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(f->f_inode,
(struct bpf_local_storage_map *)map,
- value, map_flags);
+ value, map_flags, GFP_ATOMIC);
fput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -167,11 +169,13 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@@ -193,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
@@ -204,6 +208,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
BPF_CALL_2(bpf_inode_storage_delete,
struct bpf_map *, map, struct inode *, inode)
{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!inode)
return -EINVAL;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b2ee45064e06..110029ede71e 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -5,6 +5,7 @@
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info {
struct list_head list;
@@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
int ret;
- rcu_read_lock();
- migrate_disable();
- ret = bpf_prog_run(prog, ctx);
- migrate_enable();
- rcu_read_unlock();
+ if (prog->aux->sleepable) {
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+ ret = bpf_prog_run(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock_trace();
+ } else {
+ rcu_read_lock();
+ migrate_disable();
+ ret = bpf_prog_run(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock();
+ }
/* bpf program can only return 0 or 1:
* 0 : okay
@@ -714,3 +724,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
+
+/* maximum number of loops */
+#define MAX_LOOPS BIT(23)
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+ u64, flags)
+{
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 ret;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+ if (nr_loops > MAX_LOOPS)
+ return -E2BIG;
+
+ for (i = 0; i < nr_loops; i++) {
+ ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ return i + 1;
+ }
+
+ return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+ .func = bpf_loop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index b305270b7a4b..01aa2b51ec4d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -11,6 +11,9 @@
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
@@ -60,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem)
+ void *value, bool charge_mem, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
@@ -68,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
+ gfp_flags | __GFP_NOWARN);
if (selem) {
if (value)
memcpy(SDATA(selem)->data, value, smap->map.value_size);
@@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
+void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ kfree_rcu(local_storage, rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ kfree_rcu(selem, rcu);
+}
+
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
@@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
bool free_local_storage;
void *owner;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
owner = local_storage->owner;
/* All uncharging on the owner must be done first.
@@ -117,13 +136,13 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
* will be done by the caller.
*
* Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
- * read if kfree_rcu(local_storage, rcu) is done
+ * rcu_read_lock(), it is more intuitive to
+ * read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
* Hence, a "bool free_local_storage" is returned
- * to the caller which then calls the kfree_rcu()
- * after unlock.
+ * to the caller which then calls then frees the storage after
+ * all the RCU grace periods have expired.
*/
}
hlist_del_init_rcu(&selem->snode);
@@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
- kfree_rcu(selem, rcu);
-
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
return free_local_storage;
}
@@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
/* selem has already been unlinked from sk */
return;
- local_storage = rcu_dereference(selem->local_storage);
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
@@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
- kfree_rcu(local_storage, rcu);
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_rcu);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
/* selem has already be unlinked from smap */
return;
- smap = rcu_dereference(SDATA(selem)->smap);
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
b = select_bucket(smap, selem);
raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
@@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem;
/* Fast path (cache hit) */
- sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
+ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
+ bpf_rcu_lock_held());
if (sdata && rcu_access_pointer(sdata->smap) == smap)
return sdata;
/* Slow path (cache miss) */
- hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
+ rcu_read_lock_trace_held())
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
break;
@@ -260,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem)
+ struct bpf_local_storage_elem *first_selem,
+ gfp_t gfp_flags)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -271,7 +294,7 @@ int bpf_local_storage_alloc(void *owner,
return err;
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- GFP_ATOMIC | __GFP_NOWARN);
+ gfp_flags | __GFP_NOWARN);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -306,7 +329,8 @@ int bpf_local_storage_alloc(void *owner,
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_local_storage_map_free() does a
- * synchronize_rcu() before walking the bucket->list.
+ * synchronize_rcu_mult (waiting for both sleepable and
+ * normal programs) before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
@@ -327,10 +351,10 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags)
+ void *value, u64 map_flags, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
- struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_elem *selem = NULL;
struct bpf_local_storage *local_storage;
unsigned long flags;
int err;
@@ -342,18 +366,22 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!map_value_has_spin_lock(&smap->map)))
return ERR_PTR(-EINVAL);
- local_storage = rcu_dereference(*owner_storage(smap, owner));
+ if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
+ return ERR_PTR(-EINVAL);
+
+ local_storage = rcu_dereference_check(*owner_storage(smap, owner),
+ bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
/* Very first elem for the owner */
err = check_flags(NULL, map_flags);
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true);
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem);
+ err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
kfree(selem);
mem_uncharge(smap, owner, smap->elem_size);
@@ -380,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
}
+ if (gfp_flags == GFP_KERNEL) {
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+ }
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
@@ -405,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
- /* local_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during
- * bpf_selem_unlink_storage_nolock().
- */
- selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
+ if (gfp_flags != GFP_KERNEL) {
+ /* local_storage->lock is held. Hence, we are sure
+ * we can unlink and uncharge the old_sdata successfully
+ * later. Hence, instead of charging the new selem now
+ * and then uncharge the old selem later (which may cause
+ * a potential but unnecessary charge failure), avoid taking
+ * a charge at all here (the "!old_sdata" check) and the
+ * old_sdata will not be uncharged later during
+ * bpf_selem_unlink_storage_nolock().
+ */
+ selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
+ if (!selem) {
+ err = -ENOMEM;
+ goto unlock_err;
+ }
}
/* First, link the new selem to the map */
@@ -439,6 +475,10 @@ unlock:
unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ if (selem) {
+ mem_uncharge(smap, owner, smap->elem_size);
+ kfree(selem);
+ }
return ERR_PTR(err);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 06062370c3b8..064eccba641d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.allowed = bpf_ima_inode_hash_allowed,
};
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+ return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+ .func = bpf_ima_file_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ case BPF_FUNC_ima_file_hash:
+ return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
default:
return tracing_prog_func_proto(func_id, prog);
}
@@ -167,6 +187,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)
#ifdef CONFIG_KEYS
@@ -207,7 +228,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_task_getsecid_subj)
+BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 8ecfe4752769..21069dbe9138 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -165,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
break;
}
- if (btf_member_bitfield_size(t, member)) {
+ if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
@@ -296,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data)
const struct btf_type *mtype;
for_each_member(i, t, member) {
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
@@ -387,7 +387,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_prog *prog;
u32 moff;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index ebfa8bc90892..6638a0ecc3d2 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -17,6 +17,7 @@
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
- task_storage = rcu_dereference(task->bpf_storage);
+ task_storage =
+ rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
if (!task_storage)
return NULL;
@@ -172,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
- task, (struct bpf_local_storage_map *)map, value, map_flags);
+ task, (struct bpf_local_storage_map *)map, value, map_flags,
+ GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -224,11 +227,13 @@ out:
return err;
}
-BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@@ -247,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
unlock:
bpf_task_storage_unlock();
@@ -260,6 +265,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
{
int ret;
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
@@ -323,7 +329,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &btf_task_struct_ids[0],
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
@@ -334,5 +340,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
- .arg2_btf_id = &btf_task_struct_ids[0],
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index dbc3ad07e21b..0918a39279f6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
@@ -25,6 +25,7 @@
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <net/sock.h>
+#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@@ -197,6 +198,21 @@
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
+enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_XDP,
+ BTF_KFUNC_HOOK_TC,
+ BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+ BTF_KFUNC_SET_MAX_CNT = 32,
+};
+
+struct btf_kfunc_set_tab {
+ struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -211,6 +227,7 @@ struct btf {
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
+ struct btf_kfunc_set_tab *kfunc_set_tab;
/* split BTF support */
struct btf *base_btf;
@@ -282,6 +299,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = "DATASEC",
[BTF_KIND_FLOAT] = "FLOAT",
[BTF_KIND_DECL_TAG] = "DECL_TAG",
+ [BTF_KIND_TYPE_TAG] = "TYPE_TAG",
};
const char *btf_type_str(const struct btf_type *t)
@@ -401,6 +419,9 @@ static struct btf_type btf_void;
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id);
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -418,6 +439,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
return true;
}
@@ -503,6 +525,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
+static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+ struct btf *btf;
+ s32 ret;
+ int id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -EINVAL;
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret > 0) {
+ btf_get(btf);
+ *btf_p = btf;
+ return ret;
+ }
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, btf, id) {
+ if (!btf_is_module(btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(btf, name, kind);
+ if (ret > 0) {
+ *btf_p = btf;
+ return ret;
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ return ret;
+}
+
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
@@ -576,6 +642,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_func(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -834,7 +901,7 @@ static const char *btf_show_name(struct btf_show *show)
const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
const char *name = NULL, *prefix = "", *parens = "";
const struct btf_member *m = show->state.member;
- const struct btf_type *t = show->state.type;
+ const struct btf_type *t;
const struct btf_array *array;
u32 id = show->state.type_id;
const char *member = NULL;
@@ -1528,8 +1595,30 @@ static void btf_free_id(struct btf *btf)
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+ struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+ int hook, type;
+
+ if (!tab)
+ return;
+ /* For module BTF, we directly assign the sets being registered, so
+ * there is nothing to free except kfunc_set_tab.
+ */
+ if (btf_is_module(btf))
+ goto free_tab;
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
+ for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
+ kfree(tab->sets[hook][type]);
+ }
+free_tab:
+ kfree(tab);
+ btf->kfunc_set_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
@@ -1737,6 +1826,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@@ -2345,6 +2435,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
+ const char *value;
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -2360,7 +2452,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- /* typedef type must have a valid name, and other ref types,
+ /* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@@ -2369,6 +2461,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
+ } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
@@ -2493,7 +2591,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
*
* We now need to continue from the last-resolved-ptr to
* ensure the last-resolved-ptr will not referring back to
- * the currenct ptr (t).
+ * the current ptr (t).
*/
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
@@ -2958,7 +3056,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- offset = btf_member_bit_offset(t, member);
+ offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
@@ -3083,7 +3181,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
if (off != -ENOENT)
/* only one such field is allowed */
return -E2BIG;
- off = btf_member_bit_offset(t, member);
+ off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
@@ -3173,8 +3271,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
btf_show_start_member(show, member);
- member_offset = btf_member_bit_offset(t, member);
- bitfield_size = btf_member_bitfield_size(t, member);
+ member_offset = __btf_member_bit_offset(t, member);
+ bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
@@ -3521,9 +3619,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return 0;
}
+static int btf_func_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ int err;
+
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+ return 0;
+}
+
static struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
- .resolve = btf_df_resolve,
+ .resolve = btf_func_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
@@ -4059,6 +4172,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_DATASEC] = &datasec_ops,
[BTF_KIND_FLOAT] = &float_ops,
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
+ [BTF_KIND_TYPE_TAG] = &modifier_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -4143,7 +4257,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
- if (btf_type_is_decl_tag(t))
+ if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
return btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
@@ -4233,12 +4347,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
if (err)
return err;
}
-
- if (btf_type_is_func(t)) {
- err = btf_func_check(env, t);
- if (err)
- return err;
- }
}
return 0;
@@ -4374,8 +4482,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
btf = env->btf;
btf_data_size = btf->data_size;
- if (btf_data_size <
- offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
+ if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
@@ -4460,8 +4567,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
log->len_total = log_size;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf) {
+ if (!bpf_verifier_log_attr_valid(log)) {
err = -EINVAL;
goto errout;
}
@@ -4814,7 +4920,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
-static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* t comes in already as a pointer */
t = btf_type_by_id(btf, t->type);
@@ -4823,8 +4929,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
t = btf_type_by_id(btf, t->type);
- /* char, signed char, unsigned char */
- return btf_type_is_int(t) && t->size == 1;
+ return btf_type_is_int(t);
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@@ -4837,6 +4942,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -4929,10 +5035,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ u32 type, flag;
- if (ctx_arg_info->offset == off &&
- (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
- ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+ type = base_type(ctx_arg_info->reg_type);
+ flag = type_flag(ctx_arg_info->reg_type);
+ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
+ (flag & PTR_MAYBE_NULL)) {
info->reg_type = ctx_arg_info->reg_type;
return true;
}
@@ -4945,7 +5053,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
- if (is_string_ptr(btf, t))
+ if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@@ -4987,6 +5095,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tag_value, "user") == 0)
+ info->reg_type |= MEM_USER;
+ if (strcmp(tag_value, "percpu") == 0)
+ info->reg_type |= MEM_PERCPU;
+ }
+
/* skip modifiers */
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
@@ -5013,12 +5130,12 @@ enum bpf_struct_walk_result {
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
- const char *tname, *mname;
+ const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
again:
@@ -5048,7 +5165,7 @@ again:
if (array_elem->nelems != 0)
goto error;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
@@ -5071,14 +5188,14 @@ error:
for_each_member(i, t, member) {
/* offset of the field in bytes */
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
- if (btf_member_bitfield_size(t, member)) {
- u32 end_bit = btf_member_bit_offset(t, member) +
- btf_member_bitfield_size(t, member);
+ if (__btf_member_bitfield_size(t, member)) {
+ u32 end_bit = __btf_member_bit_offset(t, member) +
+ __btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
@@ -5202,7 +5319,8 @@ error:
}
if (btf_type_is_ptr(mtype)) {
- const struct btf_type *stype;
+ const struct btf_type *stype, *t;
+ enum bpf_type_flag tmp_flag = 0;
u32 id;
if (msize != size || off != moff) {
@@ -5211,9 +5329,23 @@ error:
mname, moff, tname, off, size);
return -EACCES;
}
+
+ /* check type tag */
+ t = btf_type_by_id(btf, mtype->type);
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ /* check __user tag */
+ if (strcmp(tag_value, "user") == 0)
+ tmp_flag = MEM_USER;
+ /* check __percpu tag */
+ if (strcmp(tag_value, "percpu") == 0)
+ tmp_flag = MEM_PERCPU;
+ }
+
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
+ *flag = tmp_flag;
return WALK_PTR;
}
}
@@ -5240,13 +5372,14 @@ error:
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
+ enum bpf_type_flag tmp_flag = 0;
int err;
u32 id;
do {
- err = btf_struct_walk(log, btf, t, off, size, &id);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
switch (err) {
case WALK_PTR:
@@ -5254,6 +5387,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* we're done.
*/
*next_btf_id = id;
+ *flag = tmp_flag;
return PTR_TO_BTF_ID;
case WALK_SCALAR:
return SCALAR_VALUE;
@@ -5298,6 +5432,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
const struct btf *need_btf, u32 need_type_id)
{
const struct btf_type *type;
+ enum bpf_type_flag flag;
int err;
/* Are we already done? */
@@ -5308,7 +5443,7 @@ again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
if (err != WALK_STRUCT)
return false;
@@ -5372,7 +5507,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
}
args = (const struct btf_param *)(func + 1);
nargs = btf_type_vlen(func);
- if (nargs >= MAX_BPF_FUNC_ARGS) {
+ if (nargs > MAX_BPF_FUNC_ARGS) {
bpf_log(log,
"The function %s has %d arguments. Too many.\n",
tname, nargs);
@@ -5563,16 +5698,85 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#endif
};
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
+{
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
+
+ if (!btf_type_is_struct(t))
+ return false;
+
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
+
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ bpf_log(log, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_type_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
+
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ int len, sfx_len = sizeof("__sz") - 1;
+ const struct btf_type *t;
+ const char *param_name;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < sfx_len)
+ return false;
+ param_name += len - sfx_len;
+ if (strncmp(param_name, "__sz", sfx_len))
+ return false;
+
+ return true;
+}
+
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
+ u32 i, nargs, ref_id, ref_obj_id = 0;
+ bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
- u32 i, nargs, ref_id;
+ int ref_regno = 0, ret;
+ bool rel = false;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -5598,6 +5802,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* Only kfunc can be release func */
+ if (is_kfunc)
+ rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_RELEASE, func_id);
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
@@ -5621,7 +5829,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- if (btf_is_kernel(btf)) {
+
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel);
+ if (ret < 0)
+ return ret;
+
+ if (btf_get_prog_ctx_type(log, btf, t,
+ env->prog->type, i)) {
+ /* If function expects ctx type in BTF check that caller
+ * is passing PTR_TO_CTX.
+ */
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+ } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
+ (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
const struct btf_type *reg_ref_t;
const struct btf *reg_btf;
const char *reg_ref_tname;
@@ -5637,14 +5862,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
- } else if (reg2btf_ids[reg->type]) {
- reg_btf = btf_vmlinux;
- reg_ref_id = *reg2btf_ids[reg->type];
+ /* Ensure only one argument is referenced
+ * PTR_TO_BTF_ID, check_func_arg_reg_off relies
+ * on only one referenced register being allowed
+ * for kfuncs.
+ */
+ if (reg->ref_obj_id) {
+ if (ref_obj_id) {
+ bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id, ref_obj_id);
+ return -EFAULT;
+ }
+ ref_regno = regno;
+ ref_obj_id = reg->ref_obj_id;
+ }
} else {
- bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
- func_name, i,
- btf_type_str(ref_t), ref_tname, regno);
- return -EINVAL;
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
}
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
@@ -5660,23 +5894,40 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname);
return -EINVAL;
}
- } else if (btf_get_prog_ctx_type(log, btf, t,
- env->prog->type, i)) {
- /* If function expects ctx type in BTF check that caller
- * is passing PTR_TO_CTX.
- */
- if (reg->type != PTR_TO_CTX) {
- bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_type_str(t));
- return -EINVAL;
- }
- if (check_ctx_reg(env, reg, regno))
- return -EINVAL;
} else if (ptr_to_mem_ok) {
const struct btf_type *resolve_ret;
u32 type_size;
+ if (is_kfunc) {
+ bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+
+ /* Permit pointer to mem, but only when argument
+ * type is pointer to scalar, or struct composed
+ * (recursively) of scalars.
+ * When arg_mem_size is true, the pointer can be
+ * void *.
+ */
+ if (!btf_type_is_scalar(ref_t) &&
+ !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
+ bpf_log(log,
+ "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
+ return -EINVAL;
+ }
+
+ /* Check for mem, len pair */
+ if (arg_mem_size) {
+ if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
+ bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
+ i, i + 1);
+ return -EINVAL;
+ }
+ i++;
+ continue;
+ }
+ }
+
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
@@ -5689,11 +5940,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
+ bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
+ is_kfunc ? "kernel " : "", func_name, func_id);
return -EINVAL;
}
}
- return 0;
+ /* Either both are set, or neither */
+ WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
+ /* We already made sure ref_obj_id is set only for one argument. We do
+ * allow (!rel && ref_obj_id), so that passing such referenced
+ * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
+ * is_kfunc is true.
+ */
+ if (rel && !ref_obj_id) {
+ bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+ /* returns argument register number > 0 in case of reference release kfunc */
+ return rel ? ref_regno : 0;
}
/* Compare BTF of a function with given bpf_reg_state.
@@ -5738,7 +6004,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, false);
+ return btf_check_func_arg_match(env, btf, func_id, regs, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -5846,7 +6112,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
- reg->type = PTR_TO_MEM_OR_NULL;
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
reg->id = ++env->id_gen;
continue;
@@ -5939,7 +6205,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
- /* If we encontered an error, return it. */
+ /* If we encountered an error, return it. */
if (ssnprintf.show.state.status)
return ssnprintf.show.state.status;
@@ -6135,12 +6401,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}
+enum {
+ BTF_MODULE_F_LIVE = (1 << 0),
+};
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module {
struct list_head list;
struct module *module;
struct btf *btf;
struct bin_attribute *sysfs_attr;
+ int flags;
};
static LIST_HEAD(btf_modules);
@@ -6157,6 +6428,8 @@ btf_module_read(struct file *file, struct kobject *kobj,
return len;
}
+static void purge_cand_cache(struct btf *btf);
+
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
void *module)
{
@@ -6166,7 +6439,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
int err = 0;
if (mod->btf_data_size == 0 ||
- (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+ op != MODULE_STATE_GOING))
goto out;
switch (op) {
@@ -6181,7 +6455,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
pr_warn("failed to validate module [%s] BTF: %ld\n",
mod->name, PTR_ERR(btf));
kfree(btf_mod);
- err = PTR_ERR(btf);
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ err = PTR_ERR(btf);
goto out;
}
err = btf_alloc_id(btf);
@@ -6191,6 +6466,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
goto out;
}
+ purge_cand_cache(NULL);
mutex_lock(&btf_module_mutex);
btf_mod->module = module;
btf_mod->btf = btf;
@@ -6224,6 +6500,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
break;
+ case MODULE_STATE_LIVE:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_mod->flags |= BTF_MODULE_F_LIVE;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
case MODULE_STATE_GOING:
mutex_lock(&btf_module_mutex);
list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
@@ -6233,6 +6520,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+ purge_cand_cache(btf_mod->btf);
btf_put(btf_mod->btf);
kfree(btf_mod->sysfs_attr);
kfree(btf_mod);
@@ -6269,7 +6557,12 @@ struct module *btf_try_get_module(const struct btf *btf)
if (btf_mod->btf != btf)
continue;
- if (try_module_get(btf_mod->module))
+ /* We must only consider module whose __init routine has
+ * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+ * which is set from the notifier callback for
+ * MODULE_STATE_LIVE.
+ */
+ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
res = btf_mod->module;
break;
@@ -6280,9 +6573,43 @@ struct module *btf_try_get_module(const struct btf *btf)
return res;
}
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ struct btf *btf = NULL;
+
+ if (!module) {
+ btf = bpf_get_btf_vmlinux();
+ if (!IS_ERR_OR_NULL(btf))
+ btf_get(btf);
+ return btf;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_get(btf_mod->btf);
+ btf = btf_mod->btf;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return btf;
+}
+
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
- struct btf *btf;
+ struct btf *btf = NULL;
+ int btf_obj_fd = 0;
long ret;
if (flags)
@@ -6291,44 +6618,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int
if (name_sz <= 1 || name[name_sz - 1])
return -EINVAL;
- btf = bpf_get_btf_vmlinux();
- if (IS_ERR(btf))
- return PTR_ERR(btf);
-
- ret = btf_find_by_name_kind(btf, name, kind);
- /* ret is never zero, since btf_find_by_name_kind returns
- * positive btf_id or negative error.
- */
- if (ret < 0) {
- struct btf *mod_btf;
- int id;
-
- /* If name is not found in vmlinux's BTF then search in module's BTFs */
- spin_lock_bh(&btf_idr_lock);
- idr_for_each_entry(&btf_idr, mod_btf, id) {
- if (!btf_is_module(mod_btf))
- continue;
- /* linear search could be slow hence unlock/lock
- * the IDR to avoiding holding it for too long
- */
- btf_get(mod_btf);
- spin_unlock_bh(&btf_idr_lock);
- ret = btf_find_by_name_kind(mod_btf, name, kind);
- if (ret > 0) {
- int btf_obj_fd;
-
- btf_obj_fd = __btf_new_fd(mod_btf);
- if (btf_obj_fd < 0) {
- btf_put(mod_btf);
- return btf_obj_fd;
- }
- return ret | (((u64)btf_obj_fd) << 32);
- }
- spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
+ ret = bpf_find_btf_id(name, kind, &btf);
+ if (ret > 0 && btf_is_module(btf)) {
+ btf_obj_fd = __btf_new_fd(btf);
+ if (btf_obj_fd < 0) {
+ btf_put(btf);
+ return btf_obj_fd;
}
- spin_unlock_bh(&btf_idr_lock);
+ return ret | (((u64)btf_obj_fd) << 32);
}
+ if (ret > 0)
+ btf_put(btf);
return ret;
}
@@ -6336,65 +6636,691 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
.func = bpf_btf_find_by_name_kind,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
-BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
+BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
+#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
-/* BTF ID set registration API for modules */
+/* Kernel Function (kfunc) BTF ID set registration API */
-struct kfunc_btf_id_list {
- struct list_head list;
- struct mutex mutex;
+static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ enum btf_kfunc_type type,
+ struct btf_id_set *add_set, bool vmlinux_set)
+{
+ struct btf_kfunc_set_tab *tab;
+ struct btf_id_set *set;
+ u32 set_cnt;
+ int ret;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (!add_set->cnt)
+ return 0;
+
+ tab = btf->kfunc_set_tab;
+ if (!tab) {
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return -ENOMEM;
+ btf->kfunc_set_tab = tab;
+ }
+
+ set = tab->sets[hook][type];
+ /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+ * for module sets.
+ */
+ if (WARN_ON_ONCE(set && !vmlinux_set)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* We don't need to allocate, concatenate, and sort module sets, because
+ * only one is allowed per hook. Hence, we can directly assign the
+ * pointer and return.
+ */
+ if (!vmlinux_set) {
+ tab->sets[hook][type] = add_set;
+ return 0;
+ }
+
+ /* In case of vmlinux sets, there may be more than one set being
+ * registered per hook. To create a unified set, we allocate a new set
+ * and concatenate all individual sets being registered. While each set
+ * is individually sorted, they may become unsorted when concatenated,
+ * hence re-sorting the final set again is required to make binary
+ * searching the set using btf_id_set_contains function work.
+ */
+ set_cnt = set ? set->cnt : 0;
+
+ if (set_cnt > U32_MAX - add_set->cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+
+ if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Grow set */
+ set = krealloc(tab->sets[hook][type],
+ offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!set) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* For newly allocated set, initialize set->cnt to 0 */
+ if (!tab->sets[hook][type])
+ set->cnt = 0;
+ tab->sets[hook][type] = set;
+
+ /* Concatenate the two sets */
+ memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+ set->cnt += add_set->cnt;
+
+ sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+
+ return 0;
+end:
+ btf_free_kfunc_set_tab(btf);
+ return ret;
+}
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
+{
+ bool vmlinux_set = !btf_is_module(btf);
+ int type, ret = 0;
+
+ for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
+ if (!kset->sets[type])
+ continue;
+
+ ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
+ if (ret)
+ break;
+ }
+ return ret;
+}
+
+static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ enum btf_kfunc_type type,
+ u32 kfunc_btf_id)
+{
+ struct btf_id_set *set;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
+ return false;
+ if (!btf->kfunc_set_tab)
+ return false;
+ set = btf->kfunc_set_tab->sets[hook][type];
+ if (!set)
+ return false;
+ return btf_id_set_contains(set, kfunc_btf_id);
+}
+
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
+{
+ switch (prog_type) {
+ case BPF_PROG_TYPE_XDP:
+ return BTF_KFUNC_HOOK_XDP;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return BTF_KFUNC_HOOK_TC;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return BTF_KFUNC_HOOK_STRUCT_OPS;
+ default:
+ return BTF_KFUNC_HOOK_MAX;
+ }
+}
+
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+bool btf_kfunc_id_set_contains(const struct btf *btf,
+ enum bpf_prog_type prog_type,
+ enum btf_kfunc_type type, u32 kfunc_btf_id)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+ struct btf *btf;
+ int ret;
+
+ btf = btf_get_module_btf(kset->owner);
+ if (!btf) {
+ if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register kfuncs\n");
+ return -ENOENT;
+ }
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+ pr_err("missing module BTF, cannot register kfuncs\n");
+ return -ENOENT;
+ }
+ return 0;
+ }
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+ btf_put(btf);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
+
+static
+int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id,
+ int level)
+{
+ const struct btf_type *local_type, *targ_type;
+ int depth = 32; /* max recursion depth */
+
+ /* caller made sure that names match (ignoring flavor suffix) */
+ local_type = btf_type_by_id(local_btf, local_id);
+ targ_type = btf_type_by_id(targ_btf, targ_id);
+ if (btf_kind(local_type) != btf_kind(targ_type))
+ return 0;
+
+recur:
+ depth--;
+ if (depth < 0)
+ return -EINVAL;
+
+ local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id);
+ targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id);
+ if (!local_type || !targ_type)
+ return -EINVAL;
+
+ if (btf_kind(local_type) != btf_kind(targ_type))
+ return 0;
+
+ switch (btf_kind(local_type)) {
+ case BTF_KIND_UNKN:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ case BTF_KIND_FWD:
+ return 1;
+ case BTF_KIND_INT:
+ /* just reject deprecated bitfield-like integers; all other
+ * integers are by default compatible between each other
+ */
+ return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0;
+ case BTF_KIND_PTR:
+ local_id = local_type->type;
+ targ_id = targ_type->type;
+ goto recur;
+ case BTF_KIND_ARRAY:
+ local_id = btf_array(local_type)->type;
+ targ_id = btf_array(targ_type)->type;
+ goto recur;
+ case BTF_KIND_FUNC_PROTO: {
+ struct btf_param *local_p = btf_params(local_type);
+ struct btf_param *targ_p = btf_params(targ_type);
+ __u16 local_vlen = btf_vlen(local_type);
+ __u16 targ_vlen = btf_vlen(targ_type);
+ int i, err;
+
+ if (local_vlen != targ_vlen)
+ return 0;
+
+ for (i = 0; i < local_vlen; i++, local_p++, targ_p++) {
+ if (level <= 0)
+ return -EINVAL;
+
+ btf_type_skip_modifiers(local_btf, local_p->type, &local_id);
+ btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id);
+ err = __bpf_core_types_are_compat(local_btf, local_id,
+ targ_btf, targ_id,
+ level - 1);
+ if (err <= 0)
+ return err;
+ }
+
+ /* tail recurse for return type check */
+ btf_type_skip_modifiers(local_btf, local_type->type, &local_id);
+ btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id);
+ goto recur;
+ }
+ default:
+ return 0;
+ }
+}
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ * kind should match for local and target types (i.e., STRUCT is not
+ * compatible with UNION);
+ * - for ENUMs, the size is ignored;
+ * - for INT, size and signedness are ignored;
+ * - for ARRAY, dimensionality is ignored, element types are checked for
+ * compatibility recursively;
+ * - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ * - FUNC_PROTOs are compatible if they have compatible signature: same
+ * number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id)
+{
+ return __bpf_core_types_are_compat(local_btf, local_id,
+ targ_btf, targ_id,
+ MAX_TYPES_ARE_COMPAT_DEPTH);
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+ /* check X___Y name pattern, where X and Y are not underscores */
+ return s[0] != '_' && /* X */
+ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
+ s[4] != '_'; /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+ size_t n = strlen(name);
+ int i;
+
+ for (i = n - 5; i >= 0; i--) {
+ if (bpf_core_is_flavor_sep(name + i))
+ return i + 1;
+ }
+ return n;
+}
+
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
};
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+ if (!cands->cnt)
+ /* empty candidate array was allocated on stack */
+ return;
+ kfree(cands);
+}
-void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
{
- mutex_lock(&l->mutex);
- list_add(&s->list, &l->list);
- mutex_unlock(&l->mutex);
+ kfree(cands->name);
+ kfree(cands);
}
-EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
-void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+ struct bpf_cand_cache **cache,
+ int cache_size)
{
- mutex_lock(&l->mutex);
- list_del_init(&s->list);
- mutex_unlock(&l->mutex);
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ bpf_log(log, "[%d]%s(", i, cc->name);
+ for (j = 0; j < cc->cnt; j++) {
+ bpf_log(log, "%d", cc->cands[j].id);
+ if (j < cc->cnt - 1)
+ bpf_log(log, " ");
+ }
+ bpf_log(log, "), ");
+ }
}
-EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
- struct module *owner)
+static void print_cand_cache(struct bpf_verifier_log *log)
{
- struct kfunc_btf_id_set *s;
+ mutex_lock(&cand_cache_mutex);
+ bpf_log(log, "vmlinux_cand_cache:");
+ __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ bpf_log(log, "\nmodule_cand_cache:");
+ __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ bpf_log(log, "\n");
+ mutex_unlock(&cand_cache_mutex);
+}
- if (!owner)
- return false;
- mutex_lock(&klist->mutex);
- list_for_each_entry(s, &klist->list, list) {
- if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
- mutex_unlock(&klist->mutex);
- return true;
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+ return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+ if (cc && cc->name_len == cands->name_len &&
+ !strncmp(cc->name, cands->name, cands->name_len))
+ return cc;
+ return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+ return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+ if (*cc) {
+ bpf_free_cands_from_cache(*cc);
+ *cc = NULL;
+ }
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* strdup the name, since it will stay in cache.
+ * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+ */
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ bpf_free_cands(cands);
+ if (!new_cands->name) {
+ kfree(new_cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ *cc = new_cands;
+ return new_cands;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ if (!btf) {
+ /* when new module is loaded purge all of module_cand_cache,
+ * since new module might have candidates with the name
+ * that matches cached cands.
+ */
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ continue;
}
+ /* when module is unloaded purge cache entries
+ * that match module's btf
+ */
+ for (j = 0; j < cc->cnt; j++)
+ if (cc->cands[j].btf == btf) {
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ break;
+ }
}
- mutex_unlock(&klist->mutex);
- return false;
+
}
+static void purge_cand_cache(struct btf *btf)
+{
+ mutex_lock(&cand_cache_mutex);
+ __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ mutex_unlock(&cand_cache_mutex);
+}
#endif
-#define DEFINE_KFUNC_BTF_ID_LIST(name) \
- struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
- __MUTEX_INITIALIZER(name.mutex) }; \
- EXPORT_SYMBOL_GPL(name)
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+ int targ_start_id)
+{
+ struct bpf_cand_cache *new_cands;
+ const struct btf_type *t;
+ const char *targ_name;
+ size_t targ_essent_len;
+ int n, i;
+
+ n = btf_nr_types(targ_btf);
+ for (i = targ_start_id; i < n; i++) {
+ t = btf_type_by_id(targ_btf, i);
+ if (btf_kind(t) != cands->kind)
+ continue;
+
+ targ_name = btf_name_by_offset(targ_btf, t->name_off);
+ if (!targ_name)
+ continue;
-DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
-DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+ /* the resched point is before strncmp to make sure that search
+ * for non-existing name will have a chance to schedule().
+ */
+ cond_resched();
+
+ if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+ continue;
+
+ targ_essent_len = bpf_core_essential_name_len(targ_name);
+ if (targ_essent_len != cands->name_len)
+ continue;
+
+ /* most of the time there is only one candidate for a given kind+name pair */
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+ bpf_free_cands(cands);
+ cands = new_cands;
+ cands->cands[cands->cnt].btf = targ_btf;
+ cands->cands[cands->cnt].id = i;
+ cands->cnt++;
+ }
+ return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+ struct bpf_cand_cache *cands, *cc, local_cand = {};
+ const struct btf *local_btf = ctx->btf;
+ const struct btf_type *local_type;
+ const struct btf *main_btf;
+ size_t local_essent_len;
+ struct btf *mod_btf;
+ const char *name;
+ int id;
+
+ main_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(main_btf))
+ return ERR_CAST(main_btf);
+ if (!main_btf)
+ return ERR_PTR(-EINVAL);
+
+ local_type = btf_type_by_id(local_btf, local_type_id);
+ if (!local_type)
+ return ERR_PTR(-EINVAL);
+
+ name = btf_name_by_offset(local_btf, local_type->name_off);
+ if (str_is_empty(name))
+ return ERR_PTR(-EINVAL);
+ local_essent_len = bpf_core_essential_name_len(name);
+
+ cands = &local_cand;
+ cands->name = name;
+ cands->kind = btf_kind(local_type);
+ cands->name_len = local_essent_len;
+
+ cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ /* cands is a pointer to stack here */
+ if (cc) {
+ if (cc->cnt)
+ return cc;
+ goto check_modules;
+ }
+
+ /* Attempt to find target candidates in vmlinux BTF first */
+ cands = bpf_core_add_cands(cands, main_btf, 1);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+ /* populate cache even when cands->cnt == 0 */
+ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ if (IS_ERR(cc))
+ return ERR_CAST(cc);
+
+ /* if vmlinux BTF has any candidate, don't go for module BTFs */
+ if (cc->cnt)
+ return cc;
+
+check_modules:
+ /* cands is a pointer to stack here and cands->cnt == 0 */
+ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ if (cc)
+ /* if cache has it return it even if cc->cnt == 0 */
+ return cc;
+
+ /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ if (IS_ERR(cands)) {
+ btf_put(mod_btf);
+ return ERR_CAST(cands);
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(mod_btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+ * or pointer to stack if cands->cnd == 0.
+ * Copy it into the cache even when cands->cnt == 0 and
+ * return the result.
+ */
+ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+ int relo_idx, void *insn)
+{
+ bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+ struct bpf_core_cand_list cands = {};
+ struct bpf_core_relo_res targ_res;
+ struct bpf_core_spec *specs;
+ int err;
+
+ /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+ * into arrays of btf_ids of struct fields and array indices.
+ */
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ if (!specs)
+ return -ENOMEM;
+
+ if (need_cands) {
+ struct bpf_cand_cache *cc;
+ int i;
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(ctx, relo->type_id);
+ if (IS_ERR(cc)) {
+ bpf_log(ctx->log, "target candidate search failed for %d\n",
+ relo->type_id);
+ err = PTR_ERR(cc);
+ goto out;
+ }
+ if (cc->cnt) {
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ if (!cands.cands) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < cc->cnt; i++) {
+ bpf_log(ctx->log,
+ "CO-RE relocating %s %s: found target candidate [%d]\n",
+ btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+ cands.cands[i].btf = cc->cands[i].btf;
+ cands.cands[i].id = cc->cands[i].id;
+ }
+ cands.len = cc->cnt;
+ /* cand_cache_mutex needs to span the cache lookup and
+ * copy of btf pointer into bpf_core_cand_list,
+ * since module can be unloaded while bpf_core_calc_relo_insn
+ * is working with module's btf.
+ */
+ }
+
+ err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+ &targ_res);
+ if (err)
+ goto out;
+
+ err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+ &targ_res);
+
+out:
+ kfree(specs);
+ if (need_cands) {
+ kfree(cands.cands);
+ mutex_unlock(&cand_cache_mutex);
+ if (ctx->log->level & BPF_LOG_LEVEL2)
+ print_cand_cache(ctx->log);
+ }
+ return err;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 2ca643af9a54..128028efda64 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1031,7 +1031,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
* NET_XMIT_CN (2) - continue with packet output and notify TCP
* to call cwr
- * -EPERM - drop packet
+ * -err - drop packet
*
* For ingress packets, this function will return -EPERM if any
* attached program was found and if it returned != 1 during execution.
@@ -1079,8 +1079,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
} else {
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
- __bpf_prog_run_save_cb);
- ret = (ret == 1 ? 0 : -EPERM);
+ __bpf_prog_run_save_cb, 0);
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
@@ -1093,7 +1094,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1107,10 +1108,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1119,7 +1119,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
@@ -1142,7 +1142,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
};
struct sockaddr_storage unspec;
struct cgroup *cgrp;
- int ret;
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
@@ -1156,10 +1155,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, flags);
-
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0, flags);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1169,7 +1166,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1184,11 +1181,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
- bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1201,17 +1196,47 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow;
+ int ret;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0);
rcu_read_unlock();
- return !allow;
+ return ret;
+}
+
+BPF_CALL_0(bpf_get_retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+static const struct bpf_func_proto bpf_get_retval_proto = {
+ .func = bpf_get_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ ctx->retval = retval;
+ return 0;
}
+static const struct bpf_func_proto bpf_set_retval_proto = {
+ .func = bpf_set_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1224,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_retval:
+ return &bpf_get_retval_proto;
+ case BPF_FUNC_set_retval:
+ return &bpf_set_retval_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -1337,7 +1366,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1350,24 +1380,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.new_val);
}
- return ret == 1 ? 0 : -EPERM;
+ return ret;
}
#ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum cgroup_bpf_attach_type attach_type)
-{
- struct bpf_prog_array *prog_array;
- bool empty;
-
- rcu_read_lock();
- prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
- empty = bpf_prog_array_is_empty(prog_array);
- rcu_read_unlock();
-
- return empty;
-}
-
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
struct bpf_sockopt_buf *buf)
{
@@ -1426,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
};
int ret, max_optlen;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
- return 0;
-
/* Allocate a bit more than the initial user buffer for
* BPF program. The canonical use case is overriding
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/
max_optlen = max_t(int, 16, *optlen);
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1452,13 +1460,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, 0);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret)
goto out;
- }
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
@@ -1518,19 +1524,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
+ .current_task = current,
};
int ret;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
- return retval;
-
ctx.optlen = max_optlen;
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1562,27 +1560,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, retval);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret < 0)
goto out;
- }
if (ctx.optlen > max_optlen || ctx.optlen < 0) {
ret = -EFAULT;
goto out;
}
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval) {
- ret = -EFAULT;
- goto out;
- }
-
if (ctx.optlen != 0) {
if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
put_user(ctx.optlen, optlen)) {
@@ -1591,8 +1579,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
}
- ret = ctx.retval;
-
out:
sockopt_free_buf(&ctx, &buf);
return ret;
@@ -1607,10 +1593,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
+ .current_task = current,
};
int ret;
@@ -1623,25 +1609,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
*/
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
- if (!ret)
- return -EPERM;
+ &ctx, bpf_prog_run, retval);
+ if (ret < 0)
+ return ret;
if (ctx.optlen > *optlen)
return -EFAULT;
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval)
- return -EFAULT;
-
/* BPF programs can shrink the buffer, export the modifications.
*/
if (ctx.optlen != 0)
*optlen = ctx.optlen;
- return ctx.retval;
+ return ret;
}
#endif
@@ -1789,7 +1769,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -1809,6 +1789,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_get_new_value_proto;
case BPF_FUNC_sysctl_set_new_value:
return &bpf_sysctl_set_new_value_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
default:
return cgroup_base_func_proto(func_id, prog);
}
@@ -2055,10 +2037,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
break;
case offsetof(struct bpf_sockopt, retval):
- if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
- else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
break;
case offsetof(struct bpf_sockopt, optval):
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 2405e39d800f..13e9dbeeedf3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -33,6 +33,7 @@
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -105,6 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
+ fp->blinding_requested = bpf_jit_blinding_enabled(fp);
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
mutex_init(&fp->aux->used_maps_mutex);
@@ -537,13 +539,10 @@ long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
- unsigned long addr = (unsigned long)hdr;
-
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
@@ -808,6 +807,176 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[];
+};
+
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static size_t bpf_prog_pack_size = -1;
+static size_t bpf_prog_pack_mask = -1;
+
+static int bpf_prog_chunk_count(void)
+{
+ WARN_ON_ONCE(bpf_prog_pack_size == -1);
+ return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE;
+}
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+#define BPF_HPAGE_SIZE PMD_SIZE
+#define BPF_HPAGE_MASK PMD_MASK
+#else
+#define BPF_HPAGE_SIZE PAGE_SIZE
+#define BPF_HPAGE_MASK PAGE_MASK
+#endif
+
+static size_t select_bpf_prog_pack_size(void)
+{
+ size_t size;
+ void *ptr;
+
+ size = BPF_HPAGE_SIZE * num_online_nodes();
+ ptr = module_alloc(size);
+
+ /* Test whether we can get huge pages. If not just use PAGE_SIZE
+ * packs.
+ */
+ if (!ptr || !is_vm_area_hugepages(ptr)) {
+ size = PAGE_SIZE;
+ bpf_prog_pack_mask = PAGE_MASK;
+ } else {
+ bpf_prog_pack_mask = BPF_HPAGE_MASK;
+ }
+
+ vfree(ptr);
+ return size;
+}
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+ struct bpf_prog_pack *pack;
+
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())),
+ GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = module_alloc(bpf_prog_pack_size);
+ if (!pack->ptr) {
+ kfree(pack);
+ return NULL;
+ }
+ bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
+ list_add_tail(&pack->list, &pack_list);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ if (bpf_prog_pack_size == -1)
+ bpf_prog_pack_size = select_bpf_prog_pack_size();
+
+ if (size > bpf_prog_pack_size) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = module_alloc(size);
+ if (ptr) {
+ set_vm_flush_reset_perms(ptr);
+ set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
+ set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+ }
+ goto out;
+ }
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ nbits, 0);
+ if (pos < bpf_prog_chunk_count())
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack();
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+ void *pack_ptr;
+
+ mutex_lock(&pack_mutex);
+ if (hdr->size > bpf_prog_pack_size) {
+ module_memfree(hdr);
+ goto out;
+ }
+
+ pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask);
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (tmp->ptr == pack_ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+ pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ bpf_prog_chunk_count(), 0) == 0) {
+ list_del(&pack->list);
+ module_memfree(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -833,12 +1002,11 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
if (!bpf_capable()) {
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -846,9 +1014,9 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -867,7 +1035,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -877,20 +1045,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -903,10 +1070,117 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
+ bpf_prog_pack_free(ro_header);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = (get_random_int() % hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+ struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header);
+ return PTR_ERR(ptr);
+ }
+ prog->aux->use_bpf_prog_pack = true;
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = ro_header->size;
+
+ bpf_prog_pack_free(ro_header);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ if (fp->aux->use_bpf_prog_pack)
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ else
+ addr = real_start & PAGE_MASK;
+
+ return (void *)addr;
}
/* This symbol is only overridden by archs that have different
@@ -918,7 +1192,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- bpf_jit_binary_free(hdr);
+ if (fp->aux->use_bpf_prog_pack)
+ bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
+ else
+ bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -1146,7 +1423,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
+ if (!prog->blinding_requested || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -1574,7 +1851,8 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
- if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
@@ -1828,28 +2106,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
}
#endif
-bool bpf_prog_array_compatible(struct bpf_array *array,
- const struct bpf_prog *fp)
+bool bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
bool ret;
if (fp->kprobe_override)
return false;
- spin_lock(&array->aux->owner.lock);
-
- if (!array->aux->owner.type) {
+ spin_lock(&map->owner.lock);
+ if (!map->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->owner.type = fp->type;
- array->aux->owner.jited = fp->jited;
+ map->owner.type = fp->type;
+ map->owner.jited = fp->jited;
+ map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = array->aux->owner.type == fp->type &&
- array->aux->owner.jited == fp->jited;
+ ret = map->owner.type == fp->type &&
+ map->owner.jited == fp->jited &&
+ map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
- spin_unlock(&array->aux->owner.lock);
+ spin_unlock(&map->owner.lock);
+
return ret;
}
@@ -1861,13 +2141,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
- struct bpf_array *array;
- if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ if (!map_type_contains_progs(map))
continue;
- array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp)) {
+ if (!bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -1891,7 +2169,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
- * @fp: bpf_prog populated with internal BPF program
+ * @fp: bpf_prog populated with BPF program
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
@@ -1967,18 +2245,10 @@ static struct bpf_prog_dummy {
},
};
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
- struct bpf_prog_array hdr;
- struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
.null_prog = NULL,
};
+EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
@@ -1988,12 +2258,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
(prog_cnt + 1),
flags);
- return &empty_prog_array.hdr;
+ return &bpf_empty_prog_array.hdr;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
@@ -2300,7 +2570,6 @@ static void bpf_prog_free_deferred(struct work_struct *work)
}
}
-/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@@ -2453,6 +2722,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 585b2b77ccc4..650e5d21f90d 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
}
return;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, rcpu->prog, act);
@@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
@@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+ struct bpf_map *map, int fd)
{
struct bpf_prog *prog;
@@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return -EINVAL;
}
@@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
- if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
/* Setup kthread */
@@ -746,15 +748,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- struct xdp_frame *xdpf;
-
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
/* Info needed when constructing SKB on remote CPU */
xdpf->dev_rx = dev_rx;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index f02d04540c0c..038f6d7a83e4 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
frames[nframes++] = xdpf;
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dev, xdp_prog, act);
@@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
bq->q[bq->count++] = xdpf;
}
-static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx,
struct bpf_prog *xdp_prog)
{
- struct xdp_frame *xdpf;
int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ err = xdp_ok_fwd_dev(dev, xdpf->len);
if (unlikely(err))
return err;
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
@@ -507,7 +502,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
__skb_push(skb, skb->mac_len);
break;
default:
- bpf_warn_invalid_xdp_action(act);
+ bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dst->dev, dst->xdp_prog, act);
@@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
return act;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- return __xdp_enqueue(dev, xdp, dev_rx, NULL);
+ return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
- return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
+ return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
}
-static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
if (!obj ||
!obj->dev->netdev_ops->ndo_xdp_xmit)
return false;
- if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
+ if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
return false;
return true;
@@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes)
return n;
}
-int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
- struct xdp_frame *xdpf;
int num_excluded = 0;
unsigned int i;
int err;
@@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
excluded_devices[num_excluded++] = dev_rx->ifindex;
}
- xdpf = xdp_convert_buff_to_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
- if (!is_valid_dst(dst, xdp))
+ if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
@@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
lockdep_is_held(&dtab->index_lock)) {
- if (!is_valid_dst(dst, xdp))
+ if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded,
@@ -868,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
BPF_PROG_TYPE_XDP, false);
if (IS_ERR(prog))
goto err_put_dev;
- if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+ !bpf_prog_map_compatible(&dtab->map, prog))
goto err_put_prog;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d29af9988f37..65877967f414 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1636,7 +1636,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
- * 1000, it was observed that a bucket can have upto 5 entries.
+ * 1000, it was observed that a bucket can have up to 5 entries.
*/
bucket_size = 5;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1ffd469c217f..315053ef6a75 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2,6 +2,8 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@@ -15,6 +17,7 @@
#include <linux/pid_namespace.h>
#include <linux/proc_ns.h>
#include <linux/security.h>
+#include <linux/btf_ids.h>
#include "../../lib/kstrtox.h"
@@ -222,13 +225,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
if (unlikely(!task))
goto err_clear;
- strncpy(buf, task->comm, size);
-
- /* Verifier guarantees that size > 0. For task->comm exceeding
- * size, guarantee that buf is %NUL-terminated. Unconditionally
- * done here to save the size test.
- */
- buf[size - 1] = 0;
+ /* Verifier guarantees that size > 0 */
+ strscpy(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -530,7 +528,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
.func = bpf_strtol,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
@@ -558,13 +556,27 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.func = bpf_strtoul,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
#endif
+BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
+{
+ return strncmp(s1, s2, s1_sz);
+}
+
+const struct bpf_func_proto bpf_strncmp_proto = {
+ .func = bpf_strncmp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_PTR_TO_CONST_STR,
+};
+
BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
struct bpf_pidns_info *, nsdata, u32, size)
{
@@ -630,7 +642,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -656,6 +668,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+ const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+ int ret;
+
+ /* flags is not used yet */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(!size))
+ return 0;
+
+ ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+ if (ret == size)
+ return 0;
+
+ memset(dst, 0, size);
+ /* Return -EFAULT for partial read */
+ return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+ .func = bpf_copy_from_user_task,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_BTF_ID,
+ .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg5_type = ARG_ANYTHING
+};
+
BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
{
if (cpu >= nr_cpu_ids)
@@ -667,7 +712,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
.func = bpf_per_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
.arg2_type = ARG_ANYTHING,
};
@@ -680,7 +725,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.func = bpf_this_cpu_ptr,
.gpl_only = false,
- .ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
+ .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
@@ -1011,7 +1056,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_CONST_STR,
- .arg4_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -1043,7 +1088,7 @@ struct bpf_hrtimer {
struct bpf_timer_kern {
struct bpf_hrtimer *timer;
/* bpf_spin_lock is used here instead of spinlock_t to make
- * sure that it always fits into space resereved by struct bpf_timer
+ * sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
*/
struct bpf_spin_lock lock;
@@ -1060,6 +1105,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
void *key;
u32 idx;
+ BTF_TYPE_EMIT(struct bpf_timer);
callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@@ -1364,8 +1410,6 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto;
- case BPF_FUNC_ktime_get_coarse_ns:
- return &bpf_ktime_get_coarse_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
@@ -1378,6 +1422,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_query_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
+ case BPF_FUNC_loop:
+ return &bpf_loop_proto;
+ case BPF_FUNC_strncmp:
+ return &bpf_strncmp_proto;
default:
break;
}
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 80da1db47c68..4f841e16779e 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -648,12 +648,22 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
int opt;
opt = fs_parse(fc, bpf_fs_parameters, param, &result);
- if (opt < 0)
+ if (opt < 0) {
/* We might like to report bad mount options here, but
* traditionally we've ignored all mount options, so we'd
* better continue to ignore non-existing options for bpf.
*/
- return opt == -ENOPARAM ? 0 : opt;
+ if (opt == -ENOPARAM) {
+ opt = vfs_parse_fs_param_source(fc, param);
+ if (opt != -ENOPARAM)
+ return opt;
+
+ return 0;
+ }
+
+ if (opt < 0)
+ return opt;
+ }
switch (opt) {
case OPT_MODE:
@@ -700,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock);
static int populate_bpffs(struct dentry *parent)
{
struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
- struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
int err = 0, i;
/* grab the mutex to make sure the kernel interactions with bpf_preload
- * UMD are serialized
+ * are serialized
*/
mutex_lock(&bpf_preload_lock);
@@ -712,40 +721,22 @@ static int populate_bpffs(struct dentry *parent)
if (!bpf_preload_mod_get())
goto out;
- if (!bpf_preload_ops->info.tgid) {
- /* preload() will start UMD that will load BPF iterator programs */
- err = bpf_preload_ops->preload(objs);
- if (err)
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ bpf_link_inc(objs[i].link);
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, objs[i].link);
+ if (err) {
+ bpf_link_put(objs[i].link);
goto out_put;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- links[i] = bpf_link_by_id(objs[i].link_id);
- if (IS_ERR(links[i])) {
- err = PTR_ERR(links[i]);
- goto out_put;
- }
- }
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- err = bpf_iter_link_pin_kernel(parent,
- objs[i].link_name, links[i]);
- if (err)
- goto out_put;
- /* do not unlink successfully pinned links even
- * if later link fails to pin
- */
- links[i] = NULL;
}
- /* finish() will tell UMD process to exit */
- err = bpf_preload_ops->finish();
- if (err)
- goto out_put;
}
out_put:
bpf_preload_mod_put();
out:
mutex_unlock(&bpf_preload_lock);
- for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
- if (!IS_ERR_OR_NULL(links[i]))
- bpf_link_put(links[i]);
return err;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 035e9e3a7132..497916060ac7 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
@@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return 0;
}
- new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
- map->value_size,
+ new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
if (!new)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 423549d2c52e..5763cc7ac4f1 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -412,7 +412,7 @@ static int trie_update_elem(struct bpf_map *map,
rcu_assign_pointer(im_node->child[1], node);
}
- /* Finally, assign the intermediate node to the determined spot */
+ /* Finally, assign the intermediate node to the determined slot */
rcu_assign_pointer(*slot, im_node);
out:
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 6a9542af4212..b0fa190b0979 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
- PTR_TO_RDONLY_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
{ offsetof(struct bpf_iter__bpf_map_elem, value),
- PTR_TO_RDWR_BUF_OR_NULL },
+ PTR_TO_BUF | PTR_MAYBE_NULL },
},
};
diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h
new file mode 100644
index 000000000000..5d18d7d85bef
--- /dev/null
+++ b/kernel/bpf/mmap_unlock_work.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2021 Facebook
+ */
+
+#ifndef __MMAP_UNLOCK_WORK_H__
+#define __MMAP_UNLOCK_WORK_H__
+#include <linux/irq_work.h>
+
+/* irq_work to run mmap_read_unlock() in irq_work */
+struct mmap_unlock_irq_work {
+ struct irq_work irq_work;
+ struct mm_struct *mm;
+};
+
+DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+/*
+ * We cannot do mmap_read_unlock() when the irq is disabled, because of
+ * risk to deadlock with rq_lock. To look up vma when the irqs are
+ * disabled, we need to run mmap_read_unlock() in irq_work. We use a
+ * percpu variable to do the irq_work. If the irq_work is already used
+ * by another lookup, we fall over.
+ */
+static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = false;
+
+ if (irqs_disabled()) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ work = this_cpu_ptr(&mmap_unlock_work);
+ if (irq_work_is_busy(&work->irq_work)) {
+ /* cannot queue more up_read, fallback */
+ irq_work_busy = true;
+ }
+ } else {
+ /*
+ * PREEMPT_RT does not allow to trylock mmap sem in
+ * interrupt disabled context. Force the fallback code.
+ */
+ irq_work_busy = true;
+ }
+ }
+
+ *work_ptr = work;
+ return irq_work_busy;
+}
+
+static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm)
+{
+ if (!work) {
+ mmap_read_unlock(mm);
+ } else {
+ work->mm = mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_);
+ irq_work_queue(&work->irq_work);
+ }
+}
+
+#endif /* __MMAP_UNLOCK_WORK_H__ */
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 542f275bf252..868cc2c43899 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
+#include <linux/bpf-netns.h>
#include <linux/filter.h>
#include <net/net_namespace.h>
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index 26bced262473..c9d45c9d6918 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD
if BPF_PRELOAD
config BPF_PRELOAD_UMD
- tristate "bpf_preload kernel module with user mode driver"
- depends on CC_CAN_LINK
- depends on m || CC_CAN_LINK_STATIC
+ tristate "bpf_preload kernel module"
default m
help
- This builds bpf_preload kernel module with embedded user mode driver.
+ This builds bpf_preload kernel module with embedded BPF programs for
+ introspection in bpffs.
endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 1400ac58178e..20f89cc0a0a6 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,42 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_OUT = $(abspath $(obj))/libbpf
-LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
-LIBBPF_DESTDIR = $(LIBBPF_OUT)
-LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
-
-# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
-# in tools/scripts/Makefile.include always succeeds when building the kernel
-# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A): | $(LIBBPF_OUT)
- $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \
- DESTDIR=$(LIBBPF_DESTDIR) prefix= \
- $(LIBBPF_OUT)/libbpf.a install_headers
-
-libbpf_hdrs: $(LIBBPF_A)
-
-.PHONY: libbpf_hdrs
-
-$(LIBBPF_OUT):
- $(call msg,MKDIR,$@)
- $(Q)mkdir -p $@
-
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
- -I $(LIBBPF_INCLUDE) -Wno-unused-result
-
-userprogs := bpf_preload_umd
-
-clean-files := libbpf/
-
-$(obj)/iterators/iterators.o: | libbpf_hdrs
-
-bpf_preload_umd-objs := iterators/iterators.o
-bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
-
-$(obj)/bpf_preload_umd: $(LIBBPF_A)
-
-$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
+LIBBPF_INCLUDE = $(srctree)/tools/lib
obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
-bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
index 2f9932276f2e..f065c91213a0 100644
--- a/kernel/bpf/preload/bpf_preload.h
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -2,13 +2,13 @@
#ifndef _BPF_PRELOAD_H
#define _BPF_PRELOAD_H
-#include <linux/usermode_driver.h>
-#include "iterators/bpf_preload_common.h"
+struct bpf_preload_info {
+ char link_name[16];
+ struct bpf_link *link;
+};
struct bpf_preload_ops {
- struct umd_info info;
int (*preload)(struct bpf_preload_info *);
- int (*finish)(void);
struct module *owner;
};
extern struct bpf_preload_ops *bpf_preload_ops;
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 53736e52c1df..5106b5372f0c 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -2,101 +2,87 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/pid.h>
-#include <linux/fs.h>
-#include <linux/sched/signal.h>
#include "bpf_preload.h"
+#include "iterators/iterators.lskel.h"
-extern char bpf_preload_umd_start;
-extern char bpf_preload_umd_end;
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
-static int preload(struct bpf_preload_info *obj);
-static int finish(void);
+static void free_links_and_skel(void)
+{
+ if (!IS_ERR_OR_NULL(maps_link))
+ bpf_link_put(maps_link);
+ if (!IS_ERR_OR_NULL(progs_link))
+ bpf_link_put(progs_link);
+ iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+ strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ obj[0].link = maps_link;
+ strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ obj[1].link = progs_link;
+ return 0;
+}
-static struct bpf_preload_ops umd_ops = {
- .info.driver_name = "bpf_preload",
+static struct bpf_preload_ops ops = {
.preload = preload,
- .finish = finish,
.owner = THIS_MODULE,
};
-static int preload(struct bpf_preload_info *obj)
+static int load_skel(void)
{
- int magic = BPF_PRELOAD_START;
- loff_t pos = 0;
- int i, err;
- ssize_t n;
+ int err;
- err = fork_usermode_driver(&umd_ops.info);
+ skel = iterators_bpf__open();
+ if (!skel)
+ return -ENOMEM;
+ err = iterators_bpf__load(skel);
if (err)
- return err;
-
- /* send the start magic to let UMD proceed with loading BPF progs */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- /* receive bpf_link IDs and names from UMD */
- pos = 0;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- n = kernel_read(umd_ops.info.pipe_from_umh,
- &obj[i], sizeof(*obj), &pos);
- if (n != sizeof(*obj))
- return -EPIPE;
+ goto out;
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto out;
+ maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+ if (IS_ERR(maps_link)) {
+ err = PTR_ERR(maps_link);
+ goto out;
}
- return 0;
-}
-
-static int finish(void)
-{
- int magic = BPF_PRELOAD_END;
- struct pid *tgid;
- loff_t pos = 0;
- ssize_t n;
-
- /* send the last magic to UMD. It will do a normal exit. */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- tgid = umd_ops.info.tgid;
- if (tgid) {
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
+ progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+ if (IS_ERR(progs_link)) {
+ err = PTR_ERR(progs_link);
+ goto out;
}
+ /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+ * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+ */
+ close_fd(skel->links.dump_bpf_map_fd);
+ skel->links.dump_bpf_map_fd = 0;
+ close_fd(skel->links.dump_bpf_prog_fd);
+ skel->links.dump_bpf_prog_fd = 0;
return 0;
+out:
+ free_links_and_skel();
+ return err;
}
-static int __init load_umd(void)
+static int __init load(void)
{
int err;
- err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
- &bpf_preload_umd_end - &bpf_preload_umd_start);
+ err = load_skel();
if (err)
return err;
- bpf_preload_ops = &umd_ops;
+ bpf_preload_ops = &ops;
return err;
}
-static void __exit fini_umd(void)
+static void __exit fini(void)
{
- struct pid *tgid;
-
bpf_preload_ops = NULL;
-
- /* kill UMD in case it's still there due to earlier error */
- tgid = umd_ops.info.tgid;
- if (tgid) {
- kill_pid(tgid, SIGKILL, 1);
-
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
- }
- umd_unload_blob(&umd_ops.info);
+ free_links_and_skel();
}
-late_initcall(load_umd);
-module_exit(fini_umd);
+late_initcall(load);
+module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S
deleted file mode 100644
index f1f40223b5c3..000000000000
--- a/kernel/bpf/preload/bpf_preload_umd_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
- .section .init.rodata, "a"
- .global bpf_preload_umd_start
-bpf_preload_umd_start:
- .incbin "kernel/bpf/preload/bpf_preload_umd"
- .global bpf_preload_umd_end
-bpf_preload_umd_end:
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index b8bd60511227..bfe24f8c5a20 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -35,15 +35,15 @@ endif
.PHONY: all clean
-all: iterators.skel.h
+all: iterators.lskel.h
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) iterators
-iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
- $(Q)$(BPFTOOL) gen skeleton $< > $@
+ $(Q)$(BPFTOOL) gen skeleton -L $< > $@
$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h
deleted file mode 100644
index 8464d1a48c05..000000000000
--- a/kernel/bpf/preload/iterators/bpf_preload_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BPF_PRELOAD_COMMON_H
-#define _BPF_PRELOAD_COMMON_H
-
-#define BPF_PRELOAD_START 0x5555
-#define BPF_PRELOAD_END 0xAAAA
-
-struct bpf_preload_info {
- char link_name[16];
- int link_id;
-};
-
-#endif
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
deleted file mode 100644
index 5d872a705470..000000000000
--- a/kernel/bpf/preload/iterators/iterators.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2020 Facebook */
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <bpf/libbpf.h>
-#include <bpf/bpf.h>
-#include <sys/mount.h>
-#include "iterators.skel.h"
-#include "bpf_preload_common.h"
-
-int to_kernel = -1;
-int from_kernel = 0;
-
-static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
-{
- struct bpf_preload_info obj = {};
- struct bpf_link_info info = {};
- __u32 info_len = sizeof(info);
- int err;
-
- err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
- if (err)
- return err;
- obj.link_id = info.id;
- if (strlen(link_name) >= sizeof(obj.link_name))
- return -E2BIG;
- strcpy(obj.link_name, link_name);
- if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj))
- return -EPIPE;
- return 0;
-}
-
-int main(int argc, char **argv)
-{
- struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
- struct iterators_bpf *skel;
- int err, magic;
- int debug_fd;
-
- debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
- if (debug_fd < 0)
- return 1;
- to_kernel = dup(1);
- close(1);
- dup(debug_fd);
- /* now stdin and stderr point to /dev/console */
-
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_START) {
- printf("bad start magic %d\n", magic);
- return 1;
- }
- setrlimit(RLIMIT_MEMLOCK, &rlim);
- /* libbpf opens BPF object and loads it into the kernel */
- skel = iterators_bpf__open_and_load();
- if (!skel) {
- /* iterators.skel.h is little endian.
- * libbpf doesn't support automatic little->big conversion
- * of BPF bytecode yet.
- * The program load will fail in such case.
- */
- printf("Failed load could be due to wrong endianness\n");
- return 1;
- }
- err = iterators_bpf__attach(skel);
- if (err)
- goto cleanup;
-
- /* send two bpf_link IDs with names to the kernel */
- err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
- if (err)
- goto cleanup;
- err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
- if (err)
- goto cleanup;
-
- /* The kernel will proceed with pinnging the links in bpffs.
- * UMD will wait on read from pipe.
- */
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_END) {
- printf("bad final magic %d\n", magic);
- err = -EINVAL;
- }
-cleanup:
- iterators_bpf__destroy(skel);
-
- return err != 0;
-}
diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h
new file mode 100644
index 000000000000..70f236a82fe1
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+ struct iterators_bpf__rodata {
+ } *rodata;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ skel->rodata = skel_prep_map_data((void *)"\
+\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
+\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98);
+ if (!skel->rodata)
+ goto cleanup;
+ skel->maps.rodata.initial_value = (__u64) (long) skel->rodata;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6056;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
+\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
+\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
+\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
+\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
+\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
+\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
+\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
+\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
+\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
+\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
+\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
+\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
+\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
+\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
+\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
+\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
+\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
+\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
+\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
+\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
+\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
+\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
+\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
+\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
+\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
+\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
+\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
+\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
+\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
+\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
+\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
+\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
+\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
+\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
+\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
+\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
+\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
+\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
+\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
+\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
+\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
+\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
+\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
+\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
+\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
+\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
+\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
+\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
+\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
+\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
+\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
+\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
+\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
+\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
+\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
+\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
+\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
+\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
+\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
+\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
+\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
+\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
+\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
+\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
+\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
+\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
+\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
+\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
+\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
+\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
+\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
+\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
+\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
+\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
+\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
+\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
+\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
+\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
+\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
+\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
+\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
+\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
+\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
+\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
+\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
+\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
+\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
+\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
+\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
+\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
+\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
+\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
+\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
+\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
+\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
+\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
+\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
+\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
+\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
+\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
+\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
+\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
+\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
+\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
+\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
+\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
+\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
+\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
+\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
+\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
+\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
+\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
+\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
+\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
+\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
+\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
+\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
+\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
+\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
+\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
+\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
+\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
+\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
+\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
+\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
+\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
+\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
+\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\
+\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\
+\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\
+\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\
+\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\
+\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\
+\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\
+\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\
+\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\
+\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\
+\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\
+\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\
+\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\
+\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\
+\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\
+\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\
+\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\
+\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\
+\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value,
+ 4096, PROT_READ, skel->maps.rodata.map_fd);
+ if (!skel->rodata)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
deleted file mode 100644
index cf9a6a94b3a4..000000000000
--- a/kernel/bpf/preload/iterators/iterators.skel.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-
-/* THIS FILE IS AUTOGENERATED! */
-#ifndef __ITERATORS_BPF_SKEL_H__
-#define __ITERATORS_BPF_SKEL_H__
-
-#include <stdlib.h>
-#include <bpf/libbpf.h>
-
-struct iterators_bpf {
- struct bpf_object_skeleton *skeleton;
- struct bpf_object *obj;
- struct {
- struct bpf_map *rodata;
- } maps;
- struct {
- struct bpf_program *dump_bpf_map;
- struct bpf_program *dump_bpf_prog;
- } progs;
- struct {
- struct bpf_link *dump_bpf_map;
- struct bpf_link *dump_bpf_prog;
- } links;
- struct iterators_bpf__rodata {
- char dump_bpf_map____fmt[35];
- char dump_bpf_map____fmt_1[14];
- char dump_bpf_prog____fmt[32];
- char dump_bpf_prog____fmt_2[17];
- } *rodata;
-};
-
-static void
-iterators_bpf__destroy(struct iterators_bpf *obj)
-{
- if (!obj)
- return;
- if (obj->skeleton)
- bpf_object__destroy_skeleton(obj->skeleton);
- free(obj);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj);
-
-static inline struct iterators_bpf *
-iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
-{
- struct iterators_bpf *obj;
-
- obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
- if (!obj)
- return NULL;
- if (iterators_bpf__create_skeleton(obj))
- goto err;
- if (bpf_object__open_skeleton(obj->skeleton, opts))
- goto err;
-
- return obj;
-err:
- iterators_bpf__destroy(obj);
- return NULL;
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open(void)
-{
- return iterators_bpf__open_opts(NULL);
-}
-
-static inline int
-iterators_bpf__load(struct iterators_bpf *obj)
-{
- return bpf_object__load_skeleton(obj->skeleton);
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open_and_load(void)
-{
- struct iterators_bpf *obj;
-
- obj = iterators_bpf__open();
- if (!obj)
- return NULL;
- if (iterators_bpf__load(obj)) {
- iterators_bpf__destroy(obj);
- return NULL;
- }
- return obj;
-}
-
-static inline int
-iterators_bpf__attach(struct iterators_bpf *obj)
-{
- return bpf_object__attach_skeleton(obj->skeleton);
-}
-
-static inline void
-iterators_bpf__detach(struct iterators_bpf *obj)
-{
- return bpf_object__detach_skeleton(obj->skeleton);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj)
-{
- struct bpf_object_skeleton *s;
-
- s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
- if (!s)
- return -1;
- obj->skeleton = s;
-
- s->sz = sizeof(*s);
- s->name = "iterators_bpf";
- s->obj = &obj->obj;
-
- /* maps */
- s->map_cnt = 1;
- s->map_skel_sz = sizeof(*s->maps);
- s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
- if (!s->maps)
- goto err;
-
- s->maps[0].name = "iterator.rodata";
- s->maps[0].map = &obj->maps.rodata;
- s->maps[0].mmaped = (void **)&obj->rodata;
-
- /* programs */
- s->prog_cnt = 2;
- s->prog_skel_sz = sizeof(*s->progs);
- s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
- if (!s->progs)
- goto err;
-
- s->progs[0].name = "dump_bpf_map";
- s->progs[0].prog = &obj->progs.dump_bpf_map;
- s->progs[0].link = &obj->links.dump_bpf_map;
-
- s->progs[1].name = "dump_bpf_prog";
- s->progs[1].prog = &obj->progs.dump_bpf_prog;
- s->progs[1].link = &obj->links.dump_bpf_prog;
-
- s->data_sz = 7176;
- s->data = (void *)"\
-\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
-\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
-\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
-\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
-\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
-\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
-\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
-\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
-\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
-\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
-\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
-\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
-\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
-\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
-\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
-\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
-\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
-\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
-\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
-\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
-\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
-\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
-\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
-\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
-\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
-\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
-\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
-\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
-\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
-\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
-\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
-\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
-\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
-\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
-\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
-\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
-\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
-\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
-\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
-\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
-\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
-\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
-\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
-\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
-\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
-\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
-\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
-\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
-\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
-\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
-\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
-\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
-\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
-\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
-\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
-\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
-\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
-\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
-\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
-\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
-\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
-\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
-\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
-\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
-\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
-\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
-\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
-\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
-\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
-\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
-\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
-\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
-\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
-\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
-\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
-\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
-\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
-\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
-\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
-\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
-\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
-\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
-\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
-\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
-\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
-\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
-\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
-\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
-\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
-\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
-\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
-\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
-\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
-\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
-\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
-\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
-\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
-\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
-\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
-\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
-\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
-\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
-\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
-\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
-\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
-\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
-\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
-\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
-\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
-\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
-\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
-\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
-\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
-\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
-\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
-\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
-\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
-\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
-\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
-\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
-\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
-\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
-\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
-\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
-\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
-\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
-\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
-\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
-\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
-\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
-\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
-\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
-\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
-\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
-\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
-\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
-\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
-\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
-\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
-\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
-\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
-\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
-\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
-\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
-\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
-\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
-\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
-\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
-
- return 0;
-err:
- bpf_object__destroy_skeleton(s);
- return -1;
-}
-
-#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 93a55391791a..8251243022a2 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -143,7 +143,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* Once reaching here, all sk->sk_user_data is not
- * referenceing this "array". "array" can be freed now.
+ * referencing this "array". "array" can be freed now.
*/
bpf_map_area_free(array);
}
@@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- u64 array_size;
if (!bpf_capable())
return ERR_PTR(-EPERM);
- array_size = sizeof(*array);
- array_size += (u64)attr->max_entries * sizeof(struct sock *);
-
/* allocate all map elements and zero-initialize them */
- array = bpf_map_area_alloc(array_size, numa_node);
+ array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 9e0c10c6892a..710ba9de12ce 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
}
rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
- VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+ VM_MAP | VM_USERMAP, PAGE_KERNEL);
if (rb) {
kmemleak_not_leak(pages);
rb->pages = pages;
@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
.func = bpf_ringbuf_output,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6e75bbee39f0..34725bfa1e97 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,10 +7,10 @@
#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
-#include <linux/irq_work.h>
#include <linux/btf_ids.h>
#include <linux/buildid.h>
#include "percpu_freelist.h"
+#include "mmap_unlock_work.h"
#define STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
@@ -31,25 +31,6 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
-/* irq_work to run up_read() for build_id lookup in nmi context */
-struct stack_map_irq_work {
- struct irq_work irq_work;
- struct mm_struct *mm;
-};
-
-static void do_up_read(struct irq_work *entry)
-{
- struct stack_map_irq_work *work;
-
- if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
- return;
-
- work = container_of(entry, struct stack_map_irq_work, irq_work);
- mmap_read_unlock_non_owner(work->mm);
-}
-
-static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
-
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -149,35 +130,14 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
int i;
- struct vm_area_struct *vma;
- bool irq_work_busy = false;
- struct stack_map_irq_work *work = NULL;
-
- if (irqs_disabled()) {
- if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
- work = this_cpu_ptr(&up_read_work);
- if (irq_work_is_busy(&work->irq_work)) {
- /* cannot queue more up_read, fallback */
- irq_work_busy = true;
- }
- } else {
- /*
- * PREEMPT_RT does not allow to trylock mmap sem in
- * interrupt disabled context. Force the fallback code.
- */
- irq_work_busy = true;
- }
- }
-
- /*
- * We cannot do up_read() when the irq is disabled, because of
- * risk to deadlock with rq_lock. To do build_id lookup when the
- * irqs are disabled, we need to run up_read() in irq_work. We use
- * a percpu variable to do the irq_work. If the irq_work is
- * already used by another lookup, we fall back to report ips.
- *
- * Same fallback is used for kernel stack (!user) on a stackmap
- * with build_id.
+ struct mmap_unlock_irq_work *work = NULL;
+ bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+ struct vm_area_struct *vma, *prev_vma = NULL;
+ const char *prev_build_id;
+
+ /* If the irq_work is in use, fall back to report ips. Same
+ * fallback is used for kernel stack (!user) on a stackmap with
+ * build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
!mmap_read_trylock(current->mm)) {
@@ -191,6 +151,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
for (i = 0; i < trace_nr; i++) {
+ if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ vma = prev_vma;
+ memcpy(id_offs[i].build_id, prev_build_id,
+ BUILD_ID_SIZE_MAX);
+ goto build_id_valid;
+ }
vma = find_vma(current->mm, ips[i]);
if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
/* per entry fall back to ips */
@@ -199,27 +165,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
+build_id_valid:
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ prev_vma = vma;
+ prev_build_id = id_offs[i].build_id;
}
-
- if (!work) {
- mmap_read_unlock(current->mm);
- } else {
- work->mm = current->mm;
-
- /* The lock will be released once we're out of interrupt
- * context. Tell lockdep that we've released it now so
- * it doesn't complain that we forgot to release it.
- */
- rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
- irq_work_queue(&work->irq_work);
- }
+ bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
-get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
{
#ifdef CONFIG_STACKTRACE
struct perf_callchain_entry *entry;
@@ -230,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
if (!entry)
return NULL;
- entry->nr = init_nr +
- stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
- sysctl_perf_event_max_stack - init_nr, 0);
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+ max_depth, 0);
/* stack_trace_save_tsk() works on unsigned long array, while
* perf_callchain_entry uses u64 array. For 32-bit systems, it is
@@ -244,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
int i;
/* copy data from the end to avoid using extra buffer */
- for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ for (i = entry->nr - 1; i >= 0; i--)
to[i] = (u64)(from[i]);
}
@@ -261,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
- /* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
- */
- trace_nr = trace->nr - init_nr;
-
- if (trace_nr <= skip)
+ if (trace->nr <= skip)
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_len = trace_nr * sizeof(u64);
- ips = trace->ip + skip + init_nr;
+ ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
@@ -338,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
@@ -348,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+ max_depth += skip;
+ if (max_depth > sysctl_perf_event_max_stack)
+ max_depth = sysctl_perf_event_max_stack;
+
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -440,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags)
{
- u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
@@ -465,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto err_fault;
num_elem = size / elem_size;
- if (sysctl_perf_event_max_stack < num_elem)
- init_nr = 0;
- else
- init_nr = sysctl_perf_event_max_stack - num_elem;
+ max_depth = num_elem + skip;
+ if (sysctl_perf_event_max_stack < max_depth)
+ max_depth = sysctl_perf_event_max_stack;
if (trace_in)
trace = trace_in;
else if (kernel && task)
- trace = get_callchain_entry_for_task(task, init_nr);
+ trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack,
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
goto err_fault;
- trace_nr = trace->nr - init_nr;
- if (trace_nr < skip)
+ if (trace->nr < skip)
goto err_fault;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
- ips = trace->ip + skip + init_nr;
+
+ ips = trace->ip + skip;
if (user && user_build_id)
stack_map_get_build_id_offset(buf, ips, trace_nr, user);
else
@@ -525,13 +474,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
struct pt_regs *regs;
- long res;
+ long res = -EINVAL;
if (!try_get_task_stack(task))
return -EFAULT;
regs = task_pt_regs(task);
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (regs)
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
put_task_stack(task);
return res;
@@ -542,7 +492,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
- .arg1_btf_id = &btf_task_struct_ids[0],
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
@@ -719,16 +669,3 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_btf_name = "bpf_stack_map",
.map_btf_id = &stack_trace_map_btf_id,
};
-
-static int __init stack_map_init(void)
-{
- int cpu;
- struct stack_map_irq_work *work;
-
- for_each_possible_cpu(cpu) {
- work = per_cpu_ptr(&up_read_work, cpu);
- init_irq_work(&work->irq_work, do_up_read);
- }
- return 0;
-}
-subsys_initcall(stack_map_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 50f96ea4452a..cdaa1152436a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
@@ -31,6 +32,7 @@
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
+#include <linux/trace_events.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -132,6 +134,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
return map;
}
+static void bpf_map_write_active_inc(struct bpf_map *map)
+{
+ atomic64_inc(&map->writecnt);
+}
+
+static void bpf_map_write_active_dec(struct bpf_map *map)
+{
+ atomic64_dec(&map->writecnt);
+}
+
+bool bpf_map_write_active(const struct bpf_map *map)
+{
+ return atomic64_read(&map->writecnt) != 0;
+}
+
static u32 bpf_map_value_size(const struct bpf_map *map)
{
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
@@ -540,16 +557,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
- const struct bpf_map *map = filp->private_data;
- const struct bpf_array *array;
+ struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
- array = container_of(map, struct bpf_array, map);
- spin_lock(&array->aux->owner.lock);
- type = array->aux->owner.type;
- jited = array->aux->owner.jited;
- spin_unlock(&array->aux->owner.lock);
+ if (map_type_contains_progs(map)) {
+ spin_lock(&map->owner.lock);
+ type = map->owner.type;
+ jited = map->owner.jited;
+ spin_unlock(&map->owner.lock);
}
seq_printf(m,
@@ -601,11 +616,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt++;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_inc(map);
}
/* called for all unmapped memory region (including initial) */
@@ -613,11 +625,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma)
{
struct bpf_map *map = vma->vm_file->private_data;
- if (vma->vm_flags & VM_MAYWRITE) {
- mutex_lock(&map->freeze_mutex);
- map->writecnt--;
- mutex_unlock(&map->freeze_mutex);
- }
+ if (vma->vm_flags & VM_MAYWRITE)
+ bpf_map_write_active_dec(map);
}
static const struct vm_operations_struct bpf_map_default_vmops = {
@@ -668,7 +677,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
goto out;
if (vma->vm_flags & VM_MAYWRITE)
- map->writecnt++;
+ bpf_map_write_active_inc(map);
out:
mutex_unlock(&map->freeze_mutex);
return err;
@@ -864,6 +873,7 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
+ spin_lock_init(&map->owner.lock);
map->spin_lock_off = -EINVAL;
map->timer_off = -EINVAL;
@@ -976,6 +986,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
return map;
}
+EXPORT_SYMBOL(bpf_map_get);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -1139,6 +1150,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
@@ -1174,6 +1186,7 @@ free_value:
free_key:
kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1196,6 +1209,7 @@ static int map_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
@@ -1226,6 +1240,7 @@ static int map_delete_elem(union bpf_attr *attr)
out:
kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1338,14 +1353,16 @@ int generic_map_delete_batch(struct bpf_map *map,
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
kvfree(key);
+
+ maybe_wait_bpf_programs(map);
return err;
}
@@ -1398,6 +1415,7 @@ int generic_map_update_batch(struct bpf_map *map,
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
@@ -1495,6 +1513,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
swap(prev_key, key);
retry = MAP_LOOKUP_RETRIES;
cp++;
+ cond_resched();
}
if (err == -EFAULT)
@@ -1533,6 +1552,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
+ bpf_map_write_active_inc(map);
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) ||
!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
@@ -1597,6 +1617,7 @@ free_value:
free_key:
kvfree(key);
err_put:
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -1624,8 +1645,7 @@ static int map_freeze(const union bpf_attr *attr)
}
mutex_lock(&map->freeze_mutex);
-
- if (map->writecnt) {
+ if (bpf_map_write_active(map)) {
err = -EBUSY;
goto err_put;
}
@@ -2184,7 +2204,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD fd_array
+#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
{
@@ -2202,7 +2222,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
- BPF_F_TEST_RND_HI32))
+ BPF_F_TEST_RND_HI32 |
+ BPF_F_XDP_HAS_FRAGS))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2288,6 +2309,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->dst_prog = dst_prog;
prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
err = security_bpf_prog_alloc(prog->aux);
if (err)
@@ -2473,6 +2495,7 @@ void bpf_link_put(struct bpf_link *link)
bpf_link_free(link);
}
}
+EXPORT_SYMBOL(bpf_link_put);
static int bpf_link_release(struct inode *inode, struct file *filp)
{
@@ -2544,7 +2567,7 @@ static int bpf_link_alloc_id(struct bpf_link *link)
* pre-allocated resources are to be freed with bpf_cleanup() call. All the
* transient state is passed around in struct bpf_link_primer.
* This is preferred way to create and initialize bpf_link, especially when
- * there are complicated and expensive operations inbetween creating bpf_link
+ * there are complicated and expensive operations in between creating bpf_link
* itself and attaching it to BPF hook. By using bpf_link_prime() and
* bpf_link_settle() kernel code using bpf_link doesn't have to perform
* expensive (and potentially failing) roll back operations in a rare case
@@ -2615,6 +2638,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
return link;
}
+EXPORT_SYMBOL(bpf_link_get_from_fd);
struct bpf_tracing_link {
struct bpf_link link;
@@ -2999,6 +3023,11 @@ out_put_file:
fput(perf_file);
return err;
}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_PERF_EVENTS */
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
@@ -3303,12 +3332,17 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_MSG_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return sock_map_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -4171,6 +4205,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
union bpf_attr __user *uattr,
int cmd)
{
+ bool has_read = cmd == BPF_MAP_LOOKUP_BATCH ||
+ cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH;
+ bool has_write = cmd != BPF_MAP_LOOKUP_BATCH;
struct bpf_map *map;
int err, ufd;
struct fd f;
@@ -4183,16 +4220,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if ((cmd == BPF_MAP_LOOKUP_BATCH ||
- cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
+ if (has_write)
+ bpf_map_write_active_inc(map);
+ if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
-
- if (cmd != BPF_MAP_LOOKUP_BATCH &&
- !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
+ if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -4205,8 +4239,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr,
BPF_DO_BATCH(map->ops->map_update_batch);
else
BPF_DO_BATCH(map->ops->map_delete_batch);
-
err_put:
+ if (has_write)
+ bpf_map_write_active_dec(map);
fdput(f);
return err;
}
@@ -4226,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
return -EINVAL;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
@@ -4250,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = tracing_bpf_link_attach(attr, uattr, prog);
goto out;
case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
if (attr->link_create.attach_type != BPF_PERF_EVENT) {
ret = -EINVAL;
@@ -4258,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
}
ptype = prog->type;
break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type != BPF_PERF_EVENT &&
+ attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ptype = prog->type;
+ break;
default:
ptype = attach_type_to_prog_type(attr->link_create.attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
@@ -4289,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_xdp_link_attach(attr, prog);
break;
#endif
-#ifdef CONFIG_PERF_EVENTS
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
- case BPF_PROG_TYPE_KPROBE:
ret = bpf_perf_link_attach(attr, prog);
break;
-#endif
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else
+ ret = bpf_kprobe_multi_link_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -4734,30 +4779,59 @@ static bool syscall_prog_is_valid_access(int off, int size,
return true;
}
-BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
+ struct bpf_prog * __maybe_unused prog;
+
switch (cmd) {
case BPF_MAP_CREATE:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
+ case BPF_LINK_CREATE:
+ case BPF_RAW_TRACEPOINT_OPEN:
break;
- /* case BPF_PROG_TEST_RUN:
- * is not part of this list to prevent recursive test_run
- */
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+ case BPF_PROG_TEST_RUN:
+ if (attr->test.data_in || attr->test.data_out ||
+ attr->test.ctx_out || attr->test.duration ||
+ attr->test.repeat || attr->test.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+ attr->test.ctx_size_in > U16_MAX) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ if (!__bpf_prog_enter_sleepable(prog)) {
+ /* recursion detected */
+ bpf_prog_put(prog);
+ return -EBUSY;
+ }
+ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+ __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */);
+ bpf_prog_put(prog);
+ return 0;
+#endif
default:
return -EINVAL;
}
return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
+EXPORT_SYMBOL(bpf_sys_bpf);
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -4804,7 +4878,7 @@ const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
- .arg2_type = ARG_CONST_SIZE,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index b48750bfba5a..d94696198ef8 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -8,6 +8,7 @@
#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/btf_ids.h>
+#include "mmap_unlock_work.h"
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
@@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
.show = task_vma_seq_show,
};
-BTF_ID_LIST(btf_task_file_ids)
-BTF_ID(struct, file)
-BTF_ID(struct, vm_area_struct)
-
static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
@@ -586,23 +583,88 @@ static struct bpf_iter_reg task_vma_reg_info = {
.seq_info = &task_vma_seq_info,
};
+BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
+ bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
+{
+ struct mmap_unlock_irq_work *work = NULL;
+ struct vm_area_struct *vma;
+ bool irq_work_busy = false;
+ struct mm_struct *mm;
+ int ret = -ENOENT;
+
+ if (flags)
+ return -EINVAL;
+
+ if (!task)
+ return -ENOENT;
+
+ mm = task->mm;
+ if (!mm)
+ return -ENOENT;
+
+ irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
+
+ if (irq_work_busy || !mmap_read_trylock(mm))
+ return -EBUSY;
+
+ vma = find_vma(mm, start);
+
+ if (vma && vma->vm_start <= start && vma->vm_end > start) {
+ callback_fn((u64)(long)task, (u64)(long)vma,
+ (u64)(long)callback_ctx, 0, 0);
+ ret = 0;
+ }
+ bpf_mmap_unlock_mm(work, mm);
+ return ret;
+}
+
+const struct bpf_func_proto bpf_find_vma_proto = {
+ .func = bpf_find_vma,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FUNC,
+ .arg4_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg5_type = ARG_ANYTHING,
+};
+
+DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
+
+static void do_mmap_read_unlock(struct irq_work *entry)
+{
+ struct mmap_unlock_irq_work *work;
+
+ if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
+ return;
+
+ work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
+ mmap_read_unlock_non_owner(work->mm);
+}
+
static int __init task_iter_init(void)
{
- int ret;
+ struct mmap_unlock_irq_work *work;
+ int ret, cpu;
+
+ for_each_possible_cpu(cpu) {
+ work = per_cpu_ptr(&mmap_unlock_work, cpu);
+ init_irq_work(&work->irq_work, do_mmap_read_unlock);
+ }
- task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
+ task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
- task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
- task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
+ task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
ret = bpf_iter_reg_target(&task_file_reg_info);
if (ret)
return ret;
- task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
- task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
+ task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
+ task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index e98de5e73ba5..ada97751ae1b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -27,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
+bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
+{
+ enum bpf_attach_type eatype = prog->expected_attach_type;
+
+ return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN;
+}
+
void *bpf_jit_alloc_exec_page(void)
{
void *image;
@@ -37,7 +45,7 @@ void *bpf_jit_alloc_exec_page(void)
set_vm_flush_reset_perms(image);
/* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
+ * every time new program is attached or detached.
*/
set_memory_x((long)image, 1);
return image;
@@ -109,18 +117,6 @@ static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
tr->mod = NULL;
}
-static int is_ftrace_location(void *ip)
-{
- long addr;
-
- addr = ftrace_location((long)ip);
- if (!addr)
- return 0;
- if (WARN_ON_ONCE(addr != (long)ip))
- return -EFAULT;
- return 1;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
@@ -152,12 +148,12 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
{
void *ip = tr->func.addr;
+ unsigned long faddr;
int ret;
- ret = is_ftrace_location(ip);
- if (ret < 0)
- return ret;
- tr->func.ftrace_managed = ret;
+ faddr = ftrace_location((unsigned long)ip);
+ if (faddr)
+ tr->func.ftrace_managed = true;
if (bpf_trampoline_module_get(tr))
return -ENOENT;
@@ -205,7 +201,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -302,7 +298,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(1);
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
goto out_free_im;
@@ -324,7 +320,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
out_free_image:
bpf_jit_free_exec(im->image);
out_uncharge:
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
out_free_im:
kfree(im);
out:
@@ -542,11 +538,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
static void notrace inc_misses_counter(struct bpf_prog *prog)
{
struct bpf_prog_stats *stats;
+ unsigned int flags;
stats = this_cpu_ptr(prog->stats);
- u64_stats_update_begin(&stats->syncp);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->misses);
- u64_stats_update_end(&stats->syncp);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
/* The logic is similar to bpf_prog_run(), but with an explicit
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 890b3ec375a3..d175b70067b3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4,6 +4,7 @@
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
*/
#include <uapi/linux/btf.h>
+#include <linux/bpf-cgroup.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
@@ -293,13 +294,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt,
WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
"verifier log line truncated - local buffer too short\n");
- n = min(log->len_total - log->len_used - 1, n);
- log->kbuf[n] = '\0';
-
if (log->level == BPF_LOG_KERNEL) {
- pr_err("BPF:%s\n", log->kbuf);
+ bool newline = n > 0 && log->kbuf[n - 1] == '\n';
+
+ pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n");
return;
}
+
+ n = min(log->len_total - log->len_used - 1, n);
+ log->kbuf[n] = '\0';
if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
log->len_used += n;
else
@@ -439,18 +442,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)
type == PTR_TO_SOCK_COMMON;
}
-static bool reg_type_may_be_null(enum bpf_reg_type type)
-{
- return type == PTR_TO_MAP_VALUE_OR_NULL ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_SOCK_COMMON_OR_NULL ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_BTF_ID_OR_NULL ||
- type == PTR_TO_MEM_OR_NULL ||
- type == PTR_TO_RDONLY_BUF_OR_NULL ||
- type == PTR_TO_RDWR_BUF_OR_NULL;
-}
-
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return reg->type == PTR_TO_MAP_VALUE &&
@@ -459,12 +450,15 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
- return type == PTR_TO_SOCKET ||
- type == PTR_TO_SOCKET_OR_NULL ||
- type == PTR_TO_TCP_SOCK ||
- type == PTR_TO_TCP_SOCK_OR_NULL ||
- type == PTR_TO_MEM ||
- type == PTR_TO_MEM_OR_NULL;
+ return base_type(type) == PTR_TO_SOCKET ||
+ base_type(type) == PTR_TO_TCP_SOCK ||
+ base_type(type) == PTR_TO_MEM ||
+ base_type(type) == PTR_TO_BTF_ID;
+}
+
+static bool type_is_rdonly_mem(u32 type)
+{
+ return type & MEM_RDONLY;
}
static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
@@ -472,14 +466,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
return type == ARG_PTR_TO_SOCK_COMMON;
}
-static bool arg_type_may_be_null(enum bpf_arg_type type)
+static bool type_may_be_null(u32 type)
{
- return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_CTX_OR_NULL ||
- type == ARG_PTR_TO_SOCKET_OR_NULL ||
- type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
- type == ARG_PTR_TO_STACK_OR_NULL;
+ return type & PTR_MAYBE_NULL;
}
/* Determine whether the function releases some resources allocated by another
@@ -539,39 +528,58 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
insn->imm == BPF_CMPXCHG;
}
-/* string representation of 'enum bpf_reg_type' */
-static const char * const reg_type_str[] = {
- [NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
- [PTR_TO_CTX] = "ctx",
- [CONST_PTR_TO_MAP] = "map_ptr",
- [PTR_TO_MAP_VALUE] = "map_value",
- [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
- [PTR_TO_STACK] = "fp",
- [PTR_TO_PACKET] = "pkt",
- [PTR_TO_PACKET_META] = "pkt_meta",
- [PTR_TO_PACKET_END] = "pkt_end",
- [PTR_TO_FLOW_KEYS] = "flow_keys",
- [PTR_TO_SOCKET] = "sock",
- [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
- [PTR_TO_SOCK_COMMON] = "sock_common",
- [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
- [PTR_TO_TCP_SOCK] = "tcp_sock",
- [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
- [PTR_TO_TP_BUFFER] = "tp_buffer",
- [PTR_TO_XDP_SOCK] = "xdp_sock",
- [PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
- [PTR_TO_MEM] = "mem",
- [PTR_TO_MEM_OR_NULL] = "mem_or_null",
- [PTR_TO_RDONLY_BUF] = "rdonly_buf",
- [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
- [PTR_TO_RDWR_BUF] = "rdwr_buf",
- [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
- [PTR_TO_FUNC] = "func",
- [PTR_TO_MAP_KEY] = "map_key",
-};
+/* string representation of 'enum bpf_reg_type'
+ *
+ * Note that reg_type_str() can not appear more than once in a single verbose()
+ * statement.
+ */
+static const char *reg_type_str(struct bpf_verifier_env *env,
+ enum bpf_reg_type type)
+{
+ char postfix[16] = {0}, prefix[32] = {0};
+ static const char * const str[] = {
+ [NOT_INIT] = "?",
+ [SCALAR_VALUE] = "scalar",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_STACK] = "fp",
+ [PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
+ [PTR_TO_PACKET_END] = "pkt_end",
+ [PTR_TO_FLOW_KEYS] = "flow_keys",
+ [PTR_TO_SOCKET] = "sock",
+ [PTR_TO_SOCK_COMMON] = "sock_common",
+ [PTR_TO_TCP_SOCK] = "tcp_sock",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
+ [PTR_TO_BTF_ID] = "ptr_",
+ [PTR_TO_MEM] = "mem",
+ [PTR_TO_BUF] = "buf",
+ [PTR_TO_FUNC] = "func",
+ [PTR_TO_MAP_KEY] = "map_key",
+ };
+
+ if (type & PTR_MAYBE_NULL) {
+ if (base_type(type) == PTR_TO_BTF_ID)
+ strncpy(postfix, "or_null_", 16);
+ else
+ strncpy(postfix, "_or_null", 16);
+ }
+
+ if (type & MEM_RDONLY)
+ strncpy(prefix, "rdonly_", 32);
+ if (type & MEM_ALLOC)
+ strncpy(prefix, "alloc_", 32);
+ if (type & MEM_USER)
+ strncpy(prefix, "user_", 32);
+ if (type & MEM_PERCPU)
+ strncpy(prefix, "percpu_", 32);
+
+ snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
+ prefix, str[base_type(type)], postfix);
+ return env->type_str_buf;
+}
static char slot_type_char[] = {
[STACK_INVALID] = '?',
@@ -606,6 +614,44 @@ static const char *kernel_type_name(const struct btf* btf, u32 id)
return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off);
}
+static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno)
+{
+ env->scratched_regs |= 1U << regno;
+}
+
+static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi)
+{
+ env->scratched_stack_slots |= 1ULL << spi;
+}
+
+static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno)
+{
+ return (env->scratched_regs >> regno) & 1;
+}
+
+static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno)
+{
+ return (env->scratched_stack_slots >> regno) & 1;
+}
+
+static bool verifier_state_scratched(const struct bpf_verifier_env *env)
+{
+ return env->scratched_regs || env->scratched_stack_slots;
+}
+
+static void mark_verifier_state_clean(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = 0U;
+ env->scratched_stack_slots = 0ULL;
+}
+
+/* Used for printing the entire verifier state. */
+static void mark_verifier_state_scratched(struct bpf_verifier_env *env)
+{
+ env->scratched_regs = ~0U;
+ env->scratched_stack_slots = ~0ULL;
+}
+
/* The reg state of a pointer or a bounded scalar was saved when
* it was spilled to the stack.
*/
@@ -621,7 +667,8 @@ static void scrub_spilled_slot(u8 *stype)
}
static void print_verifier_state(struct bpf_verifier_env *env,
- const struct bpf_func_state *state)
+ const struct bpf_func_state *state,
+ bool print_all)
{
const struct bpf_reg_state *reg;
enum bpf_reg_type t;
@@ -634,78 +681,83 @@ static void print_verifier_state(struct bpf_verifier_env *env,
t = reg->type;
if (t == NOT_INIT)
continue;
+ if (!print_all && !reg_scratched(env, i))
+ continue;
verbose(env, " R%d", i);
print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str[t]);
+ verbose(env, "=");
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
+ verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (t == PTR_TO_BTF_ID ||
- t == PTR_TO_BTF_ID_OR_NULL ||
- t == PTR_TO_PERCPU_BTF_ID)
+ const char *sep = "";
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (base_type(t) == PTR_TO_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
- verbose(env, "(id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t))
- verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
+ verbose(env, "(");
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (t != SCALAR_VALUE)
- verbose(env, ",off=%d", reg->off);
+ verbose_a("off=%d", reg->off);
if (type_is_pkt_pointer(t))
- verbose(env, ",r=%d", reg->range);
- else if (t == CONST_PTR_TO_MAP ||
- t == PTR_TO_MAP_KEY ||
- t == PTR_TO_MAP_VALUE ||
- t == PTR_TO_MAP_VALUE_OR_NULL)
- verbose(env, ",ks=%d,vs=%d",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size);
+ verbose_a("r=%d", reg->range);
+ else if (base_type(t) == CONST_PTR_TO_MAP ||
+ base_type(t) == PTR_TO_MAP_KEY ||
+ base_type(t) == PTR_TO_MAP_VALUE)
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
/* Typically an immediate SCALAR_VALUE, but
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(env, ",imm=%llx", reg->var_off.value);
+ verbose_a("imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(env, ",smin_value=%lld",
- (long long)reg->smin_value);
+ verbose_a("smin=%lld", (long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(env, ",smax_value=%lld",
- (long long)reg->smax_value);
+ verbose_a("smax=%lld", (long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(env, ",umin_value=%llu",
- (unsigned long long)reg->umin_value);
+ verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(env, ",umax_value=%llu",
- (unsigned long long)reg->umax_value);
+ verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, ",var_off=%s", tn_buf);
+ verbose_a("var_off=%s", tn_buf);
}
if (reg->s32_min_value != reg->smin_value &&
reg->s32_min_value != S32_MIN)
- verbose(env, ",s32_min_value=%d",
- (int)(reg->s32_min_value));
+ verbose_a("s32_min=%d", (int)(reg->s32_min_value));
if (reg->s32_max_value != reg->smax_value &&
reg->s32_max_value != S32_MAX)
- verbose(env, ",s32_max_value=%d",
- (int)(reg->s32_max_value));
+ verbose_a("s32_max=%d", (int)(reg->s32_max_value));
if (reg->u32_min_value != reg->umin_value &&
reg->u32_min_value != U32_MIN)
- verbose(env, ",u32_min_value=%d",
- (int)(reg->u32_min_value));
+ verbose_a("u32_min=%d", (int)(reg->u32_min_value));
if (reg->u32_max_value != reg->umax_value &&
reg->u32_max_value != U32_MAX)
- verbose(env, ",u32_max_value=%d",
- (int)(reg->u32_max_value));
+ verbose_a("u32_max=%d", (int)(reg->u32_max_value));
}
+#undef verbose_a
+
verbose(env, ")");
}
}
@@ -723,12 +775,14 @@ static void print_verifier_state(struct bpf_verifier_env *env,
types_buf[BPF_REG_SIZE] = 0;
if (!valid)
continue;
+ if (!print_all && !stack_slot_scratched(env, i))
+ continue;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, state->stack[i].spilled_ptr.live);
if (is_spilled_reg(&state->stack[i])) {
reg = &state->stack[i].spilled_ptr;
t = reg->type;
- verbose(env, "=%s", reg_type_str[t]);
+ verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
@@ -748,6 +802,26 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (state->in_async_callback_fn)
verbose(env, " async_cb");
verbose(env, "\n");
+ mark_verifier_state_clean(env);
+}
+
+static inline u32 vlog_alignment(u32 pos)
+{
+ return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT),
+ BPF_LOG_MIN_ALIGNMENT) - pos - 1;
+}
+
+static void print_insn_state(struct bpf_verifier_env *env,
+ const struct bpf_func_state *state)
+{
+ if (env->prev_log_len && env->prev_log_len == env->log.len_used) {
+ /* remove new line character */
+ bpf_vlog_reset(&env->log, env->prev_log_len - 1);
+ verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' ');
+ } else {
+ verbose(env, "%d:", env->insn_idx);
+ }
+ print_verifier_state(env, state, false);
}
/* copy array src of length n * size bytes to dst. dst is reallocated if it's too
@@ -1141,8 +1215,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env,
static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
{
- switch (reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL: {
+ if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
const struct bpf_map *map = reg->map_ptr;
if (map->inner_map_meta) {
@@ -1151,7 +1224,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
/* transfer reg's id which is unique for every map_lookup_elem
* as UID of the inner map.
*/
- reg->map_uid = reg->id;
+ if (map_value_has_timer(map->inner_map_meta))
+ reg->map_uid = reg->id;
} else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
reg->type = PTR_TO_XDP_SOCK;
} else if (map->map_type == BPF_MAP_TYPE_SOCKMAP ||
@@ -1160,32 +1234,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
} else {
reg->type = PTR_TO_MAP_VALUE;
}
- break;
- }
- case PTR_TO_SOCKET_OR_NULL:
- reg->type = PTR_TO_SOCKET;
- break;
- case PTR_TO_SOCK_COMMON_OR_NULL:
- reg->type = PTR_TO_SOCK_COMMON;
- break;
- case PTR_TO_TCP_SOCK_OR_NULL:
- reg->type = PTR_TO_TCP_SOCK;
- break;
- case PTR_TO_BTF_ID_OR_NULL:
- reg->type = PTR_TO_BTF_ID;
- break;
- case PTR_TO_MEM_OR_NULL:
- reg->type = PTR_TO_MEM;
- break;
- case PTR_TO_RDONLY_BUF_OR_NULL:
- reg->type = PTR_TO_RDONLY_BUF;
- break;
- case PTR_TO_RDWR_BUF_OR_NULL:
- reg->type = PTR_TO_RDWR_BUF;
- break;
- default:
- WARN_ONCE(1, "unknown nullable register type");
+ return;
}
+
+ reg->type &= ~PTR_MAYBE_NULL;
}
static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
@@ -1365,22 +1417,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg)
reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off);
}
+static bool __reg32_bound_s64(s32 a)
+{
+ return a >= 0 && a <= S32_MAX;
+}
+
static void __reg_assign_32_into_64(struct bpf_reg_state *reg)
{
reg->umin_value = reg->u32_min_value;
reg->umax_value = reg->u32_max_value;
- /* Attempt to pull 32-bit signed bounds into 64-bit bounds
- * but must be positive otherwise set to worse case bounds
- * and refine later from tnum.
+
+ /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must
+ * be positive otherwise set to worse case bounds and refine later
+ * from tnum.
*/
- if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0)
- reg->smax_value = reg->s32_max_value;
- else
- reg->smax_value = U32_MAX;
- if (reg->s32_min_value >= 0)
+ if (__reg32_bound_s64(reg->s32_min_value) &&
+ __reg32_bound_s64(reg->s32_max_value)) {
reg->smin_value = reg->s32_min_value;
- else
+ reg->smax_value = reg->s32_max_value;
+ } else {
reg->smin_value = 0;
+ reg->smax_value = U32_MAX;
+ }
}
static void __reg_combine_32_into_64(struct bpf_reg_state *reg)
@@ -1496,14 +1554,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
static void mark_btf_ld_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno,
enum bpf_reg_type reg_type,
- struct btf *btf, u32 btf_id)
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
{
if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, regno);
return;
}
mark_reg_known_zero(env, regs, regno);
- regs[regno].type = PTR_TO_BTF_ID;
+ regs[regno].type = PTR_TO_BTF_ID | flag;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
}
@@ -1537,6 +1596,7 @@ static void init_func_state(struct bpf_verifier_env *env,
state->frameno = frameno;
state->subprogno = subprogno;
init_reg_state(env, state);
+ mark_verifier_state_scratched(env);
}
/* Similar to push_stack(), but for async callbacks */
@@ -1692,7 +1752,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
}
static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
- s16 offset, struct module **btf_modp)
+ s16 offset)
{
struct bpf_kfunc_btf kf_btf = { .offset = offset };
struct bpf_kfunc_btf_tab *tab;
@@ -1746,8 +1806,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_btf_cmp_by_off, NULL);
}
- if (btf_modp)
- *btf_modp = b->module;
return b->btf;
}
@@ -1764,8 +1822,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
}
static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
- u32 func_id, s16 offset,
- struct module **btf_modp)
+ u32 func_id, s16 offset)
{
if (offset) {
if (offset < 0) {
@@ -1776,7 +1833,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
return ERR_PTR(-EINVAL);
}
- return __find_kfunc_desc_btf(env, offset, btf_modp);
+ return __find_kfunc_desc_btf(env, offset);
}
return btf_vmlinux ?: ERR_PTR(-ENOENT);
}
@@ -1790,6 +1847,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
struct bpf_kfunc_desc *desc;
const char *func_name;
struct btf *desc_btf;
+ unsigned long call_imm;
unsigned long addr;
int err;
@@ -1839,7 +1897,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
prog_aux->kfunc_btf_tab = btf_tab;
}
- desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL);
+ desc_btf = find_kfunc_desc_btf(env, func_id, offset);
if (IS_ERR(desc_btf)) {
verbose(env, "failed to find BTF for kernel function\n");
return PTR_ERR(desc_btf);
@@ -1874,9 +1932,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ call_imm = BPF_CALL_IMM(addr);
+ /* Check whether or not the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel function %s is out of range\n",
+ func_name);
+ return -EINVAL;
+ }
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = BPF_CALL_IMM(addr);
+ desc->imm = call_imm;
desc->offset = offset;
err = btf_distill_func_proto(&env->log, desc_btf,
func_proto, func_name,
@@ -2040,7 +2106,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
break;
if (parent->live & REG_LIVE_DONE) {
verbose(env, "verifier BUG type %s var_off %lld off %d\n",
- reg_type_str[parent->type],
+ reg_type_str(env, parent->type),
parent->var_off.value, parent->off);
return -EFAULT;
}
@@ -2224,6 +2290,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return -EINVAL;
}
+ mark_reg_scratched(env, regno);
+
reg = &regs[regno];
rw64 = is_reg64(env, insn, regno, reg, t);
if (t == SRC_OP) {
@@ -2298,7 +2366,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
return NULL;
- desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL);
+ desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off);
if (IS_ERR(desc_btf))
return "<error>";
@@ -2328,7 +2396,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
if (insn->code == 0)
return 0;
- if (env->log.level & BPF_LOG_LEVEL) {
+ if (env->log.level & BPF_LOG_LEVEL2) {
verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
verbose(env, "%d: ", idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
@@ -2378,8 +2446,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
*/
if (insn->src_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
/* dreg = *(u64 *)[fp - off] was a fill from the stack.
* that [fp - off] slot contains scalar that needs to be
@@ -2402,8 +2468,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx,
/* scalars can only be spilled into stack */
if (insn->dst_reg != BPF_REG_FP)
return 0;
- if (BPF_SIZE(insn->code) != BPF_DW)
- return 0;
spi = (-insn->off - 1) / BPF_REG_SIZE;
if (spi >= 64) {
verbose(env, "BUG spi %d\n", spi);
@@ -2586,7 +2650,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
DECLARE_BITMAP(mask, 64);
u32 history = st->jmp_history_cnt;
- if (env->log.level & BPF_LOG_LEVEL)
+ if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
for (i = last_idx;;) {
if (skip_first) {
@@ -2673,11 +2737,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
new_marks = true;
reg->precise = true;
}
- if (env->log.level & BPF_LOG_LEVEL) {
- print_verifier_state(env, func);
- verbose(env, "parent %s regs=%x stack=%llx marks\n",
+ if (env->log.level & BPF_LOG_LEVEL2) {
+ verbose(env, "parent %s regs=%x stack=%llx marks:",
new_marks ? "didn't have" : "already had",
reg_mask, stack_mask);
+ print_verifier_state(env, func, true);
}
if (!reg_mask && !stack_mask)
@@ -2703,9 +2767,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
static bool is_spillable_regtype(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_MAP_VALUE:
- case PTR_TO_MAP_VALUE_OR_NULL:
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
@@ -2714,21 +2777,12 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_FLOW_KEYS:
case CONST_PTR_TO_MAP:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
- case PTR_TO_RDONLY_BUF:
- case PTR_TO_RDONLY_BUF_OR_NULL:
- case PTR_TO_RDWR_BUF:
- case PTR_TO_RDWR_BUF_OR_NULL:
- case PTR_TO_PERCPU_BTF_ID:
+ case PTR_TO_BUF:
case PTR_TO_MEM:
- case PTR_TO_MEM_OR_NULL:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
return true;
@@ -2833,6 +2887,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env,
env->insn_aux_data[insn_idx].sanitize_stack_spill = true;
}
+ mark_stack_slot_scratched(env, spi);
if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
@@ -2954,6 +3009,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env,
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE];
+ mark_stack_slot_scratched(env, spi);
if (!env->allow_ptr_leaks
&& *stype != NOT_INIT
@@ -3370,11 +3426,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
/* We may have adjusted the register pointing to memory region, so we
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
- */
- if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, state);
-
- /* The minimum value is only important with signed
+ *
+ * The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
@@ -3459,11 +3512,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
#define MAX_PACKET_OFF 0xffff
-static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
-{
- return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
-}
-
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
@@ -3569,7 +3617,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
*/
*reg_type = info.reg_type;
- if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(*reg_type) == PTR_TO_BTF_ID) {
*btf = info.btf;
*btf_id = info.btf_id;
} else {
@@ -3637,7 +3685,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
}
verbose(env, "R%d invalid %s access off=%d size=%d\n",
- regno, reg_type_str[reg->type], off, size);
+ regno, reg_type_str(env, reg->type), off, size);
return -EACCES;
}
@@ -3932,16 +3980,23 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env,
}
#endif
-int check_ctx_reg(struct bpf_verifier_env *env,
- const struct bpf_reg_state *reg, int regno)
+static int __check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ bool fixed_off_ok)
{
- /* Access to ctx or passing it to a helper is only allowed in
- * its original, unmodified form.
+ /* Access to this pointer-typed register or passing it to a helper
+ * is only allowed in its original, unmodified form.
*/
- if (reg->off) {
- verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n",
- regno, reg->off);
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
+ if (!fixed_off_ok && reg->off) {
+ verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
return -EACCES;
}
@@ -3949,13 +4004,20 @@ int check_ctx_reg(struct bpf_verifier_env *env,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf);
+ verbose(env, "variable %s access var_off=%s disallowed\n",
+ reg_type_str(env, reg->type), tn_buf);
return -EACCES;
}
return 0;
}
+int check_ptr_off_reg(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno)
+{
+ return __check_ptr_off_reg(env, reg, regno, false);
+}
+
static int __check_buffer_access(struct bpf_verifier_env *env,
const char *buf_info,
const struct bpf_reg_state *reg,
@@ -4000,9 +4062,9 @@ static int check_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int regno, int off, int size,
bool zero_size_allowed,
- const char *buf_info,
u32 *max_access)
{
+ const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
int err;
err = __check_buffer_access(env, buf_info, reg, regno, off, size);
@@ -4055,7 +4117,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size)
static bool bpf_map_is_rdonly(const struct bpf_map *map)
{
- return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen;
+ /* A map is considered read-only if the following condition are true:
+ *
+ * 1) BPF program side cannot change any of the map content. The
+ * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map
+ * and was set at map creation time.
+ * 2) The map value(s) have been initialized from user space by a
+ * loader and then "frozen", such that no new map update/delete
+ * operations from syscall side are possible for the rest of
+ * the map's lifetime from that point onwards.
+ * 3) Any parallel/pending map update/delete operations from syscall
+ * side have been completed. Only after that point, it's safe to
+ * assume that map value(s) are immutable.
+ */
+ return (map->map_flags & BPF_F_RDONLY_PROG) &&
+ READ_ONCE(map->frozen) &&
+ !bpf_map_write_active(map);
}
static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val)
@@ -4097,6 +4174,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ enum bpf_type_flag flag = 0;
u32 btf_id;
int ret;
@@ -4116,9 +4194,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
+ if (reg->type & MEM_USER) {
+ verbose(env,
+ "R%d is ptr_%s access user memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_PERCPU) {
+ verbose(env,
+ "R%d is ptr_%s access percpu memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
if (env->ops->btf_struct_access) {
ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id);
+ off, size, atype, &btf_id, &flag);
} else {
if (atype != BPF_READ) {
verbose(env, "only read is supported\n");
@@ -4126,14 +4218,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id);
+ atype, &btf_id, &flag);
}
if (ret < 0)
return ret;
if (atype == BPF_READ && value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
return 0;
}
@@ -4146,6 +4238,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
u32 btf_id;
@@ -4183,12 +4276,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
+ ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
if (ret < 0)
return ret;
if (value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
return 0;
}
@@ -4349,15 +4442,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
mark_reg_unknown(env, regs, value_regno);
}
}
- } else if (reg->type == PTR_TO_MEM) {
+ } else if (base_type(reg->type) == PTR_TO_MEM) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+
+ if (type_may_be_null(reg->type)) {
+ verbose(env, "R%d invalid mem access '%s'\n", regno,
+ reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
+ if (t == BPF_WRITE && rdonly_mem) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+
if (t == BPF_WRITE && value_regno >= 0 &&
is_pointer_value(env, value_regno)) {
verbose(env, "R%d leaks addr into mem\n", value_regno);
return -EACCES;
}
+
err = check_mem_region_access(env, regno, off, size,
reg->mem_size, false);
- if (!err && t == BPF_READ && value_regno >= 0)
+ if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
mark_reg_unknown(env, regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -4370,11 +4478,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
return -EACCES;
}
- err = check_ctx_reg(env, reg, regno);
+ err = check_ptr_off_reg(env, reg, regno);
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
+ &btf_id);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -4387,7 +4496,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
mark_reg_known_zero(env, regs,
value_regno);
- if (reg_type_may_be_null(reg_type))
+ if (type_may_be_null(reg_type))
regs[value_regno].id = ++env->id_gen;
/* A load of ctx field could have different
* actual load size with the one encoded in the
@@ -4395,8 +4504,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* a sub-register.
*/
regs[value_regno].subreg_def = DEF_NOT_SUBREG;
- if (reg_type == PTR_TO_BTF_ID ||
- reg_type == PTR_TO_BTF_ID_OR_NULL) {
+ if (base_type(reg_type) == PTR_TO_BTF_ID) {
regs[value_regno].btf = btf;
regs[value_regno].btf_id = btf_id;
}
@@ -4449,7 +4557,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (type_is_sk_pointer(reg->type)) {
if (t == BPF_WRITE) {
verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
+ regno, reg_type_str(env, reg->type));
return -EACCES;
}
err = check_sock_access(env, insn_idx, regno, off, size, t);
@@ -4459,32 +4567,36 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_BTF_ID) {
+ } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+ !type_may_be_null(reg->type)) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
} else if (reg->type == CONST_PTR_TO_MAP) {
err = check_ptr_to_map_access(env, regs, regno, off, size, t,
value_regno);
- } else if (reg->type == PTR_TO_RDONLY_BUF) {
- if (t == BPF_WRITE) {
- verbose(env, "R%d cannot write into %s\n",
- regno, reg_type_str[reg->type]);
- return -EACCES;
+ } else if (base_type(reg->type) == PTR_TO_BUF) {
+ bool rdonly_mem = type_is_rdonly_mem(reg->type);
+ u32 *max_access;
+
+ if (rdonly_mem) {
+ if (t == BPF_WRITE) {
+ verbose(env, "R%d cannot write into %s\n",
+ regno, reg_type_str(env, reg->type));
+ return -EACCES;
+ }
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
}
+
err = check_buffer_access(env, reg, regno, off, size, false,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- if (!err && value_regno >= 0)
- mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_RDWR_BUF) {
- err = check_buffer_access(env, reg, regno, off, size, false,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
- if (!err && t == BPF_READ && value_regno >= 0)
+ max_access);
+
+ if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EACCES;
}
@@ -4535,9 +4647,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
if (insn->imm == BPF_CMPXCHG) {
/* Check comparison of R0 with memory location */
- err = check_reg_arg(env, BPF_REG_0, SRC_OP);
+ const u32 aux_reg = BPF_REG_0;
+
+ err = check_reg_arg(env, aux_reg, SRC_OP);
if (err)
return err;
+
+ if (is_pointer_value(env, aux_reg)) {
+ verbose(env, "R%d leaks addr into mem\n", aux_reg);
+ return -EACCES;
+ }
}
if (is_pointer_value(env, insn->src_reg)) {
@@ -4551,7 +4670,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
is_sk_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
@@ -4572,13 +4691,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
load_reg = -1;
}
- /* check whether we can read the memory */
+ /* Check whether we can read the memory, with second call for fetch
+ * case to simulate the register fill.
+ */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ, load_reg, true);
+ BPF_SIZE(insn->code), BPF_READ, -1, true);
+ if (!err && load_reg >= 0)
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, load_reg,
+ true);
if (err)
return err;
- /* check whether we can write into the same memory */
+ /* Check whether we can write into the same memory. */
err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1, true);
if (err)
@@ -4686,7 +4811,7 @@ static int check_stack_range_initialized(
}
if (is_spilled_reg(&state->stack[spi]) &&
- state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+ base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID)
goto mark;
if (is_spilled_reg(&state->stack[spi]) &&
@@ -4728,8 +4853,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ u32 *max_access;
- switch (reg->type) {
+ switch (base_type(reg->type)) {
case PTR_TO_PACKET:
case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size,
@@ -4748,18 +4874,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_mem_region_access(env, regno, reg->off,
access_size, reg->mem_size,
zero_size_allowed);
- case PTR_TO_RDONLY_BUF:
- if (meta && meta->raw_mode)
- return -EACCES;
- return check_buffer_access(env, reg, regno, reg->off,
- access_size, zero_size_allowed,
- "rdonly",
- &env->prog->aux->max_rdonly_access);
- case PTR_TO_RDWR_BUF:
+ case PTR_TO_BUF:
+ if (type_is_rdonly_mem(reg->type)) {
+ if (meta && meta->raw_mode)
+ return -EACCES;
+
+ max_access = &env->prog->aux->max_rdonly_access;
+ } else {
+ max_access = &env->prog->aux->max_rdwr_access;
+ }
return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
- "rdwr",
- &env->prog->aux->max_rdwr_access);
+ max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
@@ -4771,20 +4897,76 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
register_is_null(reg))
return 0;
- verbose(env, "R%d type=%s expected=%s\n", regno,
- reg_type_str[reg->type],
- reg_type_str[PTR_TO_STACK]);
+ verbose(env, "R%d type=%s ", regno,
+ reg_type_str(env, reg->type));
+ verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
return -EACCES;
}
}
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ int err;
+
+ /* This is used to refine r0 return value bounds for helpers
+ * that enforce this value as an upper bound on return values.
+ * See do_refine_retval_range() for helpers that can refine
+ * the return value. C type of helper is u32 so we pull register
+ * bound from umax_value however, if negative verifier errors
+ * out. Only upper bounds can be learned because retval is an
+ * int type and negative retvals are allowed.
+ */
+ if (meta)
+ meta->msize_max_value = reg->umax_value;
+
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
+ */
+ if (!tnum_is_const(reg->var_off))
+ /* For unprivileged variable accesses, disable raw
+ * mode so that the program is required to
+ * initialize all the memory that the helper could
+ * just partially fill up.
+ */
+ meta = NULL;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
+ if (err)
+ return err;
+ }
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ return err;
+}
+
int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
u32 regno, u32 mem_size)
{
if (register_is_null(reg))
return 0;
- if (reg_type_may_be_null(reg->type)) {
+ if (type_may_be_null(reg->type)) {
/* Assuming that the register contains a value check if the memory
* access is safe. Temporarily save and restore the register's state as
* the conversion shouldn't be visible to a caller.
@@ -4801,6 +4983,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return check_helper_mem_access(env, regno, mem_size, true, NULL);
}
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
+{
+ struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+ bool may_be_null = type_may_be_null(mem_reg->type);
+ struct bpf_reg_state saved_reg;
+ int err;
+
+ WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+ if (may_be_null) {
+ saved_reg = *mem_reg;
+ mark_ptr_not_null_reg(mem_reg);
+ }
+
+ err = check_mem_size_reg(env, reg, regno, true, NULL);
+
+ if (may_be_null)
+ *mem_reg = saved_reg;
+ return err;
+}
+
/* Implementation details:
* bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
* Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -4932,9 +5136,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno,
static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{
- return type == ARG_PTR_TO_MEM ||
- type == ARG_PTR_TO_MEM_OR_NULL ||
- type == ARG_PTR_TO_UNINIT_MEM;
+ return base_type(type) == ARG_PTR_TO_MEM ||
+ base_type(type) == ARG_PTR_TO_UNINIT_MEM;
}
static bool arg_type_is_mem_size(enum bpf_arg_type type)
@@ -5039,8 +5242,8 @@ static const struct bpf_reg_types mem_types = {
PTR_TO_MAP_KEY,
PTR_TO_MAP_VALUE,
PTR_TO_MEM,
- PTR_TO_RDONLY_BUF,
- PTR_TO_RDWR_BUF,
+ PTR_TO_MEM | MEM_ALLOC,
+ PTR_TO_BUF,
},
};
@@ -5057,11 +5260,11 @@ static const struct bpf_reg_types int_ptr_types = {
static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } };
static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } };
static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } };
-static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
+static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -5071,31 +5274,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
[ARG_PTR_TO_MAP_VALUE] = &map_key_value_types,
[ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types,
- [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types,
[ARG_CONST_SIZE] = &scalar_types,
[ARG_CONST_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types,
[ARG_CONST_MAP_PTR] = &const_map_ptr_types,
[ARG_PTR_TO_CTX] = &context_types,
- [ARG_PTR_TO_CTX_OR_NULL] = &context_types,
[ARG_PTR_TO_SOCK_COMMON] = &sock_types,
#ifdef CONFIG_NET
[ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
#endif
[ARG_PTR_TO_SOCKET] = &fullsock_types,
- [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
[ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types,
[ARG_PTR_TO_MEM] = &mem_types,
- [ARG_PTR_TO_MEM_OR_NULL] = &mem_types,
[ARG_PTR_TO_UNINIT_MEM] = &mem_types,
[ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types,
- [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
[ARG_PTR_TO_FUNC] = &func_ptr_types,
- [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types,
+ [ARG_PTR_TO_STACK] = &stack_ptr_types,
[ARG_PTR_TO_CONST_STR] = &const_str_ptr_types,
[ARG_PTR_TO_TIMER] = &timer_types,
};
@@ -5109,12 +5307,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
const struct bpf_reg_types *compatible;
int i, j;
- compatible = compatible_reg_types[arg_type];
+ compatible = compatible_reg_types[base_type(arg_type)];
if (!compatible) {
verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type);
return -EFAULT;
}
+ /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY
+ *
+ * Same for MAYBE_NULL:
+ *
+ * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL,
+ * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL
+ *
+ * Therefore we fold these flags depending on the arg_type before comparison.
+ */
+ if (arg_type & MEM_RDONLY)
+ type &= ~MEM_RDONLY;
+ if (arg_type & PTR_MAYBE_NULL)
+ type &= ~PTR_MAYBE_NULL;
+
for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
expected = compatible->types[i];
if (expected == NOT_INIT)
@@ -5124,14 +5337,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
goto found;
}
- verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
+ verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type));
for (j = 0; j + 1 < i; j++)
- verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
- verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
+ verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
+ verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
return -EACCES;
found:
- if (type == PTR_TO_BTF_ID) {
+ if (reg->type == PTR_TO_BTF_ID) {
if (!arg_btf_id) {
if (!compatible->btf_id) {
verbose(env, "verifier internal error: missing arg compatible BTF ID\n");
@@ -5147,15 +5360,63 @@ found:
kernel_type_name(btf_vmlinux, *arg_btf_id));
return -EACCES;
}
+ }
+
+ return 0;
+}
- if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
- verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n",
+int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type,
+ bool is_release_func)
+{
+ bool fixed_off_ok = false, release_reg;
+ enum bpf_reg_type type = reg->type;
+
+ switch ((u32)type) {
+ case SCALAR_VALUE:
+ /* Pointer types where reg offset is explicitly allowed: */
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_MEM | MEM_RDONLY:
+ case PTR_TO_MEM | MEM_ALLOC:
+ case PTR_TO_BUF:
+ case PTR_TO_BUF | MEM_RDONLY:
+ case PTR_TO_STACK:
+ /* Some of the argument types nevertheless require a
+ * zero register offset.
+ */
+ if (arg_type != ARG_PTR_TO_ALLOC_MEM)
+ return 0;
+ break;
+ /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+ * fixed offset.
+ */
+ case PTR_TO_BTF_ID:
+ /* When referenced PTR_TO_BTF_ID is passed to release function,
+ * it's fixed offset must be 0. We rely on the property that
+ * only one referenced register can be passed to BPF helpers and
+ * kfuncs. In the other cases, fixed offset can be non-zero.
+ */
+ release_reg = is_release_func && reg->ref_obj_id;
+ if (release_reg && reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func\n",
regno);
- return -EACCES;
+ return -EINVAL;
}
+ /* For release_reg == true, fixed_off_ok must be false, but we
+ * already checked and rejected reg->off != 0 above, so set to
+ * true to allow fixed offset for all other cases.
+ */
+ fixed_off_ok = true;
+ break;
+ default:
+ break;
}
-
- return 0;
+ return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
}
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
@@ -5190,15 +5451,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
return -EACCES;
}
- if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
- arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
+ if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
err = resolve_map_arg_type(env, meta, &arg_type);
if (err)
return err;
}
- if (register_is_null(reg) && arg_type_may_be_null(arg_type))
+ if (register_is_null(reg) && type_may_be_null(arg_type))
/* A NULL register has a SCALAR_VALUE type, so skip
* type checking.
*/
@@ -5208,13 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
if (err)
return err;
- if (type == PTR_TO_CTX) {
- err = check_ctx_reg(env, reg, regno);
- if (err < 0)
- return err;
- }
+ err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id));
+ if (err)
+ return err;
skip_type_check:
+ /* check_func_arg_reg_off relies on only one referenced register being
+ * allowed for BPF helpers.
+ */
if (reg->ref_obj_id) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -5267,10 +5528,11 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
- (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
- !register_is_null(reg)) ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
+ base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ if (type_may_be_null(arg_type) && register_is_null(reg))
+ return 0;
+
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
*/
@@ -5314,51 +5576,7 @@ skip_type_check:
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
- /* This is used to refine r0 return value bounds for helpers
- * that enforce this value as an upper bound on return values.
- * See do_refine_retval_range() for helpers that can refine
- * the return value. C type of helper is u32 so we pull register
- * bound from umax_value however, if negative verifier errors
- * out. Only upper bounds can be learned because retval is an
- * int type and negative retvals are allowed.
- */
- meta->msize_max_value = reg->umax_value;
-
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
- */
- if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
- meta = NULL;
-
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->umin_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
-
- if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
- if (!err)
- err = mark_chain_precision(env, regno);
+ err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
} else if (arg_type_is_alloc_size(arg_type)) {
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
@@ -5934,6 +6152,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
insn->imm == BPF_FUNC_timer_set_callback) {
struct bpf_verifier_state *async_cb;
@@ -5992,9 +6211,9 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
}
return 0;
}
@@ -6085,6 +6304,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_loop_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx,
+ * u64 flags);
+ * callback_fn(u32 index, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1].type = SCALAR_VALUE;
+ callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_3]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int set_timer_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -6114,6 +6354,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env,
return 0;
}
+static int set_find_vma_callback_state(struct bpf_verifier_env *env,
+ struct bpf_func_state *caller,
+ struct bpf_func_state *callee,
+ int insn_idx)
+{
+ /* bpf_find_vma(struct task_struct *task, u64 addr,
+ * void *callback_fn, void *callback_ctx, u64 flags)
+ * (callback_fn)(struct task_struct *task,
+ * struct vm_area_struct *vma, void *callback_ctx);
+ */
+ callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+
+ callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID;
+ __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ callee->regs[BPF_REG_2].btf = btf_vmlinux;
+ callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA],
+
+ /* pointer to stack or null */
+ callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4];
+
+ /* unused */
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_4]);
+ __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ callee->in_callback_fn = true;
+ return 0;
+}
+
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
struct bpf_verifier_state *state = env->cur_state;
@@ -6161,9 +6428,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
*insn_idx = callee->callsite + 1;
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee);
+ print_verifier_state(env, callee, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller);
+ print_verifier_state(env, caller, true);
}
/* clear everything in the callee */
free_func_state(callee);
@@ -6329,13 +6596,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env,
static int check_get_func_ip(struct bpf_verifier_env *env)
{
- enum bpf_attach_type eatype = env->prog->expected_attach_type;
enum bpf_prog_type type = resolve_prog_type(env->prog);
int func_id = BPF_FUNC_get_func_ip;
if (type == BPF_PROG_TYPE_TRACING) {
- if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT &&
- eatype != BPF_MODIFY_RETURN) {
+ if (!bpf_prog_has_trampoline(env->prog)) {
verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n",
func_id_name(func_id), func_id);
return -ENOTSUPP;
@@ -6354,6 +6619,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
int *insn_idx_p)
{
const struct bpf_func_proto *fn = NULL;
+ enum bpf_return_type ret_type;
+ enum bpf_type_flag ret_flag;
struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
int insn_idx = *insn_idx_p;
@@ -6431,13 +6698,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return err;
}
- if (func_id == BPF_FUNC_tail_call) {
- err = check_reference_leak(env);
- if (err) {
- verbose(env, "tail_call would lead to reference leak\n");
- return err;
- }
- } else if (is_release_function(func_id)) {
+ if (is_release_function(func_id)) {
err = release_reference(env, meta.ref_obj_id);
if (err) {
verbose(env, "func %s#%d reference has not been acquired before\n",
@@ -6448,35 +6709,47 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs = cur_regs(env);
- /* check that flags argument in get_local_storage(map, flags) is 0,
- * this is required because get_local_storage() can't return an error.
- */
- if (func_id == BPF_FUNC_get_local_storage &&
- !register_is_null(&regs[BPF_REG_2])) {
- verbose(env, "get_local_storage() doesn't support non-zero flags\n");
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_for_each_map_elem) {
+ switch (func_id) {
+ case BPF_FUNC_tail_call:
+ err = check_reference_leak(env);
+ if (err) {
+ verbose(env, "tail_call would lead to reference leak\n");
+ return err;
+ }
+ break;
+ case BPF_FUNC_get_local_storage:
+ /* check that flags argument in get_local_storage(map, flags) is 0,
+ * this is required because get_local_storage() can't return an error.
+ */
+ if (!register_is_null(&regs[BPF_REG_2])) {
+ verbose(env, "get_local_storage() doesn't support non-zero flags\n");
+ return -EINVAL;
+ }
+ break;
+ case BPF_FUNC_for_each_map_elem:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_map_elem_callback_state);
- if (err < 0)
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_timer_set_callback) {
+ break;
+ case BPF_FUNC_timer_set_callback:
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_timer_callback_state);
- if (err < 0)
- return -EINVAL;
- }
-
- if (func_id == BPF_FUNC_snprintf) {
+ break;
+ case BPF_FUNC_find_vma:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_find_vma_callback_state);
+ break;
+ case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs);
- if (err < 0)
- return err;
+ break;
+ case BPF_FUNC_loop:
+ err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ set_loop_callback_state);
+ break;
}
+ if (err)
+ return err;
+
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
@@ -6487,13 +6760,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
/* update return register (already marked as written above) */
- if (fn->ret_type == RET_INTEGER) {
+ ret_type = fn->ret_type;
+ ret_flag = type_flag(fn->ret_type);
+ if (ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
mark_reg_unknown(env, regs, BPF_REG_0);
- } else if (fn->ret_type == RET_VOID) {
+ } else if (ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+ } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
/* remember map_ptr, so that check_map_access()
@@ -6507,28 +6781,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
- if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
- if (map_value_has_spin_lock(meta.map_ptr))
- regs[BPF_REG_0].id = ++env->id_gen;
- } else {
- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
+ if (!type_may_be_null(ret_type) &&
+ map_value_has_spin_lock(meta.map_ptr)) {
+ regs[BPF_REG_0].id = ++env->id_gen;
}
- } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
+ } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
- } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
+ } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = meta.mem_size;
- } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
+ } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -6546,29 +6817,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
tname, PTR_ERR(ret));
return -EINVAL;
}
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
regs[BPF_REG_0].mem_size = tsize;
} else {
- regs[BPF_REG_0].type =
- fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
- PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
+ /* MEM_RDONLY may be carried from ret_flag, but it
+ * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
+ * it will confuse the check of PTR_TO_BTF_ID in
+ * check_mem_access().
+ */
+ ret_flag &= ~MEM_RDONLY;
+
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
regs[BPF_REG_0].btf = meta.ret_btf;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
- } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
- fn->ret_type == RET_PTR_TO_BTF_ID) {
+ } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
int ret_btf_id;
mark_reg_known_zero(env, regs, BPF_REG_0);
- regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
- PTR_TO_BTF_ID :
- PTR_TO_BTF_ID_OR_NULL;
+ regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
ret_btf_id = *fn->ret_btf_id;
if (ret_btf_id == 0) {
- verbose(env, "invalid return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ verbose(env, "invalid return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id),
+ func_id);
return -EINVAL;
}
/* current BPF helper definitions are only coming from
@@ -6577,12 +6849,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
regs[BPF_REG_0].btf = btf_vmlinux;
regs[BPF_REG_0].btf_id = ret_btf_id;
} else {
- verbose(env, "unknown return type %d of func %s#%d\n",
- fn->ret_type, func_id_name(func_id), func_id);
+ verbose(env, "unknown return type %u of func %s#%d\n",
+ base_type(ret_type), func_id_name(func_id), func_id);
return -EINVAL;
}
- if (reg_type_may_be_null(regs[BPF_REG_0].type))
+ if (type_may_be_null(regs[BPF_REG_0].type))
regs[BPF_REG_0].id = ++env->id_gen;
if (is_ptr_cast_function(func_id)) {
@@ -6663,22 +6935,23 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
u32 i, nargs, func_id, ptr_type_id;
- struct module *btf_mod = NULL;
+ int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
struct btf *desc_btf;
- int err;
+ bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
if (!insn->imm)
return 0;
- desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod);
+ desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off);
if (IS_ERR(desc_btf))
return PTR_ERR(desc_btf);
@@ -6687,23 +6960,43 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
func_name = btf_name_by_offset(desc_btf, func->name_off);
func_proto = btf_type_by_id(desc_btf, func->type);
- if (!env->ops->check_kfunc_call ||
- !env->ops->check_kfunc_call(func_id, btf_mod)) {
+ if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_CHECK, func_id)) {
verbose(env, "calling kernel function %s is not allowed\n",
func_name);
return -EACCES;
}
+ acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_ACQUIRE, func_id);
+
/* Check the arguments */
err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
- if (err)
+ if (err < 0)
return err;
+ /* In case of release function, we get register number of refcounted
+ * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+ */
+ if (err) {
+ err = release_reference(env, regs[err].ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, func_id);
+ return err;
+ }
+ }
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+
+ if (acq && !btf_type_is_ptr(t)) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
+
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
@@ -6722,7 +7015,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
+ if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+ regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+ /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ if (acq) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].ref_obj_id = id;
+ }
} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
nargs = btf_type_vlen(func_proto);
@@ -6791,25 +7098,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
verbose(env, "math between %s pointer and %lld is not allowed\n",
- reg_type_str[type], val);
+ reg_type_str(env, type), val);
return false;
}
if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
verbose(env, "%s pointer offset %d is not allowed\n",
- reg_type_str[type], reg->off);
+ reg_type_str(env, type), reg->off);
return false;
}
if (smin == S64_MIN) {
verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n",
- reg_type_str[type]);
+ reg_type_str(env, type));
return false;
}
if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
verbose(env, "value %lld makes %s pointer be out of bounds\n",
- smin, reg_type_str[type]);
+ smin, reg_type_str(env, type));
return false;
}
@@ -7186,11 +7493,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
return -EACCES;
}
- switch (ptr_reg->type) {
- case PTR_TO_MAP_VALUE_OR_NULL:
+ if (ptr_reg->type & PTR_MAYBE_NULL) {
verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
+ }
+
+ switch (base_type(ptr_reg->type)) {
case CONST_PTR_TO_MAP:
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
@@ -7198,14 +7507,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
- dst, reg_type_str[ptr_reg->type]);
+ dst, reg_type_str(env, ptr_reg->type));
return -EACCES;
default:
break;
@@ -8178,12 +8484,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state);
+ print_verifier_state(env, state, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -8292,6 +8598,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->dst_reg);
}
zext_32_to_64(dst_reg);
+
+ __update_reg_bounds(dst_reg);
+ __reg_deduce_bounds(dst_reg);
+ __reg_bound_offset(dst_reg);
}
} else {
/* case: R = imm
@@ -8406,7 +8716,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
new_range = dst_reg->off;
if (range_right_open)
- new_range--;
+ new_range++;
/* Examples for register markings:
*
@@ -8924,17 +9234,17 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
struct bpf_reg_state *reg, u32 id,
bool is_null)
{
- if (reg_type_may_be_null(reg->type) && reg->id == id &&
+ if (type_may_be_null(reg->type) && reg->id == id &&
!WARN_ON_ONCE(!reg->id)) {
- /* Old offset (both fixed and variable parts) should
- * have been known-zero, because we don't allow pointer
- * arithmetic on pointers that might be NULL.
- */
if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
!tnum_equals_const(reg->var_off, 0) ||
reg->off)) {
- __mark_reg_known_zero(reg);
- reg->off = 0;
+ /* Old offset (both fixed and variable parts) should
+ * have been known-zero, because we don't allow pointer
+ * arithmetic on pointers that might be NULL. If we
+ * see this happening, don't convert the register.
+ */
+ return;
}
if (is_null) {
reg->type = SCALAR_VALUE;
@@ -9302,7 +9612,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*/
if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
- reg_type_may_be_null(dst_reg->type)) {
+ type_may_be_null(dst_reg->type)) {
/* Mark all identical registers in each branch as either
* safe or unknown depending R == 0 or R != 0 conditional.
*/
@@ -9318,7 +9628,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_verifier_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
@@ -9353,16 +9663,19 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
- if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
- mark_reg_known_zero(env, regs, insn->dst_reg);
+ /* All special src_reg cases are listed below. From this point onwards
+ * we either succeed and assign a corresponding dst_reg->type after
+ * zeroing the offset, or fail and reject the program.
+ */
+ mark_reg_known_zero(env, regs, insn->dst_reg);
+ if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
dst_reg->type = aux->btf_var.reg_type;
- switch (dst_reg->type) {
+ switch (base_type(dst_reg->type)) {
case PTR_TO_MEM:
dst_reg->mem_size = aux->btf_var.mem_size;
break;
case PTR_TO_BTF_ID:
- case PTR_TO_PERCPU_BTF_ID:
dst_reg->btf = aux->btf_var.btf;
dst_reg->btf_id = aux->btf_var.btf_id;
break;
@@ -9393,7 +9706,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
map = env->used_maps[aux->map_index];
- mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE ||
@@ -9497,7 +9809,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err;
}
- err = check_ctx_reg(env, &regs[ctx_reg], ctx_reg);
+ err = check_ptr_off_reg(env, &regs[ctx_reg], ctx_reg);
if (err < 0)
return err;
@@ -9556,7 +9868,7 @@ static int check_return_code(struct bpf_verifier_env *env)
/* enforce return zero from async callbacks like timer */
if (reg->type != SCALAR_VALUE) {
verbose(env, "In async callback the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -9570,7 +9882,7 @@ static int check_return_code(struct bpf_verifier_env *env)
if (is_subprog) {
if (reg->type != SCALAR_VALUE) {
verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
return 0;
@@ -9634,7 +9946,7 @@ static int check_return_code(struct bpf_verifier_env *env)
if (reg->type != SCALAR_VALUE) {
verbose(env, "At program exit the register R0 is not a known value (%s)\n",
- reg_type_str[reg->type]);
+ reg_type_str(env, reg->type));
return -EINVAL;
}
@@ -10088,8 +10400,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
- sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
static int check_btf_line(struct bpf_verifier_env *env,
@@ -10217,6 +10528,78 @@ err_free:
return err;
}
+#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo)
+#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE
+
+static int check_core_relo(struct bpf_verifier_env *env,
+ const union bpf_attr *attr,
+ bpfptr_t uattr)
+{
+ u32 i, nr_core_relo, ncopy, expected_size, rec_size;
+ struct bpf_core_relo core_relo = {};
+ struct bpf_prog *prog = env->prog;
+ const struct btf *btf = prog->aux->btf;
+ struct bpf_core_ctx ctx = {
+ .log = &env->log,
+ .btf = btf,
+ };
+ bpfptr_t u_core_relo;
+ int err;
+
+ nr_core_relo = attr->core_relo_cnt;
+ if (!nr_core_relo)
+ return 0;
+ if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo))
+ return -EINVAL;
+
+ rec_size = attr->core_relo_rec_size;
+ if (rec_size < MIN_CORE_RELO_SIZE ||
+ rec_size > MAX_CORE_RELO_SIZE ||
+ rec_size % sizeof(u32))
+ return -EINVAL;
+
+ u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel);
+ expected_size = sizeof(struct bpf_core_relo);
+ ncopy = min_t(u32, expected_size, rec_size);
+
+ /* Unlike func_info and line_info, copy and apply each CO-RE
+ * relocation record one at a time.
+ */
+ for (i = 0; i < nr_core_relo; i++) {
+ /* future proofing when sizeof(bpf_core_relo) changes */
+ err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size);
+ if (err) {
+ if (err == -E2BIG) {
+ verbose(env, "nonzero tailing record in core_relo");
+ if (copy_to_bpfptr_offset(uattr,
+ offsetof(union bpf_attr, core_relo_rec_size),
+ &expected_size, sizeof(expected_size)))
+ err = -EFAULT;
+ }
+ break;
+ }
+
+ if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) {
+ err = -EFAULT;
+ break;
+ }
+
+ if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) {
+ verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n",
+ i, core_relo.insn_off, prog->len);
+ err = -EINVAL;
+ break;
+ }
+
+ err = bpf_core_apply(&ctx, &core_relo, i,
+ &prog->insnsi[core_relo.insn_off / 8]);
+ if (err)
+ break;
+ bpfptr_add(&u_core_relo, rec_size);
+ }
+ return err;
+}
+
static int check_btf_info(struct bpf_verifier_env *env,
const union bpf_attr *attr,
bpfptr_t uattr)
@@ -10247,6 +10630,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
if (err)
return err;
+ err = check_core_relo(env, attr, uattr);
+ if (err)
+ return err;
+
return 0;
}
@@ -10415,7 +10802,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return true;
if (rcur->type == NOT_INIT)
return false;
- switch (rold->type) {
+ switch (base_type(rold->type)) {
case SCALAR_VALUE:
if (env->explore_alu_limits)
return false;
@@ -10437,6 +10824,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
}
case PTR_TO_MAP_KEY:
case PTR_TO_MAP_VALUE:
+ /* a PTR_TO_MAP_VALUE could be safe to use as a
+ * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+ * checked, doing so could have affected others with the same
+ * id, and we can't check for that because we lost the id when
+ * we converted to a PTR_TO_MAP_VALUE.
+ */
+ if (type_may_be_null(rold->type)) {
+ if (!type_may_be_null(rcur->type))
+ return false;
+ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+ return false;
+ /* Check our ids match any regs they're supposed to */
+ return check_ids(rold->id, rcur->id, idmap);
+ }
+
/* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK.
* 'id' is not compared, since it's only used for maps with
@@ -10448,20 +10851,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
- case PTR_TO_MAP_VALUE_OR_NULL:
- /* a PTR_TO_MAP_VALUE could be safe to use as a
- * PTR_TO_MAP_VALUE_OR_NULL into the same map.
- * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
- * checked, doing so could have affected others with the same
- * id, and we can't check for that because we lost the id when
- * we converted to a PTR_TO_MAP_VALUE.
- */
- if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
- return false;
- if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
- return false;
- /* Check our ids match any regs they're supposed to */
- return check_ids(rold->id, rcur->id, idmap);
case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
if (rcur->type != rold->type)
@@ -10490,11 +10879,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold,
case PTR_TO_PACKET_END:
case PTR_TO_FLOW_KEYS:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
/* Only valid matches are exact, which memcmp() above
* would have accepted
@@ -11020,17 +11406,13 @@ next:
/* Return true if it's OK to have the same insn return a different type. */
static bool reg_type_mismatch_ok(enum bpf_reg_type type)
{
- switch (type) {
+ switch (base_type(type)) {
case PTR_TO_CTX:
case PTR_TO_SOCKET:
- case PTR_TO_SOCKET_OR_NULL:
case PTR_TO_SOCK_COMMON:
- case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
- case PTR_TO_TCP_SOCK_OR_NULL:
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
- case PTR_TO_BTF_ID_OR_NULL:
return false;
default:
return true;
@@ -11110,16 +11492,12 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (env->log.level & BPF_LOG_LEVEL2 ||
- (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
- if (env->log.level & BPF_LOG_LEVEL2)
- verbose(env, "%d:", env->insn_idx);
- else
- verbose(env, "\nfrom %d to %d%s:",
- env->prev_insn_idx, env->insn_idx,
- env->cur_state->speculative ?
- " (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe]);
+ if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) {
+ verbose(env, "\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ print_verifier_state(env, state->frame[state->curframe], true);
do_print_state = false;
}
@@ -11130,9 +11508,15 @@ static int do_check(struct bpf_verifier_env *env)
.private_data = env,
};
+ if (verifier_state_scratched(env))
+ print_insn_state(env, state->frame[state->curframe]);
+
verbose_linfo(env, env->insn_idx, "; ");
+ env->prev_log_len = env->log.len_used;
verbose(env, "%d: ", env->insn_idx);
print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ env->prev_insn_print_len = env->log.len_used - env->prev_log_len;
+ env->prev_log_len = env->log.len_used;
}
if (bpf_prog_is_dev_bound(env->prog->aux)) {
@@ -11254,7 +11638,7 @@ static int do_check(struct bpf_verifier_env *env)
if (is_ctx_reg(env, insn->dst_reg)) {
verbose(env, "BPF_ST stores into R%d %s is not allowed\n",
insn->dst_reg,
- reg_type_str[reg_state(env, insn->dst_reg)->type]);
+ reg_type_str(env, reg_state(env, insn->dst_reg)->type));
return -EACCES;
}
@@ -11291,7 +11675,7 @@ static int do_check(struct bpf_verifier_env *env)
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
- err = check_kfunc_call(env, insn);
+ err = check_kfunc_call(env, insn, &env->insn_idx);
else
err = check_helper_call(env, insn, &env->insn_idx);
if (err)
@@ -11341,6 +11725,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
process_bpf_exit:
+ mark_verifier_state_scratched(env);
update_branch_counts(env, env->cur_state);
err = pop_stack(env, &prev_insn_idx,
&env->insn_idx, pop_log);
@@ -11489,7 +11874,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
- aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+ aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
@@ -11506,7 +11891,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
err = -EINVAL;
goto err_put;
}
- aux->btf_var.reg_type = PTR_TO_MEM;
+ aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
} else {
aux->btf_var.reg_type = PTR_TO_BTF_ID;
@@ -11631,6 +12016,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
}
+ if (map_value_has_timer(map)) {
+ if (is_tracing_prog_type(prog_type)) {
+ verbose(env, "tracing progs cannot use bpf_timer yet\n");
+ return -EINVAL;
+ }
+ }
+
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n");
@@ -11659,6 +12051,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
}
break;
case BPF_MAP_TYPE_RINGBUF:
+ case BPF_MAP_TYPE_INODE_STORAGE:
+ case BPF_MAP_TYPE_SK_STORAGE:
+ case BPF_MAP_TYPE_TASK_STORAGE:
break;
default:
verbose(env,
@@ -12628,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
func[i]->aux->linfo = prog->aux->linfo;
@@ -12723,6 +13119,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_jit_attempt_done(prog);
@@ -12750,6 +13147,7 @@ out_free:
out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
+ prog->blinding_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (!bpf_pseudo_call(insn))
continue;
@@ -12842,7 +13240,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env,
static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
- bool expect_blinding = bpf_jit_blinding_enabled(prog);
+ enum bpf_attach_type eatype = prog->expected_attach_type;
enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
@@ -13006,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !expect_blinding &&
+ if (env->bpf_capable && !prog->blinding_requested &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -13094,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (insn->imm == BPF_FUNC_task_storage_get ||
+ insn->imm == BPF_FUNC_sk_storage_get ||
+ insn->imm == BPF_FUNC_inode_storage_get) {
+ if (env->prog->aux->sleepable)
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
@@ -13212,11 +13630,79 @@ patch_map_ops_generic:
continue;
}
+ /* Implement bpf_get_func_arg inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6);
+ insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3);
+ insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1);
+ insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0);
+ insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ insn_buf[7] = BPF_JMP_A(1);
+ insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL);
+ cnt = 9;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement bpf_get_func_ret inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_ret) {
+ if (eatype == BPF_TRACE_FEXIT ||
+ eatype == BPF_MODIFY_RETURN) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
+ insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
+ insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0);
+ insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0);
+ insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0);
+ cnt = 6;
+ } else {
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP);
+ cnt = 1;
+ }
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
+ /* Implement get_func_arg_cnt inline. */
+ if (prog_type == BPF_PROG_TYPE_TRACING &&
+ insn->imm == BPF_FUNC_get_func_arg_cnt) {
+ /* Load nr_args from ctx - 8 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
+ if (!new_prog)
+ return -ENOMEM;
+
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
/* Implement bpf_get_func_ip inline. */
if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_ip) {
- /* Load IP address from ctx - 8 */
- insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8);
+ /* Load IP address from ctx - 16 */
+ insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16);
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1);
if (!new_prog)
@@ -13330,7 +13816,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog)
mark_reg_known_zero(env, regs, i);
else if (regs[i].type == SCALAR_VALUE)
mark_reg_unknown(env, regs, i);
- else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
+ else if (base_type(regs[i].type) == PTR_TO_MEM) {
const u32 mem_size = regs[i].mem_size;
mark_reg_known_zero(env, regs, i);
@@ -13918,13 +14404,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr)
log->ubuf = (char __user *) (unsigned long) attr->log_buf;
log->len_total = attr->log_size;
- ret = -EINVAL;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
- !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
+ if (!bpf_verifier_log_attr_valid(log)) {
+ ret = -EINVAL;
goto err_unlock;
+ }
}
+ mark_verifier_state_clean(env);
+
if (IS_ERR(btf_vmlinux)) {
/* Either gcc or pahole or kernel are broken. */
verbose(env, "in-kernel BTF is malformed\n");
diff --git a/kernel/capability.c b/kernel/capability.c
index 46a361dde042..765194f5d678 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability_noaudit);
static bool ns_capable_common(struct user_namespace *ns,
int cap,
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..6e36e854b512 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -65,6 +65,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
return container_of(kfc, struct cgroup_fs_context, kfc);
}
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+};
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 81c9e0685948..afc6c0e9c966 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -394,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -403,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
mutex_lock(&cgrp->pidlist_mutex);
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
- l = of->priv;
+ l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
@@ -449,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
@@ -460,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -504,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
goto out_unlock;
/*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
*/
- cred = current_cred();
+ cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
@@ -543,9 +546,19 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
+ struct cgroup_file_ctx *ctx;
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ ctx = of->priv;
+ if ((ctx->ns->user_ns != &init_user_ns) ||
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
@@ -951,6 +964,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return invalfc(fc, "Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 919194de39c8..adb820e98f24 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -30,6 +30,7 @@
#include "cgroup-internal.h"
+#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -1301,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
@@ -2024,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
@@ -2650,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
if (!list_empty(&src_cset->mg_preload_node))
return;
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@@ -3630,6 +3631,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
@@ -3641,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
@@ -3648,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3679,12 +3686,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
@@ -3811,24 +3822,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
@@ -3845,7 +3875,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -4751,21 +4781,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4773,21 +4803,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4838,9 +4865,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
int ret;
@@ -4869,11 +4896,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb, bool threadgroup)
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
{
int ret = 0;
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
if (ret)
return ret;
@@ -4890,8 +4918,10 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp,
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
bool threadgroup)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
bool locked;
@@ -4909,9 +4939,16 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- /* process and thread migrations follow same delegation rule */
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb, threadgroup);
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
@@ -5711,7 +5748,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@@ -6129,8 +6166,23 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
if (ret)
goto err;
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
- !(kargs->flags & CLONE_THREAD));
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
if (ret)
goto err;
@@ -6191,6 +6243,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6253,6 +6306,7 @@ void cgroup_cancel_fork(struct task_struct *child,
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d0e163a02099..9390bfd9f1cd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/*
* There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
* of the situation.
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@ -591,6 +591,35 @@ static inline void free_cpuset(struct cpuset *cs)
}
/*
+ * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
*
@@ -614,28 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
- int ret;
+ int ret = 0;
rcu_read_lock();
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
+ if (!is_in_v2_mode())
+ ret = validate_change_legacy(cur, trial);
+ if (ret)
+ goto out;
/* Remaining checks don't apply to root cpuset */
- ret = 0;
if (cur == &top_cpuset)
goto out;
par = parent_cs(cur);
- /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
- goto out;
-
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
@@ -811,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
goto done;
}
@@ -841,7 +863,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
if (root_load_balance &&
@@ -930,7 +952,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -1159,7 +1181,7 @@ enum subparts_cmd {
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
- * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
* into parent's effective_cpus. 0 should always be returned.
@@ -1183,9 +1205,7 @@ enum subparts_cmd {
*
* Because of the implicit cpu exclusive nature of a partition root,
* cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
+ * permitted when checked by validate_change().
*/
static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
struct cpumask *newmask,
@@ -1530,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
/*
* Check all its siblings and call update_cpumasks_hier()
* if their use_parent_ecpus flag is set in order for them
* to use the right effective_cpus value.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1541,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
if (!sibling->use_parent_ecpus)
continue;
+ if (!css_tryget_online(&sibling->css))
+ continue;
+ rcu_read_unlock();
update_cpumasks_hier(sibling, tmp);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
@@ -1615,8 +1645,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Make sure that subparts_cpus is a subset of cpus_allowed.
*/
if (cs->nr_subparts_cpus) {
- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
}
spin_unlock_irq(&callback_lock);
@@ -1998,7 +2027,7 @@ out:
}
/*
- * update_prstate - update partititon_root_state
+ * update_prstate - update partition_root_state
* cs: the cpuset to update
* new_prs: new partition root state
*
@@ -2260,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2313,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
@@ -2848,7 +2879,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -3045,7 +3076,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
@@ -3493,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}
-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
+/*
+ * __cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -3536,7 +3567,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
+ bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
@@ -3665,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void)
int cpuset_memory_pressure_enabled __read_mostly;
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
@@ -3681,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
- **/
+ */
void __cpuset_memory_pressure_bump(void)
{
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 1486768f2318..24b5c2ab5598 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
- if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
@@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
if (pos == root)
return NULL;
@@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
- if (!pos)
+ if (!pos) {
pos = root;
- else
+ /* return NULL if this subtree is not on-list */
+ if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
+ return NULL;
+ } else {
pos = cgroup_parent(pos);
+ }
/* walk down to the first leaf */
while (true) {
@@ -115,33 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- if (rstatc->updated_next) {
- struct cgroup *parent = cgroup_parent(pos);
-
- if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
-
- prstatc = cgroup_rstat_cpu(parent, cpu);
- nextp = &prstatc->updated_children;
- while (true) {
- struct cgroup_rstat_cpu *nrstatc;
-
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
- WARN_ON_ONCE(*nextp == parent);
- nextp = &nrstatc->updated_next;
- }
- *nextp = rstatc->updated_next;
- }
+ parent = cgroup_parent(pos);
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
+ struct cgroup **nextp;
- rstatc->updated_next = NULL;
- return pos;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ nextp = &prstatc->updated_children;
+ while (*nextp != pos) {
+ struct cgroup_rstat_cpu *nrstatc;
+
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+ *nextp = rstatc->updated_next;
}
- /* only happens for @root */
- return NULL;
+ rstatc->updated_next = NULL;
+ return pos;
}
/* see cgroup_rstat_flush() */
@@ -156,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
@@ -169,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
@@ -318,7 +324,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
@@ -328,11 +334,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
- delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index eb0029c9a6a6..e400fbbc8aba 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,5 +1,5 @@
# KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
+# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_LEGACY_PTYS is not set
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
new file mode 100644
index 000000000000..e8db8d938661
--- /dev/null
+++ b/kernel/configs/debug.config
@@ -0,0 +1,106 @@
+# The config is based on running daily CI for enterprise Linux distros to
+# seek regressions on linux-next builds on different bare-metal and virtual
+# platforms. It can be used for example,
+#
+# $ make ARCH=arm64 defconfig debug.config
+#
+# Keep alphabetically sorted inside each section.
+#
+# printk and dmesg options
+#
+CONFIG_DEBUG_BUGVERBOSE=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_PRINTK_CALLER=y
+CONFIG_PRINTK_TIME=y
+CONFIG_SYMBOLIC_ERRNAME=y
+#
+# Compile-time checks and compiler options
+#
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
+CONFIG_DEBUG_SECTION_MISMATCH=y
+CONFIG_FRAME_WARN=2048
+CONFIG_SECTION_MISMATCH_WARN_ONLY=y
+#
+# Generic Kernel Debugging Instruments
+#
+# CONFIG_UBSAN_ALIGNMENT is not set
+# CONFIG_UBSAN_DIV_ZERO is not set
+# CONFIG_UBSAN_TRAP is not set
+# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_FS_ALLOW_ALL=y
+CONFIG_DEBUG_IRQFLAGS=y
+CONFIG_UBSAN=y
+CONFIG_UBSAN_BOOL=y
+CONFIG_UBSAN_BOUNDS=y
+CONFIG_UBSAN_ENUM=y
+CONFIG_UBSAN_SHIFT=y
+CONFIG_UBSAN_UNREACHABLE=y
+#
+# Memory Debugging
+#
+# CONFIG_DEBUG_PAGEALLOC is not set
+# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set
+# CONFIG_DEBUG_RODATA_TEST is not set
+# CONFIG_DEBUG_WX is not set
+# CONFIG_KFENCE is not set
+# CONFIG_PAGE_POISONING is not set
+# CONFIG_SLUB_STATS is not set
+CONFIG_PAGE_EXTENSION=y
+CONFIG_PAGE_OWNER=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_DEBUG_OBJECTS_FREE=y
+CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_DEBUG_OBJECTS_TIMERS=y
+CONFIG_DEBUG_OBJECTS_WORK=y
+CONFIG_DEBUG_PER_CPU_MAPS=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_VIRTUAL=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_VM_PGFLAGS=y
+CONFIG_DEBUG_VM_RB=y
+CONFIG_DEBUG_VM_VMACACHE=y
+CONFIG_GENERIC_PTDUMP=y
+CONFIG_KASAN=y
+CONFIG_KASAN_GENERIC=y
+CONFIG_KASAN_INLINE=y
+CONFIG_KASAN_VMALLOC=y
+CONFIG_PTDUMP_DEBUGFS=y
+CONFIG_SCHED_STACK_END_CHECK=y
+CONFIG_SLUB_DEBUG_ON=y
+#
+# Debug Oops, Lockups and Hangs
+#
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_PANIC_TIMEOUT=0
+CONFIG_SOFTLOCKUP_DETECTOR=y
+#
+# Lock Debugging (spinlocks, mutexes, etc...)
+#
+# CONFIG_PROVE_RAW_LOCK_NESTING is not set
+CONFIG_PROVE_LOCKING=y
+#
+# Debug kernel data structures
+#
+CONFIG_BUG_ON_DATA_CORRUPTION=y
+#
+# RCU Debugging
+#
+CONFIG_PROVE_RCU=y
+CONFIG_PROVE_RCU_LIST=y
+#
+# Tracers
+#
+CONFIG_BRANCH_PROFILE_NONE=y
+CONFIG_DYNAMIC_FTRACE=y
+CONFIG_FTRACE=y
+CONFIG_FUNCTION_TRACER=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 192e43a87407..d0a9aa0b42e8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -31,8 +31,10 @@
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
+#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
+#include <linux/random.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -69,7 +71,6 @@ struct cpuhp_cpu_state {
bool rollback;
bool single;
bool bringup;
- int cpu;
struct hlist_node *node;
struct hlist_node *last;
enum cpuhp_state cb_state;
@@ -473,7 +474,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
#endif
static inline enum cpuhp_state
-cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
bool bringup = st->state < target;
@@ -484,14 +485,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
st->target = target;
st->single = false;
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
return prev_state;
}
static inline void
-cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state prev_state)
{
bool bringup = !st->bringup;
@@ -518,8 +520,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
}
st->bringup = bringup;
- if (cpu_dying(st->cpu) != !bringup)
- set_cpu_dying(st->cpu, !bringup);
+ if (cpu_dying(cpu) != !bringup)
+ set_cpu_dying(cpu, !bringup);
}
/* Regular hotplug invocation of the AP hotplug thread */
@@ -539,15 +541,16 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
wait_for_ap_thread(st, st->bringup);
}
-static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
{
enum cpuhp_state prev_state;
int ret;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
__cpuhp_kick_ap(st);
if ((ret = st->result)) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
}
@@ -579,7 +582,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
if (st->target <= CPUHP_AP_ONLINE_IDLE)
return 0;
- return cpuhp_kick_ap(st, st->target);
+ return cpuhp_kick_ap(cpu, st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -588,6 +591,12 @@ static int bringup_cpu(unsigned int cpu)
int ret;
/*
+ * Reset stale stack state from the last time this CPU was online.
+ */
+ scs_task_reset(idle);
+ kasan_unpoison_task_stack(idle);
+
+ /*
* Some architectures have to walk the irq descriptors to
* setup the vector space for the cpu which comes online.
* Prevent irq alloc/free across the bringup.
@@ -696,7 +705,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
prev_state));
@@ -713,7 +722,6 @@ static void cpuhp_create(unsigned int cpu)
init_completion(&st->done_up);
init_completion(&st->done_down);
- st->cpu = cpu;
}
static int cpuhp_should_run(unsigned int cpu)
@@ -867,7 +875,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu)
cpuhp_lock_release(true);
trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
- ret = cpuhp_kick_ap(st, st->target);
+ ret = cpuhp_kick_ap(cpu, st, st->target);
trace_cpuhp_exit(cpu, st->state, prev_state, ret);
return ret;
@@ -1099,7 +1107,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret, cpu, cpuhp_get_step(st->state)->name,
st->state);
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
if (st->state < prev_state)
WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
@@ -1126,7 +1134,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = cpuhp_set_state(st, target);
+ prev_state = cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
@@ -1157,7 +1165,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state < prev_state) {
if (st->state == CPUHP_TEARDOWN_CPU) {
- cpuhp_reset_state(st, prev_state);
+ cpuhp_reset_state(cpu, st, prev_state);
__cpuhp_kick_ap(st);
} else {
WARN(1, "DEAD callback error for CPU%d", cpu);
@@ -1344,7 +1352,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- cpuhp_set_state(st, target);
+ cpuhp_set_state(cpu, st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1481,8 +1489,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
- primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+ primary = housekeeping_any_cpu(HK_TYPE_TIMER);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
@@ -1652,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
@@ -1775,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index eb53f5ec62c9..256cf6db573c 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -6,6 +6,7 @@
#include <linux/buildid.h>
#include <linux/crash_core.h>
+#include <linux/init.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
@@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline,
"crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
+/*
+ * Add a dummy early_param handler to mark crashkernel= as a known command line
+ * parameter and suppress incorrect warnings in init/main.c.
+ */
+static int __init parse_crashkernel_dummy(char *arg)
+{
+ return 0;
+}
+early_param("crashkernel", parse_crashkernel_dummy);
+
Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type,
void *data, size_t data_len)
{
diff --git a/kernel/cred.c b/kernel/cred.c
index 473d17c431f3..e10c15f51c1f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -665,21 +665,16 @@ EXPORT_SYMBOL(cred_fscmp);
int set_cred_ucounts(struct cred *new)
{
- struct task_struct *task = current;
- const struct cred *old = task->real_cred;
struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
- if (new->user == old->user && new->user_ns == old->user_ns)
- return 0;
-
/*
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
- if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
return 0;
- if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
return -EAGAIN;
new->ucounts = new_ucounts;
@@ -875,7 +870,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
/*
* report use of invalid credentials
*/
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
printk(KERN_ERR "CRED: Invalid credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index df2bface866e..85cb51c4a17e 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -291,7 +291,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+ int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_func_printf("Bad address 0x%lx\n", addr);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 51530d5b15a8..c5e8cea9e05f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -100,19 +100,10 @@ void __delayacct_blkio_start(void)
*/
void __delayacct_blkio_end(struct task_struct *p)
{
- struct task_delay_info *delays = p->delays;
- u64 *total;
- u32 *count;
-
- if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
- total = &delays->swapin_delay;
- count = &delays->swapin_count;
- } else {
- total = &delays->blkio_delay;
- count = &delays->blkio_count;
- }
-
- delayacct_end(&delays->lock, &delays->blkio_start, total, count);
+ delayacct_end(&p->delays->lock,
+ &p->delays->blkio_start,
+ &p->delays->blkio_delay,
+ &p->delays->blkio_count);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -164,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
+ tmp = d->compact_delay_total + tsk->delays->compact_delay;
+ d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
d->thrashing_count += tsk->delays->thrashing_count;
+ d->compact_count += tsk->delays->compact_count;
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@@ -179,8 +173,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
unsigned long flags;
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
- ret = nsec_to_clock_t(tsk->delays->blkio_delay +
- tsk->delays->swapin_delay);
+ ret = nsec_to_clock_t(tsk->delays->blkio_delay);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
@@ -210,3 +203,29 @@ void __delayacct_thrashing_end(void)
&current->delays->thrashing_delay,
&current->delays->thrashing_count);
}
+
+void __delayacct_swapin_start(void)
+{
+ current->delays->swapin_start = local_clock();
+}
+
+void __delayacct_swapin_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->swapin_start,
+ &current->delays->swapin_delay,
+ &current->delays->swapin_count);
+}
+
+void __delayacct_compact_start(void)
+{
+ current->delays->compact_start = local_clock();
+}
+
+void __delayacct_compact_end(void)
+{
+ delayacct_end(&current->delays->lock,
+ &current->delays->compact_start,
+ &current->delays->compact_delay,
+ &current->delays->compact_count);
+}
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1b02179758cb..56866aaa2ae1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -110,15 +110,10 @@ config DMA_GLOBAL_POOL
select DMA_DECLARE_COHERENT
bool
-config DMA_REMAP
- bool
- depends on MMU
- select DMA_NONCOHERENT_MMAP
-
config DMA_DIRECT_REMAP
bool
- select DMA_REMAP
select DMA_COHERENT_POOL
+ select DMA_NONCOHERENT_MMAP
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 0dd65ec1d234..21926e46ef4f 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
-obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_MMU) += remap.o
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..6ea80ae42622 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = {
static int __init rmem_cma_setup(struct reserved_mem *rmem)
{
- phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
- phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
@@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- if ((rmem->base & mask) || (rmem->size & mask)) {
+ if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
}
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 7a14ca29c377..f8ff598596b8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -927,7 +927,7 @@ static __init int dma_debug_cmdline(char *str)
global_disable = true;
}
- return 0;
+ return 1;
}
static __init int dma_debug_entries_cmdline(char *str)
@@ -936,7 +936,7 @@ static __init int dma_debug_entries_cmdline(char *str)
return -EINVAL;
if (!get_option(&str, &nr_prealloc_entries))
nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
- return 0;
+ return 1;
}
__setup("dma_debug=", dma_debug_cmdline);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 4c6c5e0635e3..9743c6ccce1a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,15 +75,45 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
+static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
+{
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
+}
+
+static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
+{
+ int ret;
+
+ if (!force_dma_unencrypted(dev))
+ return 0;
+ ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
+ if (ret)
+ pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
+ return ret;
+}
+
static void __dma_direct_free_pages(struct device *dev, struct page *page,
size_t size)
{
- if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
- swiotlb_free(dev, page, size))
+ if (swiotlb_free(dev, page, size))
return;
dma_free_contiguous(dev, page, size);
}
+static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
+{
+ struct page *page = swiotlb_alloc(dev, size);
+
+ if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+ swiotlb_free(dev, page, size);
+ return NULL;
+ }
+
+ return page;
+}
+
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
{
@@ -93,18 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
+ if (is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_swiotlb(dev, size);
+
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
- if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
- is_swiotlb_for_alloc(dev)) {
- page = swiotlb_alloc(dev, size);
- if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
- __dma_direct_free_pages(dev, page, size);
- return NULL;
- }
- return page;
- }
-
page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
@@ -133,6 +156,15 @@ again:
return page;
}
+/*
+ * Check if a potentially blocking operations needs to dip into the atomic
+ * pools for the given device/gfp.
+ */
+static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
+{
+ return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
+}
+
static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
@@ -140,6 +172,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
u64 phys_mask;
void *ret;
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
+ return NULL;
+
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_mask);
page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
@@ -149,54 +184,81 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
return ret;
}
+static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
+ dma_addr_t *dma_handle, gfp_t gfp)
+{
+ struct page *page;
+
+ page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
+ if (!page)
+ return NULL;
+
+ /* remove any dirty cache lines on the kernel alias */
+ if (!PageHighMem(page))
+ arch_dma_prep_coherent(page, size);
+
+ /* return the page pointer as the opaque cookie */
+ *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
+ return page;
+}
+
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
+ bool remap = false, set_uncached = false;
struct page *page;
void *ret;
- int err;
size = PAGE_ALIGN(size);
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
- !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
- page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
- if (!page)
- return NULL;
- /* remove any dirty cache lines on the kernel alias */
- if (!PageHighMem(page))
- arch_dma_prep_coherent(page, size);
- *dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
- /* return the page pointer as the opaque cookie */
- return page;
- }
+ !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
+ return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
- if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev) &&
- !is_swiotlb_for_alloc(dev))
- return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
+ if (!dev_is_dma_coherent(dev)) {
+ /*
+ * Fallback to the arch handler if it exists. This should
+ * eventually go away.
+ */
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
+ !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
+ !is_swiotlb_for_alloc(dev))
+ return arch_dma_alloc(dev, size, dma_handle, gfp,
+ attrs);
- if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
- !dev_is_dma_coherent(dev))
- return dma_alloc_from_global_coherent(dev, size, dma_handle);
+ /*
+ * If there is a global pool, always allocate from it for
+ * non-coherent devices.
+ */
+ if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
+ return dma_alloc_from_global_coherent(dev, size,
+ dma_handle);
+
+ /*
+ * Otherwise remap if the architecture is asking for it. But
+ * given that remapping memory is a blocking operation we'll
+ * instead have to dip into the atomic pools.
+ */
+ remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
+ if (remap) {
+ if (dma_direct_use_pool(dev, gfp))
+ return dma_direct_alloc_from_pool(dev, size,
+ dma_handle, gfp);
+ } else {
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
+ return NULL;
+ set_uncached = true;
+ }
+ }
/*
- * Remapping or decrypting memory may block. If either is required and
- * we can't block, allocate the memory from the atomic pools.
- * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
- * set up another device coherent pool by shared-dma-pool and use
- * dma_alloc_from_dev_coherent instead.
+ * Decrypting memory may block, so allocate the memory from the atomic
+ * pools if we can't block.
*/
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- !gfpflags_allow_blocking(gfp) &&
- (force_dma_unencrypted(dev) ||
- (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev))) &&
- !is_swiotlb_for_alloc(dev))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
@@ -204,68 +266,51 @@ void *dma_direct_alloc(struct device *dev, size_t size,
if (!page)
return NULL;
- if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
- !dev_is_dma_coherent(dev)) ||
- (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
+ /*
+ * dma_alloc_contiguous can return highmem pages depending on a
+ * combination the cma= arguments and per-arch setup. These need to be
+ * remapped to return a kernel virtual address.
+ */
+ if (PageHighMem(page)) {
+ remap = true;
+ set_uncached = false;
+ }
+
+ if (remap) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ if (force_dma_unencrypted(dev))
+ prot = pgprot_decrypted(prot);
+
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
+ ret = dma_common_contiguous_remap(page, size, prot,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
- goto out_free_pages;
- }
- memset(ret, 0, size);
- goto done;
- }
-
- if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here,
- * so log an error and fail.
- */
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
-
- ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- err = set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size));
- if (err)
+ } else {
+ ret = page_address(page);
+ if (dma_set_decrypted(dev, ret, size))
goto out_free_pages;
}
memset(ret, 0, size);
- if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
- !dev_is_dma_coherent(dev)) {
+ if (set_uncached) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
goto out_encrypt_pages;
}
-done:
+
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
out_encrypt_pages:
- if (force_dma_unencrypted(dev)) {
- err = set_memory_encrypted((unsigned long)page_address(page),
- 1 << get_order(size));
- /* If memory cannot be re-encrypted, it must be leaked */
- if (err)
- return NULL;
- }
+ if (dma_set_encrypted(dev, page_address(page), size))
+ return NULL;
out_free_pages:
__dma_direct_free_pages(dev, page, size);
return NULL;
@@ -304,13 +349,14 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
-
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
+ if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
- else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
- arch_dma_clear_uncached(cpu_addr, size);
+ } else {
+ if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
+ arch_dma_clear_uncached(cpu_addr, size);
+ if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
+ return;
+ }
__dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
@@ -321,9 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
struct page *page;
void *ret;
- if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
- force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
- !is_swiotlb_for_alloc(dev))
+ if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
page = __dma_direct_alloc_pages(dev, size, gfp);
@@ -341,11 +385,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
}
ret = page_address(page);
- if (force_dma_unencrypted(dev)) {
- if (set_memory_decrypted((unsigned long)ret,
- 1 << get_order(size)))
- goto out_free_pages;
- }
+ if (dma_set_decrypted(dev, ret, size))
+ goto out_free_pages;
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
@@ -366,9 +407,8 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
- if (force_dma_unencrypted(dev))
- set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
-
+ if (dma_set_encrypted(dev, vaddr, 1 << page_order))
+ return;
__dma_direct_free_pages(dev, page, size);
}
@@ -499,6 +539,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+ if (force_dma_unencrypted(dev))
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 4632b0f4f72e..8a6cd53dbe8c 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
if (unlikely(is_swiotlb_buffer(dev, phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
#endif /* _KERNEL_DMA_DIRECT_H */
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 9b9af1bd6be3..0520a8f4fb1d 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,6 +11,7 @@
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
+#include <linux/map_benchmark.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
@@ -18,30 +19,6 @@
#include <linux/slab.h>
#include <linux/timekeeping.h>
-#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS 1024
-#define DMA_MAP_MAX_SECONDS 300
-#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL 0
-#define DMA_MAP_TO_DEVICE 1
-#define DMA_MAP_FROM_DEVICE 2
-
-struct map_benchmark {
- __u64 avg_map_100ns; /* average map latency in 100ns */
- __u64 map_stddev; /* standard deviation of map latency */
- __u64 avg_unmap_100ns; /* as above */
- __u64 unmap_stddev;
- __u32 threads; /* how many threads will do map/unmap in parallel */
- __u32 seconds; /* how long the test will last */
- __s32 node; /* which numa node this benchmark will run on */
- __u32 dma_bits; /* DMA addressing capability */
- __u32 dma_dir; /* DMA data direction */
- __u32 dma_trans_ns; /* time for DMA transmission in ns */
- __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
- __u8 expansion[76]; /* For future use */
-};
-
struct map_benchmark_data {
struct map_benchmark bparam;
struct device *dev;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9478eccd1c8e..db7244291b74 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -407,8 +407,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
*/
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
- if (force_dma_unencrypted(dev))
- prot = pgprot_decrypted(prot);
if (dev_is_dma_coherent(dev))
return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
@@ -745,7 +743,6 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
/*
@@ -761,7 +758,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
size_t dma_max_mapping_size(struct device *dev)
{
diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c
index 5f84e6cdb78e..4d40dcce7604 100644
--- a/kernel/dma/pool.c
+++ b/kernel/dma/pool.c
@@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
GFP_KERNEL);
if (!atomic_pool_kernel)
ret = -ENOMEM;
- if (IS_ENABLED(CONFIG_ZONE_DMA)) {
+ if (has_managed_dma()) {
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
GFP_KERNEL | GFP_DMA);
if (!atomic_pool_dma)
@@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
if (prev == NULL) {
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
return atomic_pool_dma32;
- if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
+ if (atomic_pool_dma && (gfp & GFP_DMA))
return atomic_pool_dma;
return atomic_pool_kernel;
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 8e840fbbed7c..73a41cec9e38 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,39 +21,33 @@
#define pr_fmt(fmt) "software IO TLB: " fmt
#include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
-#include <linux/mm.h>
#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/swiotlb.h>
-#include <linux/pfn.h>
#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/cc_platform.h>
-#include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
#ifdef CONFIG_DMA_RESTRICTED_POOL
-#include <linux/io.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
#include <linux/slab.h>
#endif
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
@@ -72,6 +66,8 @@ enum swiotlb_force swiotlb_force;
struct io_tlb_mem io_tlb_default_mem;
+phys_addr_t swiotlb_unencrypted_base;
+
/*
* Max segment that we can provide which (if pages are contingous) will
* not be bounced (unless SWIOTLB_FORCE is set).
@@ -156,6 +152,34 @@ static inline unsigned long nr_slots(u64 val)
}
/*
+ * Remap swioltb memory in the unencrypted physical address space
+ * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
+ * Isolation VMs).
+ */
+#ifdef CONFIG_HAS_IOMEM
+static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
+{
+ void *vaddr = NULL;
+
+ if (swiotlb_unencrypted_base) {
+ phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
+
+ vaddr = memremap(paddr, bytes, MEMREMAP_WB);
+ if (!vaddr)
+ pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
+ &paddr, bytes);
+ }
+
+ return vaddr;
+}
+#else
+static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
+{
+ return NULL;
+}
+#endif
+
+/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
* call SWIOTLB when the operations are possible. It needs to be called
@@ -172,7 +196,10 @@ void __init swiotlb_update_mem_attributes(void)
vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
- memset(vaddr, 0, bytes);
+
+ mem->vaddr = swiotlb_mem_remap(mem, bytes);
+ if (!mem->vaddr)
+ mem->vaddr = vaddr;
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
@@ -196,7 +223,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
+
+ /*
+ * If swiotlb_unencrypted_base is set, the bounce buffer memory will
+ * be remapped and cleared in swiotlb_update_mem_attributes.
+ */
+ if (swiotlb_unencrypted_base)
+ return;
+
memset(vaddr, 0, bytes);
+ mem->vaddr = vaddr;
+ return;
}
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
@@ -371,7 +408,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
- unsigned char *vaddr = phys_to_virt(tlb_addr);
+ unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
unsigned int tlb_offset, orig_addr_offset;
if (orig_addr == INVALID_PHYS_ADDR)
@@ -581,9 +618,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
tlb_addr = slot_addr(mem->start, index) + offset;
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ /*
+ * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+ * to the tlb buffer, if we knew for sure the device will
+ * overwirte the entire current content. But we don't. Thus
+ * unconditional bounce may prevent leaking swiotlb content (i.e.
+ * kernel memory) to user-space.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
@@ -712,47 +754,29 @@ bool is_swiotlb_active(struct device *dev)
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);
-#ifdef CONFIG_DEBUG_FS
-static struct dentry *debugfs_dir;
-
-static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
{
+ mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+ if (!mem->nslabs)
+ return;
+
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
}
-static int __init swiotlb_create_default_debugfs(void)
+static int __init __maybe_unused swiotlb_create_default_debugfs(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
-
- debugfs_dir = debugfs_create_dir("swiotlb", NULL);
- if (mem->nslabs) {
- mem->debugfs = debugfs_dir;
- swiotlb_create_debugfs_files(mem);
- }
+ swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
return 0;
}
+#ifdef CONFIG_DEBUG_FS
late_initcall(swiotlb_create_default_debugfs);
-
#endif
#ifdef CONFIG_DMA_RESTRICTED_POOL
-#ifdef CONFIG_DEBUG_FS
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
- struct io_tlb_mem *mem = rmem->priv;
-
- mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
- swiotlb_create_debugfs_files(mem);
-}
-#else
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-}
-#endif
-
struct page *swiotlb_alloc(struct device *dev, size_t size)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
@@ -799,8 +823,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
if (!mem)
return -ENOMEM;
- mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
- GFP_KERNEL);
+ mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
if (!mem->slots) {
kfree(mem);
return -ENOMEM;
@@ -814,7 +837,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
rmem->priv = mem;
- rmem_swiotlb_debugfs_init(rmem);
+ swiotlb_create_debugfs_files(mem, rmem->name);
}
dev->dma_io_tlb_mem = mem;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index d5a61d565ad5..93c3b86e781c 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,7 +2,9 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
+#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
+#include <linux/jump_label.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -58,7 +60,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = arch_syscall_enter_tracehook(regs);
+ ret = ptrace_report_syscall_entry(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
@@ -138,15 +140,7 @@ void noinstr exit_to_user_mode(void)
}
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
@@ -169,10 +163,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
- handle_signal_work(regs, ti_work);
+ arch_do_signal_or_restart(regs);
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(regs);
+ resume_user_mode_work(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
@@ -187,7 +181,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
@@ -196,7 +190,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
- unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+ unsigned long ti_work = read_thread_flags();
lockdep_assert_irqs_disabled();
@@ -252,7 +246,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- arch_syscall_exit_tracehook(regs, step);
+ ptrace_report_syscall_exit(regs, step);
}
/*
@@ -380,7 +374,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
return ret;
}
-void irqentry_exit_cond_resched(void)
+void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
@@ -392,7 +386,17 @@ void irqentry_exit_cond_resched(void)
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
@@ -420,13 +424,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMPT_DYNAMIC
- static_call(irqentry_exit_cond_resched)();
-#else
+ if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
-#endif
- }
+
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 49972ee99aff..9d09f489b60e 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,8 +8,11 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
+ }
if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
@@ -20,13 +23,13 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
schedule();
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(NULL);
+ resume_user_mode_work(NULL);
ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
if (ret)
return ret;
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
return 0;
}
@@ -43,7 +46,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
* disabled in the inner loop before going into guest mode. No need
* to disable interrupts here.
*/
- ti_work = READ_ONCE(current_thread_info()->flags);
+ ti_work = read_thread_flags();
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
return 0;
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 4508201847d2..0b6379adff6b 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -48,7 +48,7 @@ bool syscall_user_dispatch(struct pt_regs *regs)
* the selector is loaded by userspace.
*/
if (unlikely(__get_user(state, sd->selector))) {
- force_fatal_sig(SIGSEGV);
+ force_exit_sig(SIGSEGV);
return true;
}
@@ -56,7 +56,7 @@ bool syscall_user_dispatch(struct pt_regs *regs)
return false;
if (state != SYSCALL_DISPATCH_FILTER_BLOCK) {
- force_fatal_sig(SIGSYS);
+ force_exit_sig(SIGSYS);
return true;
}
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 58cbe357fb2b..1273be84392c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
}
if (regs) {
- mm_segment_t fs;
-
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 523106a506ee..23bb19716ad3 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -574,8 +574,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type);
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
+ enum event_type_t event_type);
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
@@ -674,6 +673,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
WRITE_ONCE(event->state, state);
}
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val) \
+do { \
+ barrier(); \
+ WRITE_ONCE(*(ptr), (val)); \
+} while (0)
+
+#define __load_acquire(ptr) \
+({ \
+ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
+ barrier(); \
+ ___p; \
+})
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -719,35 +735,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
- struct perf_cgroup_info *info;
- u64 now;
-
- now = perf_clock();
+ struct perf_cgroup_info *t;
- info = this_cpu_ptr(cgrp->info);
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ if (!__load_acquire(&t->active))
+ return t->time;
+ now += READ_ONCE(t->timeoffset);
+ return now;
+}
- info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ if (adv)
+ info->time += now - info->timestamp;
info->timestamp = now;
+ /*
+ * see update_context_time()
+ */
+ WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
struct perf_cgroup *cgrp = cpuctx->cgrp;
struct cgroup_subsys_state *css;
+ struct perf_cgroup_info *info;
if (cgrp) {
+ u64 now = perf_clock();
+
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
- __update_cgrp_time(cgrp);
+ info = this_cpu_ptr(cgrp->info);
+
+ __update_cgrp_time(info, now, true);
+ if (final)
+ __store_release(&info->active, 0);
}
}
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
- struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
/*
* ensure we access cgroup data only when needed and
@@ -756,19 +788,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
if (!is_cgroup_event(event))
return;
- cgrp = perf_cgroup_from_task(current, event->ctx);
+ info = this_cpu_ptr(event->cgrp->info);
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- __update_cgrp_time(event->cgrp);
+ if (info->active)
+ __update_cgrp_time(info, perf_clock(), true);
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
- struct perf_cgroup *cgrp;
+ struct perf_event_context *ctx = &cpuctx->ctx;
+ struct perf_cgroup *cgrp = cpuctx->cgrp;
struct perf_cgroup_info *info;
struct cgroup_subsys_state *css;
@@ -777,32 +809,28 @@ perf_cgroup_set_timestamp(struct task_struct *task,
* ensure we do not access cgroup data
* unless we have the cgroup pinned (css_get)
*/
- if (!task || !ctx->nr_cgroups)
+ if (!cgrp)
return;
- cgrp = perf_cgroup_from_task(task, ctx);
+ WARN_ON_ONCE(!ctx->nr_cgroups);
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+ __update_cgrp_time(info, ctx->timestamp, false);
+ __store_release(&info->active, 1);
}
}
static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
-#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
-#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
-
/*
* reschedule events based on the cgroup constraint of task.
- *
- * mode SWOUT : schedule out everything
- * mode SWIN : schedule in based on cgroup for next
*/
-static void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cgroup *cgrp;
+ struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;
@@ -812,35 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
*/
local_irq_save(flags);
+ cgrp = perf_cgroup_from_task(task, NULL);
+
list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+ list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
+ if (READ_ONCE(cpuctx->cgrp) == cgrp)
+ continue;
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(cpuctx->ctx.pmu);
- if (mode & PERF_CGROUP_SWOUT) {
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
- /*
- * must not be done before ctxswout due
- * to event_filter_match() in event_sched_out()
- */
- cpuctx->cgrp = NULL;
- }
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to update_cgrp_time_from_cpuctx() in
+ * ctx_sched_out()
+ */
+ cpuctx->cgrp = cgrp;
+ /*
+ * set cgrp before ctxsw in to allow
+ * perf_cgroup_set_timestamp() in ctx_sched_in()
+ * to not have to pass task around
+ */
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL);
- if (mode & PERF_CGROUP_SWIN) {
- WARN_ON_ONCE(cpuctx->cgrp);
- /*
- * set cgrp before ctxsw in to allow
- * event_filter_match() to not have to pass
- * task around
- * we pass the cpuctx->ctx to perf_cgroup_from_task()
- * because cgorup events are only per-cpu
- */
- cpuctx->cgrp = perf_cgroup_from_task(task,
- &cpuctx->ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
- }
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
@@ -848,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
local_irq_restore(flags);
}
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
-
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(next, NULL);
-
- /*
- * only schedule out current cgroup events if we know
- * that we are switching to a different cgroup. Otherwise,
- * do no touch the cgroup events.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
-
- rcu_read_unlock();
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
-{
- struct perf_cgroup *cgrp1;
- struct perf_cgroup *cgrp2 = NULL;
-
- rcu_read_lock();
- /*
- * we come here when we know perf_cgroup_events > 0
- * we do not need to pass the ctx here because we know
- * we are holding the rcu lock
- */
- cgrp1 = perf_cgroup_from_task(task, NULL);
- cgrp2 = perf_cgroup_from_task(prev, NULL);
-
- /*
- * only need to schedule in cgroup events if we are changing
- * cgroup during ctxsw. Cgroup events were not scheduled
- * out of ctxsw out if that was not the case.
- */
- if (cgrp1 != cgrp2)
- perf_cgroup_switch(task, PERF_CGROUP_SWIN);
-
- rcu_read_unlock();
-}
-
static int perf_cgroup_ensure_storage(struct perf_event *event,
struct cgroup_subsys_state *css)
{
@@ -982,14 +954,6 @@ out:
}
static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
-{
- struct perf_cgroup_info *t;
- t = per_cpu_ptr(event->cgrp->info, event->cpu);
- event->shadow_ctx_time = now - t->timestamp;
-}
-
-static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
@@ -1003,22 +967,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct
*/
cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
- /*
- * Since setting cpuctx->cgrp is conditional on the current @cgrp
- * matching the event's cgroup, we must do this for every new event,
- * because if the first would mismatch, the second would not try again
- * and we would leave cpuctx->cgrp unset.
- */
- if (ctx->is_active && !cpuctx->cgrp) {
- struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
-
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- cpuctx->cgrp = cgrp;
- }
-
if (ctx->nr_cgroups++)
return;
+ cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
list_add(&cpuctx->cgrp_cpuctx_entry,
per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
}
@@ -1040,9 +992,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c
if (--ctx->nr_cgroups)
return;
- if (ctx->is_active && cpuctx->cgrp)
- cpuctx->cgrp = NULL;
-
+ cpuctx->cgrp = NULL;
list_del(&cpuctx->cgrp_cpuctx_entry);
}
@@ -1066,17 +1016,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
-{
-}
-
-static inline void perf_cgroup_sched_out(struct task_struct *task,
- struct task_struct *next)
-{
-}
-
-static inline void perf_cgroup_sched_in(struct task_struct *prev,
- struct task_struct *task)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+ bool final)
{
}
@@ -1088,22 +1029,16 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
}
static inline void
-perf_cgroup_set_timestamp(struct task_struct *task,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
-perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
+ return 0;
}
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
return 0;
}
@@ -1117,6 +1052,10 @@ static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}
+
+static void perf_cgroup_switch(struct task_struct *task)
+{
+}
#endif
/*
@@ -1525,22 +1464,59 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
- ctx->time += now - ctx->timestamp;
+ if (adv)
+ ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+ __update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx ? ctx->time : 0;
+ return ctx->time;
+}
+
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time_now(event, now);
+
+ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+ return ctx->time;
+
+ now += READ_ONCE(ctx->timeoffset);
+ return now;
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -1808,6 +1784,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@@ -1999,6 +1977,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--;
+ if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+ ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -2346,7 +2326,7 @@ __perf_remove_from_context(struct perf_event *event,
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
}
event_sched_out(event, cpuctx, ctx);
@@ -2357,6 +2337,9 @@ __perf_remove_from_context(struct perf_event *event,
list_del_event(event, ctx);
if (!ctx->nr_events && ctx->is_active) {
+ if (ctx == &cpuctx->ctx)
+ update_cgrp_time_from_cpuctx(cpuctx, true);
+
ctx->is_active = 0;
ctx->rotate_necessary = 0;
if (ctx->task) {
@@ -2388,7 +2371,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
* event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock);
- if (!ctx->is_active) {
+ /*
+ * Cgroup events are per-cpu events, and must IPI because of
+ * cgrp_cpuctx_list.
+ */
+ if (!ctx->is_active && !is_cgroup_event(event)) {
__perf_remove_from_context(event, __get_cpu_context(ctx),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
@@ -2478,40 +2465,6 @@ void perf_event_disable_inatomic(struct perf_event *event)
irq_work_queue(&event->pending);
}
-static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- /*
- * use the correct time source for the time snapshot
- *
- * We could get by without this by leveraging the
- * fact that to get to this function, the caller
- * has most likely already called update_context_time()
- * and update_cgrp_time_xx() and thus both timestamp
- * are identical (or very close). Given that tstamp is,
- * already adjusted for cgroup, we could say that:
- * tstamp - ctx->timestamp
- * is equivalent to
- * tstamp - cgrp->timestamp.
- *
- * Then, in perf_output_read(), the calculation would
- * work with no changes because:
- * - event is guaranteed scheduled in
- * - no scheduled out in between
- * - thus the timestamp would be the same
- *
- * But this is a bit hairy.
- *
- * So instead, we have an explicit cgroup call to remain
- * within the time source all along. We believe it
- * is cleaner and simpler to understand.
- */
- if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, event->tstamp);
- else
- event->shadow_ctx_time = event->tstamp - ctx->timestamp;
-}
-
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
@@ -2552,8 +2505,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx);
-
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
@@ -2671,8 +2622,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task);
+ enum event_type_t event_type);
static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
@@ -2688,15 +2638,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx,
- struct task_struct *task)
+ struct perf_event_context *ctx)
{
- cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_PINNED);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
if (ctx)
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
}
/*
@@ -2746,7 +2695,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
else if (ctx_event_type & EVENT_PINNED)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx);
perf_pmu_enable(cpuctx->ctx.pmu);
}
@@ -2857,11 +2806,14 @@ perf_install_in_context(struct perf_event_context *ctx,
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
+ * Similarly, cgroup events for the context also needs the IPI to
+ * manipulate the cgrp_cpuctx_list.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
*/
- if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
+ ctx->nr_events && !is_cgroup_event(event)) {
raw_spin_lock_irq(&ctx->lock);
if (ctx->task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock);
@@ -2966,7 +2918,7 @@ static void __perf_event_enable(struct perf_event *event,
return;
if (!event_filter_match(event)) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
return;
}
@@ -2975,7 +2927,7 @@ static void __perf_event_enable(struct perf_event *event,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
return;
}
@@ -3193,6 +3145,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
return err;
}
+/*
+ * Copy event-type-independent attributes that may be modified.
+ */
+static void perf_event_modify_copy_attr(struct perf_event_attr *to,
+ const struct perf_event_attr *from)
+{
+ to->sig_data = from->sig_data;
+}
+
static int perf_event_modify_attr(struct perf_event *event,
struct perf_event_attr *attr)
{
@@ -3215,10 +3176,17 @@ static int perf_event_modify_attr(struct perf_event *event,
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->child_mutex);
+ /*
+ * Event-type-independent attributes must be copied before event-type
+ * modification, which will validate that final attributes match the
+ * source attributes after all relevant attributes have been copied.
+ */
+ perf_event_modify_copy_attr(&event->attr, attr);
err = func(event, attr);
if (err)
goto out;
list_for_each_entry(child, &event->child_list, child_list) {
+ perf_event_modify_copy_attr(&child->attr, attr);
err = func(child, attr);
if (err)
goto out;
@@ -3247,16 +3215,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
return;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
- if (ctx->task) {
- WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
- cpuctx->task_ctx = NULL;
- }
-
/*
* Always update time if it was set; not only when it changes.
* Otherwise we can 'forget' to update time for any but the last
@@ -3270,7 +3228,22 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
}
is_active ^= ctx->is_active; /* changed bits */
@@ -3602,7 +3575,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
* cgroup event are system-wide mode only
*/
if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_out(task, next);
+ perf_cgroup_switch(next);
}
/*
@@ -3707,13 +3680,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+/*
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
+ */
static inline bool event_update_userpage(struct perf_event *event)
{
if (likely(!atomic_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
perf_event_update_userpage(event);
return true;
@@ -3793,17 +3772,26 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ enum event_type_t event_type)
{
int is_active = ctx->is_active;
- u64 now;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events))
return;
+ if (is_active ^ EVENT_TIME) {
+ /* start ctx time */
+ __update_context_time(ctx, false);
+ perf_cgroup_set_timestamp(cpuctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
@@ -3814,13 +3802,6 @@ ctx_sched_in(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (is_active & EVENT_TIME) {
- /* start ctx time */
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
- }
-
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@@ -3834,12 +3815,11 @@ ctx_sched_in(struct perf_event_context *ctx,
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type,
- struct task_struct *task)
+ enum event_type_t event_type)
{
struct perf_event_context *ctx = &cpuctx->ctx;
- ctx_sched_in(ctx, cpuctx, event_type, task);
+ ctx_sched_in(ctx, cpuctx, event_type);
}
static void perf_event_context_sched_in(struct perf_event_context *ctx,
@@ -3881,7 +3861,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
+ perf_event_sched_in(cpuctx, ctx);
if (cpuctx->sched_cb_usage && pmu->sched_task)
pmu->sched_task(cpuctx->task_ctx, true);
@@ -3909,16 +3889,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
struct perf_event_context *ctx;
int ctxn;
- /*
- * If cgroup events exist on this CPU, then we need to check if we have
- * to switch in PMU state; cgroup event are system-wide mode only.
- *
- * Since cgroup events are CPU events, we must schedule these in before
- * we schedule in the task events.
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
-
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
@@ -4192,7 +4162,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
if (cpu_event)
rotate_ctx(&cpuctx->ctx, cpu_event);
- perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4264,7 +4234,7 @@ static void perf_event_enable_on_exec(int ctxn)
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
- ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME);
}
perf_ctx_unlock(cpuctx, ctx);
@@ -4414,6 +4384,18 @@ static inline u64 perf_event_count(struct perf_event *event)
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = perf_event_time_now(event, *now);
+ __perf_update_times(event, ctx_time, enabled, running);
+}
+
/*
* NMI-safe method to read a local event, that is an event that
* is:
@@ -4473,10 +4455,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
*value = local64_read(&event->count);
if (enabled || running) {
- u64 now = event->shadow_ctx_time + perf_clock();
- u64 __enabled, __running;
+ u64 __enabled, __running, __now;;
- __perf_update_times(event, now, &__enabled, &__running);
+ calc_timer_values(event, &__now, &__enabled, &__running);
if (enabled)
*enabled = __enabled;
if (running)
@@ -5798,18 +5779,6 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void calc_timer_values(struct perf_event *event,
- u64 *now,
- u64 *enabled,
- u64 *running)
-{
- u64 ctx_time;
-
- *now = perf_clock();
- ctx_time = event->shadow_ctx_time + *now;
- __perf_update_times(event, ctx_time, enabled, running);
-}
-
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
@@ -5934,6 +5903,8 @@ static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *old_rb = NULL;
unsigned long flags;
+ WARN_ON_ONCE(event->parent);
+
if (event->rb) {
/*
* Should be impossible, we set this when removing
@@ -5991,6 +5962,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -6004,6 +5978,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -6349,7 +6326,6 @@ accounting:
ring_buffer_attach(event, rb);
perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -6521,26 +6497,43 @@ static void perf_pending_event(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
-/*
- * We assume there is only KVM supporting the callbacks.
- * Later on, we might change it to a list if there is
- * another virtualization implementation supporting the callbacks.
- */
-struct perf_guest_info_callbacks *perf_guest_cbs;
+#ifdef CONFIG_GUEST_PERF_EVENTS
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
-int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
+DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
+DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
+
+void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
+ static_call_update(__perf_guest_state, cbs->state);
+ static_call_update(__perf_guest_get_ip, cbs->get_ip);
+
+ /* Implementing ->handle_intel_pt_intr is optional. */
+ if (cbs->handle_intel_pt_intr)
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
-int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
+void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
- return 0;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ static_call_update(__perf_guest_state, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
+ static_call_update(__perf_guest_handle_intel_pt_intr,
+ (void *)&__static_call_return0);
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+#endif
static void
perf_output_sample_regs(struct perf_output_handle *handle,
@@ -6648,7 +6641,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
- mm_segment_t fs;
/*
* We dump:
@@ -6666,9 +6658,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -6696,7 +6686,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event,
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
goto out;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
goto out;
@@ -6762,7 +6752,7 @@ static void perf_aux_sample_output(struct perf_event *event,
if (WARN_ON_ONCE(!sampler || !data->aux_size))
return;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
return;
@@ -9759,6 +9749,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
continue;
if (event->attr.config != entry->type)
continue;
+ /* Cannot deliver synchronous signal to other task. */
+ if (event->attr.sigtrap)
+ continue;
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
}
@@ -10430,8 +10423,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
*/
if (state == IF_STATE_END) {
ret = -EINVAL;
- if (kernel && event->attr.exclude_kernel)
- goto fail;
/*
* ACTION "filter" must have a non-zero length region
@@ -10473,8 +10464,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
@@ -11536,6 +11530,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->state = PERF_EVENT_STATE_INACTIVE;
+ if (parent_event)
+ event->event_caps = parent_event->event_caps;
+
if (event->attr.sigtrap)
atomic_set(&event->event_limit, 1);
@@ -13463,7 +13460,7 @@ static int __perf_cgroup_move(void *info)
{
struct task_struct *task = info;
rcu_read_lock();
- perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ perf_cgroup_switch(task);
rcu_read_unlock();
return 0;
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6357c3580d07..6418083901d4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = compound_head(old_page),
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
int err;
struct mmu_notifier_range range;
@@ -173,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
return err;
}
- /* For try_to_free_swap() and munlock_vma_page() below */
+ /* For try_to_free_swap() below */
lock_page(old_page);
mmu_notifier_invalidate_range_start(&range);
@@ -201,13 +197,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, false);
+ page_remove_rmap(old_page, vma, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
-
- if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
- munlock_vma_page(old_page);
put_page(old_page);
err = 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index f702a6a63686..f072959fcab7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,7 +49,8 @@
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
@@ -64,6 +65,7 @@
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
+#include <linux/rethook.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -116,7 +118,7 @@ static void __exit_signal(struct task_struct *tsk)
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
- wake_up_process(sig->group_exit_task);
+ wake_up_process(sig->group_exec_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
@@ -169,6 +171,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -697,7 +700,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
- wake_up_process(tsk->signal->group_exit_task);
+ wake_up_process(tsk->signal->group_exec_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
@@ -735,37 +738,8 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- /*
- * We can get here from a kernel oops, sometimes with preemption off.
- * Start by checking for critical errors.
- * Then fix up important state like USER_DS and preemption.
- * Then do everything else.
- */
-
- WARN_ON(blk_needs_flush_plug(tsk));
-
- if (unlikely(in_interrupt()))
- panic("Aiee, killing interrupt handler!");
- if (unlikely(!tsk->pid))
- panic("Attempted to kill the idle task!");
-
- /*
- * If do_exit is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- */
- force_uaccess_begin();
+ WARN_ON(tsk->plug);
- if (unlikely(in_atomic())) {
- pr_info("note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
- preempt_count_set(PREEMPT_ENABLED);
- }
-
- profile_task_exit(tsk);
kcov_task_exit(tsk);
coredump_task_exit(tsk);
@@ -773,17 +747,6 @@ void __noreturn do_exit(long code)
validate_creds_for_do_exit(tsk);
- /*
- * We're taking recursive faults here in do_exit. Safest is to just
- * leave this task alone and wait for reboot.
- */
- if (unlikely(tsk->flags & PF_EXITING)) {
- pr_alert("Fixing recursive fault but reboot is needed!\n");
- futex_exit_recursive(tsk);
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule();
- }
-
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
@@ -871,6 +834,7 @@ void __noreturn do_exit(long code)
put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
@@ -882,16 +846,46 @@ void __noreturn do_exit(long code)
lockdep_free_task(tsk);
do_task_dead();
}
-EXPORT_SYMBOL_GPL(do_exit);
-void complete_and_exit(struct completion *comp, long code)
+void __noreturn make_task_dead(int signr)
{
- if (comp)
- complete(comp);
+ /*
+ * Take the task off the cpu after something catastrophic has
+ * happened.
+ *
+ * We can get here from a kernel oops, sometimes with preemption off.
+ * Start by checking for critical errors.
+ * Then fix up important state like USER_DS and preemption.
+ * Then do everything else.
+ */
+ struct task_struct *tsk = current;
+
+ if (unlikely(in_interrupt()))
+ panic("Aiee, killing interrupt handler!");
+ if (unlikely(!tsk->pid))
+ panic("Attempted to kill the idle task!");
+
+ if (unlikely(in_atomic())) {
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
+ preempt_count_set(PREEMPT_ENABLED);
+ }
- do_exit(code);
+ /*
+ * We're taking recursive faults here in make_task_dead. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
+ futex_exit_recursive(tsk);
+ tsk->exit_state = EXIT_DEAD;
+ refcount_inc(&tsk->rcu_users);
+ do_task_dead();
+ }
+
+ do_exit(signr);
}
-EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
@@ -902,22 +896,24 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-void
+void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
- BUG_ON(exit_code & 0x80); /* core dumps don't get here */
-
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
exit_code = sig->group_exit_code;
+ else if (sig->group_exec_task)
+ exit_code = 0;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
- if (signal_group_exit(sig))
+ if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
+ else if (sig->group_exec_task)
+ exit_code = 0;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
@@ -1012,7 +1008,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- status = p->exit_code;
+ status = (p->signal->flags & SIGNAL_GROUP_EXIT)
+ ? p->signal->group_exit_code : p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
diff --git a/kernel/extable.c b/kernel/extable.c
index b6f330f0fe74..bda5e9761541 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -3,6 +3,7 @@
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
*/
+#include <linux/elf.h>
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/extable.h>
@@ -132,12 +133,33 @@ out:
}
/*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+ func_desc_t *desc = ptr;
+ void *p;
+
+ if (!get_kernel_nofault(p, (void *)&desc->addr))
+ ptr = p;
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+ if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+ return ptr;
+
+ return dereference_function_descriptor(ptr);
+}
+#endif
+
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3244cc56b697..9796897560ab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
@@ -96,6 +97,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -184,7 +186,7 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-#ifdef CONFIG_VMAP_STACK
+# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
@@ -192,6 +194,41 @@ static inline void free_task_struct(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -209,11 +246,35 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}
-#endif
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
-#ifdef CONFIG_VMAP_STACK
+ int i;
+ int ret;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ }
+ return 0;
+err:
+ /*
+ * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+ * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+ * ignore this page.
+ */
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ return ret;
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm;
void *stack;
int i;
@@ -225,15 +286,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Mark stack accessible for KASAN. */
+ /* Reset stack metadata. */
kasan_unpoison_range(s->addr, THREAD_SIZE);
+ stack = kasan_reset_tag(s->addr);
+
/* Clear stale pointers from reused stack. */
- memset(s->addr, 0, THREAD_SIZE);
+ memset(stack, 0, THREAD_SIZE);
+
+ if (memcg_charge_kernel_stack(s)) {
+ vfree(s->addr);
+ return -ENOMEM;
+ }
tsk->stack_vm_area = s;
- tsk->stack = s->addr;
- return s->addr;
+ tsk->stack = stack;
+ return 0;
}
/*
@@ -246,71 +314,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
+ if (!stack)
+ return -ENOMEM;
+ vm = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
-#else
+ tsk->stack_vm_area = vm;
+ stack = kasan_reset_tag(stack);
+ tsk->stack = stack;
+ return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}
+
+# else /* !CONFIG_VMAP_STACK */
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
+ return 0;
}
- return NULL;
-#endif
+ return -ENOMEM;
}
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
- int i;
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
+}
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+# endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
+static struct kmem_cache *thread_stack_cache;
- return;
- }
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
- vfree_atomic(tsk->stack);
- return;
- }
-#endif
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ call_rcu(rh, thread_stack_free_rcu);
}
-# else
-static struct kmem_cache *thread_stack_cache;
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
void thread_stack_cache_init(void)
@@ -320,8 +413,26 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ unsigned long *stack;
+
+ stack = arch_alloc_thread_stack_node(tsk, node);
+ tsk->stack = stack;
+ return stack ? 0 : -ENOMEM;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ arch_free_thread_stack(tsk);
+ tsk->stack = NULL;
+}
+
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -365,61 +476,47 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
+ dup_anon_vma_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
+ free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
static void account_kernel_stack(struct task_struct *tsk, int account)
{
- void *stack = task_stack_page(tsk);
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
+ void *stack = task_stack_page(tsk);
+
/* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
}
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+void exit_task_stack_account(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ account_kernel_stack(tsk, -1);
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm;
int i;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page's
- * memory cgroup pointer is NULL, and
- * memcg_kmem_uncharge_page() in free_thread_stack()
- * will ignore this page.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
- }
+ vm = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
}
-#endif
- return 0;
}
static void release_task_stack(struct task_struct *tsk)
@@ -427,12 +524,7 @@ static void release_task_stack(struct task_struct *tsk)
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -754,9 +846,7 @@ void __put_task_struct(struct task_struct *tsk)
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
sched_core_free(tsk);
-
- if (!profile_handoff_task(tsk))
- free_task(tsk);
+ free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);
@@ -873,8 +963,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
@@ -883,32 +971,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!tsk)
return NULL;
- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
goto free_tsk;
- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
- stack_vm_area = task_stack_vm_area(tsk);
-
- err = arch_dup_task_struct(tsk, orig);
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
+ goto free_tsk;
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
-
- if (err)
- goto free_stack;
+ account_kernel_stack(tsk, 1);
err = scs_prepare(tsk, node);
if (err)
@@ -950,9 +1024,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- tsk->pf_io_worker = NULL;
-
- account_kernel_stack(tsk, 1);
+ tsk->worker_private = NULL;
kcov_task_init(tsk);
kmap_local_fork(tsk);
@@ -966,12 +1038,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_IOMMU_SVA
+ tsk->pasid_activated = 0;
+#endif
+
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
return tsk;
free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -1018,13 +1095,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static void mm_init_pasid(struct mm_struct *mm)
-{
-#ifdef CONFIG_IOMMU_SUPPORT
- mm->pasid = INIT_PASID;
-#endif
-}
-
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1053,7 +1123,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
- mm_init_pasid(mm);
+ mm_pasid_init(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1120,6 +1190,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ mm_pasid_drop(mm);
mmdrop(mm);
}
@@ -1556,32 +1627,6 @@ out:
return error;
}
-static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
-{
-#ifdef CONFIG_BLOCK
- struct io_context *ioc = current->io_context;
- struct io_context *new_ioc;
-
- if (!ioc)
- return 0;
- /*
- * Share io context with parent, if CLONE_IO is set
- */
- if (clone_flags & CLONE_IO) {
- ioc_task_link(ioc);
- tsk->io_context = ioc;
- } else if (ioprio_valid(ioc->ioprio)) {
- new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
- if (unlikely(!new_ioc))
- return -ENOMEM;
-
- new_ioc->ioprio = ioc->ioprio;
- put_io_context(new_ioc);
- }
-#endif
- return 0;
-}
-
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@@ -2032,12 +2077,6 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
- /*
- * This _must_ happen before we call free_task(), i.e. before we jump
- * to any of the bad_fork_* labels. This is to avoid freeing
- * p->set_child_tid which is (ab)used as a kthread's data pointer for
- * kernel threads (PF_KTHREAD).
- */
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -2052,18 +2091,18 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ retval = copy_creds(p, clone_flags);
+ if (retval < 0)
+ goto bad_fork_free;
+
retval = -EAGAIN;
if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
- goto bad_fork_free;
+ goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_EXCEEDED;
- retval = copy_creds(p, clone_flags);
- if (retval < 0)
- goto bad_fork_free;
-
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
@@ -2118,12 +2157,16 @@ static __latent_entropy struct task_struct *copy_process(
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
+ if (p->flags & PF_KTHREAD) {
+ if (!set_kthread_struct(p))
+ goto bad_fork_cleanup_delayacct;
+ }
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
- goto bad_fork_cleanup_threadgroup_lock;
+ goto bad_fork_cleanup_delayacct;
}
#endif
#ifdef CONFIG_CPUSETS
@@ -2282,6 +2325,9 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
@@ -2294,6 +2340,17 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_put_pidfd;
/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2350,10 +2407,6 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
- /* past the last point of failure */
- if (pidfile)
- fd_install(pidfd, pidfile);
-
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2402,8 +2455,11 @@ static __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+ if (pidfile)
+ fd_install(pidfd, pidfile);
+
proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
@@ -2460,14 +2516,15 @@ bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
-bad_fork_cleanup_threadgroup_lock:
#endif
+bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 25d8a88b32e5..b22ef1efe751 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -41,11 +41,6 @@
#include "futex.h"
#include "../locking/rtmutex_common.h"
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
-#endif
-
-
/*
* The base of the bucket array and its size are always used together
* (after initialization only in futex_hash()), so ensure that they
@@ -307,7 +302,7 @@ again:
* found it, but truncated or holepunched or subjected to
* invalidate_complete_page2 before we got the page lock (also
* cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
+ * so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
@@ -776,9 +771,6 @@ static void exit_robust_list(struct task_struct *curr)
unsigned long futex_offset;
int rc;
- if (!futex_cmpxchg_enabled)
- return;
-
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
@@ -874,9 +866,6 @@ static void compat_exit_robust_list(struct task_struct *curr)
compat_long_t futex_offset;
int rc;
- if (!futex_cmpxchg_enabled)
- return;
-
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
@@ -950,8 +939,6 @@ static void exit_pi_state_list(struct task_struct *curr)
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
- if (!futex_cmpxchg_enabled)
- return;
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
@@ -1044,7 +1031,7 @@ static void futex_cleanup(struct task_struct *tsk)
* actually finished the futex cleanup. The worst case for this is that the
* waiter runs through the wait loop until the state becomes visible.
*
- * This is called from the recursive fault handling path in do_exit().
+ * This is called from the recursive fault handling path in make_task_dead().
*
* This is best effort. Either the futex exit code has run already or
* not. If the OWNER_DIED bit has been set on the futex then the waiter can
@@ -1125,26 +1112,6 @@ void futex_exit_release(struct task_struct *tsk)
futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}
-static void __init futex_detect_cmpxchg(void)
-{
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
- u32 curval;
-
- /*
- * This will fail and we want it. Some arch implementations do
- * runtime detection of the futex_atomic_cmpxchg_inatomic()
- * functionality. We want to know that before we call in any
- * of the complex code paths. Also we want to prevent
- * registration of robust lists in that case. NULL is
- * guaranteed to fault and we get -EFAULT on functional
- * implementation, the non-functional ones will return
- * -ENOSYS.
- */
- if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT)
- futex_cmpxchg_enabled = 1;
-#endif
-}
-
static int __init futex_init(void)
{
unsigned int futex_shift;
@@ -1163,8 +1130,6 @@ static int __init futex_init(void)
futex_hashsize, futex_hashsize);
futex_hashsize = 1UL << futex_shift;
- futex_detect_cmpxchg();
-
for (i = 0; i < futex_hashsize; i++) {
atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 040ae4277cb0..c264cbeab71c 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -27,12 +27,6 @@
#define FLAGS_CLOCKRT 0x02
#define FLAGS_HAS_TIMEOUT 0x04
-#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
-#define futex_cmpxchg_enabled 1
-#else
-extern int __read_mostly futex_cmpxchg_enabled;
-#endif
-
#ifdef CONFIG_FAIL_FUTEX
extern bool should_fail_futex(bool fshared);
#else
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 6f91a07a6a83..086a22d1adb7 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -29,8 +29,6 @@
SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
size_t, len)
{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
/*
* The kernel knows only one size for now:
*/
@@ -56,9 +54,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
unsigned long ret;
struct task_struct *p;
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
rcu_read_lock();
ret = -ESRCH;
@@ -104,17 +99,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
}
switch (cmd) {
- case FUTEX_LOCK_PI:
- case FUTEX_LOCK_PI2:
- case FUTEX_UNLOCK_PI:
- case FUTEX_TRYLOCK_PI:
- case FUTEX_WAIT_REQUEUE_PI:
- case FUTEX_CMP_REQUEUE_PI:
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
- }
-
- switch (cmd) {
case FUTEX_WAIT:
val3 = FUTEX_BITSET_MATCH_ANY;
fallthrough;
@@ -323,9 +307,6 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list,
struct compat_robust_list_head __user *, head,
compat_size_t, len)
{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
if (unlikely(len != sizeof(*head)))
return -EINVAL;
@@ -342,9 +323,6 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
unsigned long ret;
struct task_struct *p;
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
rcu_read_lock();
ret = -ESRCH;
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 053447183ac5..04f4ebdc3cf5 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
- depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR
select CONSTRUCTORS
default n
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 9888e2bc8c76..52501e5f7655 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -63,7 +63,9 @@ static struct task_struct *watchdog_task;
* Should we dump all CPUs backtraces in a hung task event?
* Defaults to 0, can be changed via sysctl.
*/
-unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
+#else
+#define sysctl_hung_task_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
/*
@@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked,
MAX_SCHEDULE_TIMEOUT;
}
+#ifdef CONFIG_SYSCTL
/*
* Process updating of timeout sysctl
*/
-int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
+/*
+ * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
+ * and hung_task_check_interval_secs
+ */
+static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
+static struct ctl_table hung_task_sysctls[] = {
+#ifdef CONFIG_SMP
+ {
+ .procname = "hung_task_all_cpu_backtrace",
+ .data = &sysctl_hung_task_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif /* CONFIG_SMP */
+ {
+ .procname = "hung_task_panic",
+ .data = &sysctl_hung_task_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "hung_task_check_count",
+ .data = &sysctl_hung_task_check_count,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "hung_task_timeout_secs",
+ .data = &sysctl_hung_task_timeout_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_check_interval_secs",
+ .data = &sysctl_hung_task_check_interval_secs,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dohung_task_timeout_secs,
+ .extra2 = (void *)&hung_task_timeout_max,
+ },
+ {
+ .procname = "hung_task_warnings",
+ .data = &sysctl_hung_task_warnings,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_NEG_ONE,
+ },
+ {}
+};
+
+static void __init hung_task_sysctl_init(void)
+{
+ register_sysctl_init("kernel", hung_task_sysctls);
+}
+#else
+#define hung_task_sysctl_init() do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
@@ -310,6 +384,7 @@ static int __init hung_task_init(void)
pm_notifier(hungtask_pm_notify, 0);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+ hung_task_sysctl_init();
return 0;
}
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f7ff8919dc9b..fdf170404650 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec,
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
- node_to_cpumask[n]);
+ /* Ensure that only CPUs which are in both masks are set */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
if (++curvec == last_affv)
curvec = firstvec;
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f895265d7548..54af0deb239b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -38,7 +38,7 @@ struct irqaction chained_action = {
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
@@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
if (!desc)
return -EINVAL;
- if (!chip)
- chip = &no_irq_chip;
-
- desc->irq_data.chip = chip;
+ desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
@@ -575,8 +572,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq);
*/
void handle_untracked_irq(struct irq_desc *desc)
{
- unsigned int flags = 0;
-
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
@@ -593,7 +588,7 @@ void handle_untracked_irq(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
- __handle_irq_event_percpu(desc, &flags);
+ __handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -1075,7 +1070,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
@@ -1560,6 +1555,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+static struct device *irq_get_parent_device(struct irq_data *data)
+{
+ if (data->domain)
+ return data->domain->dev;
+
+ return NULL;
+}
+
/**
* irq_chip_pm_get - Enable power for an IRQ chip
* @data: Pointer to interrupt specific data
@@ -1569,12 +1572,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
int irq_chip_pm_get(struct irq_data *data)
{
+ struct device *dev = irq_get_parent_device(data);
int retval;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
- retval = pm_runtime_get_sync(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev) {
+ retval = pm_runtime_get_sync(dev);
if (retval < 0) {
- pm_runtime_put_noidle(data->chip->parent_device);
+ pm_runtime_put_noidle(dev);
return retval;
}
}
@@ -1592,10 +1596,11 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
+ struct device *dev = irq_get_parent_device(data);
int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
- retval = pm_runtime_put(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_put(dev);
return (retval < 0) ? retval : 0;
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 39a41c56ad4f..1ed2b1739363 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
{
const struct cpumask *hk_mask;
- if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
return false;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
return false;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index e4cff358b437..2b43f5f5033d 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -69,8 +69,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
seq_printf(m, "chip: None\n");
return;
}
- seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
- seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
+ seq_printf(m, "%*schip: ", ind, "");
+ if (chip->irq_print_chip)
+ chip->irq_print_chip(data, m);
+ else
+ seq_printf(m, "%s", chip->name);
+ seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
ARRAY_SIZE(irqchip_flags));
}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 6f29bf4c8515..f0862eb6b506 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -451,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
}
-struct irq_domain_ops irq_generic_chip_ops = {
+const struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
.unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27182003b879..9489f93b3db3 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -136,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int irq = desc->irq_data.irq;
@@ -174,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
}
__irq_wake_thread(desc, action);
-
- fallthrough; /* to add to randomness */
- case IRQ_HANDLED:
- *flags |= action->flags;
break;
default:
@@ -193,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
- unsigned int flags = 0;
- retval = __handle_irq_event_percpu(desc, &flags);
+ retval = __handle_irq_event_percpu(desc);
- add_interrupt_randomness(desc->irq_data.irq, flags);
+ add_interrupt_randomness(desc->irq_data.irq);
if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 54363527feea..99cbdf55a8bd 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
-irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
+irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2267e6527db3..939d21cd55c3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -640,7 +640,7 @@ int handle_irq_desc(struct irq_desc *desc)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data)))
return -EPERM;
generic_handle_irq_desc(desc);
@@ -662,6 +662,29 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+/**
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
+ * @irq: The irq number to handle
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
+ */
+int generic_handle_irq_safe(unsigned int irq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_to_desc(irq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
+
#ifdef CONFIG_IRQ_DOMAIN
/**
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
@@ -676,7 +699,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
- WARN_ON_ONCE(!in_irq());
+ WARN_ON_ONCE(!in_hardirq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index bf38c546aa25..d5ce96510549 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @chip_data: The associated chip data
*/
int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq,
+ const struct irq_chip *chip,
void *chip_data)
{
struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return -ENOENT;
irq_data->hwirq = hwirq;
- irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
return 0;
@@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7405e384e5ed..c03f71d5ec10 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -247,13 +247,13 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
* online.
*/
if (irqd_affinity_is_managed(data) &&
- housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
+ housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
const struct cpumask *hk_mask, *prog_mask;
static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
static struct cpumask tmp_mask;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
raw_spin_lock(&tmp_mask_lock);
cpumask_and(&tmp_mask, mask, hk_mask);
@@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
-int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
+int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
+ bool setaffinity)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
@@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
- /* set the initial affinity to prevent every interrupt being on CPU0 */
- if (m)
+ if (m && setaffinity)
__irq_set_affinity(irq, m, false);
return 0;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7f350ae59c5f..2bdfce5edafd 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -14,12 +14,15 @@
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/slab.h>
+#include <linux/sysfs.h>
#include <linux/pci.h>
#include "internals.h"
+static inline int msi_sysfs_create_group(struct device *dev);
+
/**
- * alloc_msi_entry - Allocate an initialized msi_desc
+ * msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
@@ -29,34 +32,134 @@
*
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
-struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
- const struct irq_affinity_desc *affinity)
+static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
+ const struct irq_affinity_desc *affinity)
{
- struct msi_desc *desc;
+ struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
- desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
- INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
- desc->affinity = kmemdup(affinity,
- nvec * sizeof(*desc->affinity), GFP_KERNEL);
+ desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
}
}
-
return desc;
}
-void free_msi_entry(struct msi_desc *entry)
+static void msi_free_desc(struct msi_desc *desc)
{
- kfree(entry->affinity);
- kfree(entry);
+ kfree(desc->affinity);
+ kfree(desc);
+}
+
+static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
+{
+ int ret;
+
+ desc->msi_index = index;
+ ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
+ if (ret)
+ msi_free_desc(desc);
+ return ret;
+}
+
+/**
+ * msi_add_msi_desc - Allocate and initialize a MSI descriptor
+ * @dev: Pointer to the device for which the descriptor is allocated
+ * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
+{
+ struct msi_desc *desc;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
+ if (!desc)
+ return -ENOMEM;
+
+ /* Copy type specific data to the new descriptor. */
+ desc->pci = init_desc->pci;
+ return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
+}
+
+/**
+ * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
+ * @dev: Pointer to the device for which the descriptors are allocated
+ * @index: Index for the first MSI descriptor
+ * @ndesc: Number of descriptors to allocate
+ *
+ * Return: 0 on success or an appropriate failure code.
+ */
+static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
+{
+ unsigned int idx, last = index + ndesc - 1;
+ struct msi_desc *desc;
+ int ret;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ for (idx = index; idx <= last; idx++) {
+ desc = msi_alloc_desc(dev, 1, NULL);
+ if (!desc)
+ goto fail_mem;
+ ret = msi_insert_desc(dev->msi.data, desc, idx);
+ if (ret)
+ goto fail;
+ }
+ return 0;
+
+fail_mem:
+ ret = -ENOMEM;
+fail:
+ msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
+ return ret;
+}
+
+static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
+{
+ switch (filter) {
+ case MSI_DESC_ALL:
+ return true;
+ case MSI_DESC_NOTASSOCIATED:
+ return !desc->irq;
+ case MSI_DESC_ASSOCIATED:
+ return !!desc->irq;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
+/**
+ * msi_free_msi_descs_range - Free MSI descriptors of a device
+ * @dev: Device to free the descriptors
+ * @filter: Descriptor state filter
+ * @first_index: Index to start freeing from
+ * @last_index: Last index to be freed
+ */
+void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
+ unsigned int first_index, unsigned int last_index)
+{
+ struct xarray *xa = &dev->msi.data->__store;
+ struct msi_desc *desc;
+ unsigned long idx;
+
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ xa_for_each_range(xa, idx, desc, first_index, last_index) {
+ if (msi_desc_match(desc, filter)) {
+ xa_erase(xa, idx);
+ msi_free_desc(desc);
+ }
+ }
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@@ -72,139 +175,290 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
-static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
- char *buf)
+static void msi_device_data_release(struct device *dev, void *res)
{
- struct msi_desc *entry;
- bool is_msix = false;
- unsigned long irq;
- int retval;
+ struct msi_device_data *md = res;
- retval = kstrtoul(attr->attr.name, 10, &irq);
- if (retval)
- return retval;
+ WARN_ON_ONCE(!xa_empty(&md->__store));
+ xa_destroy(&md->__store);
+ dev->msi.data = NULL;
+}
- entry = irq_get_msi_desc(irq);
- if (!entry)
- return -ENODEV;
+/**
+ * msi_setup_device_data - Setup MSI device data
+ * @dev: Device for which MSI device data should be set up
+ *
+ * Return: 0 on success, appropriate error code otherwise
+ *
+ * This can be called more than once for @dev. If the MSI device data is
+ * already allocated the call succeeds. The allocated memory is
+ * automatically released when the device is destroyed.
+ */
+int msi_setup_device_data(struct device *dev)
+{
+ struct msi_device_data *md;
+ int ret;
- if (dev_is_pci(dev))
- is_msix = entry->msi_attrib.is_msix;
+ if (dev->msi.data)
+ return 0;
- return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+ md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ ret = msi_sysfs_create_group(dev);
+ if (ret) {
+ devres_free(md);
+ return ret;
+ }
+
+ xa_init(&md->__store);
+ mutex_init(&md->mutex);
+ dev->msi.data = md;
+ devres_add(dev, md);
+ return 0;
+}
+
+/**
+ * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_lock_descs(struct device *dev)
+{
+ mutex_lock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_lock_descs);
+
+/**
+ * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * @dev: Device to operate on
+ */
+void msi_unlock_descs(struct device *dev)
+{
+ /* Invalidate the index wich was cached by the iterator */
+ dev->msi.data->__iter_idx = MSI_MAX_INDEX;
+ mutex_unlock(&dev->msi.data->mutex);
+}
+EXPORT_SYMBOL_GPL(msi_unlock_descs);
+
+static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
+{
+ struct msi_desc *desc;
+
+ xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
+ if (msi_desc_match(desc, filter))
+ return desc;
+ }
+ md->__iter_idx = MSI_MAX_INDEX;
+ return NULL;
}
/**
- * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
- * @dev: The device(PCI, platform etc) who will get sysfs entries
+ * msi_first_desc - Get the first MSI descriptor of a device
+ * @dev: Device to operate on
+ * @filter: Descriptor state filter
+ *
+ * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
+ * must be invoked before the call.
*
- * Return attribute_group ** so that specific bus MSI can save it to
- * somewhere during initilizing msi irqs. If devices has no MSI irq,
- * return NULL; if it fails to populate sysfs, return ERR_PTR
+ * Return: Pointer to the first MSI descriptor matching the search
+ * criteria, NULL if none found.
*/
-const struct attribute_group **msi_populate_sysfs(struct device *dev)
-{
- const struct attribute_group **msi_irq_groups;
- struct attribute **msi_attrs, *msi_attr;
- struct device_attribute *msi_dev_attr;
- struct attribute_group *msi_irq_group;
- struct msi_desc *entry;
- int ret = -ENOMEM;
- int num_msi = 0;
- int count = 0;
- int i;
+struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
- /* Determine how many msi entries we have */
- for_each_msi_entry(entry, dev)
- num_msi += entry->nvec_used;
- if (!num_msi)
+ if (WARN_ON_ONCE(!md))
return NULL;
- /* Dynamically create the MSI attributes for the device */
- msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
- if (!msi_attrs)
- return ERR_PTR(-ENOMEM);
-
- for_each_msi_entry(entry, dev) {
- for (i = 0; i < entry->nvec_used; i++) {
- msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
- if (!msi_dev_attr)
- goto error_attrs;
- msi_attrs[count] = &msi_dev_attr->attr;
-
- sysfs_attr_init(&msi_dev_attr->attr);
- msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
- entry->irq + i);
- if (!msi_dev_attr->attr.name)
- goto error_attrs;
- msi_dev_attr->attr.mode = 0444;
- msi_dev_attr->show = msi_mode_show;
- ++count;
+ lockdep_assert_held(&md->mutex);
+
+ md->__iter_idx = 0;
+ return msi_find_desc(md, filter);
+}
+EXPORT_SYMBOL_GPL(msi_first_desc);
+
+/**
+ * msi_next_desc - Get the next MSI descriptor of a device
+ * @dev: Device to operate on
+ *
+ * The first invocation of msi_next_desc() has to be preceeded by a
+ * successful invocation of __msi_first_desc(). Consecutive invocations are
+ * only valid if the previous one was successful. All these operations have
+ * to be done within the same MSI mutex held region.
+ *
+ * Return: Pointer to the next MSI descriptor matching the search
+ * criteria, NULL if none found.
+ */
+struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
+{
+ struct msi_device_data *md = dev->msi.data;
+
+ if (WARN_ON_ONCE(!md))
+ return NULL;
+
+ lockdep_assert_held(&md->mutex);
+
+ if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
+ return NULL;
+
+ md->__iter_idx++;
+ return msi_find_desc(md, filter);
+}
+EXPORT_SYMBOL_GPL(msi_next_desc);
+
+/**
+ * msi_get_virq - Return Linux interrupt number of a MSI interrupt
+ * @dev: Device to operate on
+ * @index: MSI interrupt index to look for (0-based)
+ *
+ * Return: The Linux interrupt number on success (> 0), 0 if not found
+ */
+unsigned int msi_get_virq(struct device *dev, unsigned int index)
+{
+ struct msi_desc *desc;
+ unsigned int ret = 0;
+ bool pcimsi;
+
+ if (!dev->msi.data)
+ return 0;
+
+ pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
+
+ msi_lock_descs(dev);
+ desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
+ if (desc && desc->irq) {
+ /*
+ * PCI-MSI has only one descriptor for multiple interrupts.
+ * PCI-MSIX and platform MSI use a descriptor per
+ * interrupt.
+ */
+ if (pcimsi) {
+ if (index < desc->nvec_used)
+ ret = desc->irq + index;
+ } else {
+ ret = desc->irq;
}
}
+ msi_unlock_descs(dev);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(msi_get_virq);
- msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
- if (!msi_irq_group)
- goto error_attrs;
- msi_irq_group->name = "msi_irqs";
- msi_irq_group->attrs = msi_attrs;
+#ifdef CONFIG_SYSFS
+static struct attribute *msi_dev_attrs[] = {
+ NULL
+};
- msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
- if (!msi_irq_groups)
- goto error_irq_group;
- msi_irq_groups[0] = msi_irq_group;
+static const struct attribute_group msi_irqs_group = {
+ .name = "msi_irqs",
+ .attrs = msi_dev_attrs,
+};
- ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
- if (ret)
- goto error_irq_groups;
-
- return msi_irq_groups;
-
-error_irq_groups:
- kfree(msi_irq_groups);
-error_irq_group:
- kfree(msi_irq_group);
-error_attrs:
- count = 0;
- msi_attr = msi_attrs[count];
- while (msi_attr) {
- msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
- kfree(msi_attr->name);
- kfree(msi_dev_attr);
- ++count;
- msi_attr = msi_attrs[count];
+static inline int msi_sysfs_create_group(struct device *dev)
+{
+ return devm_device_add_group(dev, &msi_irqs_group);
+}
+
+static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ /* MSI vs. MSIX is per device not per interrupt */
+ bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
+
+ return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
+}
+
+static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs = desc->sysfs_attrs;
+ int i;
+
+ if (!attrs)
+ return;
+
+ desc->sysfs_attrs = NULL;
+ for (i = 0; i < desc->nvec_used; i++) {
+ if (attrs[i].show)
+ sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ kfree(attrs[i].attr.name);
}
- kfree(msi_attrs);
- return ERR_PTR(ret);
+ kfree(attrs);
}
+static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
+{
+ struct device_attribute *attrs;
+ int ret, i;
+
+ attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return -ENOMEM;
+
+ desc->sysfs_attrs = attrs;
+ for (i = 0; i < desc->nvec_used; i++) {
+ sysfs_attr_init(&attrs[i].attr);
+ attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
+ if (!attrs[i].attr.name) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+
+ attrs[i].attr.mode = 0444;
+ attrs[i].show = msi_mode_show;
+
+ ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
+ if (ret) {
+ attrs[i].show = NULL;
+ goto fail;
+ }
+ }
+ return 0;
+
+fail:
+ msi_sysfs_remove_desc(dev, desc);
+ return ret;
+}
+
+#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
/**
- * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
- * @dev: The device(PCI, platform etc) who will remove sysfs entries
- * @msi_irq_groups: attribute_group for device msi_irqs entries
+ * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) which will get sysfs entries
*/
-void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
-{
- struct device_attribute *dev_attr;
- struct attribute **msi_attrs;
- int count = 0;
-
- if (msi_irq_groups) {
- sysfs_remove_groups(&dev->kobj, msi_irq_groups);
- msi_attrs = msi_irq_groups[0]->attrs;
- while (msi_attrs[count]) {
- dev_attr = container_of(msi_attrs[count],
- struct device_attribute, attr);
- kfree(dev_attr->attr.name);
- kfree(dev_attr);
- ++count;
- }
- kfree(msi_attrs);
- kfree(msi_irq_groups[0]);
- kfree(msi_irq_groups);
+int msi_device_populate_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+ int ret;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ if (desc->sysfs_attrs)
+ continue;
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
}
+ return 0;
}
+/**
+ * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
+ * @dev: The device (PCI, platform etc) for which to remove
+ * sysfs entries
+ */
+void msi_device_destroy_sysfs(struct device *dev)
+{
+ struct msi_desc *desc;
+
+ msi_for_each_desc(desc, dev, MSI_DESC_ALL)
+ msi_sysfs_remove_desc(dev, desc);
+}
+#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
+#else /* CONFIG_SYSFS */
+static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
+static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
+static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
+#endif /* !CONFIG_SYSFS */
+
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
static inline void irq_chip_write_msi_msg(struct irq_data *data,
struct msi_msg *msg)
@@ -456,43 +710,38 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
- int virq, int nvec, msi_alloc_info_t *arg)
+ int virq_base, int nvec, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
struct msi_desc *desc;
- int ret = 0;
+ int ret, virq;
- for_each_msi_entry(desc, dev) {
- /* Don't even try the multi-MSI brain damage. */
- if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
- ret = -EINVAL;
- break;
- }
+ msi_lock_descs(dev);
+ ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
+ if (ret)
+ goto unlock;
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
+ for (virq = virq_base; virq < virq_base + nvec; virq++) {
+ desc = xa_load(&dev->msi.data->__store, virq);
+ desc->irq = virq;
ops->set_desc(arg, desc);
- /* Assumes the domain mutex is held! */
- ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
- arg);
+ ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (ret)
- break;
+ goto fail;
- irq_set_msi_desc_off(desc->irq, 0, desc);
- }
-
- if (ret) {
- /* Mop up the damage */
- for_each_msi_entry(desc, dev) {
- if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
- continue;
-
- irq_domain_free_irqs_common(domain, desc->irq, 1);
- }
+ irq_set_msi_desc(virq, desc);
}
+ msi_unlock_descs(dev);
+ return 0;
+fail:
+ for (--virq; virq >= virq_base; virq--)
+ irq_domain_free_irqs_common(domain, virq, 1);
+ msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
+unlock:
+ msi_unlock_descs(dev);
return ret;
}
@@ -531,8 +780,59 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
* Checking the first MSI descriptor is sufficient. MSIX supports
* masking and MSI does so when the can_mask attribute is set.
*/
- desc = first_msi_entry(dev);
- return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask;
+ desc = msi_first_desc(dev, MSI_DESC_ALL);
+ return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
+}
+
+static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
+ int allocated)
+{
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_VMD_MSI:
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ break;
+ fallthrough;
+ default:
+ return -ENOSPC;
+ }
+
+ /* Let a failed PCI multi MSI allocation retry */
+ if (desc->nvec_used > 1)
+ return 1;
+
+ /* If there was a successful allocation let the caller know */
+ return allocated ? allocated : -ENOSPC;
+}
+
+#define VIRQ_CAN_RESERVE 0x01
+#define VIRQ_ACTIVATE 0x02
+#define VIRQ_NOMASK_QUIRK 0x04
+
+static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
+{
+ struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
+ int ret;
+
+ if (!(vflags & VIRQ_CAN_RESERVE)) {
+ irqd_clr_can_reserve(irqd);
+ if (vflags & VIRQ_NOMASK_QUIRK)
+ irqd_set_msi_nomask_quirk(irqd);
+ }
+
+ if (!(vflags & VIRQ_ACTIVATE))
+ return 0;
+
+ ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
+ if (ret)
+ return ret;
+ /*
+ * If the interrupt uses reservation mode, clear the activated bit
+ * so request_irq() will assign the final vector.
+ */
+ if (vflags & VIRQ_CAN_RESERVE)
+ irqd_clr_activated(irqd);
+ return 0;
}
int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
@@ -540,83 +840,103 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- struct irq_data *irq_data;
- struct msi_desc *desc;
msi_alloc_info_t arg = { };
+ unsigned int vflags = 0;
+ struct msi_desc *desc;
+ int allocated = 0;
int i, ret, virq;
- bool can_reserve;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
return ret;
- for_each_msi_entry(desc, dev) {
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
+ vflags |= VIRQ_ACTIVATE;
+
+ /*
+ * Interrupt can use a reserved vector and will not occupy
+ * a real device vector until the interrupt is requested.
+ */
+ if (msi_check_reservation_mode(domain, info, dev)) {
+ vflags |= VIRQ_CAN_RESERVE;
+ /*
+ * MSI affinity setting requires a special quirk (X86) when
+ * reservation mode is active.
+ */
+ if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+ vflags |= VIRQ_NOMASK_QUIRK;
+ }
+
+ msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false,
desc->affinity);
- if (virq < 0) {
- ret = -ENOSPC;
- if (ops->handle_error)
- ret = ops->handle_error(domain, desc, ret);
- if (ops->msi_finish)
- ops->msi_finish(&arg, ret);
- return ret;
- }
+ if (virq < 0)
+ return msi_handle_pci_fail(domain, desc, allocated);
for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
irq_debugfs_copy_devname(virq + i, dev);
+ ret = msi_init_virq(domain, virq + i, vflags);
+ if (ret)
+ return ret;
+ }
+ if (info->flags & MSI_FLAG_DEV_SYSFS) {
+ ret = msi_sysfs_populate_desc(dev, desc);
+ if (ret)
+ return ret;
}
+ allocated++;
}
+ return 0;
+}
- if (ops->msi_finish)
- ops->msi_finish(&arg, 0);
+static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
+ struct device *dev,
+ unsigned int num_descs)
+{
+ if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
+ return 0;
- can_reserve = msi_check_reservation_mode(domain, info, dev);
+ return msi_add_simple_msi_descs(dev, 0, num_descs);
+}
- /*
- * This flag is set by the PCI layer as we need to activate
- * the MSI entries before the PCI layer enables MSI in the
- * card. Otherwise the card latches a random msi message.
- */
- if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
- goto skip_activate;
-
- for_each_msi_vector(desc, i, dev) {
- if (desc->irq == i) {
- virq = desc->irq;
- dev_dbg(dev, "irq [%d-%d] for MSI\n",
- virq, virq + desc->nvec_used - 1);
- }
+/**
+ * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation/free.
+ *
+ * Return: %0 on success or an error code.
+ */
+int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+ int ret;
- irq_data = irq_domain_get_irq_data(domain, i);
- if (!can_reserve) {
- irqd_clr_can_reserve(irq_data);
- if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
- irqd_set_msi_nomask_quirk(irq_data);
- }
- ret = irq_domain_activate_irq(irq_data, can_reserve);
- if (ret)
- goto cleanup;
- }
+ lockdep_assert_held(&dev->msi.data->mutex);
-skip_activate:
- /*
- * If these interrupts use reservation mode, clear the activated bit
- * so request_irq() will assign the final vector.
- */
- if (can_reserve) {
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- irqd_clr_activated(irq_data);
- }
- }
- return 0;
+ ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
+ if (ret)
+ return ret;
-cleanup:
- msi_domain_free_irqs(domain, dev);
+ ret = ops->domain_alloc_irqs(domain, dev, nvec);
+ if (ret)
+ msi_domain_free_irqs_descs_locked(domain, dev);
return ret;
}
@@ -629,52 +949,78 @@ cleanup:
*
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
{
- struct msi_domain_info *info = domain->host_data;
- struct msi_domain_ops *ops = info->ops;
+ int ret;
- return ops->domain_alloc_irqs(domain, dev, nvec);
+ msi_lock_descs(dev);
+ ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
+ msi_unlock_descs(dev);
+ return ret;
}
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
- struct irq_data *irq_data;
+ struct msi_domain_info *info = domain->host_data;
+ struct irq_data *irqd;
struct msi_desc *desc;
int i;
- for_each_msi_vector(desc, i, dev) {
- irq_data = irq_domain_get_irq_data(domain, i);
- if (irqd_is_activated(irq_data))
- irq_domain_deactivate_irq(irq_data);
- }
-
- for_each_msi_entry(desc, dev) {
- /*
- * We might have failed to allocate an MSI early
- * enough that there is no IRQ associated to this
- * entry. If that's the case, don't do anything.
- */
- if (desc->irq) {
- irq_domain_free_irqs(desc->irq, desc->nvec_used);
- desc->irq = 0;
+ /* Only handle MSI entries which have an interrupt associated */
+ msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
+ /* Make sure all interrupts are deactivated */
+ for (i = 0; i < desc->nvec_used; i++) {
+ irqd = irq_domain_get_irq_data(domain, desc->irq + i);
+ if (irqd && irqd_is_activated(irqd))
+ irq_domain_deactivate_irq(irqd);
}
+
+ irq_domain_free_irqs(desc->irq, desc->nvec_used);
+ if (info->flags & MSI_FLAG_DEV_SYSFS)
+ msi_sysfs_remove_desc(dev, desc);
+ desc->irq = 0;
}
}
+static void msi_domain_free_msi_descs(struct msi_domain_info *info,
+ struct device *dev)
+{
+ if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
+ msi_free_msi_descs(dev);
+}
+
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
+ * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
* @domain: The domain to managing the interrupts
* @dev: Pointer to device struct of the device for which the interrupts
* are free
+ *
+ * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
+ * pair. Use this for MSI irqdomains which implement their own vector
+ * allocation.
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
- return ops->domain_free_irqs(domain, dev);
+ lockdep_assert_held(&dev->msi.data->mutex);
+
+ ops->domain_free_irqs(domain, dev);
+ msi_domain_free_msi_descs(info, dev);
+}
+
+/**
+ * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ msi_lock_descs(dev);
+ msi_domain_free_irqs_descs_locked(domain, dev);
+ msi_unlock_descs(dev);
}
/**
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index ee595ec09778..623b8136e9af 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
+ unsigned int irq = (int)(long)pde_data(file_inode(file));
cpumask_var_t new_value;
int err;
@@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_proc_show, pde_data(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
+ return single_open(file, irq_affinity_list_proc_show, pde_data(inode));
}
static const struct proc_ops irq_affinity_proc_ops = {
@@ -265,7 +265,7 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE_DATA(inode));
+ return single_open(file, default_affinity_show, pde_data(inode));
}
static const struct proc_ops default_affinity_proc_ops = {
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index f7df715ec28e..7afa40fe5cc4 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -137,7 +137,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (!irq_work_claim(work))
return false;
- kasan_record_aux_stack(work);
+ kasan_record_aux_stack_noalloc(work);
preempt_disable();
if (cpu != smp_processor_id()) {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3011bc33a5ba..79f2eb617a62 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -212,6 +212,10 @@ unsigned long kallsyms_lookup_name(const char *name)
unsigned long i;
unsigned int off;
+ /* Skip the search for empty string. */
+ if (!*name)
+ return 0;
+
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
@@ -243,6 +247,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
+ cond_resched();
}
return 0;
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 36ca640c4f8e..475524bd900a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t)
static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
int res = 0;
- void *area;
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
unsigned long flags;
- area = vmalloc_user(vma->vm_end - vma->vm_start);
- if (!area)
- return -ENOMEM;
-
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
}
- if (!kcov->area) {
- kcov->area = area;
- vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock_irqrestore(&kcov->lock, flags);
- for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
- if (vm_insert_page(vma, vma->vm_start + off, page))
- WARN_ONCE(1, "vm_insert_page() failed");
- }
- return 0;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vma->vm_flags |= VM_DONTEXPAND;
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
}
+ return 0;
exit:
spin_unlock_irqrestore(&kcov->lock, flags);
- vfree(area);
return res;
}
@@ -564,31 +555,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
- unsigned long size, unused;
+ unsigned long flags, unused;
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
- unsigned long flags;
switch (cmd) {
- case KCOV_INIT_TRACE:
- /*
- * Enable kcov in trace mode and setup buffer size.
- * Must happen before anything else.
- */
- if (kcov->mode != KCOV_MODE_DISABLED)
- return -EBUSY;
- /*
- * Size must be at least 2 to hold current position and one PC.
- * Later we allocate size * sizeof(unsigned long) memory,
- * that must not overflow.
- */
- size = arg;
- if (size < 2 || size > INT_MAX / sizeof(unsigned long))
- return -EINVAL;
- kcov->size = size;
- kcov->mode = KCOV_MODE_INIT;
- return 0;
case KCOV_ENABLE:
/*
* Enable coverage for the current task.
@@ -692,9 +664,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
- unsigned long flags;
+ unsigned long size, flags;
+ void *area;
- if (cmd == KCOV_REMOTE_ENABLE) {
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
if (get_user(remote_num_handles, (unsigned __user *)(arg +
offsetof(struct kcov_remote_arg, num_handles))))
return -EFAULT;
@@ -710,16 +710,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
return -EINVAL;
}
arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
}
-
- kcov = filep->private_data;
- spin_lock_irqsave(&kcov->lock, flags);
- res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock_irqrestore(&kcov->lock, flags);
-
- kfree(remote_arg);
-
- return res;
}
static const struct file_operations kcov_fops = {
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index c2bb07f5bcc7..4f35d1bced6a 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,9 +8,12 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+ $(call cc-option,-mno-outline-atomics) \
-fno-stack-protector -DDISABLE_BRANCH_PROFILING
obj-y := core.o debugfs.o report.o
+
+KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 4b84c8e7884b..fe12dfe254ec 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -40,15 +40,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+static bool kcsan_weak_memory = true;
+module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
+#else
+#define kcsan_weak_memory false
+#endif
+
bool kcsan_enabled;
/* Per-CPU kcsan_ctx for interrupts */
static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
- .disable_count = 0,
- .atomic_next = 0,
- .atomic_nest_count = 0,
- .in_flat_atomic = false,
- .access_mask = 0,
.scoped_accesses = {LIST_POISON1, NULL},
};
@@ -209,15 +211,17 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
static noinline void kcsan_check_scoped_accesses(void)
{
struct kcsan_ctx *ctx = get_ctx();
- struct list_head *prev_save = ctx->scoped_accesses.prev;
struct kcsan_scoped_access *scoped_access;
- ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
+ if (ctx->disable_scoped)
+ return;
+
+ ctx->disable_scoped++;
list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
check_access(scoped_access->ptr, scoped_access->size,
scoped_access->type, scoped_access->ip);
}
- ctx->scoped_accesses.prev = prev_save;
+ ctx->disable_scoped--;
}
/* Rules for generic atomic accesses. Called from fast-path. */
@@ -325,6 +329,21 @@ static void delay_access(int type)
udelay(delay);
}
+/*
+ * Reads the instrumented memory for value change detection; value change
+ * detection is currently done for accesses up to a size of 8 bytes.
+ */
+static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
+{
+ switch (size) {
+ case 1: return READ_ONCE(*(const u8 *)ptr);
+ case 2: return READ_ONCE(*(const u16 *)ptr);
+ case 4: return READ_ONCE(*(const u32 *)ptr);
+ case 8: return READ_ONCE(*(const u64 *)ptr);
+ default: return 0; /* Ignore; we do not diff the values. */
+ }
+}
+
void kcsan_save_irqtrace(struct task_struct *task)
{
#ifdef CONFIG_TRACE_IRQFLAGS
@@ -339,6 +358,76 @@ void kcsan_restore_irqtrace(struct task_struct *task)
#endif
}
+static __always_inline int get_kcsan_stack_depth(void)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return current->kcsan_stack_depth;
+#else
+ BUILD_BUG();
+ return 0;
+#endif
+}
+
+static __always_inline void add_kcsan_stack_depth(int val)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ current->kcsan_stack_depth += val;
+#else
+ BUILD_BUG();
+#endif
+}
+
+static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
+{
+#ifdef CONFIG_KCSAN_WEAK_MEMORY
+ return ctx->disable_scoped ? NULL : &ctx->reorder_access;
+#else
+ return NULL;
+#endif
+}
+
+static __always_inline bool
+find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access)
+ return false;
+
+ /*
+ * Note: If accesses are repeated while reorder_access is identical,
+ * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
+ */
+ return reorder_access->ptr == ptr && reorder_access->size == size &&
+ reorder_access->type == type && reorder_access->ip == ip;
+}
+
+static inline void
+set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
+ int type, unsigned long ip)
+{
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (!reorder_access || !kcsan_weak_memory)
+ return;
+
+ /*
+ * To avoid nested interrupts or scheduler (which share kcsan_ctx)
+ * reading an inconsistent reorder_access, ensure that the below has
+ * exclusive access to reorder_access by disallowing concurrent use.
+ */
+ ctx->disable_scoped++;
+ barrier();
+ reorder_access->ptr = ptr;
+ reorder_access->size = size;
+ reorder_access->type = type | KCSAN_ACCESS_SCOPED;
+ reorder_access->ip = ip;
+ reorder_access->stack_depth = get_kcsan_stack_depth();
+ barrier();
+ ctx->disable_scoped--;
+}
+
/*
* Pull everything together: check_access() below contains the performance
* critical operations; the fast-path (including check_access) functions should
@@ -377,8 +466,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
* The access_mask check relies on value-change comparison. To avoid
* reporting a race where e.g. the writer set up the watchpoint, but the
* reader has access_mask!=0, we have to ignore the found watchpoint.
+ *
+ * reorder_access is never created from an access with access_mask set.
*/
- if (ctx->access_mask)
+ if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
return;
/*
@@ -428,11 +519,13 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
u64 old, new, diff;
- unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ bool interrupt_watcher = kcsan_interrupt_watcher;
unsigned long ua_flags = user_access_save();
struct kcsan_ctx *ctx = get_ctx();
+ unsigned long access_mask = ctx->access_mask;
unsigned long irq_flags = 0;
+ bool is_reorder_access;
/*
* Always reset kcsan_skip counter in slow-path to avoid underflow; see
@@ -456,12 +549,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
}
/*
+ * The local CPU cannot observe reordering of its own accesses, and
+ * therefore we need to take care of 2 cases to avoid false positives:
+ *
+ * 1. Races of the reordered access with interrupts. To avoid, if
+ * the current access is reorder_access, disable interrupts.
+ * 2. Avoid races of scoped accesses from nested interrupts (below).
+ */
+ is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
+ if (is_reorder_access)
+ interrupt_watcher = false;
+ /*
+ * Avoid races of scoped accesses from nested interrupts (or scheduler).
+ * Assume setting up a watchpoint for a non-scoped (normal) access that
+ * also conflicts with a current scoped access. In a nested interrupt,
+ * which shares the context, it would check a conflicting scoped access.
+ * To avoid, disable scoped access checking.
+ */
+ ctx->disable_scoped++;
+
+ /*
* Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
* runtime is entered for every memory access, and potentially useful
* information is lost if dirtied by KCSAN.
*/
kcsan_save_irqtrace(current);
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_save(irq_flags);
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
@@ -482,23 +595,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
- old = 0;
- switch (size) {
- case 1:
- old = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- old = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- old = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- old = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
- }
+ old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
/*
* Delay this thread, to increase probability of observing a racy
@@ -510,23 +607,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
* Re-read value, and check if it is as expected; if not, we infer a
* racy access.
*/
- access_mask = ctx->access_mask;
- new = 0;
- switch (size) {
- case 1:
- new = READ_ONCE(*(const u8 *)ptr);
- break;
- case 2:
- new = READ_ONCE(*(const u16 *)ptr);
- break;
- case 4:
- new = READ_ONCE(*(const u32 *)ptr);
- break;
- case 8:
- new = READ_ONCE(*(const u64 *)ptr);
- break;
- default:
- break; /* ignore; we do not diff the values */
+ if (!is_reorder_access) {
+ new = read_instrumented_memory(ptr, size);
+ } else {
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * because the memory location may no longer be accessible and
+ * could result in a fault.
+ */
+ new = 0;
+ access_mask = 0;
}
diff = old ^ new;
@@ -596,10 +686,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned
*/
remove_watchpoint(watchpoint);
atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
+
out_unlock:
- if (!kcsan_interrupt_watcher)
+ if (!interrupt_watcher)
local_irq_restore(irq_flags);
kcsan_restore_irqtrace(current);
+ ctx->disable_scoped--;
+
+ /*
+ * Reordered accesses cannot be used for value change detection,
+ * therefore never consider for reordering if access_mask is set.
+ * ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
+ */
+ if (!access_mask && !is_assert)
+ set_reorder_access(ctx, ptr, size, type, ip);
out:
user_access_restore(ua_flags);
}
@@ -607,7 +707,6 @@ out:
static __always_inline void
check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
- const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
atomic_long_t *watchpoint;
long encoded_watchpoint;
@@ -618,12 +717,14 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
if (unlikely(size == 0))
return;
+again:
/*
* Avoid user_access_save in fast-path: find_watchpoint is safe without
* user_access_save, as the address that ptr points to is only used to
* check if a watchpoint exists; ptr is never dereferenced.
*/
- watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
+ watchpoint = find_watchpoint((unsigned long)ptr, size,
+ !(type & KCSAN_ACCESS_WRITE),
&encoded_watchpoint);
/*
* It is safe to check kcsan_is_enabled() after find_watchpoint in the
@@ -637,9 +738,42 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
else {
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
- if (unlikely(should_watch(ctx, ptr, size, type)))
+ if (unlikely(should_watch(ctx, ptr, size, type))) {
kcsan_setup_watchpoint(ptr, size, type, ip);
- else if (unlikely(ctx->scoped_accesses.prev))
+ return;
+ }
+
+ if (!(type & KCSAN_ACCESS_SCOPED)) {
+ struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
+
+ if (reorder_access) {
+ /*
+ * reorder_access check: simulates reordering of
+ * the access after subsequent operations.
+ */
+ ptr = reorder_access->ptr;
+ type = reorder_access->type;
+ ip = reorder_access->ip;
+ /*
+ * Upon a nested interrupt, this context's
+ * reorder_access can be modified (shared ctx).
+ * We know that upon return, reorder_access is
+ * always invalidated by setting size to 0 via
+ * __tsan_func_exit(). Therefore we must read
+ * and check size after the other fields.
+ */
+ barrier();
+ size = READ_ONCE(reorder_access->size);
+ if (size)
+ goto again;
+ }
+ }
+
+ /*
+ * Always checked last, right before returning from runtime;
+ * if reorder_access is valid, checked after it was checked.
+ */
+ if (unlikely(ctx->scoped_accesses.prev))
kcsan_check_scoped_accesses();
}
}
@@ -814,6 +948,22 @@ void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
}
EXPORT_SYMBOL(__kcsan_check_access);
+#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
+ void __kcsan_##name(void) \
+ { \
+ struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
+ if (!sa) \
+ return; \
+ if (order_before_cond) \
+ sa->size = 0; \
+ } \
+ EXPORT_SYMBOL(__kcsan_##name)
+
+DEFINE_MEMORY_BARRIER(mb, true);
+DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
+DEFINE_MEMORY_BARRIER(release, true);
+
/*
* KCSAN uses the same instrumentation that is emitted by supported compilers
* for ThreadSanitizer (TSAN).
@@ -926,19 +1076,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8);
DEFINE_TSAN_VOLATILE_READ_WRITE(16);
/*
- * The below are not required by KCSAN, but can still be emitted by the
- * compiler.
+ * Function entry and exit are used to determine the validty of reorder_access.
+ * Reordering of the access ends at the end of the function scope where the
+ * access happened. This is done for two reasons:
+ *
+ * 1. Artificially limits the scope where missing barriers are detected.
+ * This minimizes false positives due to uninstrumented functions that
+ * contain the required barriers but were missed.
+ *
+ * 2. Simplifies generating the stack trace of the access.
*/
void __tsan_func_entry(void *call_pc);
-void __tsan_func_entry(void *call_pc)
+noinline void __tsan_func_entry(void *call_pc)
{
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ add_kcsan_stack_depth(1);
}
EXPORT_SYMBOL(__tsan_func_entry);
+
void __tsan_func_exit(void);
-void __tsan_func_exit(void)
+noinline void __tsan_func_exit(void)
{
+ struct kcsan_scoped_access *reorder_access;
+
+ if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
+ return;
+
+ reorder_access = get_reorder_access(get_ctx());
+ if (!reorder_access)
+ goto out;
+
+ if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
+ /*
+ * Access check to catch cases where write without a barrier
+ * (supposed release) was last access in function: because
+ * instrumentation is inserted before the real access, a data
+ * race due to the write giving up a c-s would only be caught if
+ * we do the conflicting access after.
+ */
+ check_access(reorder_access->ptr, reorder_access->size,
+ reorder_access->type, reorder_access->ip);
+ reorder_access->size = 0;
+ reorder_access->stack_depth = INT_MIN;
+ }
+out:
+ add_kcsan_stack_depth(-1);
}
EXPORT_SYMBOL(__tsan_func_exit);
+
void __tsan_init(void);
void __tsan_init(void)
{
@@ -961,10 +1148,19 @@ EXPORT_SYMBOL(__tsan_init);
* functions, whose job is to also execute the operation itself.
*/
+static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
+{
+ if (memorder == __ATOMIC_RELEASE ||
+ memorder == __ATOMIC_SEQ_CST ||
+ memorder == __ATOMIC_ACQ_REL)
+ __kcsan_release();
+}
+
#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
@@ -974,6 +1170,7 @@ EXPORT_SYMBOL(__tsan_init);
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
@@ -986,6 +1183,7 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
{ \
+ kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
@@ -1018,6 +1216,7 @@ EXPORT_SYMBOL(__tsan_init);
int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
u##bits val, int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
@@ -1033,6 +1232,7 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
int mo, int fail_mo) \
{ \
+ kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
@@ -1064,10 +1264,47 @@ DEFINE_TSAN_ATOMIC_OPS(64);
void __tsan_atomic_thread_fence(int memorder);
void __tsan_atomic_thread_fence(int memorder)
{
+ kcsan_atomic_builtin_memorder(memorder);
__atomic_thread_fence(memorder);
}
EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+/*
+ * In instrumented files, we emit instrumentation for barriers by mapping the
+ * kernel barriers to an __atomic_signal_fence(), which is interpreted specially
+ * and otherwise has no relation to a real __atomic_signal_fence(). No known
+ * kernel code uses __atomic_signal_fence().
+ *
+ * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
+ * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
+ * can be disabled via the __no_kcsan function attribute (vs. an explicit call
+ * which could not). When __no_kcsan is requested, __atomic_signal_fence()
+ * generates no code.
+ *
+ * Note: The result of using __atomic_signal_fence() with KCSAN enabled is
+ * potentially limiting the compiler's ability to reorder operations; however,
+ * if barriers were instrumented with explicit calls (without LTO), the compiler
+ * couldn't optimize much anyway. The result of a hypothetical architecture
+ * using __atomic_signal_fence() in normal code would be KCSAN false negatives.
+ */
void __tsan_atomic_signal_fence(int memorder);
-void __tsan_atomic_signal_fence(int memorder) { }
+noinline void __tsan_atomic_signal_fence(int memorder)
+{
+ switch (memorder) {
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
+ __kcsan_mb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
+ __kcsan_wmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
+ __kcsan_rmb();
+ break;
+ case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
+ __kcsan_release();
+ break;
+ default:
+ break;
+ }
+}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 660729238588..a36fca063a73 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -16,9 +16,12 @@
#define pr_fmt(fmt) "kcsan_test: " fmt
#include <kunit/test.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
#include <linux/jiffies.h>
#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
+#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/seqlock.h>
#include <linux/spinlock.h>
@@ -151,7 +154,7 @@ struct expect_report {
/* Check observed report matches information in @r. */
__no_kcsan
-static bool report_matches(const struct expect_report *r)
+static bool __report_matches(const struct expect_report *r)
{
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
@@ -213,9 +216,9 @@ static bool report_matches(const struct expect_report *r)
const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
const char *const access_type_aux =
- (is_atomic && is_scoped) ? " (marked, scoped)"
+ (is_atomic && is_scoped) ? " (marked, reordered)"
: (is_atomic ? " (marked)"
- : (is_scoped ? " (scoped)" : ""));
+ : (is_scoped ? " (reordered)" : ""));
if (i == 1) {
/* Access 2 */
@@ -253,6 +256,40 @@ out:
return ret;
}
+static __always_inline const struct expect_report *
+__report_set_scoped(struct expect_report *r, int accesses)
+{
+ BUILD_BUG_ON(accesses > 3);
+
+ if (accesses & 1)
+ r->access[0].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
+
+ if (accesses & 2)
+ r->access[1].type |= KCSAN_ACCESS_SCOPED;
+ else
+ r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
+
+ return r;
+}
+
+__no_kcsan
+static bool report_matches_any_reordered(struct expect_report *r)
+{
+ return __report_matches(__report_set_scoped(r, 0)) ||
+ __report_matches(__report_set_scoped(r, 1)) ||
+ __report_matches(__report_set_scoped(r, 2)) ||
+ __report_matches(__report_set_scoped(r, 3));
+}
+
+#ifdef