aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/Kconfig.freezer1
-rw-r--r--kernel/Kconfig.hz1
-rw-r--r--kernel/Kconfig.locks1
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/async.c10
-rw-r--r--kernel/audit.c44
-rw-r--r--kernel/audit.h25
-rw-r--r--kernel/audit_fsnotify.c13
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c19
-rw-r--r--kernel/auditfilter.c97
-rw-r--r--kernel/auditsc.c161
-rw-r--r--kernel/backtracetest.c6
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/arraymap.c81
-rw-r--r--kernel/bpf/bpf_lru_list.c5
-rw-r--r--kernel/bpf/bpf_lru_list.h5
-rw-r--r--kernel/bpf/btf.c419
-rw-r--r--kernel/bpf/cgroup.c823
-rw-r--r--kernel/bpf/core.c85
-rw-r--r--kernel/bpf/cpumap.c172
-rw-r--r--kernel/bpf/devmap.c142
-rw-r--r--kernel/bpf/disasm.c15
-rw-r--r--kernel/bpf/disasm.h10
-rw-r--r--kernel/bpf/hashtab.c53
-rw-r--r--kernel/bpf/helpers.c141
-rw-r--r--kernel/bpf/inode.c17
-rw-r--r--kernel/bpf/local_storage.c19
-rw-r--r--kernel/bpf/lpm_trie.c25
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h5
-rw-r--r--kernel/bpf/percpu_freelist.c5
-rw-r--r--kernel/bpf/percpu_freelist.h5
-rw-r--r--kernel/bpf/queue_stack_maps.c19
-rw-r--r--kernel/bpf/reuseport_array.c17
-rw-r--r--kernel/bpf/stackmap.c33
-rw-r--r--kernel/bpf/syscall.c311
-rw-r--r--kernel/bpf/tnum.c1
-rw-r--r--kernel/bpf/verifier.c1897
-rw-r--r--kernel/bpf/xskmap.c22
-rw-r--r--kernel/cgroup/Makefile4
-rw-r--r--kernel/cgroup/cgroup-internal.h8
-rw-r--r--kernel/cgroup/cgroup-v1.c17
-rw-r--r--kernel/cgroup/cgroup.c421
-rw-r--r--kernel/cgroup/cpuset.c19
-rw-r--r--kernel/cgroup/debug.c8
-rw-r--r--kernel/cgroup/freezer.c639
-rw-r--r--kernel/cgroup/legacy_freezer.c481
-rw-r--r--kernel/cgroup/pids.c5
-rw-r--r--kernel/cgroup/rdma.c5
-rw-r--r--kernel/cgroup/rstat.c1
-rw-r--r--kernel/compat.c8
-rw-r--r--kernel/context_tracking.c1
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/cpu_pm.c11
-rw-r--r--kernel/crash_core.c4
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/cred.c28
-rw-r--r--kernel/debug/Makefile1
-rw-r--r--kernel/debug/gdbstub.c9
-rw-r--r--kernel/debug/kdb/Makefile1
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/delayacct.c11
-rw-r--r--kernel/dma/Kconfig5
-rw-r--r--kernel/dma/debug.c14
-rw-r--r--kernel/dma/direct.c2
-rw-r--r--kernel/dma/mapping.c9
-rw-r--r--kernel/dma/swiotlb.c7
-rw-r--r--kernel/events/core.c79
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/events/ring_buffer.c64
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/exit.c9
-rw-r--r--kernel/extable.c14
-rw-r--r--kernel/fail_function.c2
-rw-r--r--kernel/fork.c341
-rw-r--r--kernel/freezer.c1
-rw-r--r--kernel/futex.c86
-rw-r--r--kernel/gcov/Kconfig4
-rw-r--r--kernel/gcov/Makefile5
-rw-r--r--kernel/gcov/base.c86
-rw-r--r--kernel/gcov/clang.c581
-rw-r--r--kernel/gcov/gcc_3_4.c12
-rw-r--r--kernel/gcov/gcc_4_7.c12
-rw-r--r--kernel/gcov/gcc_base.c86
-rw-r--r--kernel/gcov/gcov.h5
-rwxr-xr-xkernel/gen_kheaders.sh96
-rw-r--r--kernel/hung_task.c1
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/affinity.c12
-rw-r--r--kernel/irq/autoprobe.c6
-rw-r--r--kernel/irq/chip.c37
-rw-r--r--kernel/irq/cpuhotplug.c2
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/internals.h26
-rw-r--r--kernel/irq/irqdesc.c19
-rw-r--r--kernel/irq/irqdomain.c6
-rw-r--r--kernel/irq/manage.c92
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/irq/timings.c453
-rw-r--r--kernel/irq_work.c1
-rw-r--r--kernel/jump_label.c65
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c29
-rw-r--r--kernel/kheaders.c66
-rw-r--r--kernel/kprobes.c15
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/livepatch/Kconfig1
-rw-r--r--kernel/livepatch/Makefile1
-rw-r--r--kernel/livepatch/core.c31
-rw-r--r--kernel/livepatch/patch.c14
-rw-r--r--kernel/livepatch/shadow.c14
-rw-r--r--kernel/livepatch/transition.c14
-rw-r--r--kernel/locking/Makefile2
-rw-r--r--kernel/locking/lock_events.h7
-rw-r--r--kernel/locking/lock_events_list.h12
-rw-r--r--kernel/locking/lockdep.c743
-rw-r--r--kernel/locking/lockdep_internals.h36
-rw-r--r--kernel/locking/locktorture.c2
-rw-r--r--kernel/locking/mutex.c1
-rw-r--r--kernel/locking/percpu-rwsem.c3
-rw-r--r--kernel/locking/qrwlock.c11
-rw-r--r--kernel/locking/qspinlock.c11
-rw-r--r--kernel/locking/qspinlock_stat.h10
-rw-r--r--kernel/locking/rtmutex.c1
-rw-r--r--kernel/locking/rwsem-xadd.c729
-rw-r--r--kernel/locking/rwsem.c1453
-rw-r--r--kernel/locking/rwsem.h306
-rw-r--r--kernel/locking/semaphore.c3
-rw-r--r--kernel/locking/test-ww_mutex.c15
-rw-r--r--kernel/memremap.c36
-rw-r--r--kernel/module-internal.h8
-rw-r--r--kernel/module.c45
-rw-r--r--kernel/module_signing.c6
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/nsproxy.c6
-rw-r--r--kernel/padata.c3
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/params.c14
-rw-r--r--kernel/pid.c73
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/energy_model.c2
-rw-r--r--kernel/power/hibernate.c16
-rw-r--r--kernel/power/main.c4
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/poweroff.c3
-rw-r--r--kernel/power/qos.c1
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend.c16
-rw-r--r--kernel/power/suspend_test.c3
-rw-r--r--kernel/power/swap.c7
-rw-r--r--kernel/power/user.c4
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h14
-rw-r--r--kernel/printk/printk.c23
-rw-r--r--kernel/printk/printk_safe.c14
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c28
-rw-r--r--kernel/rcu/Kconfig1
-rw-r--r--kernel/rcu/Kconfig.debug1
-rw-r--r--kernel/rcu/rcu.h14
-rw-r--r--kernel/rcu/rcutorture.c96
-rw-r--r--kernel/rcu/srcutree.c69
-rw-r--r--kernel/rcu/sync.c214
-rw-r--r--kernel/rcu/tree.c174
-rw-r--r--kernel/rcu/tree.h6
-rw-r--r--kernel/rcu/tree_exp.h53
-rw-r--r--kernel/rcu/tree_plugin.h195
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/rcu/update.c13
-rw-r--r--kernel/reboot.c21
-rw-r--r--kernel/resource.c1
-rw-r--r--kernel/rseq.c4
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/clock.c1
-rw-r--r--kernel/sched/core.c534
-rw-r--r--kernel/sched/cpudeadline.c10
-rw-r--r--kernel/sched/cpufreq_schedutil.c29
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cputime.c1
-rw-r--r--kernel/sched/deadline.c10
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c628
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/isolation.c1
-rw-r--r--kernel/sched/membarrier.c11
-rw-r--r--kernel/sched/pelt.c13
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/psi.c617
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/sched/sched-pelt.h2
-rw-r--r--kernel/sched/sched.h134
-rw-r--r--kernel/sched/topology.c18
-rw-r--r--kernel/sched/wait.c9
-rw-r--r--kernel/sched/wait_bit.c1
-rw-r--r--kernel/seccomp.c6
-rw-r--r--kernel/signal.c361
-rw-r--r--kernel/smp.c13
-rw-r--r--kernel/smpboot.c1
-rw-r--r--kernel/softirq.c5
-rw-r--r--kernel/stacktrace.c5
-rw-r--r--kernel/stop_machine.c24
-rw-r--r--kernel/sys.c64
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c120
-rw-r--r--kernel/taskstats.c45
-rw-r--r--kernel/test_kprobes.c11
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/alarmtimer.c1
-rw-r--r--kernel/time/clocksource.c4
-rw-r--r--kernel/time/hrtimer.c8
-rw-r--r--kernel/time/ntp.c26
-rw-r--r--kernel/time/ntp_internal.h4
-rw-r--r--kernel/time/posix-timers.c13
-rw-r--r--kernel/time/sched_clock.c2
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c14
-rw-r--r--kernel/time/timekeeping.c18
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/time/timer_list.c36
-rw-r--r--kernel/time/vsyscall.c129
-rw-r--r--kernel/torture.c23
-rw-r--r--kernel/trace/Kconfig8
-rw-r--r--kernel/trace/bpf_trace.c232
-rw-r--r--kernel/trace/ftrace.c21
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c447
-rw-r--r--kernel/trace/trace.h32
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c92
-rw-r--r--kernel/trace/trace_events_hist.c279
-rw-r--r--kernel/trace/trace_events_trigger.c3
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--kernel/trace/trace_kdb.c67
-rw-r--r--kernel/trace/trace_kprobe.c77
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_probe.c291
-rw-r--r--kernel/trace/trace_probe.h78
-rw-r--r--kernel/trace/trace_probe_tmpl.h2
-rw-r--r--kernel/trace/trace_selftest.c5
-rw-r--r--kernel/trace/trace_uprobe.c74
-rw-r--r--kernel/tracepoint.c15
-rw-r--r--kernel/tsacct.c13
-rw-r--r--kernel/ucount.c7
-rw-r--r--kernel/umh.c1
-rw-r--r--kernel/up.c4
-rw-r--r--kernel/user-return-notifier.c1
-rw-r--r--kernel/user.c16
-rw-r--r--kernel/user_namespace.c16
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/utsname_sysctl.c6
-rw-r--r--kernel/workqueue.c136
267 files changed, 13149 insertions, 5314 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 6e699100872f..34d1e77ee9df 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,5 +1,6 @@
#
# Generated files
#
+kheaders.md5
timeconst.h
hz.bc
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
index a3bb4cb52539..68646feefb3d 100644
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
config FREEZER
def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..38ef6d06888e 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Timer Interrupt Frequency Configuration
#
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index bf770d7556f7..e0852dc333ac 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# The ARCH_INLINE foo is necessary because select ignores "depends on"
#
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0fee5fe6c899..dc0b682ec2d9 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
choice
prompt "Preemption Model"
diff --git a/kernel/Makefile b/kernel/Makefile
index 62471e75a2b0..a8d923b5481b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -71,6 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
+obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
@@ -122,3 +123,12 @@ $(obj)/configs.o: $(obj)/config_data.gz
targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
+
+$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
+
+quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
+cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@
+$(obj)/kheaders_data.tar.xz: FORCE
+ $(call cmd,genikh)
+
+clean-files := kheaders_data.tar.xz kheaders.md5
diff --git a/kernel/acct.c b/kernel/acct.c
index addf7732fb56..81f9831a7859 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -227,7 +227,7 @@ static int acct_on(struct filename *pathname)
filp_close(file, NULL);
return PTR_ERR(internal);
}
- err = mnt_want_write(internal);
+ err = __mnt_want_write(internal);
if (err) {
mntput(internal);
kfree(acct);
@@ -252,7 +252,7 @@ static int acct_on(struct filename *pathname)
old = xchg(&ns->bacct, &acct->pin);
mutex_unlock(&acct->lock);
pin_kill(old);
- mnt_drop_write(mnt);
+ __mnt_drop_write(mnt);
mntput(mnt);
return 0;
}
diff --git a/kernel/async.c b/kernel/async.c
index f6bd0d9885e1..4f9c1d614016 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* async.c: Asynchronous function calls for boot performance
*
* (C) Copyright 2009 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
@@ -119,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
/* 1) run (and print duration) */
if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("calling %lli_%pF @ %i\n",
+ pr_debug("calling %lli_%pS @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
@@ -128,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state < SYSTEM_RUNNING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
+ pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
diff --git a/kernel/audit.c b/kernel/audit.c
index c89ea48c70a6..da8dc0db5bd3 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
* Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
* System-call specific features have moved to auditsc.c
@@ -5,20 +6,6 @@
* Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Goals: 1) Integrate fully with Security Modules.
@@ -2220,7 +2207,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
if (!audit_enabled)
return;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
if (!ab)
return;
@@ -2274,6 +2261,33 @@ out:
}
/**
+ * audit_signal_info - record signal info for shutting down audit subsystem
+ * @sig: signal value
+ * @t: task being signaled
+ *
+ * If the audit subsystem is being terminated, record the task (pid)
+ * and uid that is doing that.
+ */
+int audit_signal_info(int sig, struct task_struct *t)
+{
+ kuid_t uid = current_uid(), auid;
+
+ if (auditd_test_task(t) &&
+ (sig == SIGTERM || sig == SIGHUP ||
+ sig == SIGUSR1 || sig == SIGUSR2)) {
+ audit_sig_pid = task_tgid_nr(current);
+ auid = audit_get_loginuid(current);
+ if (uid_valid(auid))
+ audit_sig_uid = auid;
+ else
+ audit_sig_uid = uid;
+ security_task_getsecid(current, &audit_sig_sid);
+ }
+
+ return audit_signal_info_syscall(t);
+}
+
+/**
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
diff --git a/kernel/audit.h b/kernel/audit.h
index 958d5b8fc1b3..6fb7160412d4 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,22 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/fs.h>
@@ -231,7 +218,7 @@ extern int audit_comparator(const u32 left, const u32 op, const u32 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
-extern int audit_compare_dname_path(const char *dname, const char *path, int plen);
+extern int audit_compare_dname_path(const struct qstr *dname, const char *path, int plen);
extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi,
const void *payload, int size);
extern void audit_panic(const char *message);
@@ -299,7 +286,7 @@ extern const char *audit_tree_path(struct audit_tree *tree);
extern void audit_put_tree(struct audit_tree *tree);
extern void audit_kill_trees(struct audit_context *context);
-extern int audit_signal_info(int sig, struct task_struct *t);
+extern int audit_signal_info_syscall(struct task_struct *t);
extern void audit_filter_inodes(struct task_struct *tsk,
struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
@@ -330,7 +317,11 @@ extern struct list_head *audit_killed_trees(void);
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(context) BUG()
-#define audit_signal_info(s, t) AUDIT_DISABLED
+static inline int audit_signal_info_syscall(struct task_struct *t)
+{
+ return 0;
+}
+
#define audit_filter_inodes(t, c) AUDIT_DISABLED
#endif /* CONFIG_AUDITSYSCALL */
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 37ae95cfb7f4..f0d243318452 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_fsnotify.c -- tracking inodes
*
* Copyright 2003-2009,2014-2015 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/kernel.h>
@@ -164,7 +155,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
static int audit_mark_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
u32 mask, const void *data, int data_type,
- const unsigned char *dname, u32 cookie,
+ const struct qstr *dname, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index abfb112f26aa..e49c912f862d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1040,7 +1040,7 @@ static void evict_chunk(struct audit_chunk *chunk)
static int audit_tree_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
u32 mask, const void *data, int data_type,
- const unsigned char *file_name, u32 cookie,
+ const struct qstr *file_name, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
return 0;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index e8d1adeb2223..1f31c2f1e6fc 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* audit_watch.c -- watching inodes
*
* Copyright 2003-2009 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/file.h>
@@ -255,7 +242,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
/* Update inode info in audit rules based on filesystem event. */
static void audit_update_watch(struct audit_parent *parent,
- const char *dname, dev_t dev,
+ const struct qstr *dname, dev_t dev,
unsigned long ino, unsigned invalidating)
{
struct audit_watch *owatch, *nwatch, *nextw;
@@ -482,7 +469,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
static int audit_watch_handle_event(struct fsnotify_group *group,
struct inode *to_tell,
u32 mask, const void *data, int data_type,
- const unsigned char *dname, u32 cookie,
+ const struct qstr *dname, u32 cookie,
struct fsnotify_iter_info *iter_info)
{
struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 63f8b3f26fab..b0126e9c0743 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1,22 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditfilter.c -- filtering of audit events
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -335,7 +322,7 @@ static u32 audit_to_op(u32 op)
/* check if an audit field is valid */
static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- switch(f->type) {
+ switch (f->type) {
case AUDIT_MSGTYPE:
if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
entry->rule.listnr != AUDIT_FILTER_USER)
@@ -347,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
break;
}
- switch(entry->rule.listnr) {
+ switch (entry->rule.listnr) {
case AUDIT_FILTER_FS:
switch(f->type) {
case AUDIT_FSTYPE:
@@ -358,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
}
}
- switch(f->type) {
- default:
- return -EINVAL;
+ /* Check for valid field type and op */
+ switch (f->type) {
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_PERS: /* <uapi/linux/personality.h> */
+ case AUDIT_DEVMINOR:
+ /* all ops are valid */
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -373,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
case AUDIT_PID:
- case AUDIT_PERS:
case AUDIT_MSGTYPE:
case AUDIT_PPID:
case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
case AUDIT_EXIT:
case AUDIT_SUCCESS:
case AUDIT_INODE:
case AUDIT_SESSIONID:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_SADDR_FAM:
/* bit ops are only useful on syscall args */
if (f->op == Audit_bitmask || f->op == Audit_bittest)
return -EINVAL;
break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
- case AUDIT_SUBJ_SEN:
- case AUDIT_SUBJ_CLR:
case AUDIT_OBJ_USER:
case AUDIT_OBJ_ROLE:
case AUDIT_OBJ_TYPE:
- case AUDIT_OBJ_LEV_LOW:
- case AUDIT_OBJ_LEV_HIGH:
case AUDIT_WATCH:
case AUDIT_DIR:
case AUDIT_FILTERKEY:
- break;
case AUDIT_LOGINUID_SET:
- if ((f->val != 0) && (f->val != 1))
- return -EINVAL;
- /* FALL THROUGH */
case AUDIT_ARCH:
case AUDIT_FSTYPE:
+ case AUDIT_PERM:
+ case AUDIT_FILETYPE:
+ case AUDIT_FIELD_COMPARE:
+ case AUDIT_EXE:
+ /* only equal and not equal valid ops */
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
+ default:
+ /* field not recognized */
+ return -EINVAL;
+ }
+
+ /* Check for select valid field values */
+ switch (f->type) {
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ break;
case AUDIT_PERM:
if (f->val & ~15)
return -EINVAL;
@@ -425,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
- case AUDIT_EXE:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
+ case AUDIT_SADDR_FAM:
+ if (f->val >= AF_MAX)
return -EINVAL;
break;
+ default:
+ break;
}
+
return 0;
}
@@ -1114,22 +1118,24 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)
int err = 0;
struct audit_entry *entry;
- entry = audit_data_to_entry(data, datasz);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
-
switch (type) {
case AUDIT_ADD_RULE:
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
err = audit_add_rule(entry);
audit_log_rule_change("add_rule", &entry->rule, !err);
break;
case AUDIT_DEL_RULE:
+ entry = audit_data_to_entry(data, datasz);
+ if (IS_ERR(entry))
+ return PTR_ERR(entry);
err = audit_del_rule(entry);
audit_log_rule_change("remove_rule", &entry->rule, !err);
break;
default:
- err = -EINVAL;
WARN_ON(1);
+ return -EINVAL;
}
if (err || type == AUDIT_DEL_RULE) {
@@ -1201,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right)
case Audit_bittest:
return ((left & right) == right);
default:
- BUG();
return 0;
}
}
@@ -1224,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
@@ -1247,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
case Audit_bitmask:
case Audit_bittest:
default:
- BUG();
return 0;
}
}
@@ -1290,12 +1293,12 @@ int parent_len(const char *path)
* @parentlen: length of the parent if known. Passing in AUDIT_NAME_FULL
* here indicates that we must compute this value.
*/
-int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
+int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)
{
int dlen, pathlen;
const char *p;
- dlen = strlen(dname);
+ dlen = dname->len;
pathlen = strlen(path);
if (pathlen < dlen)
return 1;
@@ -1306,7 +1309,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
p = path + parentlen;
- return strncmp(p, dname, dlen);
+ return strncmp(p, dname->name, dlen);
}
int audit_filter(int msgtype, unsigned int listtype)
@@ -1315,8 +1318,6 @@ int audit_filter(int msgtype, unsigned int listtype)
int ret = 1; /* Audit by default */
rcu_read_lock();
- if (list_empty(&audit_filter_list[listtype]))
- goto unlock_and_return;
list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
int i, result = 0;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d1eab1d4a930..4effe01ebbe2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_WATCH:
- if (name)
- result = audit_watch_compare(rule->watch, name->ino, name->dev);
+ if (name) {
+ result = audit_watch_compare(rule->watch,
+ name->ino,
+ name->dev);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_DIR:
- if (ctx)
+ if (ctx) {
result = match_tree_refs(ctx, rule->tree);
+ if (f->op == Audit_not_equal)
+ result = !result;
+ }
break;
case AUDIT_LOGINUID:
result = audit_uid_comparator(audit_get_loginuid(tsk),
@@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
+ case AUDIT_SADDR_FAM:
+ if (ctx->sockaddr)
+ result = audit_comparator(ctx->sockaddr->ss_family,
+ f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -684,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_PERM:
result = audit_match_perm(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
+ if (f->op == Audit_not_equal)
+ result = !result;
break;
case AUDIT_FIELD_COMPARE:
result = audit_field_compare(tsk, cred, f, ctx, name);
@@ -771,15 +788,13 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
return AUDIT_DISABLED;
rcu_read_lock();
- if (!list_empty(list)) {
- list_for_each_entry_rcu(e, list, list) {
- if (audit_in_mask(&e->rule, ctx->major) &&
- audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return state;
- }
+ list_for_each_entry_rcu(e, list, list) {
+ if (audit_in_mask(&e->rule, ctx->major) &&
+ audit_filter_rules(tsk, &e->rule, ctx, NULL,
+ &state, false)) {
+ rcu_read_unlock();
+ ctx->current_state = state;
+ return state;
}
}
rcu_read_unlock();
@@ -798,9 +813,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,
struct audit_entry *e;
enum audit_state state;
- if (list_empty(list))
- return 0;
-
list_for_each_entry_rcu(e, list, list) {
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
@@ -808,7 +820,6 @@ static int audit_filter_inode_name(struct task_struct *tsk,
return 1;
}
}
-
return 0;
}
@@ -840,6 +851,13 @@ static inline void audit_proctitle_free(struct audit_context *context)
context->proctitle.len = 0;
}
+static inline void audit_free_module(struct audit_context *context)
+{
+ if (context->type == AUDIT_KERN_MODULE) {
+ kfree(context->module.name);
+ context->module.name = NULL;
+ }
+}
static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
@@ -923,6 +941,7 @@ int audit_alloc(struct task_struct *tsk)
static inline void audit_free_context(struct audit_context *context)
{
+ audit_free_module(context);
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
free_tree_refs(context);
@@ -1139,7 +1158,8 @@ out:
kfree(buf_head);
}
-void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+static void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap)
{
int i;
@@ -1266,7 +1286,6 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "name=");
if (context->module.name) {
audit_log_untrustedstring(ab, context->module.name);
- kfree(context->module.name);
} else
audit_log_format(ab, "(null)");
@@ -1628,7 +1647,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
return;
}
- context->arch = syscall_get_arch();
+ context->arch = syscall_get_arch(current);
context->major = major;
context->argv[0] = a1;
context->argv[1] = a2;
@@ -1697,6 +1716,7 @@ void __audit_syscall_exit(int success, long return_code)
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ audit_free_module(context);
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
audit_free_aux(context);
@@ -1897,8 +1917,9 @@ static inline int audit_copy_fcaps(struct audit_names *name,
}
/* Copy inode data into an audit_names. */
-void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- struct inode *inode, unsigned int flags)
+static void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ struct inode *inode, unsigned int flags)
{
name->ino = inode->i_ino;
name->dev = inode->i_sb->s_dev;
@@ -1935,18 +1956,16 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
return;
rcu_read_lock();
- if (!list_empty(list)) {
- list_for_each_entry_rcu(e, list, list) {
- for (i = 0; i < e->rule.field_count; i++) {
- struct audit_field *f = &e->rule.fields[i];
-
- if (f->type == AUDIT_FSTYPE
- && audit_comparator(inode->i_sb->s_magic,
- f->op, f->val)
- && e->rule.action == AUDIT_NEVER) {
- rcu_read_unlock();
- return;
- }
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(inode->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
}
}
}
@@ -2045,7 +2064,7 @@ void __audit_inode_child(struct inode *parent,
{
struct audit_context *context = audit_context();
struct inode *inode = d_backing_inode(dentry);
- const char *dname = dentry->d_name.name;
+ const struct qstr *dname = &dentry->d_name;
struct audit_names *n, *found_parent = NULL, *found_child = NULL;
struct audit_entry *e;
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
@@ -2055,18 +2074,16 @@ void __audit_inode_child(struct inode *parent,
return;
rcu_read_lock();
- if (!list_empty(list)) {
- list_for_each_entry_rcu(e, list, list) {
- for (i = 0; i < e->rule.field_count; i++) {
- struct audit_field *f = &e->rule.fields[i];
-
- if (f->type == AUDIT_FSTYPE
- && audit_comparator(parent->i_sb->s_magic,
- f->op, f->val)
- && e->rule.action == AUDIT_NEVER) {
- rcu_read_unlock();
- return;
- }
+ list_for_each_entry_rcu(e, list, list) {
+ for (i = 0; i < e->rule.field_count; i++) {
+ struct audit_field *f = &e->rule.fields[i];
+
+ if (f->type == AUDIT_FSTYPE
+ && audit_comparator(parent->i_sb->s_magic,
+ f->op, f->val)
+ && e->rule.action == AUDIT_NEVER) {
+ rcu_read_unlock();
+ return;
}
}
}
@@ -2099,7 +2116,7 @@ void __audit_inode_child(struct inode *parent,
(n->type != type && n->type != AUDIT_TYPE_UNKNOWN))
continue;
- if (!strcmp(dname, n->name->name) ||
+ if (!strcmp(dname->name, n->name->name) ||
!audit_compare_dname_path(dname, n->name->name,
found_parent ?
found_parent->name_len :
@@ -2360,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t)
}
/**
- * audit_signal_info - record signal info for shutting down audit subsystem
- * @sig: signal value
+ * audit_signal_info_syscall - record signal info for syscalls
* @t: task being signaled
*
* If the audit subsystem is being terminated, record the task (pid)
* and uid that is doing that.
*/
-int audit_signal_info(int sig, struct task_struct *t)
+int audit_signal_info_syscall(struct task_struct *t)
{
struct audit_aux_data_pids *axp;
struct audit_context *ctx = audit_context();
- kuid_t uid = current_uid(), auid, t_uid = task_uid(t);
-
- if (auditd_test_task(t) &&
- (sig == SIGTERM || sig == SIGHUP ||
- sig == SIGUSR1 || sig == SIGUSR2)) {
- audit_sig_pid = task_tgid_nr(current);
- auid = audit_get_loginuid(current);
- if (uid_valid(auid))
- audit_sig_uid = auid;
- else
- audit_sig_uid = uid;
- security_task_getsecid(current, &audit_sig_sid);
- }
+ kuid_t t_uid = task_uid(t);
if (!audit_signals || audit_dummy_context())
return 0;
@@ -2512,6 +2516,35 @@ void __audit_fanotify(unsigned int response)
AUDIT_FANOTIFY, "resp=%u", response);
}
+void __audit_tk_injoffset(struct timespec64 offset)
+{
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
+ "sec=%lli nsec=%li",
+ (long long)offset.tv_sec, offset.tv_nsec);
+}
+
+static void audit_log_ntp_val(const struct audit_ntp_data *ad,
+ const char *op, enum audit_ntp_type type)
+{
+ const struct audit_ntp_val *val = &ad->vals[type];
+
+ if (val->newval == val->oldval)
+ return;
+
+ audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
+ "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+}
+
+void __audit_ntp_log(const struct audit_ntp_data *ad)
+{
+ audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
+ audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
+ audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
+ audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
+ audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
+ audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+}
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
@@ -2580,7 +2613,7 @@ void audit_seccomp(unsigned long syscall, long signr, int code)
return;
audit_log_task(ab);
audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
- signr, syscall_get_arch(), syscall,
+ signr, syscall_get_arch(current), syscall,
in_compat_syscall(), KSTK_EIP(current), code);
audit_log_end(ab);
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a563c8fdad0d..a2a97fa3071b 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple stack backtrace regression test module
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#include <linux/completion.h>
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 4c2fa3ac56f6..29d781061cd5 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
+CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c72e0d8e1e65..1c65ce0098a9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016,2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/btf.h>
@@ -22,7 +14,7 @@
#include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
static void bpf_array_free_percpu(struct bpf_array *array)
{
@@ -63,6 +55,7 @@ int array_map_alloc_check(union bpf_attr *attr)
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size == 0 ||
attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
(percpu && numa_node != NUMA_NO_NODE))
return -EINVAL;
@@ -82,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
u32 elem_size, index_mask, max_entries;
bool unpriv = !capable(CAP_SYS_ADMIN);
u64 cost, array_size, mask64;
+ struct bpf_map_memory mem;
struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
@@ -115,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* make sure there is no u32 overflow later in round_up() */
cost = array_size;
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- if (percpu) {
+ if (percpu)
cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- }
- cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- ret = bpf_map_precharge_memlock(cost);
+ ret = bpf_map_charge_init(&mem, cost);
if (ret < 0)
return ERR_PTR(ret);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
- if (!array)
+ if (!array) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
array->index_mask = index_mask;
array->map.unpriv_array = unpriv;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- array->map.pages = cost;
+ bpf_map_charge_move(&array->map.memory, &mem);
array->elem_size = elem_size;
if (percpu && bpf_array_alloc_percpu(array)) {
+ bpf_map_charge_finish(&array->map.memory);
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
@@ -160,6 +151,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + array->elem_size * (index & array->index_mask);
}
+static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
+ u32 off)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ if (map->max_entries != 1)
+ return -ENOTSUPP;
+ if (off >= map->value_size)
+ return -EINVAL;
+
+ *imm = (unsigned long)array->value;
+ return 0;
+}
+
+static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
+ u32 *off)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u64 base = (unsigned long)array->value;
+ u64 range = array->elem_size;
+
+ if (map->max_entries != 1)
+ return -ENOTSUPP;
+ if (imm < base || imm >= base + range)
+ return -ENOENT;
+
+ *off = imm - base;
+ return 0;
+}
+
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
@@ -360,7 +381,8 @@ static void array_map_seq_show_elem(struct bpf_map *map, void *key,
return;
}
- seq_printf(m, "%u: ", *(u32 *)key);
+ if (map->btf_key_type_id)
+ seq_printf(m, "%u: ", *(u32 *)key);
btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
seq_puts(m, "\n");
@@ -397,6 +419,18 @@ static int array_map_check_btf(const struct bpf_map *map,
{
u32 int_data;
+ /* One exception for keyless BTF: .bss/.data/.rodata map */
+ if (btf_type_is_void(key_type)) {
+ if (map->map_type != BPF_MAP_TYPE_ARRAY ||
+ map->max_entries != 1)
+ return -EINVAL;
+
+ if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC)
+ return -EINVAL;
+
+ return 0;
+ }
+
if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
return -EINVAL;
@@ -419,6 +453,8 @@ const struct bpf_map_ops array_map_ops = {
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup,
+ .map_direct_value_addr = array_map_direct_value_addr,
+ .map_direct_value_meta = array_map_direct_value_meta,
.map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
};
@@ -440,6 +476,9 @@ static int fd_array_map_alloc_check(union bpf_attr *attr)
/* only file descriptors can be stored in this type of map */
if (attr->value_size != sizeof(u32))
return -EINVAL;
+ /* Program read-only/write-only not supported for special maps yet. */
+ if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG))
+ return -EINVAL;
return array_map_alloc_check(attr);
}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..1b6b9349cb85 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/cpumask.h>
#include <linux/spinlock.h>
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..f02504640e18 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index bd3921b1514b..546ebee39e2a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -185,6 +185,16 @@
i < btf_type_vlen(struct_type); \
i++, member++)
+#define for_each_vsi(i, struct_type, member) \
+ for (i = 0, member = btf_type_var_secinfo(struct_type); \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+#define for_each_vsi_from(i, from, struct_type, member) \
+ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
static DEFINE_IDR(btf_idr);
static DEFINE_SPINLOCK(btf_idr_lock);
@@ -262,6 +272,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_RESTRICT] = "RESTRICT",
[BTF_KIND_FUNC] = "FUNC",
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
+ [BTF_KIND_VAR] = "VAR",
+ [BTF_KIND_DATASEC] = "DATASEC",
};
struct btf_kind_operations {
@@ -314,7 +326,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
return false;
}
-static bool btf_type_is_void(const struct btf_type *t)
+bool btf_type_is_void(const struct btf_type *t)
{
return t == &btf_void;
}
@@ -375,13 +387,36 @@ static bool btf_type_is_int(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
}
+static bool btf_type_is_var(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
+}
+
+static bool btf_type_is_datasec(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+}
+
+/* Types that act only as a source, not sink or intermediate
+ * type when resolving.
+ */
+static bool btf_type_is_resolve_source_only(const struct btf_type *t)
+{
+ return btf_type_is_var(t) ||
+ btf_type_is_datasec(t);
+}
+
/* What types need to be resolved?
*
* btf_type_is_modifier() is an obvious one.
*
* btf_type_is_struct() because its member refers to
* another type (through member->type).
-
+ *
+ * btf_type_is_var() because the variable refers to
+ * another type. btf_type_is_datasec() holds multiple
+ * btf_type_is_var() types that need resolving.
+ *
* btf_type_is_array() because its element (array->type)
* refers to another type. Array can be thought of a
* special case of struct while array just has the same
@@ -390,9 +425,11 @@ static bool btf_type_is_int(const struct btf_type *t)
static bool btf_type_needs_resolve(const struct btf_type *t)
{
return btf_type_is_modifier(t) ||
- btf_type_is_ptr(t) ||
- btf_type_is_struct(t) ||
- btf_type_is_array(t);
+ btf_type_is_ptr(t) ||
+ btf_type_is_struct(t) ||
+ btf_type_is_array(t) ||
+ btf_type_is_var(t) ||
+ btf_type_is_datasec(t);
}
/* t->size can be used */
@@ -403,6 +440,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
+ case BTF_KIND_DATASEC:
return true;
}
@@ -467,6 +505,16 @@ static const struct btf_enum *btf_type_enum(const struct btf_type *t)
return (const struct btf_enum *)(t + 1);
}
+static const struct btf_var *btf_type_var(const struct btf_type *t)
+{
+ return (const struct btf_var *)(t + 1);
+}
+
+static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
+{
+ return (const struct btf_var_secinfo *)(t + 1);
+}
+
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -478,23 +526,31 @@ static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
offset < btf->hdr.str_len;
}
-/* Only C-style identifier is permitted. This can be relaxed if
- * necessary.
- */
-static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
+{
+ if ((first ? !isalpha(c) :
+ !isalnum(c)) &&
+ c != '_' &&
+ ((c == '.' && !dot_ok) ||
+ c != '.'))
+ return false;
+ return true;
+}
+
+static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
{
/* offset must be valid */
const char *src = &btf->strings[offset];
const char *src_limit;
- if (!isalpha(*src) && *src != '_')
+ if (!__btf_name_char_ok(*src, true, dot_ok))
return false;
/* set a limit on identifier length */
src_limit = src + KSYM_NAME_LEN;
src++;
while (*src && src < src_limit) {
- if (!isalnum(*src) && *src != '_')
+ if (!__btf_name_char_ok(*src, false, dot_ok))
return false;
src++;
}
@@ -502,6 +558,19 @@ static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
return !*src;
}
+/* Only C-style identifier is permitted. This can be relaxed if
+ * necessary.
+ */
+static bool btf_name_valid_identifier(const struct btf *btf, u32 offset)
+{
+ return __btf_name_valid(btf, offset, false);
+}
+
+static bool btf_name_valid_section(const struct btf *btf, u32 offset)
+{
+ return __btf_name_valid(btf, offset, true);
+}
+
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
if (!offset)
@@ -697,6 +766,32 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
__btf_verifier_log(log, "\n");
}
+__printf(4, 5)
+static void btf_verifier_log_vsi(struct btf_verifier_env *env,
+ const struct btf_type *datasec_type,
+ const struct btf_var_secinfo *vsi,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+ if (env->phase != CHECK_META)
+ btf_verifier_log_type(env, datasec_type, NULL);
+
+ __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u",
+ vsi->type, vsi->offset, vsi->size);
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
static void btf_verifier_log_hdr(struct btf_verifier_env *env,
u32 btf_data_size)
{
@@ -974,7 +1069,8 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
} else if (btf_type_is_ptr(size_type)) {
size = sizeof(void *);
} else {
- if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+ if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) &&
+ !btf_type_is_var(size_type)))
return NULL;
size = btf->resolved_sizes[size_type_id];
@@ -1509,7 +1605,7 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
u32 next_type_size = 0;
next_type = btf_type_by_id(btf, next_type_id);
- if (!next_type) {
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
@@ -1542,6 +1638,53 @@ static int btf_modifier_resolve(struct btf_verifier_env *env,
return 0;
}
+static int btf_var_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ if (btf_type_is_modifier(next_type)) {
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id;
+
+ resolved_type_id = next_type_id;
+ resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+ if (btf_type_is_ptr(resolved_type) &&
+ !env_type_is_resolve_sink(env, resolved_type) &&
+ !env_type_is_resolved(env, resolved_type_id))
+ return env_stack_push(env, resolved_type,
+ resolved_type_id);
+ }
+
+ /* We must resolve to something concrete at this point, no
+ * forward types or similar that would resolve to size of
+ * zero is allowed.
+ */
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+ return 0;
+}
+
static int btf_ptr_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
@@ -1551,7 +1694,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
struct btf *btf = env->btf;
next_type = btf_type_by_id(btf, next_type_id);
- if (!next_type) {
+ if (!next_type || btf_type_is_resolve_source_only(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
@@ -1609,6 +1752,15 @@ static void btf_modifier_seq_show(const struct btf *btf,
btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
}
+static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ t = btf_type_id_resolve(btf, &type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset,
struct seq_file *m)
@@ -1776,7 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->index_type */
index_type_id = array->index_type;
index_type = btf_type_by_id(btf, index_type_id);
- if (btf_type_nosize_or_null(index_type)) {
+ if (btf_type_nosize_or_null(index_type) ||
+ btf_type_is_resolve_source_only(index_type)) {
btf_verifier_log_type(env, v->t, "Invalid index");
return -EINVAL;
}
@@ -1795,7 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env,
/* Check array->type */
elem_type_id = array->type;
elem_type = btf_type_by_id(btf, elem_type_id);
- if (btf_type_nosize_or_null(elem_type)) {
+ if (btf_type_nosize_or_null(elem_type) ||
+ btf_type_is_resolve_source_only(elem_type)) {
btf_verifier_log_type(env, v->t,
"Invalid elem");
return -EINVAL;
@@ -2016,7 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
const struct btf_type *member_type = btf_type_by_id(env->btf,
member_type_id);
- if (btf_type_nosize_or_null(member_type)) {
+ if (btf_type_nosize_or_null(member_type) ||
+ btf_type_is_resolve_source_only(member_type)) {
btf_verifier_log_member(env, v->t, member,
"Invalid member");
return -EINVAL;
@@ -2411,6 +2566,222 @@ static struct btf_kind_operations func_ops = {
.seq_show = btf_df_seq_show,
};
+static s32 btf_var_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_var *var;
+ u32 meta_needed = sizeof(*var);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (!t->name_off ||
+ !__btf_name_valid(env->btf, t->name_off, true)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ /* A var cannot be in type void */
+ if (!t->type || !BTF_TYPE_ID_VALID(t->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ var = btf_type_var(t);
+ if (var->linkage != BTF_VAR_STATIC &&
+ var->linkage != BTF_VAR_GLOBAL_ALLOCATED) {
+ btf_verifier_log_type(env, t, "Linkage not supported");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ const struct btf_var *var = btf_type_var(t);
+
+ btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage);
+}
+
+static const struct btf_kind_operations var_ops = {
+ .check_meta = btf_var_check_meta,
+ .resolve = btf_var_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_var_log,
+ .seq_show = btf_var_seq_show,
+};
+
+static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_var_secinfo *vsi;
+ u64 last_vsi_end_off = 0, sum = 0;
+ u32 i, meta_needed;
+
+ meta_needed = btf_type_vlen(t) * sizeof(*vsi);
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (!btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen == 0");
+ return -EINVAL;
+ }
+
+ if (!t->size) {
+ btf_verifier_log_type(env, t, "size == 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (!t->name_off ||
+ !btf_name_valid_section(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for_each_vsi(i, t, vsi) {
+ /* A var cannot be in type void */
+ if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid offset");
+ return -EINVAL;
+ }
+
+ if (!vsi->size || vsi->size > t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid size");
+ return -EINVAL;
+ }
+
+ last_vsi_end_off = vsi->offset + vsi->size;
+ if (last_vsi_end_off > t->size) {
+ btf_verifier_log_vsi(env, t, vsi,
+ "Invalid offset+size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_vsi(env, t, vsi, NULL);
+ sum += vsi->size;
+ }
+
+ if (t->size < sum) {
+ btf_verifier_log_type(env, t, "Invalid btf_info size");
+ return -EINVAL;
+ }
+
+ return meta_needed;
+}
+
+static int btf_datasec_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_var_secinfo *vsi;
+ struct btf *btf = env->btf;
+ u16 i;
+
+ for_each_vsi_from(i, v->next_member, v->t, vsi) {
+ u32 var_type_id = vsi->type, type_id, type_size = 0;
+ const struct btf_type *var_type = btf_type_by_id(env->btf,
+ var_type_id);
+ if (!var_type || !btf_type_is_var(var_type)) {
+ btf_verifier_log_vsi(env, v->t, vsi,
+ "Not a VAR kind member");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, var_type) &&
+ !env_type_is_resolved(env, var_type_id)) {
+ env_stack_set_next_member(env, i + 1);
+ return env_stack_push(env, var_type, var_type_id);
+ }
+
+ type_id = var_type->type;
+ if (!btf_type_id_size(btf, &type_id, &type_size)) {
+ btf_verifier_log_vsi(env, v->t, vsi, "Invalid type");
+ return -EINVAL;
+ }
+
+ if (vsi->size < type_size) {
+ btf_verifier_log_vsi(env, v->t, vsi, "Invalid size");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, 0, 0);
+ return 0;
+}
+
+static void btf_datasec_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static void btf_datasec_seq_show(const struct btf *btf,
+ const struct btf_type *t, u32 type_id,
+ void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_var_secinfo *vsi;
+ const struct btf_type *var;
+ u32 i;
+
+ seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off));
+ for_each_vsi(i, t, vsi) {
+ var = btf_type_by_id(btf, vsi->type);
+ if (i)
+ seq_puts(m, ",");
+ btf_type_ops(var)->seq_show(btf, var, vsi->type,
+ data + vsi->offset, bits_offset, m);
+ }
+ seq_puts(m, "}");
+}
+
+static const struct btf_kind_operations datasec_ops = {
+ .check_meta = btf_datasec_check_meta,
+ .resolve = btf_datasec_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_datasec_log,
+ .seq_show = btf_datasec_seq_show,
+};
+
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -2542,6 +2913,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_RESTRICT] = &modifier_ops,
[BTF_KIND_FUNC] = &func_ops,
[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
+ [BTF_KIND_VAR] = &var_ops,
+ [BTF_KIND_DATASEC] = &datasec_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -2622,13 +2995,17 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
if (!env_type_is_resolved(env, type_id))
return false;
- if (btf_type_is_struct(t))
+ if (btf_type_is_struct(t) || btf_type_is_datasec(t))
return !btf->resolved_ids[type_id] &&
- !btf->resolved_sizes[type_id];
+ !btf->resolved_sizes[type_id];
- if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+ if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
+ btf_type_is_var(t)) {
t = btf_type_id_resolve(btf, &type_id);
- return t && !btf_type_is_modifier(t);
+ return t &&
+ !btf_type_is_modifier(t) &&
+ !btf_type_is_var(t) &&
+ !btf_type_is_datasec(t);
}
if (btf_type_is_array(t)) {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 4e807973aa80..0a00eaca6fae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1,33 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Functions to manage eBPF programs attached to cgroups
*
* Copyright (c) 2016 Daniel Mack
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/cgroup.h>
+#include <linux/filter.h>
#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <net/sock.h>
+#include <net/bpf_sk_storage.h>
+
+#include "../cgroup/cgroup-internal.h"
DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+void cgroup_bpf_offline(struct cgroup *cgrp)
+{
+ cgroup_get(cgrp);
+ percpu_ref_kill(&cgrp->bpf.refcnt);
+}
+
/**
- * cgroup_bpf_put() - put references of all bpf programs
- * @cgrp: the cgroup to modify
+ * cgroup_bpf_release() - put references of all bpf programs and
+ * release all cgroup bpf data
+ * @work: work structure embedded into the cgroup to modify
*/
-void cgroup_bpf_put(struct cgroup *cgrp)
+static void cgroup_bpf_release(struct work_struct *work)
{
+ struct cgroup *cgrp = container_of(work, struct cgroup,
+ bpf.release_work);
enum bpf_cgroup_storage_type stype;
+ struct bpf_prog_array *old_array;
unsigned int type;
+ mutex_lock(&cgroup_mutex);
+
for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog_list *pl, *tmp;
@@ -42,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp)
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
- bpf_prog_array_free(cgrp->bpf.effective[type]);
+ old_array = rcu_dereference_protected(
+ cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+ bpf_prog_array_free(old_array);
}
+
+ mutex_unlock(&cgroup_mutex);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+ cgroup_put(cgrp);
+}
+
+/**
+ * cgroup_bpf_release_fn() - callback used to schedule releasing
+ * of bpf cgroup data
+ * @ref: percpu ref counter structure
+ */
+static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+{
+ struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+
+ INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+ queue_work(system_wq, &cgrp->bpf.release_work);
}
/* count number of elements in the list.
@@ -98,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
*/
static int compute_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu **array)
+ struct bpf_prog_array **array)
{
enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *progs;
@@ -136,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
} while ((p = cgroup_parent(p)));
- rcu_assign_pointer(*array, progs);
+ *array = progs;
return 0;
}
static void activate_effective_progs(struct cgroup *cgrp,
enum bpf_attach_type type,
- struct bpf_prog_array __rcu *array)
+ struct bpf_prog_array *old_array)
{
- struct bpf_prog_array __rcu *old_array;
-
- old_array = xchg(&cgrp->bpf.effective[type], array);
+ rcu_swap_protected(cgrp->bpf.effective[type], old_array,
+ lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
*/
@@ -163,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
* that array below is variable length
*/
#define NR ARRAY_SIZE(cgrp->bpf.effective)
- struct bpf_prog_array __rcu *arrays[NR] = {};
- int i;
+ struct bpf_prog_array *arrays[NR] = {};
+ int ret, i;
+
+ ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0,
+ GFP_KERNEL);
+ if (ret)
+ return ret;
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
@@ -180,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cleanup:
for (i = 0; i < NR; i++)
bpf_prog_array_free(arrays[i]);
+
+ percpu_ref_exit(&cgrp->bpf.refcnt);
+
return -ENOMEM;
}
@@ -193,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
err = compute_effective_progs(desc, type, &desc->bpf.inactive);
if (err)
goto cleanup;
@@ -202,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp,
css_for_each_descendant_pre(css, &cgrp->self) {
struct cgroup *desc = container_of(css, struct cgroup, self);
+ if (percpu_ref_is_zero(&desc->bpf.refcnt)) {
+ if (unlikely(desc->bpf.inactive)) {
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+ continue;
+ }
+
activate_effective_progs(desc, type, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -441,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
enum bpf_attach_type type = attr->query.attach_type;
struct list_head *progs = &cgrp->bpf.progs[type];
u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog_array *effective;
int cnt, ret = 0, i;
+ effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+ lockdep_is_held(&cgroup_mutex));
+
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ cnt = bpf_prog_array_length(effective);
else
cnt = prog_list_length(progs);
@@ -461,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
}
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
- prog_ids, cnt);
+ return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
} else {
struct bpf_prog_list *pl;
u32 id;
@@ -545,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* The program type passed in via @type must be suitable for network
* filtering. No further check is performed to assert that.
*
- * This function will return %-EPERM if any if an attached program was found
- * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ * For egress packets, this function can return:
+ * NET_XMIT_SUCCESS (0) - continue with packet output
+ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
+ * NET_XMIT_CN (2) - continue with packet output and notify TCP
+ * to call cwr
+ * -EPERM - drop packet
+ *
+ * For ingress packets, this function will return -EPERM if any
+ * attached program was found and if it returned != 1 during execution.
+ * Otherwise 0 is returned.
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
@@ -572,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
+ if (type == BPF_CGROUP_INET_EGRESS) {
+ ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
+ cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ } else {
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ __bpf_prog_run_save_cb);
+ ret = (ret == 1 ? 0 : -EPERM);
+ }
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
skb->sk = save_sk;
- return ret == 1 ? 0 : -EPERM;
+
+ return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -701,7 +773,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
@@ -710,6 +782,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_map_update_elem_proto;
case BPF_FUNC_map_delete_elem:
return &bpf_map_delete_elem_proto;
+ case BPF_FUNC_map_push_elem:
+ return &bpf_map_push_elem_proto;
+ case BPF_FUNC_map_pop_elem:
+ return &bpf_map_pop_elem_proto;
+ case BPF_FUNC_map_peek_elem:
+ return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
@@ -725,6 +803,12 @@ cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
}
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ return cgroup_base_func_proto(func_id, prog);
+}
+
static bool cgroup_dev_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -762,3 +846,692 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
.get_func_proto = cgroup_dev_func_proto,
.is_valid_access = cgroup_dev_is_valid_access,
};
+
+/**
+ * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl
+ *
+ * @head: sysctl table header
+ * @table: sysctl table
+ * @write: sysctl is being read (= 0) or written (= 1)
+ * @buf: pointer to buffer passed by user space
+ * @pcount: value-result argument: value is size of buffer pointed to by @buf,
+ * result is size of @new_buf if program set new value, initial value
+ * otherwise
+ * @ppos: value-result argument: value is position at which read from or write
+ * to sysctl is happening, result is new position if program overrode it,
+ * initial value otherwise
+ * @new_buf: pointer to pointer to new buffer that will be allocated if program
+ * overrides new value provided by user space on sysctl write
+ * NOTE: it's caller responsibility to free *new_buf if it was set
+ * @type: type of program to be executed
+ *
+ * Program is run when sysctl is being accessed, either read or written, and
+ * can allow or deny such access.
+ *
+ * This function will return %-EPERM if an attached program is found and
+ * returned value != 1 during execution. In all other cases 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
+ struct ctl_table *table, int write,
+ void __user *buf, size_t *pcount,
+ loff_t *ppos, void **new_buf,
+ enum bpf_attach_type type)
+{
+ struct bpf_sysctl_kern ctx = {
+ .head = head,
+ .table = table,
+ .write = write,
+ .ppos = ppos,
+ .cur_val = NULL,
+ .cur_len = PAGE_SIZE,
+ .new_val = NULL,
+ .new_len = 0,
+ .new_updated = 0,
+ };
+ struct cgroup *cgrp;
+ int ret;
+
+ ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
+ if (ctx.cur_val) {
+ mm_segment_t old_fs;
+ loff_t pos = 0;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
+ &ctx.cur_len, &pos)) {
+ /* Let BPF program decide how to proceed. */
+ ctx.cur_len = 0;
+ }
+ set_fs(old_fs);
+ } else {
+ /* Let BPF program decide how to proceed. */
+ ctx.cur_len = 0;
+ }
+
+ if (write && buf && *pcount) {
+ /* BPF program should be able to override new value with a
+ * buffer bigger than provided by user.
+ */
+ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
+ ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
+ if (!ctx.new_val ||
+ copy_from_user(ctx.new_val, buf, ctx.new_len))
+ /* Let BPF program decide how to proceed. */
+ ctx.new_len = 0;
+ }
+
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ rcu_read_unlock();
+
+ kfree(ctx.cur_val);
+
+ if (ret == 1 && ctx.new_updated) {
+ *new_buf = ctx.new_val;
+ *pcount = ctx.new_len;
+ } else {
+ kfree(ctx.new_val);
+ }
+
+ return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
+
+#ifdef CONFIG_NET
+static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_prog_array *prog_array;
+ bool empty;
+
+ rcu_read_lock();
+ prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
+ empty = bpf_prog_array_is_empty(prog_array);
+ rcu_read_unlock();
+
+ return empty;
+}
+
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+{
+ if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ return -EINVAL;
+
+ ctx->optval = kzalloc(max_optlen, GFP_USER);
+ if (!ctx->optval)
+ return -ENOMEM;
+
+ ctx->optval_end = ctx->optval + max_optlen;
+ ctx->optlen = max_optlen;
+
+ return 0;
+}
+
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+{
+ kfree(ctx->optval);
+}
+
+int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
+ int *optname, char __user *optval,
+ int *optlen, char **kernel_optval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = *level,
+ .optname = *optname,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ return 0;
+
+ ret = sockopt_alloc_buf(&ctx, *optlen);
+ if (ret)
+ return ret;
+
+ if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen == -1) {
+ /* optlen set to -1, bypass kernel */
+ ret = 1;
+ } else if (ctx.optlen > *optlen || ctx.optlen < -1) {
+ /* optlen is out of bounds */
+ ret = -EFAULT;
+ } else {
+ /* optlen within bounds, run kernel handler */
+ ret = 0;
+
+ /* export any potential modifications */
+ *level = ctx.level;
+ *optname = ctx.optname;
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
+
+out:
+ if (ret)
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
+ int optname, char __user *optval,
+ int __user *optlen, int max_optlen,
+ int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ };
+ int ret;
+
+ /* Opportunistic check to see whether we have any BPF program
+ * attached to the hook so we don't waste time allocating
+ * memory and locking the socket.
+ */
+ if (!cgroup_bpf_enabled ||
+ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ return retval;
+
+ ret = sockopt_alloc_buf(&ctx, max_optlen);
+ if (ret)
+ return ret;
+
+ if (!retval) {
+ /* If kernel getsockopt finished successfully,
+ * copy whatever was returned to the user back
+ * into our temporary buffer. Set optlen to the
+ * one that kernel returned as well to let
+ * BPF programs inspect the value.
+ */
+
+ if (get_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen)
+ ctx.optlen = max_optlen;
+
+ if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ lock_sock(sk);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
+ &ctx, BPF_PROG_RUN);
+ release_sock(sk);
+
+ if (!ret) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ if (ctx.optlen > max_optlen) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret = ctx.retval;
+
+out:
+ sockopt_free_buf(&ctx);
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+#endif
+
+static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
+ size_t *lenp)
+{
+ ssize_t tmp_ret = 0, ret;
+
+ if (dir->header.parent) {
+ tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ }
+
+ ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp);
+ if (ret < 0)
+ return ret;
+ *bufp += ret;
+ *lenp -= ret;
+ ret += tmp_ret;
+
+ /* Avoid leading slash. */
+ if (!ret)
+ return ret;
+
+ tmp_ret = strscpy(*bufp, "/", *lenp);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ *bufp += tmp_ret;
+ *lenp -= tmp_ret;
+
+ return ret + tmp_ret;
+}
+
+BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf,
+ size_t, buf_len, u64, flags)
+{
+ ssize_t tmp_ret = 0, ret;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (!(flags & BPF_F_SYSCTL_BASE_NAME)) {
+ if (!ctx->head)
+ return -EINVAL;
+ tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len);
+ if (tmp_ret < 0)
+ return tmp_ret;
+ }
+
+ ret = strscpy(buf, ctx->table->procname, buf_len);
+
+ return ret < 0 ? ret : tmp_ret + ret;
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_name_proto = {
+ .func = bpf_sysctl_get_name,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static int copy_sysctl_value(char *dst, size_t dst_len, char *src,
+ size_t src_len)
+{
+ if (!dst)
+ return -EINVAL;
+
+ if (!dst_len)
+ return -E2BIG;
+
+ if (!src || !src_len) {
+ memset(dst, 0, dst_len);
+ return -EINVAL;
+ }
+
+ memcpy(dst, src, min(dst_len, src_len));
+
+ if (dst_len > src_len) {
+ memset(dst + src_len, '\0', dst_len - src_len);
+ return src_len;
+ }
+
+ dst[dst_len - 1] = '\0';
+
+ return -E2BIG;
+}
+
+BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx,
+ char *, buf, size_t, buf_len)
+{
+ return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
+ .func = bpf_sysctl_get_current_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
+ size_t, buf_len)
+{
+ if (!ctx->write) {
+ if (buf && buf_len)
+ memset(buf, '\0', buf_len);
+ return -EINVAL;
+ }
+ return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
+ .func = bpf_sysctl_get_new_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
+ const char *, buf, size_t, buf_len)
+{
+ if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
+ return -EINVAL;
+
+ if (buf_len > PAGE_SIZE - 1)
+ return -E2BIG;
+
+ memcpy(ctx->new_val, buf, buf_len);
+ ctx->new_len = buf_len;
+ ctx->new_updated = 1;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
+ .func = bpf_sysctl_set_new_value,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+};
+
+static const struct bpf_func_proto *
+sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_strtol:
+ return &bpf_strtol_proto;
+ case BPF_FUNC_strtoul:
+ return &bpf_strtoul_proto;
+ case BPF_FUNC_sysctl_get_name:
+ return &bpf_sysctl_get_name_proto;
+ case BPF_FUNC_sysctl_get_current_value:
+ return &bpf_sysctl_get_current_value_proto;
+ case BPF_FUNC_sysctl_get_new_value:
+ return &bpf_sysctl_get_new_value_proto;
+ case BPF_FUNC_sysctl_set_new_value:
+ return &bpf_sysctl_set_new_value_proto;
+ default:
+ return cgroup_base_func_proto(func_id, prog);
+ }
+}
+
+static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size)
+ return false;
+
+ switch (off) {
+ case offsetof(struct bpf_sysctl, write):
+ if (type != BPF_READ)
+ return false;
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case offsetof(struct bpf_sysctl, file_pos):
+ if (type == BPF_READ) {
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size, size_default);
+ } else {
+ return size == size_default;
+ }
+ default:
+ return false;
+ }
+}
+
+static u32 sysctl_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sysctl, write):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sysctl_kern, write,
+ FIELD_SIZEOF(struct bpf_sysctl_kern,
+ write),
+ target_size));
+ break;
+ case offsetof(struct bpf_sysctl, file_pos):
+ /* ppos is a pointer so it should be accessed via indirect
+ * loads and stores. Also for stores additional temporary
+ * register is used since neither src_reg nor dst_reg can be
+ * overridden.
+ */
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(
+ BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sysctl_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sysctl_kern, ppos));
+ *insn++ = BPF_STX_MEM(
+ BPF_SIZEOF(u32), treg, si->src_reg, 0);
+ *insn++ = BPF_LDX_MEM(
+ BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sysctl_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sysctl_kern, ppos));
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0);
+ }
+ *target_size = sizeof(u32);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
+ .get_func_proto = sysctl_func_proto,
+ .is_valid_access = sysctl_is_valid_access,
+ .convert_ctx_access = sysctl_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_sysctl_prog_ops = {
+};
+
+static const struct bpf_func_proto *
+cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+#ifdef CONFIG_NET
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif
+ default:
+ return cgroup_base_func_proto(func_id, prog);
+ }
+}
+
+static bool cg_sockopt_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
+
+ if (off < 0 || off >= sizeof(struct bpf_sockopt))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_GETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optname):
+ /* fallthrough */
+ case offsetof(struct bpf_sockopt, level):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type ==
+ BPF_CGROUP_SETSOCKOPT;
+ case offsetof(struct bpf_sockopt, optlen):
+ return size == size_default;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case offsetof(struct bpf_sockopt, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (size != size_default)
+ return false;
+ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT;
+ default:
+ if (size != size_default)
+ return false;
+ break;
+ }
+ return true;
+}
+
+#define CG_SOCKOPT_ACCESS_FIELD(T, F) \
+ T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sockopt_kern, F))
+
+static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sockopt, sk):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk);
+ break;
+ case offsetof(struct bpf_sockopt, level):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level);
+ break;
+ case offsetof(struct bpf_sockopt, optname):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname);
+ break;
+ case offsetof(struct bpf_sockopt, optlen):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
+ break;
+ case offsetof(struct bpf_sockopt, retval):
+ if (type == BPF_WRITE)
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
+ else
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ break;
+ case offsetof(struct bpf_sockopt, optval):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
+ break;
+ case offsetof(struct bpf_sockopt, optval_end):
+ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf,
+ bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Nothing to do for sockopt argument. The data is kzalloc'ated.
+ */
+ return 0;
+}
+
+const struct bpf_verifier_ops cg_sockopt_verifier_ops = {
+ .get_func_proto = cg_sockopt_func_proto,
+ .is_valid_access = cg_sockopt_is_valid_access,
+ .convert_ctx_access = cg_sockopt_convert_ctx_access,
+ .gen_prologue = cg_sockopt_get_prologue,
+};
+
+const struct bpf_prog_ops cg_sockopt_prog_ops = {
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c605397c79f0..16079550db6d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux Socket Filter - Kernel level socket filtering
*
@@ -12,11 +13,6 @@
* Alexei Starovoitov <ast@plumgrid.com>
* Daniel Borkmann <dborkman@redhat.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
@@ -292,7 +288,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
dst[i] = fp->insnsi[i];
if (!was_ld_map &&
dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
- dst[i].src_reg == BPF_PSEUDO_MAP_FD) {
+ (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
+ dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
was_ld_map = true;
dst[i].imm = 0;
} else if (was_ld_map &&
@@ -337,7 +334,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
}
static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
- s32 end_new, u32 curr, const bool probe_pass)
+ s32 end_new, s32 curr, const bool probe_pass)
{
const s64 imm_min = S32_MIN, imm_max = S32_MAX;
s32 delta = end_new - end_old;
@@ -355,7 +352,7 @@ static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old,
}
static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
- s32 end_new, u32 curr, const bool probe_pass)
+ s32 end_new, s32 curr, const bool probe_pass)
{
const s32 off_min = S16_MIN, off_max = S16_MAX;
s32 delta = end_new - end_old;
@@ -438,6 +435,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
u32 insn_adj_cnt, insn_rest, insn_delta = len - 1;
const u32 cnt_max = S16_MAX;
struct bpf_prog *prog_adj;
+ int err;
/* Since our patchlet doesn't expand the image, we're done. */
if (insn_delta == 0) {
@@ -453,8 +451,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
* we afterwards may not fail anymore.
*/
if (insn_adj_cnt > cnt_max &&
- bpf_adj_branches(prog, off, off + 1, off + len, true))
- return NULL;
+ (err = bpf_adj_branches(prog, off, off + 1, off + len, true)))
+ return ERR_PTR(err);
/* Several new instructions need to be inserted. Make room
* for them. Likely, there's no need for a new allocation as
@@ -463,7 +461,7 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt),
GFP_USER);
if (!prog_adj)
- return NULL;
+ return ERR_PTR(-ENOMEM);
prog_adj->len = insn_adj_cnt;
@@ -1095,13 +1093,13 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
continue;
tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten);
- if (!tmp) {
+ if (IS_ERR(tmp)) {
/* Patching may have repointed aux->prog during
* realloc from the original one, so we need to
* fix it up here on error.
*/
bpf_jit_prog_release_other(prog, clone);
- return ERR_PTR(-ENOMEM);
+ return tmp;
}
clone = tmp;
@@ -1366,10 +1364,10 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) ((*(s32 *) &DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> SRC);
CONT;
ALU_ARSH_K:
- DST = (u64) (u32) ((*(s32 *) &DST) >> IMM);
+ DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
@@ -1793,38 +1791,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
return &empty_prog_array.hdr;
}
-void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs ||
- progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ if (!progs || progs == &empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
-int bpf_prog_array_length(struct bpf_prog_array __rcu *array)
+int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
u32 cnt = 0;
- rcu_read_lock();
- item = rcu_dereference(array)->items;
- for (; item->prog; item++)
+ for (item = array->items; item->prog; item++)
if (item->prog != &dummy_bpf_prog.prog)
cnt++;
- rcu_read_unlock();
return cnt;
}
+bool bpf_prog_array_is_empty(struct bpf_prog_array *array)
+{
+ struct bpf_prog_array_item *item;
-static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
+ for (item = array->items; item->prog; item++)
+ if (item->prog != &dummy_bpf_prog.prog)
+ return false;
+ return true;
+}
+
+static bool bpf_prog_array_copy_core(struct bpf_prog_array *array,
u32 *prog_ids,
u32 request_cnt)
{
struct bpf_prog_array_item *item;
int i = 0;
- item = rcu_dereference_check(array, 1)->items;
- for (; item->prog; item++) {
+ for (item = array->items; item->prog; item++) {
if (item->prog == &dummy_bpf_prog.prog)
continue;
prog_ids[i] = item->prog->aux->id;
@@ -1837,7 +1839,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array,
return !!(item->prog);
}
-int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_to_user(struct bpf_prog_array *array,
__u32 __user *prog_ids, u32 cnt)
{
unsigned long err = 0;
@@ -1848,18 +1850,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
* cnt = bpf_prog_array_length();
* if (cnt > 0)
* bpf_prog_array_copy_to_user(..., cnt);
- * so below kcalloc doesn't need extra cnt > 0 check, but
- * bpf_prog_array_length() releases rcu lock and
- * prog array could have been swapped with empty or larger array,
- * so always copy 'cnt' prog_ids to the user.
- * In a rare race the user will see zero prog_ids
+ * so below kcalloc doesn't need extra cnt > 0 check.
*/
ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN);
if (!ids)
return -ENOMEM;
- rcu_read_lock();
nospc = bpf_prog_array_copy_core(array, ids, cnt);
- rcu_read_unlock();
err = copy_to_user(prog_ids, ids, cnt * sizeof(u32));
kfree(ids);
if (err)
@@ -1869,19 +1865,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array,
return 0;
}
-void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array,
+void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
struct bpf_prog *old_prog)
{
- struct bpf_prog_array_item *item = array->items;
+ struct bpf_prog_array_item *item;
- for (; item->prog; item++)
+ for (item = array->items; item->prog; item++)
if (item->prog == old_prog) {
WRITE_ONCE(item->prog, &dummy_bpf_prog.prog);
break;
}
}
-int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
struct bpf_prog_array **new_array)
@@ -1945,7 +1941,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
return 0;
}
-int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
+int bpf_prog_array_copy_info(struct bpf_prog_array *array,
u32 *prog_ids, u32 request_cnt,
u32 *prog_cnt)
{
@@ -2088,6 +2084,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
return false;
}
+/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
+ * analysis code and wants explicit zero extension inserted by verifier.
+ * Otherwise, return FALSE.
+ */
+bool __weak bpf_jit_needs_zext(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
@@ -2099,10 +2104,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
-int sysctl_bpf_stats_enabled __read_mostly;
/* All definitions of tracepoints related to BPF. */
#define CREATE_TRACE_POINTS
#include <linux/bpf_trace.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
+EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 3c18260403dd..ef49e17ae47c 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* bpf/cpumap.c
*
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
- * Released under terms in GPL version 2. See COPYING.
*/
/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
@@ -32,14 +32,19 @@
/* General idea: XDP packets getting XDP redirected to another CPU,
* will maximum be stored/queued for one driver ->poll() call. It is
- * guaranteed that setting flush bit and flush operation happen on
+ * guaranteed that queueing the frame and the flush operation happen on
* same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
* which queue in bpf_cpu_map_entry contains packets.
*/
#define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */
+struct bpf_cpu_map_entry;
+struct bpf_cpu_map;
+
struct xdp_bulk_queue {
void *q[CPU_MAP_BULK_SIZE];
+ struct list_head flush_node;
+ struct bpf_cpu_map_entry *obj;
unsigned int count;
};
@@ -52,6 +57,8 @@ struct bpf_cpu_map_entry {
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
+ struct bpf_cpu_map *cmap;
+
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
@@ -65,23 +72,17 @@ struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
struct bpf_cpu_map_entry **cpu_map;
- unsigned long __percpu *flush_needed;
+ struct list_head __percpu *flush_list;
};
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
- struct xdp_bulk_queue *bq, bool in_napi_ctx);
-
-static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
-{
- return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
-}
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx);
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
+ int ret, cpu;
u64 cost;
- int ret;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
- cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_cmap;
- cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+ cost += sizeof(struct list_head) * num_possible_cpus();
/* Notice returns -EPERM on if map size is larger than memlock limit */
- ret = bpf_map_precharge_memlock(cmap->map.pages);
+ ret = bpf_map_charge_init(&cmap->map.memory, cost);
if (ret) {
err = ret;
goto free_cmap;
}
- /* A per cpu bitfield with a bit per possible CPU in map */
- cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
- __alignof__(unsigned long));
- if (!cmap->flush_needed)
- goto free_cmap;
+ cmap->flush_list = alloc_percpu(struct list_head);
+ if (!cmap->flush_list)
+ goto free_charge;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu));
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
@@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
return &cmap->map;
free_percpu:
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
+free_charge:
+ bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
kfree(cmap);
return ERR_PTR(err);
@@ -160,12 +161,12 @@ static void cpu_map_kthread_stop(struct work_struct *work)
}
static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_frame *xdpf)
+ struct xdp_frame *xdpf,
+ struct sk_buff *skb)
{
unsigned int hard_start_headroom;
unsigned int frame_size;
void *pkt_data_start;
- struct sk_buff *skb;
/* Part of headroom was reserved to xdpf */
hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
@@ -191,8 +192,8 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
pkt_data_start = xdpf->data - hard_start_headroom;
- skb = build_skb(pkt_data_start, frame_size);
- if (!skb)
+ skb = build_skb_around(skb, pkt_data_start, frame_size);
+ if (unlikely(!skb))
return NULL;
skb_reserve(skb, hard_start_headroom);
@@ -209,6 +210,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
* - RX ring dev queue index (skb_record_rx_queue)
*/
+ /* Until page_pool get SKB return path, release DMA here */
+ xdp_release_frame(xdpf);
+
/* Allow SKB to reuse area used by xdp_frame */
xdp_scrub_frame(xdpf);
@@ -240,6 +244,8 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
+#define CPUMAP_BATCH 8
+
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
@@ -252,8 +258,11 @@ static int cpu_map_kthread_run(void *data)
* kthread_stop signal until queue is empty.
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
- unsigned int processed = 0, drops = 0, sched = 0;
- struct xdp_frame *xdpf;
+ unsigned int drops = 0, sched = 0;
+ void *frames[CPUMAP_BATCH];
+ void *skbs[CPUMAP_BATCH];
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n, m;
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -269,18 +278,38 @@ static int cpu_map_kthread_run(void *data)
sched = cond_resched();
}
- /* Process packets in rcpu->queue */
- local_bh_disable();
/*
* The bpf_cpu_map_entry is single consumer, with this
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
- struct sk_buff *skb;
+ n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
+
+ for (i = 0; i < n; i++) {
+ void *f = frames[i];
+ struct page *page = virt_to_page(f);
+
+ /* Bring struct page memory area to curr CPU. Read by
+ * build_skb_around via page_is_pfmemalloc(), and when
+ * freed written by page_frag_free call.
+ */
+ prefetchw(page);
+ }
+
+ m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
+ if (unlikely(m == 0)) {
+ for (i = 0; i < n; i++)
+ skbs[i] = NULL; /* effect: xdp_return_frame */
+ drops = n;
+ }
+
+ local_bh_disable();
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ struct sk_buff *skb = skbs[i];
int ret;
- skb = cpu_map_build_skb(rcpu, xdpf);
+ skb = cpu_map_build_skb(rcpu, xdpf, skb);
if (!skb) {
xdp_return_frame(xdpf);
continue;
@@ -290,13 +319,9 @@ static int cpu_map_kthread_run(void *data)
ret = netif_receive_skb_core(skb);
if (ret == NET_RX_DROP)
drops++;
-
- /* Limit BH-disable period */
- if (++processed == 8)
- break;
}
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
local_bh_enable(); /* resched point, may call do_softirq() */
}
@@ -311,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
{
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
- int numa, err;
+ struct xdp_bulk_queue *bq;
+ int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
@@ -326,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
if (!rcpu->bulkq)
goto free_rcu;
+ for_each_possible_cpu(i) {
+ bq = per_cpu_ptr(rcpu->bulkq, i);
+ bq->obj = rcpu;
+ }
+
/* Alloc queue */
rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
if (!rcpu->queue)
@@ -382,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu)
struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
/* No concurrent bq_enqueue can run at this point */
- bq_flush_to_queue(rcpu, bq, false);
+ bq_flush_to_queue(bq, false);
}
free_percpu(rcpu->bulkq);
/* Cannot kthread_stop() here, last put free rcpu resources */
@@ -465,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
if (!rcpu)
return -ENOMEM;
+ rcpu->cmap = cmap;
}
rcu_read_lock();
__cpu_map_entry_replace(cmap, key_cpu, rcpu);
@@ -491,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map)
synchronize_rcu();
/* To ensure all pending flush operations have completed wait for flush
- * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
- * Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new bits will be set.
+ * list be empty on _all_ cpus. Because the above synchronize_rcu()
+ * ensures the map is disconnected from the program we can assume no new
+ * items will be added to the list.
*/
for_each_online_cpu(cpu) {
- unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+ struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu);
- while (!bitmap_empty(bitmap, cmap->map.max_entries))
+ while (!list_empty(flush_list))
cond_resched();
}
@@ -515,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map)
/* bq flush and cleanup happens after RCU graze-period */
__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
}
- free_percpu(cmap->flush_needed);
+ free_percpu(cmap->flush_list);
bpf_map_area_free(cmap->cpu_map);
kfree(cmap);
}
@@ -567,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = {
.map_check_btf = map_check_no_btf,
};
-static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
- struct xdp_bulk_queue *bq, bool in_napi_ctx)
+static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx)
{
+ struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
const int to_cpu = rcpu->cpu;
struct ptr_ring *q;
@@ -598,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
bq->count = 0;
spin_unlock(&q->producer_lock);
+ __list_del_clearprev(&bq->flush_node);
+
/* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
return 0;
@@ -608,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
*/
static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
+ struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
- bq_flush_to_queue(rcpu, bq, true);
+ bq_flush_to_queue(bq, true);
/* Notice, xdp_buff/page MUST be queued here, long enough for
* driver to code invoking us to finished, due to driver
@@ -623,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
* operation, when completing napi->poll call.
*/
bq->q[bq->count++] = xdpf;
+
+ if (!bq->flush_node.prev)
+ list_add(&bq->flush_node, flush_list);
+
return 0;
}
@@ -642,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
-void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
- struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
-
- __set_bit(bit, bitmap);
-}
-
void __cpu_map_flush(struct bpf_map *map)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
- unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
- u32 bit;
-
- /* The napi->poll softirq makes sure __cpu_map_insert_ctx()
- * and __cpu_map_flush() happen on same CPU. Thus, the percpu
- * bitmap indicate which percpu bulkq have packets.
- */
- for_each_set_bit(bit, bitmap, map->max_entries) {
- struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
- struct xdp_bulk_queue *bq;
-
- /* This is possible if entry is removed by user space
- * between xdp redirect and flush op.
- */
- if (unlikely(!rcpu))
- continue;
-
- __clear_bit(bit, bitmap);
+ struct list_head *flush_list = this_cpu_ptr(cmap->flush_list);
+ struct xdp_bulk_queue *bq, *tmp;
- /* Flush all frames in bulkq to real queue */
- bq = this_cpu_ptr(rcpu->bulkq);
- bq_flush_to_queue(rcpu, bq, true);
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
+ bq_flush_to_queue(bq, true);
/* If already running, costs spin_lock_irqsave + smb_mb */
- wake_up_process(rcpu->kthread);
+ wake_up_process(bq->obj->kthread);
}
}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..d83cf8ccc872 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
/* Devmaps primary use is as a backend map for XDP BPF helper call
@@ -25,9 +17,8 @@
* datapath always has a valid copy. However, the datapath does a "flush"
* operation that pushes any pending packets in the driver outside the RCU
* critical section. Each bpf_dtab_netdev tracks these pending operations using
- * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
- * until all bits are cleared indicating outstanding flush operations have
- * completed.
+ * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
+ * this list is empty, indicating outstanding flush operations have completed.
*
* BPF syscalls may race with BPF program calls on any of the update, delete
* or lookup operations. As noted above the xchg() operation also keep the
@@ -56,9 +47,13 @@
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
#define DEV_MAP_BULK_SIZE 16
+struct bpf_dtab_netdev;
+
struct xdp_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ struct list_head flush_node;
struct net_device *dev_rx;
+ struct bpf_dtab_netdev *obj;
unsigned int count;
};
@@ -73,22 +68,17 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
- unsigned long __percpu *flush_needed;
+ struct list_head __percpu *flush_list;
struct list_head list;
};
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static u64 dev_map_bitmap_size(const union bpf_attr *attr)
-{
- return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
-}
-
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
struct bpf_dtab *dtab;
- int err = -EINVAL;
+ int err, cpu;
u64 cost;
if (!capable(CAP_NET_ADMIN))
@@ -99,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
+ /* Lookup returns a pointer straight to dev->ifindex, so make sure the
+ * verifier prevents writes from the BPF side
+ */
+ attr->map_flags |= BPF_F_RDONLY_PROG;
+
dtab = kzalloc(sizeof(*dtab), GFP_USER);
if (!dtab)
return ERR_PTR(-ENOMEM);
@@ -107,39 +102,39 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
- cost += dev_map_bitmap_size(attr) * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_dtab;
+ cost += sizeof(struct list_head) * num_possible_cpus();
- dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* if map size is larger than memlock limit, reject it early */
- err = bpf_map_precharge_memlock(dtab->map.pages);
+ /* if map size is larger than memlock limit, reject it */
+ err = bpf_map_charge_init(&dtab->map.memory, cost);
if (err)
goto free_dtab;
err = -ENOMEM;
- /* A per cpu bitfield with a bit per possible net device */
- dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
- __alignof__(unsigned long),
- GFP_KERNEL | __GFP_NOWARN);
- if (!dtab->flush_needed)
- goto free_dtab;
+ dtab->flush_list = alloc_percpu(struct list_head);
+ if (!dtab->flush_list)
+ goto free_charge;
+
+ for_each_possible_cpu(cpu)
+ INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu));
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_dtab;
+ goto free_percpu;
spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
return &dtab->map;
+
+free_percpu:
+ free_percpu(dtab->flush_list);
+free_charge:
+ bpf_map_charge_finish(&dtab->map.memory);
free_dtab:
- free_percpu(dtab->flush_needed);
kfree(dtab);
return ERR_PTR(err);
}
@@ -164,15 +159,18 @@ static void dev_map_free(struct bpf_map *map)
bpf_clear_redirect_map(map);
synchronize_rcu();
+ /* Make sure prior __dev_map_entry_free() have completed. */
+ rcu_barrier();
+
/* To ensure all pending flush operations have completed wait for flush
- * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * list to empty on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected
- * from the program we can assume no new bits will be set.
+ * from the program we can assume no new items will be added.
*/
for_each_online_cpu(cpu) {
- unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+ struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu);
- while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ while (!list_empty(flush_list))
cond_resched();
}
@@ -183,11 +181,12 @@ static void dev_map_free(struct bpf_map *map)
if (!dev)
continue;
+ free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
- free_percpu(dtab->flush_needed);
+ free_percpu(dtab->flush_list);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
@@ -209,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
-{
- struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
-
- __set_bit(bit, bitmap);
-}
-
-static int bq_xmit_all(struct bpf_dtab_netdev *obj,
- struct xdp_bulk_queue *bq, u32 flags,
+static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags,
bool in_napi_ctx)
{
+ struct bpf_dtab_netdev *obj = bq->obj;
struct net_device *dev = obj->dev;
int sent = 0, drops = 0, err = 0;
int i;
@@ -247,6 +238,7 @@ out:
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL;
+ __list_del_clearprev(&bq->flush_node);
return 0;
error:
/* If ndo_xdp_xmit fails with an errno, no frames have been
@@ -269,30 +261,19 @@ error:
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the ctx bitmap
- * is zeroed before completing to ensure all flush operations have completed.
+ * net device can be torn down. On devmap tear down we ensure the flush list
+ * is empty before completing to ensure all flush operations have completed.
*/
void __dev_map_flush(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
- u32 bit;
-
- for_each_set_bit(bit, bitmap, map->max_entries) {
- struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
- struct xdp_bulk_queue *bq;
-
- /* This is possible if the dev entry is removed by user space
- * between xdp redirect and flush op.
- */
- if (unlikely(!dev))
- continue;
-
- __clear_bit(bit, bitmap);
+ struct list_head *flush_list = this_cpu_ptr(dtab->flush_list);
+ struct xdp_bulk_queue *bq, *tmp;
- bq = this_cpu_ptr(dev->bulkq);
- bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true);
- }
+ rcu_read_lock();
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ bq_xmit_all(bq, XDP_XMIT_FLUSH, true);
+ rcu_read_unlock();
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
@@ -318,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
+ struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
- bq_xmit_all(obj, bq, 0, true);
+ bq_xmit_all(bq, 0, true);
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
@@ -331,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
bq->dev_rx = dev_rx;
bq->q[bq->count++] = xdpf;
+
+ if (!bq->flush_node.prev)
+ list_add(&bq->flush_node, flush_list);
+
return 0;
}
@@ -381,17 +367,14 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_xmit) {
struct xdp_bulk_queue *bq;
- unsigned long *bitmap;
-
int cpu;
+ rcu_read_lock();
for_each_online_cpu(cpu) {
- bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
- __clear_bit(dev->bit, bitmap);
-
bq = per_cpu_ptr(dev->bulkq, cpu);
- bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false);
+ bq_xmit_all(bq, XDP_XMIT_FLUSH, false);
}
+ rcu_read_unlock();
}
}
@@ -436,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
struct net *net = current->nsproxy->net_ns;
gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
- u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
+ struct xdp_bulk_queue *bq;
+ u32 i = *(u32 *)key;
+ int cpu;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -460,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
return -ENOMEM;
}
+ for_each_possible_cpu(cpu) {
+ bq = per_cpu_ptr(dev->bulkq, cpu);
+ bq->obj = dev;
+ }
+
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
free_percpu(dev->bulkq);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index de73f55e42fd..b44d8c447afd 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
@@ -205,10 +197,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
* part of the ldimm64 insn is accessible.
*/
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+ bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
+ insn->src_reg == BPF_PSEUDO_MAP_VALUE;
char tmp[64];
- if (map_ptr && !allow_ptr_leaks)
+ if (is_ptr && !allow_ptr_leaks)
imm = 0;
verbose(cbs->private_data, "(%02x) r%d = %s\n",
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e1324a834a24..e546b18d27da 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#ifndef __BPF_DISASM_H__
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index fed15cf94dca..22066a62c8c9 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/btf.h>
@@ -23,7 +15,7 @@
#define HTAB_CREATE_FLAG_MASK \
(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
+ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
struct bucket {
struct hlist_nulls_head head;
@@ -262,8 +254,8 @@ static int htab_map_alloc_check(union bpf_attr *attr)
/* Guard against local DoS, and discourage production use. */
return -EPERM;
- if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
- /* reserved bits should not be used */
+ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
return -EINVAL;
if (!lru && percpu_lru)
@@ -360,14 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
else
cost += (u64) htab->elem_size * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- /* make sure page count doesn't overflow */
- goto free_htab;
-
- htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- /* if map size is larger than memlock limit, reject it early */
- err = bpf_map_precharge_memlock(htab->map.pages);
+ /* if map size is larger than memlock limit, reject it */
+ err = bpf_map_charge_init(&htab->map.memory, cost);
if (err)
goto free_htab;
@@ -376,7 +362,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_htab;
+ goto free_charge;
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
@@ -409,6 +395,8 @@ free_prealloc:
prealloc_destroy(htab);
free_buckets:
bpf_map_area_free(htab->buckets);
+free_charge:
+ bpf_map_charge_finish(&htab->map.memory);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -527,18 +515,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
return insn - insn_buf;
}
-static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map,
+ void *key, const bool mark)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l) {
- bpf_lru_node_set_ref(&l->lru_node);
+ if (mark)
+ bpf_lru_node_set_ref(&l->lru_node);
return l->key + round_up(map->key_size, 8);
}
return NULL;
}
+static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, true);
+}
+
+static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
+{
+ return __htab_lru_map_lookup_elem(map, key, false);
+}
+
static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
@@ -1250,6 +1250,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_map_lookup_elem,
+ .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
.map_gen_lookup = htab_lru_map_gen_lookup,
@@ -1281,7 +1282,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{
- struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l;
void __percpu *pptr;
int ret = -ENOENT;
@@ -1297,8 +1297,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key);
if (!l)
goto out;
- if (htab_is_lru(htab))
- bpf_lru_node_set_ref(&l->lru_node);
+ /* We do not mark LRU map element here in order to not mess up
+ * eviction heuristics when user space does a map walk.
+ */
pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index a411fc17d265..5e28718928ca 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/rcupdate.h>
@@ -18,6 +10,9 @@
#include <linux/sched.h>
#include <linux/uidgid.h>
#include <linux/filter.h>
+#include <linux/ctype.h>
+
+#include "../../lib/kstrtox.h"
/* If kernel subsystem is allowing eBPF programs to call this function,
* inside its own verifier_ops->get_func_proto() callback it should return
@@ -363,4 +358,132 @@ const struct bpf_func_proto bpf_get_local_storage_proto = {
.arg2_type = ARG_ANYTHING,
};
#endif
+
+#define BPF_STRTOX_BASE_MASK 0x1F
+
+static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags,
+ unsigned long long *res, bool *is_negative)
+{
+ unsigned int base = flags & BPF_STRTOX_BASE_MASK;
+ const char *cur_buf = buf;
+ size_t cur_len = buf_len;
+ unsigned int consumed;
+ size_t val_len;
+ char str[64];
+
+ if (!buf || !buf_len || !res || !is_negative)
+ return -EINVAL;
+
+ if (base != 0 && base != 8 && base != 10 && base != 16)
+ return -EINVAL;
+
+ if (flags & ~BPF_STRTOX_BASE_MASK)
+ return -EINVAL;
+
+ while (cur_buf < buf + buf_len && isspace(*cur_buf))
+ ++cur_buf;
+
+ *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-');
+ if (*is_negative)
+ ++cur_buf;
+
+ consumed = cur_buf - buf;
+ cur_len -= consumed;
+ if (!cur_len)
+ return -EINVAL;
+
+ cur_len = min(cur_len, sizeof(str) - 1);
+ memcpy(str, cur_buf, cur_len);
+ str[cur_len] = '\0';
+ cur_buf = str;
+
+ cur_buf = _parse_integer_fixup_radix(cur_buf, &base);
+ val_len = _parse_integer(cur_buf, base, res);
+
+ if (val_len & KSTRTOX_OVERFLOW)
+ return -ERANGE;
+
+ if (val_len == 0)
+ return -EINVAL;
+
+ cur_buf += val_len;
+ consumed += cur_buf - str;
+
+ return consumed;
+}
+
+static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags,
+ long long *res)
+{
+ unsigned long long _res;
+ bool is_negative;
+ int err;
+
+ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+ if (err < 0)
+ return err;
+ if (is_negative) {
+ if ((long long)-_res > 0)
+ return -ERANGE;
+ *res = -_res;
+ } else {
+ if ((long long)_res < 0)
+ return -ERANGE;
+ *res = _res;
+ }
+ return err;
+}
+
+BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags,
+ long *, res)
+{
+ long long _res;
+ int err;
+
+ err = __bpf_strtoll(buf, buf_len, flags, &_res);
+ if (err < 0)
+ return err;
+ if (_res != (long)_res)
+ return -ERANGE;
+ *res = _res;
+ return err;
+}
+
+const struct bpf_func_proto bpf_strtol_proto = {
+ .func = bpf_strtol,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_LONG,
+};
+
+BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags,
+ unsigned long *, res)
+{
+ unsigned long long _res;
+ bool is_negative;
+ int err;
+
+ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative);
+ if (err < 0)
+ return err;
+ if (is_negative)
+ return -EINVAL;
+ if (_res != (unsigned long)_res)
+ return -ERANGE;
+ *res = _res;
+ return err;
+}
+
+const struct bpf_func_proto bpf_strtoul_proto = {
+ .func = bpf_strtoul,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_LONG,
+};
#endif
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 4a8f390a2b82..cc0d0cf114e3 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Minimal file system backend for holding eBPF maps and programs,
* used by bpf(2) object pinning.
@@ -5,10 +6,6 @@
* Authors:
*
* Daniel Borkmann <daniel@iogearbox.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/init.h>
@@ -518,7 +515,7 @@ out:
static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
{
struct bpf_prog *prog;
- int ret = inode_permission(inode, MAY_READ | MAY_WRITE);
+ int ret = inode_permission(inode, MAY_READ);
if (ret)
return ERR_PTR(ret);
@@ -566,9 +563,8 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static void bpf_destroy_inode_deferred(struct rcu_head *head)
+static void bpf_free_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
enum bpf_type type;
if (S_ISLNK(inode->i_mode))
@@ -578,16 +574,11 @@ static void bpf_destroy_inode_deferred(struct rcu_head *head)
free_inode_nonrcu(inode);
}
-static void bpf_destroy_inode(struct inode *inode)
-{
- call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred);
-}
-
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
- .destroy_inode = bpf_destroy_inode,
+ .free_inode = bpf_free_inode,
};
enum {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 6b572e2de7fb..addd6fdceec8 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -14,7 +14,7 @@ DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STO
#ifdef CONFIG_CGROUP_BPF
#define LOCAL_STORAGE_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
struct bpf_cgroup_storage_map {
struct bpf_map map;
@@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct bpf_cgroup_storage_map *map;
+ struct bpf_map_memory mem;
+ int ret;
if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
return ERR_PTR(-EINVAL);
@@ -282,21 +284,26 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
if (attr->value_size > PAGE_SIZE)
return ERR_PTR(-E2BIG);
- if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK)
- /* reserved bits should not be used */
+ if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
return ERR_PTR(-EINVAL);
if (attr->max_entries)
/* max_entries is not used and enforced to be 0 */
return ERR_PTR(-EINVAL);
+ ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map));
+ if (ret < 0)
+ return ERR_PTR(ret);
+
map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map),
__GFP_ZERO | GFP_USER, numa_node);
- if (!map)
+ if (!map) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
- map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map),
- PAGE_SIZE) >> PAGE_SHIFT;
+ bpf_map_charge_move(&map->map.memory, &mem);
/* copy mandatory map attributes */
bpf_map_init_from_attr(&map->map, attr);
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 93a5cbbde421..56e6c75d354d 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Longest prefix match list implementation
*
* Copyright (c) 2016,2017 Daniel Mack
* Copyright (c) 2016 David Herrmann
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/bpf.h>
@@ -538,7 +535,7 @@ out:
#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \
- BPF_F_RDONLY | BPF_F_WRONLY)
+ BPF_F_ACCESS_MASK)
static struct bpf_map *trie_alloc(union bpf_attr *attr)
{
@@ -553,6 +550,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
if (attr->max_entries == 0 ||
!(attr->map_flags & BPF_F_NO_PREALLOC) ||
attr->map_flags & ~LPM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
attr->key_size < LPM_KEY_SIZE_MIN ||
attr->key_size > LPM_KEY_SIZE_MAX ||
attr->value_size < LPM_VAL_SIZE_MIN ||
@@ -572,14 +570,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
cost_per_node = sizeof(struct lpm_trie_node) +
attr->value_size + trie->data_size;
cost += (u64) attr->max_entries * cost_per_node;
- if (cost >= U32_MAX - PAGE_SIZE) {
- ret = -E2BIG;
- goto out_err;
- }
-
- trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- ret = bpf_map_precharge_memlock(trie->map.pages);
+ ret = bpf_map_charge_init(&trie->map.memory, cost);
if (ret)
goto out_err;
@@ -715,9 +707,14 @@ find_leftmost:
* have exact two children, so this function will never return NULL.
*/
for (node = search_root; node;) {
- if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
+ if (node->flags & LPM_TREE_NODE_FLAG_IM) {
+ node = rcu_dereference(node->child[0]);
+ } else {
next_node = node;
- node = rcu_dereference(node->child[0]);
+ node = rcu_dereference(node->child[0]);
+ if (!node)
+ node = rcu_dereference(next_node->child[1]);
+ }
}
do_copy:
next_key->prefixlen = next_node->prefixlen;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 3dff41403583..fab4fb134547 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/slab.h>
#include <linux/bpf.h>
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 6183db9ec08c..a507bf6ef8b9 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __MAP_IN_MAP_H__
#define __MAP_IN_MAP_H__
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 0c1b4ba9e90e..6e090140b924 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include "percpu_freelist.h"
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index c3960118e617..fbf8a8a28979 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -1,8 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#ifndef __PERCPU_FREELIST_H__
#define __PERCPU_FREELIST_H__
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index b384ea9f3254..f697647ceb54 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -11,8 +11,7 @@
#include "percpu_freelist.h"
#define QUEUE_STACK_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-
+ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
struct bpf_queue_stack {
struct bpf_map map;
@@ -52,7 +51,8 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 0 ||
attr->value_size == 0 ||
- attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK)
+ attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags))
return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
{
int ret, numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_map_memory mem = {0};
struct bpf_queue_stack *qs;
u64 size, queue_size, cost;
size = (u64) attr->max_entries + 1;
cost = queue_size = sizeof(*qs) + size * attr->value_size;
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-E2BIG);
-
- cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- ret = bpf_map_precharge_memlock(cost);
+ ret = bpf_map_charge_init(&mem, cost);
if (ret < 0)
return ERR_PTR(ret);
qs = bpf_map_area_alloc(queue_size, numa_node);
- if (!qs)
+ if (!qs) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
memset(qs, 0, sizeof(*qs));
bpf_map_init_from_attr(&qs->map, attr);
- qs->map.pages = cost;
+ bpf_map_charge_move(&qs->map.memory, &mem);
qs->size = size;
raw_spin_lock_init(&qs->lock);
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..50c083ba978c 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int err, numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
- u64 cost, array_size;
+ struct bpf_map_memory mem;
+ u64 array_size;
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
@@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
array_size = sizeof(*array);
array_size += (u64)attr->max_entries * sizeof(struct sock *);
- /* make sure there is no u32 overflow later in round_up() */
- cost = array_size;
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-ENOMEM);
- cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- err = bpf_map_precharge_memlock(cost);
+ err = bpf_map_charge_init(&mem, array_size);
if (err)
return ERR_PTR(err);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
- if (!array)
+ if (!array) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
+ }
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- array->map.pages = cost;
+ bpf_map_charge_move(&array->map.memory, &mem);
return &array->map;
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 950ab2f28922..052580c33d26 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
#include <linux/jhash.h>
@@ -89,6 +86,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
{
u32 value_size = attr->value_size;
struct bpf_stack_map *smap;
+ struct bpf_map_memory mem;
u64 cost, n_buckets;
int err;
@@ -116,40 +114,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
n_buckets = roundup_pow_of_two(attr->max_entries);
cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
- if (cost >= U32_MAX - PAGE_SIZE)
- return ERR_PTR(-E2BIG);
+ cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+ err = bpf_map_charge_init(&mem, cost);
+ if (err)
+ return ERR_PTR(err);
smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
- if (!smap)
+ if (!smap) {
+ bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
-
- err = -E2BIG;
- cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_smap;
+ }
bpf_map_init_from_attr(&smap->map, attr);
smap->map.value_size = value_size;
smap->n_buckets = n_buckets;
- smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
- err = bpf_map_precharge_memlock(smap->map.pages);
- if (err)
- goto free_smap;
err = get_callchain_buffers(sysctl_perf_event_max_stack);
if (err)
- goto free_smap;
+ goto free_charge;
err = prealloc_elems_and_freelist(smap);
if (err)
goto put_buffers;
+ bpf_map_charge_move(&smap->map.memory, &mem);
+
return &smap->map;
put_buffers:
put_callchain_buffers();
-free_smap:
+free_charge:
+ bpf_map_charge_finish(&mem);
bpf_map_area_free(smap);
return ERR_PTR(err);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index afca36f53c49..5d141f16f6fa 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
@@ -166,29 +158,28 @@ void bpf_map_area_free(void *area)
kvfree(area);
}
+static u32 bpf_map_flags_retain_permanent(u32 flags)
+{
+ /* Some map creation flags are not tied to the map object but
+ * rather to the map fd instead, so they have no meaning upon
+ * map object inspection since multiple file descriptors with
+ * different (access) properties can exist here. Thus, given
+ * this has zero meaning for the map itself, lets clear these
+ * from here.
+ */
+ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY);
+}
+
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
{
map->map_type = attr->map_type;
map->key_size = attr->key_size;
map->value_size = attr->value_size;
map->max_entries = attr->max_entries;
- map->map_flags = attr->map_flags;
+ map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
}
-int bpf_map_precharge_memlock(u32 pages)
-{
- struct user_struct *user = get_current_user();
- unsigned long memlock_limit, cur;
-
- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- cur = atomic_long_read(&user->locked_vm);
- free_uid(user);
- if (cur + pages > memlock_limit)
- return -EPERM;
- return 0;
-}
-
static int bpf_charge_memlock(struct user_struct *user, u32 pages)
{
unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -202,45 +193,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages)
static void bpf_uncharge_memlock(struct user_struct *user, u32 pages)
{
- atomic_long_sub(pages, &user->locked_vm);
+ if (user)
+ atomic_long_sub(pages, &user->locked_vm);
}
-static int bpf_map_init_memlock(struct bpf_map *map)
+int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size)
{
- struct user_struct *user = get_current_user();
+ u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT;
+ struct user_struct *user;
int ret;
- ret = bpf_charge_memlock(user, map->pages);
+ if (size >= U32_MAX - PAGE_SIZE)
+ return -E2BIG;
+
+ user = get_current_user();
+ ret = bpf_charge_memlock(user, pages);
if (ret) {
free_uid(user);
return ret;
}
- map->user = user;
- return ret;
+
+ mem->pages = pages;
+ mem->user = user;
+
+ return 0;
}
-static void bpf_map_release_memlock(struct bpf_map *map)
+void bpf_map_charge_finish(struct bpf_map_memory *mem)
{
- struct user_struct *user = map->user;
- bpf_uncharge_memlock(user, map->pages);
- free_uid(user);
+ bpf_uncharge_memlock(mem->user, mem->pages);
+ free_uid(mem->user);
+}
+
+void bpf_map_charge_move(struct bpf_map_memory *dst,
+ struct bpf_map_memory *src)
+{
+ *dst = *src;
+
+ /* Make sure src will not be used for the redundant uncharging. */
+ memset(src, 0, sizeof(struct bpf_map_memory));
}
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages)
{
int ret;
- ret = bpf_charge_memlock(map->user, pages);
+ ret = bpf_charge_memlock(map->memory.user, pages);
if (ret)
return ret;
- map->pages += pages;
+ map->memory.pages += pages;
return ret;
}
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages)
{
- bpf_uncharge_memlock(map->user, pages);
- map->pages -= pages;
+ bpf_uncharge_memlock(map->memory.user, pages);
+ map->memory.pages -= pages;
}
static int bpf_map_alloc_id(struct bpf_map *map)
@@ -291,11 +299,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ struct bpf_map_memory mem;
- bpf_map_release_memlock(map);
+ bpf_map_charge_move(&mem, &map->memory);
security_bpf_map_free(map);
/* implementation dependent freeing */
map->ops->map_free(map);
+ bpf_map_charge_finish(&mem);
}
static void bpf_map_put_uref(struct bpf_map *map)
@@ -343,6 +353,18 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
return 0;
}
+static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f)
+{
+ fmode_t mode = f.file->f_mode;
+
+ /* Our file permissions may have been overridden by global
+ * map permissions facing syscall side.
+ */
+ if (READ_ONCE(map->frozen))
+ mode &= ~FMODE_CAN_WRITE;
+ return mode;
+}
+
#ifdef CONFIG_PROC_FS
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
@@ -364,14 +386,16 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"memlock:\t%llu\n"
- "map_id:\t%u\n",
+ "map_id:\t%u\n"
+ "frozen:\t%u\n",
map->map_type,
map->key_size,
map->value_size,
map->max_entries,
map->map_flags,
- map->pages * 1ULL << PAGE_SHIFT,
- map->id);
+ map->memory.pages * 1ULL << PAGE_SHIFT,
+ map->id,
+ READ_ONCE(map->frozen));
if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
@@ -448,10 +472,10 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
const char *end = src + BPF_OBJ_NAME_LEN;
memset(dst, 0, BPF_OBJ_NAME_LEN);
-
- /* Copy all isalnum() and '_' char */
+ /* Copy all isalnum(), '_' and '.' chars. */
while (src < end && *src) {
- if (!isalnum(*src) && *src != '_')
+ if (!isalnum(*src) &&
+ *src != '_' && *src != '.')
return -EINVAL;
*dst++ = *src++;
}
@@ -478,9 +502,16 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 key_size, value_size;
int ret = 0;
- key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
- if (!key_type || key_size != map->key_size)
- return -EINVAL;
+ /* Some maps allow key to be unspecified. */
+ if (btf_key_id) {
+ key_type = btf_type_id_size(btf, &btf_key_id, &key_size);
+ if (!key_type || key_size != map->key_size)
+ return -EINVAL;
+ } else {
+ key_type = btf_type_by_id(btf, 0);
+ if (!map->ops->map_check_btf)
+ return -EINVAL;
+ }
value_type = btf_type_id_size(btf, &btf_value_id, &value_size);
if (!value_type || value_size != map->value_size)
@@ -489,9 +520,12 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
map->spin_lock_off = btf_find_spin_lock(btf, value_type);
if (map_value_has_spin_lock(map)) {
+ if (map->map_flags & BPF_F_RDONLY_PROG)
+ return -EACCES;
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY &&
- map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_SK_STORAGE)
return -ENOTSUPP;
if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
map->value_size) {
@@ -513,6 +547,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
static int map_create(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_map_memory mem;
struct bpf_map *map;
int f_flags;
int err;
@@ -537,7 +572,7 @@ static int map_create(union bpf_attr *attr)
err = bpf_obj_name_cpy(map->name, attr->map_name);
if (err)
- goto free_map_nouncharge;
+ goto free_map;
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
@@ -545,22 +580,22 @@ static int map_create(union bpf_attr *attr)
if (attr->btf_key_type_id || attr->btf_value_type_id) {
struct btf *btf;
- if (!attr->btf_key_type_id || !attr->btf_value_type_id) {
+ if (!attr->btf_value_type_id) {
err = -EINVAL;
- goto free_map_nouncharge;
+ goto free_map;
}
btf = btf_get_by_fd(attr->btf_fd);
if (IS_ERR(btf)) {
err = PTR_ERR(btf);
- goto free_map_nouncharge;
+ goto free_map;
}
err = map_check_btf(map, btf, attr->btf_key_type_id,
attr->btf_value_type_id);
if (err) {
btf_put(btf);
- goto free_map_nouncharge;
+ goto free_map;
}
map->btf = btf;
@@ -572,15 +607,11 @@ static int map_create(union bpf_attr *attr)
err = security_bpf_map_alloc(map);
if (err)
- goto free_map_nouncharge;
-
- err = bpf_map_init_memlock(map);
- if (err)
- goto free_map_sec;
+ goto free_map;
err = bpf_map_alloc_id(map);
if (err)
- goto free_map;
+ goto free_map_sec;
err = bpf_map_new_fd(map, f_flags);
if (err < 0) {
@@ -596,13 +627,13 @@ static int map_create(union bpf_attr *attr)
return err;
-free_map:
- bpf_map_release_memlock(map);
free_map_sec:
security_bpf_map_free(map);
-free_map_nouncharge:
+free_map:
btf_put(map->btf);
+ bpf_map_charge_move(&mem, &map->memory);
map->ops->map_free(map);
+ bpf_map_charge_finish(&mem);
return err;
}
@@ -713,8 +744,7 @@ static int map_lookup_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
@@ -773,7 +803,10 @@ static int map_lookup_elem(union bpf_attr *attr)
err = map->ops->map_peek_elem(map, value);
} else {
rcu_read_lock();
- ptr = map->ops->map_lookup_elem(map, key);
+ if (map->ops->map_lookup_elem_sys_only)
+ ptr = map->ops->map_lookup_elem_sys_only(map, key);
+ else
+ ptr = map->ops->map_lookup_elem(map, key);
if (IS_ERR(ptr)) {
err = PTR_ERR(ptr);
} else if (!ptr) {
@@ -843,8 +876,7 @@ static int map_update_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -955,8 +987,7 @@ static int map_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -1007,8 +1038,7 @@ static int map_get_next_key(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_READ)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
err = -EPERM;
goto err_put;
}
@@ -1075,8 +1105,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
-
- if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) {
err = -EPERM;
goto err_put;
}
@@ -1118,6 +1147,36 @@ err_put:
return err;
}
+#define BPF_MAP_FREEZE_LAST_FIELD map_fd
+
+static int map_freeze(const union bpf_attr *attr)
+{
+ int err = 0, ufd = attr->map_fd;
+ struct bpf_map *map;
+ struct fd f;
+
+ if (CHECK_ATTR(BPF_MAP_FREEZE))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ if (READ_ONCE(map->frozen)) {
+ err = -EBUSY;
+ goto err_put;
+ }
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
+ goto err_put;
+ }
+
+ WRITE_ONCE(map->frozen, true);
+err_put:
+ fdput(f);
+ return err;
+}
+
static const struct bpf_prog_ops * const bpf_prog_types[] = {
#define BPF_PROG_TYPE(_id, _name) \
[_id] = & _name ## _prog_ops,
@@ -1517,6 +1576,24 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ switch (expected_attach_type) {
+ case BPF_CGROUP_SETSOCKOPT:
+ case BPF_CGROUP_GETSOCKOPT:
return 0;
default:
return -EINVAL;
@@ -1540,7 +1617,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
- if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT))
+ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT |
+ BPF_F_ANY_ALIGNMENT |
+ BPF_F_TEST_RND_HI32))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -1557,7 +1636,8 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
/* eBPF programs must be GPL compatible to use GPL-ed functions */
is_gpl = license_is_gpl_compatible(license);
- if (attr->insn_cnt == 0 || attr->insn_cnt > BPF_MAXINSNS)
+ if (attr->insn_cnt == 0 ||
+ attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
return -E2BIG;
if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
type != BPF_PROG_TYPE_CGROUP_SKB &&
@@ -1609,7 +1689,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
if (err < 0)
goto free_prog;
- prog->aux->load_time = ktime_get_boot_ns();
+ prog->aux->load_time = ktime_get_boottime_ns();
err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
if (err)
goto free_prog;
@@ -1728,12 +1808,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
}
raw_tp->btp = btp;
- prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd,
- BPF_PROG_TYPE_RAW_TRACEPOINT);
+ prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
if (IS_ERR(prog)) {
err = PTR_ERR(prog);
goto out_free_tp;
}
+ if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
+ prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
+ err = -EINVAL;
+ goto out_put_prog;
+ }
err = bpf_probe_register(raw_tp->btp, prog);
if (err)
@@ -1764,7 +1848,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
switch (prog->type) {
case BPF_PROG_TYPE_CGROUP_SOCK:
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
return attach_type == prog->expected_attach_type ? 0 : -EINVAL;
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ return prog->enforce_expected_attach_type &&
+ prog->expected_attach_type != attach_type ?
+ -EINVAL : 0;
default:
return 0;
}
@@ -1806,6 +1895,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1827,6 +1918,13 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_FLOW_DISSECTOR:
ptype = BPF_PROG_TYPE_FLOW_DISSECTOR;
break;
+ case BPF_CGROUP_SYSCTL:
+ ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
+ break;
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ break;
default:
return -EINVAL;
}
@@ -1888,6 +1986,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
break;
case BPF_CGROUP_SOCK_OPS:
@@ -1905,6 +2005,13 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return lirc_prog_detach(attr);
case BPF_FLOW_DISSECTOR:
return skb_flow_dissector_bpf_prog_detach(attr);
+ case BPF_CGROUP_SYSCTL:
+ ptype = BPF_PROG_TYPE_CGROUP_SYSCTL;
+ break;
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT;
+ break;
default:
return -EINVAL;
}
@@ -1936,11 +2043,18 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_INET6_CONNECT:
case BPF_CGROUP_UDP4_SENDMSG:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
case BPF_CGROUP_SOCK_OPS:
case BPF_CGROUP_DEVICE:
+ case BPF_CGROUP_SYSCTL:
+ case BPF_CGROUP_GETSOCKOPT:
+ case BPF_CGROUP_SETSOCKOPT:
break;
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
+ case BPF_FLOW_DISSECTOR:
+ return skb_flow_dissector_prog_query(attr, uattr);
default:
return -EINVAL;
}
@@ -1948,7 +2062,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
return cgroup_bpf_prog_query(attr, uattr);
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1961,6 +2075,14 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
if (CHECK_ATTR(BPF_PROG_TEST_RUN))
return -EINVAL;
+ if ((attr->test.ctx_size_in && !attr->test.ctx_in) ||
+ (!attr->test.ctx_size_in && attr->test.ctx_in))
+ return -EINVAL;
+
+ if ((attr->test.ctx_size_out && !attr->test.ctx_out) ||
+ (!attr->test.ctx_size_out && attr->test.ctx_out))
+ return -EINVAL;
+
prog = bpf_prog_get(attr->test.prog_fd);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -2071,13 +2193,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
}
static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
- unsigned long addr)
+ unsigned long addr, u32 *off,
+ u32 *type)
{
+ const struct bpf_map *map;
int i;
- for (i = 0; i < prog->aux->used_map_cnt; i++)
- if (prog->aux->used_maps[i] == (void *)addr)
- return prog->aux->used_maps[i];
+ for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
+ map = prog->aux->used_maps[i];
+ if (map == (void *)addr) {
+ *type = BPF_PSEUDO_MAP_FD;
+ return map;
+ }
+ if (!map->ops->map_direct_value_meta)
+ continue;
+ if (!map->ops->map_direct_value_meta(map, addr, off)) {
+ *type = BPF_PSEUDO_MAP_VALUE;
+ return map;
+ }
+ }
+
return NULL;
}
@@ -2085,6 +2220,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
{
const struct bpf_map *map;
struct bpf_insn *insns;
+ u32 off, type;
u64 imm;
int i;
@@ -2112,11 +2248,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
continue;
imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
- map = bpf_map_from_imm(prog, imm);
+ map = bpf_map_from_imm(prog, imm, &off, &type);
if (map) {
- insns[i].src_reg = BPF_PSEUDO_MAP_FD;
+ insns[i].src_reg = type;
insns[i].imm = map->id;
- insns[i + 1].imm = 0;
+ insns[i + 1].imm = off;
continue;
}
}
@@ -2706,6 +2842,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_MAP_GET_NEXT_KEY:
err = map_get_next_key(&attr);
break;
+ case BPF_MAP_FREEZE:
+ err = map_freeze(&attr);
+ break;
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr, uattr);
break;
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c
index 938d41211be7..ca52b9642943 100644
--- a/kernel/bpf/tnum.c
+++ b/kernel/bpf/tnum.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* tnum: tracked (or tristate) numbers
*
* A tnum tracks knowledge about the bits of a value. Each bit can be either
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 09d5d972c9ff..a2e763703c30 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
* Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <uapi/linux/btf.h>
#include <linux/kernel.h>
@@ -176,8 +168,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 131072
-#define BPF_COMPLEXITY_LIMIT_STACK 1024
+#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192
#define BPF_COMPLEXITY_LIMIT_STATES 64
#define BPF_MAP_PTR_UNPRIV 1UL
@@ -335,7 +326,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type)
{
return type == PTR_TO_SOCKET ||
type == PTR_TO_SOCK_COMMON ||
- type == PTR_TO_TCP_SOCK;
+ type == PTR_TO_TCP_SOCK ||
+ type == PTR_TO_XDP_SOCK;
}
static bool reg_type_may_be_null(enum bpf_reg_type type)
@@ -377,7 +369,8 @@ static bool is_release_function(enum bpf_func_id func_id)
static bool is_acquire_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_lookup_tcp ||
- func_id == BPF_FUNC_sk_lookup_udp;
+ func_id == BPF_FUNC_sk_lookup_udp ||
+ func_id == BPF_FUNC_skc_lookup_tcp;
}
static bool is_ptr_cast_function(enum bpf_func_id func_id)
@@ -405,6 +398,8 @@ static const char * const reg_type_str[] = {
[PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
[PTR_TO_TCP_SOCK] = "tcp_sock",
[PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
+ [PTR_TO_TP_BUFFER] = "tp_buffer",
+ [PTR_TO_XDP_SOCK] = "xdp_sock",
};
static char slot_type_char[] = {
@@ -452,12 +447,12 @@ static void print_verifier_state(struct bpf_verifier_env *env,
verbose(env, " R%d", i);
print_liveness(env, reg->live);
verbose(env, "=%s", reg_type_str[t]);
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
- if (t == PTR_TO_STACK)
- verbose(env, ",call_%d", func(env, reg)->callsite);
} else {
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@@ -519,11 +514,17 @@ static void print_verifier_state(struct bpf_verifier_env *env,
continue;
verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE);
print_liveness(env, state->stack[i].spilled_ptr.live);
- if (state->stack[i].slot_type[0] == STACK_SPILL)
- verbose(env, "=%s",
- reg_type_str[state->stack[i].spilled_ptr.type]);
- else
+ if (state->stack[i].slot_type[0] == STACK_SPILL) {
+ reg = &state->stack[i].spilled_ptr;
+ t = reg->type;
+ verbose(env, "=%s", reg_type_str[t]);
+ if (t == SCALAR_VALUE && reg->precise)
+ verbose(env, "P");
+ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+ verbose(env, "%lld", reg->var_off.value + reg->off);
+ } else {
verbose(env, "=%s", types_buf);
+ }
}
if (state->acquired_refs && state->refs[0].id) {
verbose(env, " refs=%d", state->refs[0].id);
@@ -672,6 +673,13 @@ static void free_func_state(struct bpf_func_state *state)
kfree(state);
}
+static void clear_jmp_history(struct bpf_verifier_state *state)
+{
+ kfree(state->jmp_history);
+ state->jmp_history = NULL;
+ state->jmp_history_cnt = 0;
+}
+
static void free_verifier_state(struct bpf_verifier_state *state,
bool free_self)
{
@@ -681,6 +689,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
+ clear_jmp_history(state);
if (free_self)
kfree(state);
}
@@ -708,8 +717,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
const struct bpf_verifier_state *src)
{
struct bpf_func_state *dst;
+ u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt;
int i, err;
+ if (dst_state->jmp_history_cnt < src->jmp_history_cnt) {
+ kfree(dst_state->jmp_history);
+ dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER);
+ if (!dst_state->jmp_history)
+ return -ENOMEM;
+ }
+ memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz);
+ dst_state->jmp_history_cnt = src->jmp_history_cnt;
+
/* if dst has more stack frames then src frame, free them */
for (i = src->curframe + 1; i <= dst_state->curframe; i++) {
free_func_state(dst_state->frame[i]);
@@ -718,6 +737,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe;
dst_state->active_spin_lock = src->active_spin_lock;
+ dst_state->branches = src->branches;
+ dst_state->parent = src->parent;
+ dst_state->first_insn_idx = src->first_insn_idx;
+ dst_state->last_insn_idx = src->last_insn_idx;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
if (!dst) {
@@ -733,6 +756,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
return 0;
}
+static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st)
+{
+ while (st) {
+ u32 br = --st->branches;
+
+ /* WARN_ON(br > 1) technically makes sense here,
+ * but see comment in push_stack(), hence:
+ */
+ WARN_ONCE((int)br < 0,
+ "BUG update_branch_counts:branches_to_explore=%d\n",
+ br);
+ if (br)
+ break;
+ st = st->parent;
+ }
+}
+
static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
int *insn_idx)
{
@@ -781,10 +821,23 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
if (err)
goto err;
elem->st.speculative |= speculative;
- if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
- verbose(env, "BPF program is too complex\n");
+ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) {
+ verbose(env, "The sequence of %d jumps is too complex.\n",
+ env->stack_size);
goto err;
}
+ if (elem->st.parent) {
+ ++elem->st.parent->branches;
+ /* WARN_ON(branches > 2) technically makes sense here,
+ * but
+ * 1. speculative states will bump 'branches' for non-branch
+ * instructions
+ * 2. is_state_visited() heuristics may decide not to create
+ * a new state for a sequence of branches and all such current
+ * and cloned states will be pointing to a single parent state
+ * which might have large 'branches' count.
+ */
+ }
return &elem->st;
err:
free_verifier_state(env->cur_state, true);
@@ -932,6 +985,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
reg->smax_value = S64_MAX;
reg->umin_value = 0;
reg->umax_value = U64_MAX;
+
+ /* constant backtracking is enabled for root only for now */
+ reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
}
/* Mark a register as having a completely unknown (scalar) value. */
@@ -980,6 +1036,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
__mark_reg_not_init(regs + regno);
}
+#define DEF_NOT_SUBREG (0)
static void init_reg_state(struct bpf_verifier_env *env,
struct bpf_func_state *state)
{
@@ -990,6 +1047,7 @@ static void init_reg_state(struct bpf_verifier_env *env,
mark_reg_not_init(env, regs, i);
regs[i].live = REG_LIVE_NONE;
regs[i].parent = NULL;
+ regs[i].subreg_def = DEF_NOT_SUBREG;
}
/* frame pointer */
@@ -1091,7 +1149,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
*/
subprog[env->subprog_cnt].start = insn_cnt;
- if (env->log.level > 1)
+ if (env->log.level & BPF_LOG_LEVEL2)
for (i = 0; i < env->subprog_cnt; i++)
verbose(env, "func#%d @%d\n", i, subprog[i].start);
@@ -1135,9 +1193,10 @@ next:
*/
static int mark_reg_read(struct bpf_verifier_env *env,
const struct bpf_reg_state *state,
- struct bpf_reg_state *parent)
+ struct bpf_reg_state *parent, u8 flag)
{
bool writes = parent == state->parent; /* Observe write marks */
+ int cnt = 0;
while (parent) {
/* if read wasn't screened by an earlier write ... */
@@ -1149,50 +1208,625 @@ static int mark_reg_read(struct bpf_verifier_env *env,
parent->var_off.value, parent->off);
return -EFAULT;
}
+ /* The first condition is more likely to be true than the
+ * second, checked it first.
+ */
+ if ((parent->live & REG_LIVE_READ) == flag ||
+ parent->live & REG_LIVE_READ64)
+ /* The parentage chain never changes and
+ * this parent was already marked as LIVE_READ.
+ * There is no need to keep walking the chain again and
+ * keep re-marking all parents as LIVE_READ.
+ * This case happens when the same register is read
+ * multiple times without writes into it in-between.
+ * Also, if parent has the stronger REG_LIVE_READ64 set,
+ * then no need to set the weak REG_LIVE_READ32.
+ */
+ break;
/* ... then we depend on parent's value */
- parent->live |= REG_LIVE_READ;
+ parent->live |= flag;
+ /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */
+ if (flag == REG_LIVE_READ64)
+ parent->live &= ~REG_LIVE_READ32;
state = parent;
parent = state->parent;
writes = true;
+ cnt++;
}
+
+ if (env->longest_mark_read_walk < cnt)
+ env->longest_mark_read_walk = cnt;
return 0;
}
+/* This function is supposed to be used by the following 32-bit optimization
+ * code only. It returns TRUE if the source or destination register operates
+ * on 64-bit, otherwise return FALSE.
+ */
+static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t)
+{
+ u8 code, class, op;
+
+ code = insn->code;
+ class = BPF_CLASS(code);
+ op = BPF_OP(code);
+ if (class == BPF_JMP) {
+ /* BPF_EXIT for "main" will reach here. Return TRUE
+ * conservatively.
+ */
+ if (op == BPF_EXIT)
+ return true;
+ if (op == BPF_CALL) {
+ /* BPF to BPF call will reach here because of marking
+ * caller saved clobber with DST_OP_NO_MARK for which we
+ * don't care the register def because they are anyway
+ * marked as NOT_INIT already.
+ */
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ return false;
+ /* Helper call will reach here because of arg type
+ * check, conservatively return TRUE.
+ */
+ if (t == SRC_OP)
+ return true;
+
+ return false;
+ }
+ }
+
+ if (class == BPF_ALU64 || class == BPF_JMP ||
+ /* BPF_END always use BPF_ALU class. */
+ (class == BPF_ALU && op == BPF_END && insn->imm == 64))
+ return true;
+
+ if (class == BPF_ALU || class == BPF_JMP32)
+ return false;
+
+ if (class == BPF_LDX) {
+ if (t != SRC_OP)
+ return BPF_SIZE(code) == BPF_DW;
+ /* LDX source must be ptr. */
+ return true;
+ }
+
+ if (class == BPF_STX) {
+ if (reg->type != SCALAR_VALUE)
+ return true;
+ return BPF_SIZE(code) == BPF_DW;
+ }
+
+ if (class == BPF_LD) {
+ u8 mode = BPF_MODE(code);
+
+ /* LD_IMM64 */
+ if (mode == BPF_IMM)
+ return true;
+
+ /* Both LD_IND and LD_ABS return 32-bit data. */
+ if (t != SRC_OP)
+ return false;
+
+ /* Implicit ctx ptr. */
+ if (regno == BPF_REG_6)
+ return true;
+
+ /* Explicit source could be any width. */
+ return true;
+ }
+
+ if (class == BPF_ST)
+ /* The only source register for BPF_ST is a ptr. */
+ return true;
+
+ /* Conservatively return true at default. */
+ return true;
+}
+
+/* Return TRUE if INSN doesn't have explicit value define. */
+static bool insn_no_def(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ return (class == BPF_JMP || class == BPF_JMP32 ||
+ class == BPF_STX || class == BPF_ST);
+}
+
+/* Return TRUE if INSN has defined any 32-bit value explicitly. */
+static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn)
+{
+ if (insn_no_def(insn))
+ return false;
+
+ return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP);
+}
+
+static void mark_insn_zext(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg)
+{
+ s32 def_idx = reg->subreg_def;
+
+ if (def_idx == DEF_NOT_SUBREG)
+ return;
+
+ env->insn_aux_data[def_idx - 1].zext_dst = true;
+ /* The dst will be zero extended, so won't be sub-register anymore. */
+ reg->subreg_def = DEF_NOT_SUBREG;
+}
+
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
+ struct bpf_reg_state *reg, *regs = state->regs;
+ bool rw64;
if (regno >= MAX_BPF_REG) {
verbose(env, "R%d is invalid\n", regno);
return -EINVAL;
}
+ reg = &regs[regno];
+ rw64 = is_reg64(env, insn, regno, reg, t);
if (t == SRC_OP) {
/* check whether register used as source operand can be read */
- if (regs[regno].type == NOT_INIT) {
+ if (reg->type == NOT_INIT) {
verbose(env, "R%d !read_ok\n", regno);
return -EACCES;
}
/* We don't need to worry about FP liveness because it's read-only */
- if (regno != BPF_REG_FP)
- return mark_reg_read(env, &regs[regno],
- regs[regno].parent);
+ if (regno == BPF_REG_FP)
+ return 0;
+
+ if (rw64)
+ mark_insn_zext(env, reg);
+
+ return mark_reg_read(env, reg, reg->parent,
+ rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
verbose(env, "frame pointer is read only\n");
return -EACCES;
}
- regs[regno].live |= REG_LIVE_WRITTEN;
+ reg->live |= REG_LIVE_WRITTEN;
+ reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1;
if (t == DST_OP)
mark_reg_unknown(env, regs, regno);
}
return 0;
}
+/* for any branch, call, exit record the history of jmps in the given state */
+static int push_jmp_history(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur)
+{
+ u32 cnt = cur->jmp_history_cnt;
+ struct bpf_idx_pair *p;
+
+ cnt++;
+ p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER);
+ if (!p)
+ return -ENOMEM;
+ p[cnt - 1].idx = env->insn_idx;
+ p[cnt - 1].prev_idx = env->prev_insn_idx;
+ cur->jmp_history = p;
+ cur->jmp_history_cnt = cnt;
+ return 0;
+}
+
+/* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
+ */
+static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+ u32 *history)
+{
+ u32 cnt = *history;
+
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
+ i = st->jmp_history[cnt - 1].prev_idx;
+ (*history)--;
+ } else {
+ i--;
+ }
+ return i;
+}
+
+/* For given verifier state backtrack_insn() is called from the last insn to
+ * the first insn. Its purpose is to compute a bitmask of registers and
+ * stack slots that needs precision in the parent verifier state.
+ */
+static int backtrack_insn(struct bpf_verifier_env *env, int idx,
+ u32 *reg_mask, u64 *stack_mask)
+{
+ const struct bpf_insn_cbs cbs = {
+ .cb_print = verbose,
+ .private_data = env,
+ };
+ struct bpf_insn *insn = env->prog->insnsi + idx;
+ u8 class = BPF_CLASS(insn->code);
+ u8 opcode = BPF_OP(insn->code);
+ u8 mode = BPF_MODE(insn->code);
+ u32 dreg = 1u << insn->dst_reg;
+ u32 sreg = 1u << insn->src_reg;
+ u32 spi;
+
+ if (insn->code == 0)
+ return 0;
+ if (env->log.level & BPF_LOG_LEVEL) {
+ verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask);
+ verbose(env, "%d: ", idx);
+ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (!(*reg_mask & dreg))
+ return 0;
+ if (opcode == BPF_MOV) {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg = sreg
+ * dreg needs precision after this insn
+ * sreg needs precision before this insn
+ */
+ *reg_mask &= ~dreg;
+ *reg_mask |= sreg;
+ } else {
+ /* dreg = K
+ * dreg needs precision after this insn.
+ * Corresponding register is already marked
+ * as precise=true in this verifier state.
+ * No further markings in parent are necessary
+ */
+ *reg_mask &= ~dreg;
+ }
+ } else {
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* dreg += sreg
+ * both dreg and sreg need precision
+ * before this insn
+ */
+ *reg_mask |= sreg;
+ } /* else dreg += K
+ * dreg still needs precision before this insn
+ */
+ }
+ } else if (class == BPF_LDX) {
+ if (!(*reg_mask & dreg))
+ return 0;
+ *reg_mask &= ~dreg;
+
+ /* scalars can only be spilled into stack w/o losing precision.
+ * Load from any other memory can be zero extended.
+ * The desire to keep that precision is already indicated
+ * by 'precise' mark in corresponding register of this state.
+ * No further tracking necessary.
+ */
+ if (insn->src_reg != BPF_REG_FP)
+ return 0;
+ if (BPF_SIZE(insn->code) != BPF_DW)
+ return 0;
+
+ /* dreg = *(u64 *)[fp - off] was a fill from the stack.
+ * that [fp - off] slot contains scalar that needs to be
+ * tracked with precision
+ */
+ spi = (-insn->off - 1) / BPF_REG_SIZE;
+ if (spi >= 64) {
+ verbose(env, "BUG spi %d\n", spi);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ *stack_mask |= 1ull << spi;
+ } else if (class == BPF_STX) {
+ if (*reg_mask & dreg)
+ /* stx shouldn't be using _scalar_ dst_reg
+ * to access memory. It means backtracking
+ * encountered a case of pointer subtraction.
+ */
+ return -ENOTSUPP;
+ /* scalars can only be spilled into stack */
+ if (insn->dst_reg != BPF_REG_FP)
+ return 0;
+ if (BPF_SIZE(insn->code) != BPF_DW)
+ return 0;
+ spi = (-insn->off - 1) / BPF_REG_SIZE;
+ if (spi >= 64) {
+ verbose(env, "BUG spi %d\n", spi);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ if (!(*stack_mask & (1ull << spi)))
+ return 0;
+ *stack_mask &= ~(1ull << spi);
+ *reg_mask |= sreg;
+ } else if (class == BPF_JMP || class == BPF_JMP32) {
+ if (opcode == BPF_CALL) {
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ return -ENOTSUPP;
+ /* regular helper call sets R0 */
+ *reg_mask &= ~1;
+ if (*reg_mask & 0x3f) {
+ /* if backtracing was looking for registers R1-R5
+ * they should have been found already.
+ */
+ verbose(env, "BUG regs %x\n", *reg_mask);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ } else if (opcode == BPF_EXIT) {
+ return -ENOTSUPP;
+ }
+ } else if (class == BPF_LD) {
+ if (!(*reg_mask & dreg))
+ return 0;
+ *reg_mask &= ~dreg;
+ /* It's ld_imm64 or ld_abs or ld_ind.
+ * For ld_imm64 no further tracking of precision
+ * into parent is necessary
+ */
+ if (mode == BPF_IND || mode == BPF_ABS)
+ /* to be analyzed */
+ return -ENOTSUPP;
+ } else if (class == BPF_ST) {
+ if (*reg_mask & dreg)
+ /* likely pointer subtraction */
+ return -ENOTSUPP;
+ }
+ return 0;
+}
+
+/* the scalar precision tracking algorithm:
+ * . at the start all registers have precise=false.
+ * . scalar ranges are tracked as normal through alu and jmp insns.
+ * . once precise value of the scalar register is used in:
+ * . ptr + scalar alu
+ * . if (scalar cond K|scalar)
+ * . helper_call(.., scalar, ...) where ARG_CONST is expected
+ * backtrack through the verifier states and mark all registers and
+ * stack slots with spilled constants that these scalar regisers
+ * should be precise.
+ * . during state pruning two registers (or spilled stack slots)
+ * are equivalent if both are not precise.
+ *
+ * Note the verifier cannot simply walk register parentage chain,
+ * since many different registers and stack slots could have been
+ * used to compute single precise scalar.
+ *
+ * The approach of starting with precise=true for all registers and then
+ * backtrack to mark a register as not precise when the verifier detects
+ * that program doesn't care about specific value (e.g., when helper
+ * takes register as ARG_ANYTHING parameter) is not safe.
+ *
+ * It's ok to walk single parentage chain of the verifier states.
+ * It's possible that this backtracking will go all the way till 1st insn.
+ * All other branches will be explored for needing precision later.
+ *
+ * The backtracking needs to deal with cases like:
+ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0)
+ * r9 -= r8
+ * r5 = r9
+ * if r5 > 0x79f goto pc+7
+ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff))
+ * r5 += 1
+ * ...
+ * call bpf_perf_event_output#25
+ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO
+ *
+ * and this case:
+ * r6 = 1
+ * call foo // uses callee's r6 inside to compute r0
+ * r0 += r6
+ * if r0 == 0 goto
+ *
+ * to track above reg_mask/stack_mask needs to be independent for each frame.
+ *
+ * Also if parent's curframe > frame where backtracking started,
+ * the verifier need to mark registers in both frames, otherwise callees
+ * may incorrectly prune callers. This is similar to
+ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences")
+ *
+ * For now backtracking falls back into conservative marking.
+ */
+static void mark_all_scalars_precise(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *st)
+{
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ int i, j;
+
+ /* big hammer: mark all scalars precise in this path.
+ * pop_stack may still get !precise scalars.
+ */
+ for (; st; st = st->parent)
+ for (i = 0; i <= st->curframe; i++) {
+ func = st->frame[i];
+ for (j = 0; j < BPF_REG_FP; j++) {
+ reg = &func->regs[j];
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = true;
+ }
+ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) {
+ if (func->stack[j].slot_type[0] != STACK_SPILL)
+ continue;
+ reg = &func->stack[j].spilled_ptr;
+ if (reg->type != SCALAR_VALUE)
+ continue;
+ reg->precise = true;
+ }
+ }
+}
+
+static int __mark_chain_precision(struct bpf_verifier_env *env, int regno,
+ int spi)
+{
+ struct bpf_verifier_state *st = env->cur_state;
+ int first_idx = st->first_insn_idx;
+ int last_idx = env->insn_idx;
+ struct bpf_func_state *func;
+ struct bpf_reg_state *reg;
+ u32 reg_mask = regno >= 0 ? 1u << regno : 0;
+ u64 stack_mask = spi >= 0 ? 1ull << spi : 0;
+ bool skip_first = true;
+ bool new_marks = false;
+ int i, err;
+
+ if (!env->allow_ptr_leaks)
+ /* backtracking is root only for now */
+ return 0;
+
+ func = st->frame[st->curframe];
+ if (regno >= 0) {
+ reg = &func->regs[regno];
+ if (reg->type != SCALAR_VALUE) {
+ WARN_ONCE(1, "backtracing misuse");
+ return -EFAULT;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ else
+ reg_mask = 0;
+ reg->precise = true;
+ }
+
+ while (spi >= 0) {
+ if (func->stack[spi].slot_type[0] != STACK_SPILL) {
+ stack_mask = 0;
+ break;
+ }
+ reg = &func->stack[spi].spilled_ptr;
+ if (reg->type != SCALAR_VALUE) {
+ stack_mask = 0;
+ break;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ else
+ stack_mask = 0;
+ reg->precise = true;
+ break;
+ }
+
+ if (!new_marks)
+ return 0;
+ if (!reg_mask && !stack_mask)
+ return 0;
+ for (;;) {
+ DECLARE_BITMAP(mask, 64);
+ u32 history = st->jmp_history_cnt;
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx);
+ for (i = last_idx;;) {
+ if (skip_first) {
+ err = 0;
+ skip_first = false;
+ } else {
+ err = backtrack_insn(env, i, &reg_mask, &stack_mask);
+ }
+ if (err == -ENOTSUPP) {
+ mark_all_scalars_precise(env, st);
+ return 0;
+ } else if (err) {
+ return err;
+ }
+ if (!reg_mask && !stack_mask)
+ /* Found assignment(s) into tracked register in this state.
+ * Since this state is already marked, just return.
+ * Nothing to be tracked further in the parent state.
+ */
+ return 0;
+ if (i == first_idx)
+ break;
+ i = get_prev_insn_idx(st, i, &history);
+ if (i >= env->prog->len) {
+ /* This can happen if backtracking reached insn 0
+ * and there are still reg_mask or stack_mask
+ * to backtrack.
+ * It means the backtracking missed the spot where
+ * particular register was initialized with a constant.
+ */
+ verbose(env, "BUG backtracking idx %d\n", i);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+ }
+ st = st->parent;
+ if (!st)
+ break;
+
+ new_marks = false;
+ func = st->frame[st->curframe];
+ bitmap_from_u64(mask, reg_mask);
+ for_each_set_bit(i, mask, 32) {
+ reg = &func->regs[i];
+ if (reg->type != SCALAR_VALUE) {
+ reg_mask &= ~(1u << i);
+ continue;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ reg->precise = true;
+ }
+
+ bitmap_from_u64(mask, stack_mask);
+ for_each_set_bit(i, mask, 64) {
+ if (i >= func->allocated_stack / BPF_REG_SIZE) {
+ /* This can happen if backtracking
+ * is propagating stack precision where
+ * caller has larger stack frame
+ * than callee, but backtrack_insn() should
+ * have returned -ENOTSUPP.
+ */
+ verbose(env, "BUG spi %d stack_size %d\n",
+ i, func->allocated_stack);
+ WARN_ONCE(1, "verifier backtracking bug");
+ return -EFAULT;
+ }
+
+ if (func->stack[i].slot_type[0] != STACK_SPILL) {
+ stack_mask &= ~(1ull << i);
+ continue;
+ }
+ reg = &func->stack[i].spilled_ptr;
+ if (reg->type != SCALAR_VALUE) {
+ stack_mask &= ~(1ull << i);
+ continue;
+ }
+ if (!reg->precise)
+ new_marks = true;
+ reg->precise = true;
+ }
+ if (env->log.level & BPF_LOG_LEVEL) {
+ print_verifier_state(env, func);
+ verbose(env, "parent %s regs=%x stack=%llx marks\n",
+ new_marks ? "didn't have" : "already had",
+ reg_mask, stack_mask);
+ }
+
+ if (!reg_mask && !stack_mask)
+ break;
+ if (!new_marks)
+ break;
+
+ last_idx = st->last_insn_idx;
+ first_idx = st->first_insn_idx;
+ }
+ return 0;
+}
+
+static int mark_chain_precision(struct bpf_verifier_env *env, int regno)
+{
+ return __mark_chain_precision(env, regno, -1);
+}
+
+static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi)
+{
+ return __mark_chain_precision(env, -1, spi);
+}
+
static bool is_spillable_regtype(enum bpf_reg_type type)
{
switch (type) {
@@ -1211,6 +1845,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
return true;
default:
return false;
@@ -1223,6 +1858,23 @@ static bool register_is_null(struct bpf_reg_state *reg)
return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0);
}
+static bool register_is_const(struct bpf_reg_state *reg)
+{
+ return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
+}
+
+static void save_register_state(struct bpf_func_state *state,
+ int spi, struct bpf_reg_state *reg)
+{
+ int i;
+
+ state->stack[spi].spilled_ptr = *reg;
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ state->stack[spi].slot_type[i] = STACK_SPILL;
+}
+
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -1232,7 +1884,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
{
struct bpf_func_state *cur; /* state of the current function */
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
- enum bpf_reg_type type;
+ u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg;
+ struct bpf_reg_state *reg = NULL;
err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE),
state->acquired_refs, true);
@@ -1249,27 +1902,48 @@ static int check_stack_write(struct bpf_verifier_env *env,
}
cur = env->cur_state->frame[env->cur_state->curframe];
- if (value_regno >= 0 &&
- is_spillable_regtype((type = cur->regs[value_regno].type))) {
-
+ if (value_regno >= 0)
+ reg = &cur->regs[value_regno];
+
+ if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
+ !register_is_null(reg) && env->allow_ptr_leaks) {
+ if (dst_reg != BPF_REG_FP) {
+ /* The backtracking logic can only recognize explicit
+ * stack slot address like [fp - 8]. Other spill of
+ * scalar via different register has to be conervative.
+ * Backtrack from here and mark all registers as precise
+ * that contributed into 'reg' being a constant.
+ */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
+ }
+ save_register_state(state, spi, reg);
+ } else if (reg && is_spillable_regtype(reg->type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
+ verbose_linfo(env, insn_idx, "; ");
verbose(env, "invalid size of register spill\n");
return -EACCES;
}
- if (state != cur && type == PTR_TO_STACK) {
+ if (state != cur && reg->type == PTR_TO_STACK) {
verbose(env, "cannot spill pointers to stack into stack frame of the caller\n");
return -EINVAL;
}
- /* save register state */
- state->stack[spi].spilled_ptr = cur->regs[value_regno];
- state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
+ if (!env->allow_ptr_leaks) {
+ bool sanitize = false;
- for (i = 0; i < BPF_REG_SIZE; i++) {
- if (state->stack[spi].slot_type[i] == STACK_MISC &&
- !env->allow_ptr_leaks) {
+ if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ register_is_const(&state->stack[spi].spilled_ptr))
+ sanitize = true;
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (state->stack[spi].slot_type[i] == STACK_MISC) {
+ sanitize = true;
+ break;
+ }
+ if (sanitize) {
int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
int soff = (-spi - 1) * BPF_REG_SIZE;
@@ -1292,8 +1966,8 @@ static int check_stack_write(struct bpf_verifier_env *env,
}
*poff = soff;
}
- state->stack[spi].slot_type[i] = STACK_SPILL;
}
+ save_register_state(state, spi, reg);
} else {
u8 type = STACK_MISC;
@@ -1316,9 +1990,13 @@ static int check_stack_write(struct bpf_verifier_env *env,
state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
/* when we zero initialize stack slots mark them as such */
- if (value_regno >= 0 &&
- register_is_null(&cur->regs[value_regno]))
+ if (reg && register_is_null(reg)) {
+ /* backtracking doesn't work for STACK_ZERO yet. */
+ err = mark_chain_precision(env, value_regno);
+ if (err)
+ return err;
type = STACK_ZERO;
+ }
/* Mark slots affected by this stack write. */
for (i = 0; i < size; i++)
@@ -1335,6 +2013,7 @@ static int check_stack_read(struct bpf_verifier_env *env,
struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state = vstate->frame[vstate->curframe];
int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ struct bpf_reg_state *reg;
u8 *stype;
if (reg_state->allocated_stack <= slot) {
@@ -1343,11 +2022,21 @@ static int check_stack_read(struct bpf_verifier_env *env,
return -EACCES;
}
stype = reg_state->stack[spi].slot_type;
+ reg = &reg_state->stack[spi].spilled_ptr;
if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
- verbose(env, "invalid size of register spill\n");
- return -EACCES;
+ if (reg->type != SCALAR_VALUE) {
+ verbose_linfo(env, env->insn_idx, "; ");
+ verbose(env, "invalid size of register fill\n");
+ return -EACCES;
+ }
+ if (value_regno >= 0) {
+ mark_reg_unknown(env, state->regs, value_regno);
+ state->regs[value_regno].live |= REG_LIVE_WRITTEN;
+ }
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
+ return 0;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
@@ -1358,16 +2047,14 @@ static int check_stack_read(struct bpf_verifier_env *env,
if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = reg_state->stack[spi].spilled_ptr;
+ state->regs[value_regno] = *reg;
/* mark reg as written since spilled pointer state likely
* has its liveness marks cleared by is_state_visited()
* which resets stack/reg liveness for state transitions
*/
state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
- mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
- reg_state->stack[spi].spilled_ptr.parent);
- return 0;
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
} else {
int zeros = 0;
@@ -1382,22 +2069,32 @@ static int check_stack_read(struct bpf_verifier_env *env,
off, i, size);
return -EACCES;
}
- mark_reg_read(env, &reg_state->stack[spi].spilled_ptr,
- reg_state->stack[spi].spilled_ptr.parent);
+ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64);
if (value_regno >= 0) {
if (zeros == size) {
/* any size read into register is zero extended,
* so the whole register == const_zero
*/
__mark_reg_const_zero(&state->regs[value_regno]);
+ /* backtracking doesn't support STACK_ZERO yet,
+ * so mark it precise here, so that later
+ * backtracking can stop here.
+ * Backtracking may not need this if this register
+ * doesn't participate in pointer adjustment.
+ * Forward propagation of precise flag is not
+ * necessary either. This mark is only to stop
+ * backtracking. Any register that contributed
+ * to const 0 was marked precise before spill.
+ */
+ state->regs[value_regno].precise = true;
} else {
/* have read misc data from the stack */
mark_reg_unknown(env, state->regs, value_regno);
}
state->regs[value_regno].live |= REG_LIVE_WRITTEN;
}
- return 0;
}
+ return 0;
}
static int check_stack_access(struct bpf_verifier_env *env,
@@ -1412,7 +2109,7 @@ static int check_stack_access(struct bpf_verifier_env *env,
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "variable stack access var_off=%s off=%d size=%d",
+ verbose(env, "variable stack access var_off=%s off=%d size=%d\n",
tn_buf, off, size);
return -EACCES;
}
@@ -1425,6 +2122,28 @@ static int check_stack_access(struct bpf_verifier_env *env,
return 0;
}
+static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
+ int off, int size, enum bpf_access_type type)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
+ u32 cap = bpf_map_flags_to_cap(map);
+
+ if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) {
+ verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+
+ if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) {
+ verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
/* check read/write into map element returned by bpf_map_lookup_elem() */
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size, bool zero_size_allowed)
@@ -1454,7 +2173,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
* need to try adding each of min_value and max_value to off
* to make sure our theoretical access will be safe.
*/
- if (env->log.level)
+ if (env->log.level & BPF_LOG_LEVEL)
print_verifier_state(env, state);
/* The minimum value is only important with signed
@@ -1541,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
env->seen_direct_write = true;
return true;
+
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+ if (t == BPF_WRITE)
+ env->seen_direct_write = true;
+
+ return true;
+
default:
return false;
}
@@ -1667,6 +2393,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
case PTR_TO_TCP_SOCK:
valid = bpf_tcp_sock_is_valid_access(off, size, t, &info);
break;
+ case PTR_TO_XDP_SOCK:
+ valid = bpf_xdp_sock_is_valid_access(off, size, t, &info);
+ break;
default:
valid = false;
}
@@ -1831,6 +2560,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
case PTR_TO_TCP_SOCK:
pointer_desc = "tcp_sock ";
break;
+ case PTR_TO_XDP_SOCK:
+ pointer_desc = "xdp_sock ";
+ break;
default:
break;
}
@@ -1955,6 +2687,32 @@ static int check_ctx_reg(struct bpf_verifier_env *env,
return 0;
}
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int regno, int off, int size)
+{
+ if (off < 0) {
+ verbose(env,
+ "R%d invalid tracepoint buffer access: off=%d, size=%d",
+ regno, off, size);
+ return -EACCES;
+ }
+ if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env,
+ "R%d invalid variable buffer offset: off=%d, var_off=%s",
+ regno, off, tn_buf);
+ return -EACCES;
+ }
+ if (off + size > env->prog->aux->max_tp_access)
+ env->prog->aux->max_tp_access = off + size;
+
+ return 0;
+}
+
+
/* truncate register to smaller size (in bytes)
* must be called with size < BPF_REG_SIZE
*/
@@ -2011,7 +2769,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
verbose(env, "R%d leaks addr into map\n", value_regno);
return -EACCES;
}
-
+ err = check_map_access_type(env, regno, off, size, t);
+ if (err)
+ return err;
err = check_map_access(env, regno, off, size, false);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
@@ -2042,6 +2802,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
value_regno);
if (reg_type_may_be_null(reg_type))
regs[value_regno].id = ++env->id_gen;
+ /* A load of ctx field could have different
+ * actual load size with the one encoded in the
+ * insn. When the dst is PTR, it is for sure not
+ * a sub-register.
+ */
+ regs[value_regno].subreg_def = DEF_NOT_SUBREG;
}
regs[value_regno].type = reg_type;
}
@@ -2097,6 +2863,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_sock_access(env, insn_idx, regno, off, size, t);
if (!err && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
+ } else if (reg->type == PTR_TO_TP_BUFFER) {
+ err = check_tp_buffer_access(env, reg, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown(env, regs, value_regno);
} else {
verbose(env, "R%d invalid mem access '%s'\n", regno,
reg_type_str[reg->type]);
@@ -2157,6 +2927,29 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
BPF_SIZE(insn->code), BPF_WRITE, -1, true);
}
+static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno,
+ int off, int access_size,
+ bool zero_size_allowed)
+{
+ struct bpf_reg_state *reg = reg_state(env, regno);
+
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n",
+ regno, tn_buf, access_size);
+ }
+ return -EACCES;
+ }
+ return 0;
+}
+
/* when register 'regno' is passed into function that will read 'access_size'
* bytes from that pointer, make sure that it's within stack boundary
* and all elements of stack are initialized.
@@ -2169,7 +2962,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
- int off, i, slot, spi;
+ int err, min_off, max_off, i, j, slot, spi;
if (reg->type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -2183,21 +2976,57 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
- /* Only allow fixed-offset stack reads */
- if (!tnum_is_const(reg->var_off)) {
- char tn_buf[48];
+ if (tnum_is_const(reg->var_off)) {
+ min_off = max_off = reg->var_off.value + reg->off;
+ err = __check_stack_boundary(env, regno, min_off, access_size,
+ zero_size_allowed);
+ if (err)
+ return err;
+ } else {
+ /* Variable offset is prohibited for unprivileged mode for
+ * simplicity since it requires corresponding support in
+ * Spectre masking for stack ALU.
+ * See also retrieve_ptr_limit().
+ */
+ if (!env->allow_ptr_leaks) {
+ char tn_buf[48];
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid variable stack read R%d var_off=%s\n",
- regno, tn_buf);
- return -EACCES;
- }
- off = reg->off + reg->var_off.value;
- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
- access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
- verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
- regno, off, access_size);
- return -EACCES;
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, tn_buf);
+ return -EACCES;
+ }
+ /* Only initialized buffer on stack is allowed to be accessed
+ * with variable offset. With uninitialized buffer it's hard to
+ * guarantee that whole memory is marked as initialized on
+ * helper return since specific bounds are unknown what may
+ * cause uninitialized stack leaking.
+ */
+ if (meta && meta->raw_mode)
+ meta = NULL;
+
+ if (reg->smax_value >= BPF_MAX_VAR_OFF ||
+ reg->smax_value <= -BPF_MAX_VAR_OFF) {
+ verbose(env, "R%d unbounded indirect variable offset stack access\n",
+ regno);
+ return -EACCES;
+ }
+ min_off = reg->smin_value + reg->off;
+ max_off = reg->smax_value + reg->off;
+ err = __check_stack_boundary(env, regno, min_off, access_size,
+ zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d min value is outside of stack bound\n",
+ regno);
+ return err;
+ }
+ err = __check_stack_boundary(env, regno, max_off, access_size,
+ zero_size_allowed);
+ if (err) {
+ verbose(env, "R%d max value is outside of stack bound\n",
+ regno);
+ return err;
+ }
}
if (meta && meta->raw_mode) {
@@ -2206,10 +3035,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return 0;
}
- for (i = 0; i < access_size; i++) {
+ for (i = min_off; i < max_off + access_size; i++) {
u8 *stype;
- slot = -(off + i) - 1;
+ slot = -i - 1;
spi = slot / BPF_REG_SIZE;
if (state->allocated_stack <= slot)
goto err;
@@ -2221,18 +3050,35 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
*stype = STACK_MISC;
goto mark;
}
+ if (state->stack[spi].slot_type[0] == STACK_SPILL &&
+ state->stack[spi].spilled_ptr.type == SCALAR_VALUE) {
+ __mark_reg_unknown(&state->stack[spi].spilled_ptr);
+ for (j = 0; j < BPF_REG_SIZE; j++)
+ state->stack[spi].slot_type[j] = STACK_MISC;
+ goto mark;
+ }
+
err:
- verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
- off, i, access_size);
+ if (tnum_is_const(reg->var_off)) {
+ verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
+ min_off, i - min_off, access_size);
+ } else {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n",
+ tn_buf, i - min_off, access_size);
+ }
return -EACCES;
mark:
/* reading any byte out of 8-byte 'spill_slot' will cause
* the whole slot to be marked as 'read'
*/
mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent);
+ state->stack[spi].spilled_ptr.parent,
+ REG_LIVE_READ64);
}
- return update_stack_depth(env, state, off);
+ return update_stack_depth(env, state, min_off);
}
static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
@@ -2247,6 +3093,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_packet_access(env, regno, reg->off, access_size,
zero_size_allowed);
case PTR_TO_MAP_VALUE:
+ if (check_map_access_type(env, regno, reg->off, access_size,
+ meta && meta->raw_mode ? BPF_WRITE :
+ BPF_READ))
+ return -EACCES;
return check_map_access(env, regno, reg->off, access_size,
zero_size_allowed);
default: /* scalar_value|ptr_to_stack or invalid ptr */
@@ -2353,6 +3203,22 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
+static bool arg_type_is_int_ptr(enum bpf_arg_type type)
+{
+ return type == ARG_PTR_TO_INT ||
+ type == ARG_PTR_TO_LONG;
+}
+
+static int int_ptr_type_to_size(enum bpf_arg_type type)
+{
+ if (type == ARG_PTR_TO_INT)
+ return sizeof(u32);
+ else if (type == ARG_PTR_TO_LONG)
+ return sizeof(u64);
+
+ return -EINVAL;
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
@@ -2385,10 +3251,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE ||
- arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
+ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
expected_type = PTR_TO_STACK;
- if (!type_is_pkt_pointer(type) && type != PTR_TO_MAP_VALUE &&
- type != expected_type)
+ if (register_is_null(reg) &&
+ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL)
+ /* final test in check_stack_boundary() */;
+ else if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
+ type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -2420,6 +3291,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
}
meta->ref_obj_id = reg->ref_obj_id;
}
+ } else if (arg_type == ARG_PTR_TO_SOCKET) {
+ expected_type = PTR_TO_SOCKET;
+ if (type != expected_type)
+ goto err_type;
} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
if (meta->func_id == BPF_FUNC_spin_lock) {
if (process_spin_lock(env, regno, true))
@@ -2445,6 +3320,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
+ } else if (arg_type_is_int_ptr(arg_type)) {
+ expected_type = PTR_TO_STACK;
+ if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
+ type != expected_type)
+ goto err_type;
} else {
verbose(env, "unsupported arg_type %d\n", arg_type);
return -EFAULT;
@@ -2471,6 +3352,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
meta->map_ptr->key_size, false,
NULL);
} else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
+ (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
+ !register_is_null(reg)) ||
arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
/* bpf_map_xxx(..., map_ptr, ..., value) call:
* check [value, value + map->value_size) validity
@@ -2526,6 +3409,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
err = check_helper_mem_access(env, regno - 1,
reg->umax_value,
zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ } else if (arg_type_is_int_ptr(arg_type)) {
+ int size = int_ptr_type_to_size(arg_type);
+
+ err = check_helper_mem_access(env, regno, size, false, meta);
+ if (err)
+ return err;
+ err = check_ptr_alignment(env, reg, 0, size, true);
}
return err;
@@ -2567,22 +3459,23 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (func_id != BPF_FUNC_get_local_storage)
goto error;
break;
- /* devmap returns a pointer to a live net_device ifindex that we cannot
- * allow to be modified from bpf side. So do not allow lookup elements
- * for now.
- */
case BPF_MAP_TYPE_DEVMAP:
- if (func_id != BPF_FUNC_redirect_map)
+ if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
/* Restrict bpf side of cpumap and xskmap, open when use-cases
* appear.
*/
case BPF_MAP_TYPE_CPUMAP:
- case BPF_MAP_TYPE_XSKMAP:
if (func_id != BPF_FUNC_redirect_map)
goto error;
break;
+ case BPF_MAP_TYPE_XSKMAP:
+ if (func_id != BPF_FUNC_redirect_map &&
+ func_id != BPF_FUNC_map_lookup_elem)
+ goto error;
+ break;
case BPF_MAP_TYPE_ARRAY_OF_MAPS:
case BPF_MAP_TYPE_HASH_OF_MAPS:
if (func_id != BPF_FUNC_map_lookup_elem)
@@ -2613,6 +3506,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
func_id != BPF_FUNC_map_push_elem)
goto error;
break;
+ case BPF_MAP_TYPE_SK_STORAGE:
+ if (func_id != BPF_FUNC_sk_storage_get &&
+ func_id != BPF_FUNC_sk_storage_delete)
+ goto error;
+ break;
default:
break;
}
@@ -2676,6 +3574,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_STACK)
goto error;
break;
+ case BPF_FUNC_sk_storage_get:
+ case BPF_FUNC_sk_storage_delete:
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ goto error;
+ break;
default:
break;
}
@@ -2905,7 +3808,7 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
/* and go analyze first insn of the callee */
*insn_idx = target_insn;
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
print_verifier_state(env, caller);
verbose(env, "callee:\n");
@@ -2945,7 +3848,7 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return err;
*insn_idx = callee->callsite + 1;
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
print_verifier_state(env, callee);
verbose(env, "to caller at %d:\n", *insn_idx);
@@ -2979,6 +3882,7 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
int func_id, int insn_idx)
{
struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+ struct bpf_map *map = meta->map_ptr;
if (func_id != BPF_FUNC_tail_call &&
func_id != BPF_FUNC_map_lookup_elem &&
@@ -2989,11 +3893,24 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
func_id != BPF_FUNC_map_peek_elem)
return 0;
- if (meta->map_ptr == NULL) {
+ if (map == NULL) {
verbose(env, "kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
+ /* In case of read-only, some additional restrictions
+ * need to be applied in order to prevent altering the
+ * state of the map from program side.
+ */
+ if ((map->map_flags & BPF_F_RDONLY_PROG) &&
+ (func_id == BPF_FUNC_map_delete_elem ||
+ func_id == BPF_FUNC_map_update_elem ||
+ func_id == BPF_FUNC_map_push_elem ||
+ func_id == BPF_FUNC_map_pop_elem)) {
+ verbose(env, "write into map forbidden\n");
+ return -EACCES;
+ }
+
if (!BPF_MAP_PTR(aux->map_state))
bpf_map_ptr_store(aux, meta->map_ptr,
meta->map_ptr->unpriv_array);
@@ -3126,6 +4043,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
}
+ /* helper call returns 64-bit value. */
+ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
/* update return register (already marked as written above) */
if (fn->ret_type == RET_INTEGER) {
/* sets type to SCALAR_VALUE */
@@ -3157,19 +4077,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
} else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
- if (is_acquire_function(func_id)) {
- int id = acquire_reference_state(env, insn_idx);
-
- if (id < 0)
- return id;
- /* For mark_ptr_or_null_reg() */
- regs[BPF_REG_0].id = id;
- /* For release_reference() */
- regs[BPF_REG_0].ref_obj_id = id;
- } else {
- /* For mark_ptr_or_null_reg() */
- regs[BPF_REG_0].id = ++env->id_gen;
- }
+ regs[BPF_REG_0].id = ++env->id_gen;
+ } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
mark_reg_known_zero(env, regs, BPF_REG_0);
regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
@@ -3180,9 +4092,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
- if (is_ptr_cast_function(func_id))
+ if (is_ptr_cast_function(func_id)) {
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
+ } else if (is_acquire_function(func_id)) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ /* For mark_ptr_or_null_reg() */
+ regs[BPF_REG_0].id = id;
+ /* For release_reference() */
+ regs[BPF_REG_0].ref_obj_id = id;
+ }
do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
@@ -3282,6 +4204,9 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
switch (ptr_reg->type) {
case PTR_TO_STACK:
+ /* Indirect variable offset stack access is prohibited in
+ * unprivileged mode so it's not handled here.
+ */
off = ptr_reg->off + ptr_reg->var_off.value;
if (mask_to_left)
*ptr_limit = MAX_BPF_STACK + off;
@@ -3441,6 +4366,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
verbose(env, "R%d pointer arithmetic on %s prohibited\n",
dst, reg_type_str[ptr_reg->type]);
return -EACCES;
@@ -3918,6 +4844,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
+ int err;
dst_reg = &regs[insn->dst_reg];
src_reg = NULL;
@@ -3944,11 +4871,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* This is legal, but we have to reverse our
* src/dest handling in computing the range
*/
+ err = mark_chain_precision(env, insn->dst_reg);
+ if (err)
+ return err;
return adjust_ptr_min_max_vals(env, insn,
src_reg, dst_reg);
}
} else if (ptr_reg) {
/* pointer += scalar */
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
return adjust_ptr_min_max_vals(env, insn,
dst_reg, src_reg);
}
@@ -4052,6 +4985,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
*dst_reg = *src_reg;
dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = DEF_NOT_SUBREG;
} else {
/* R1 = (u32) R2 */
if (is_pointer_value(env, insn->src_reg)) {
@@ -4062,6 +4996,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
} else if (src_reg->type == SCALAR_VALUE) {
*dst_reg = *src_reg;
dst_reg->live |= REG_LIVE_WRITTEN;
+ dst_reg->subreg_def = env->insn_idx + 1;
} else {
mark_reg_unknown(env, regs,
insn->dst_reg);
@@ -4678,6 +5613,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
if (reg->map_ptr->inner_map_meta) {
reg->type = CONST_PTR_TO_MAP;
reg->map_ptr = reg->map_ptr->inner_map_meta;
+ } else if (reg->map_ptr->map_type ==
+ BPF_MAP_TYPE_XSKMAP) {
+ reg->type = PTR_TO_XDP_SOCK;
} else {
reg->type = PTR_TO_MAP_VALUE;
}
@@ -4849,9 +5787,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_verifier_state *this_branch = env->cur_state;
struct bpf_verifier_state *other_branch;
struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs;
- struct bpf_reg_state *dst_reg, *other_branch_regs;
+ struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL;
u8 opcode = BPF_OP(insn->code);
bool is_jmp32;
+ int pred = -1;
int err;
/* Only conditional jumps are expected to reach here. */
@@ -4876,6 +5815,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->src_reg);
return -EACCES;
}
+ src_reg = &regs[insn->src_reg];
} else {
if (insn->src_reg != BPF_REG_0) {
verbose(env, "BPF_JMP/JMP32 uses reserved fields\n");
@@ -4891,20 +5831,29 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
dst_reg = &regs[insn->dst_reg];
is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32;
- if (BPF_SRC(insn->code) == BPF_K) {
- int pred = is_branch_taken(dst_reg, insn->imm, opcode,
- is_jmp32);
-
- if (pred == 1) {
- /* only follow the goto, ignore fall-through */
- *insn_idx += insn->off;
- return 0;
- } else if (pred == 0) {
- /* only follow fall-through branch, since
- * that's where the program will go
- */
- return 0;
- }
+ if (BPF_SRC(insn->code) == BPF_K)
+ pred = is_branch_taken(dst_reg, insn->imm,
+ opcode, is_jmp32);
+ else if (src_reg->type == SCALAR_VALUE &&
+ tnum_is_const(src_reg->var_off))
+ pred = is_branch_taken(dst_reg, src_reg->var_off.value,
+ opcode, is_jmp32);
+ if (pred >= 0) {
+ err = mark_chain_precision(env, insn->dst_reg);
+ if (BPF_SRC(insn->code) == BPF_X && !err)
+ err = mark_chain_precision(env, insn->src_reg);
+ if (err)
+ return err;
+ }
+ if (pred == 1) {
+ /* only follow the goto, ignore fall-through */
+ *insn_idx += insn->off;
+ return 0;
+ } else if (pred == 0) {
+ /* only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
}
other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
@@ -4982,23 +5931,17 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
insn->dst_reg);
return -EACCES;
}
- if (env->log.level)
+ if (env->log.level & BPF_LOG_LEVEL)
print_verifier_state(env, this_branch->frame[this_branch->curframe]);
return 0;
}
-/* return the map pointer stored inside BPF_LD_IMM64 instruction */
-static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
-{
- u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
-
- return (struct bpf_map *) (unsigned long) imm64;
-}
-
/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
+ struct bpf_insn_aux_data *aux = cur_aux(env);
struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map;
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -5022,11 +5965,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
- /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
- BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+ map = env->used_maps[aux->map_index];
+ mark_reg_known_zero(env, regs, insn->dst_reg);
+ regs[insn->dst_reg].map_ptr = map;
+
+ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].off = aux->map_off;
+ if (map_value_has_spin_lock(map))
+ regs[insn->dst_reg].id = ++env->id_gen;
+ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ } else {
+ verbose(env, "bpf verifier is misconfigured\n");
+ return -EINVAL;
+ }
- regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
- regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
return 0;
}
@@ -5136,20 +6090,32 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
* Already marked as written above.
*/
mark_reg_unknown(env, regs, BPF_REG_0);
+ /* ld_abs load up to 32-bit skb data. */
+ regs[BPF_REG_0].subreg_def = env->insn_idx + 1;
return 0;
}
static int check_return_code(struct bpf_verifier_env *env)
{
+ struct tnum enforce_attach_type_range = tnum_unknown;
struct bpf_reg_state *reg;
struct tnum range = tnum_range(0, 1);
switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+ if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
+ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
+ range = tnum_range(1, 1);
case BPF_PROG_TYPE_CGROUP_SKB:
+ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) {
+ range = tnum_range(0, 3);
+ enforce_attach_type_range = tnum_range(2, 3);
+ }
case BPF_PROG_TYPE_CGROUP_SOCK:
- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
case BPF_PROG_TYPE_SOCK_OPS:
case BPF_PROG_TYPE_CGROUP_DEVICE:
+ case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CGROUP_SOCKOPT:
break;
default:
return 0;
@@ -5163,18 +6129,23 @@ static int check_return_code(struct bpf_verifier_env *env)
}
if (!tnum_in(range, reg->var_off)) {
+ char tn_buf[48];
+
verbose(env, "At program exit the register R0 ");
if (!tnum_is_unknown(reg->var_off)) {
- char tn_buf[48];
-
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
verbose(env, "has value %s", tn_buf);
} else {
verbose(env, "has unknown scalar value");
}
- verbose(env, " should have been 0 or 1\n");
+ tnum_strn(tn_buf, sizeof(tn_buf), range);
+ verbose(env, " should have been in %s\n", tn_buf);
return -EINVAL;
}
+
+ if (!tnum_is_unknown(enforce_attach_type_range) &&
+ tnum_in(enforce_attach_type_range, reg->var_off))
+ env->prog->enforce_expected_attach_type = 1;
return 0;
}
@@ -5218,19 +6189,37 @@ enum {
BRANCH = 2,
};
-#define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L)
+static u32 state_htab_size(struct bpf_verifier_env *env)
+{
+ return env->prog->len;
+}
+
+static struct bpf_verifier_state_list **explored_state(
+ struct bpf_verifier_env *env,
+ int idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_func_state *state = cur->frame[cur->curframe];
+
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+}
-static int *insn_stack; /* stack of insns to process */
-static int cur_stack; /* current stack index */
-static int *insn_state;
+static void init_explored_state(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].prune_point = true;
+}
/* t, w, e - match pseudo-code above:
* t - index of current instruction
* w - next instruction
* e - edge
*/
-static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
+static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
+ bool loop_ok)
{
+ int *insn_stack = env->cfg.insn_stack;
+ int *insn_state = env->cfg.insn_state;
+
if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
return 0;
@@ -5245,17 +6234,19 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
if (e == BRANCH)
/* mark branch target for state pruning */
- env->explored_states[w] = STATE_LIST_MARK;
+ init_explored_state(env, w);
if (insn_state[w] == 0) {
/* tree-edge */
insn_state[t] = DISCOVERED | e;
insn_state[w] = DISCOVERED;
- if (cur_stack >= env->prog->len)
+ if (env->cfg.cur_stack >= env->prog->len)
return -E2BIG;
- insn_stack[cur_stack++] = w;
+ insn_stack[env->cfg.cur_stack++] = w;
return 1;
} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ if (loop_ok && env->allow_ptr_leaks)
+ return 0;
verbose_linfo(env, t, "%d: ", t);
verbose_linfo(env, w, "%d: ", w);
verbose(env, "back-edge from insn %d to %d\n", t, w);
@@ -5277,27 +6268,28 @@ static int check_cfg(struct bpf_verifier_env *env)
{
struct bpf_insn *insns = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ int *insn_stack, *insn_state;
int ret = 0;
int i, t;
- insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_state)
return -ENOMEM;
- insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ insn_stack = env->cfg.insn_stack = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
if (!insn_stack) {
- kfree(insn_state);
+ kvfree(insn_state);
return -ENOMEM;
}
insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
insn_stack[0] = 0; /* 0 is the first instruction */
- cur_stack = 1;
+ env->cfg.cur_stack = 1;
peek_stack:
- if (cur_stack == 0)
+ if (env->cfg.cur_stack == 0)
goto check_state;
- t = insn_stack[cur_stack - 1];
+ t = insn_stack[env->cfg.cur_stack - 1];
if (BPF_CLASS(insns[t].code) == BPF_JMP ||
BPF_CLASS(insns[t].code) == BPF_JMP32) {
@@ -5306,16 +6298,17 @@ peek_stack:
if (opcode == BPF_EXIT) {
goto mark_explored;
} else if (opcode == BPF_CALL) {
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
if (t + 1 < insn_cnt)
- env->explored_states[t + 1] = STATE_LIST_MARK;
+ init_explored_state(env, t + 1);
if (insns[t].src_reg == BPF_PSEUDO_CALL) {
- env->explored_states[t] = STATE_LIST_MARK;
- ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env);
+ init_explored_state(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+ env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
@@ -5328,26 +6321,31 @@ peek_stack:
}
/* unconditional jump with single edge */
ret = push_insn(t, t + insns[t].off + 1,
- FALLTHROUGH, env);
+ FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
+ /* unconditional jmp is not a good pruning point,
+ * but it's marked, since backtracking needs
+ * to record jmp history in is_state_visited().
+ */
+ init_explored_state(env, t + insns[t].off + 1);
/* tell verifier to check for equivalent states
* after every call and jump
*/
if (t + 1 < insn_cnt)
- env->explored_states[t + 1] = STATE_LIST_MARK;
+ init_explored_state(env, t + 1);
} else {
/* conditional jump with two edges */
- env->explored_states[t] = STATE_LIST_MARK;
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ init_explored_state(env, t);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
goto err_free;
- ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
@@ -5357,7 +6355,7 @@ peek_stack:
/* all other non-branch instructions with single
* fall-through edge
*/
- ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
if (ret == 1)
goto peek_stack;
else if (ret < 0)
@@ -5366,7 +6364,7 @@ peek_stack:
mark_explored:
insn_state[t] = EXPLORED;
- if (cur_stack-- <= 0) {
+ if (env->cfg.cur_stack-- <= 0) {
verbose(env, "pop stack internal bug\n");
ret = -EFAULT;
goto err_free;
@@ -5384,8 +6382,9 @@ check_state:
ret = 0; /* cfg looks good */
err_free:
- kfree(insn_state);
- kfree(insn_stack);
+ kvfree(insn_state);
+ kvfree(insn_stack);
+ env->cfg.insn_state = env->cfg.insn_stack = NULL;
return ret;
}
@@ -5787,12 +6786,12 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn,
struct bpf_verifier_state_list *sl;
int i;
- sl = env->explored_states[insn];
- if (!sl)
- return;
-
- while (sl != STATE_LIST_MARK) {
- if (sl->state.curframe != cur->curframe)
+ sl = *explored_state(env, insn);
+ while (sl) {
+ if (sl->state.branches)
+ goto next;
+ if (sl->state.insn_idx != insn ||
+ sl->state.curframe != cur->curframe)
goto next;
for (i = 0; i <= cur->curframe; i++)
if (sl->state.frame[i]->callsite != cur->frame[i]->callsite)
@@ -5832,6 +6831,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
switch (rold->type) {
case SCALAR_VALUE:
if (rcur->type == SCALAR_VALUE) {
+ if (!rold->precise && !rcur->precise)
+ return true;
/* new val must satisfy old val knowledge */
return range_within(rold, rcur) &&
tnum_in(rold->var_off, rcur->var_off);
@@ -5904,6 +6905,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
/* Only valid matches are exact, which memcmp() above
* would have accepted
*/
@@ -6074,6 +7076,35 @@ static bool states_equal(struct bpf_verifier_env *env,
return true;
}
+/* Return 0 if no propagation happened. Return negative error code if error
+ * happened. Otherwise, return the propagated bit.
+ */
+static int propagate_liveness_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg,
+ struct bpf_reg_state *parent_reg)
+{
+ u8 parent_flag = parent_reg->live & REG_LIVE_READ;
+ u8 flag = reg->live & REG_LIVE_READ;
+ int err;
+
+ /* When comes here, read flags of PARENT_REG or REG could be any of
+ * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need
+ * of propagation if PARENT_REG has strongest REG_LIVE_READ64.
+ */
+ if (parent_flag == REG_LIVE_READ64 ||
+ /* Or if there is no read flag from REG. */
+ !flag ||
+ /* Or if the read flag from REG is the same as PARENT_REG. */
+ parent_flag == flag)
+ return 0;
+
+ err = mark_reg_read(env, reg, parent_reg, flag);
+ if (err)
+ return err;
+
+ return flag;
+}
+
/* A write screens off any subsequent reads; but write marks come from the
* straight-line code between a state and its parent. When we arrive at an
* equivalent state (jump target or such) we didn't arrive by the straight-line
@@ -6085,8 +7116,9 @@ static int propagate_liveness(struct bpf_verifier_env *env,
const struct bpf_verifier_state *vstate,
struct bpf_verifier_state *vparent)
{
- int i, frame, err = 0;
+ struct bpf_reg_state *state_reg, *parent_reg;
struct bpf_func_state *state, *parent;
+ int i, frame, err = 0;
if (vparent->curframe != vstate->curframe) {
WARN(1, "propagate_live: parent frame %d current frame %d\n",
@@ -6096,53 +7128,155 @@ static int propagate_liveness(struct bpf_verifier_env *env,
/* Propagate read liveness of registers... */
BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);
for (frame = 0; frame <= vstate->curframe; frame++) {
+ parent = vparent->frame[frame];
+ state = vstate->frame[frame];
+ parent_reg = parent->regs;
+ state_reg = state->regs;
/* We don't need to worry about FP liveness, it's read-only */
for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) {
- if (vparent->frame[frame]->regs[i].live & REG_LIVE_READ)
- continue;
- if (vstate->frame[frame]->regs[i].live & REG_LIVE_READ) {
- err = mark_reg_read(env, &vstate->frame[frame]->regs[i],
- &vparent->frame[frame]->regs[i]);
- if (err)
- return err;
- }
+ err = propagate_liveness_reg(env, &state_reg[i],
+ &parent_reg[i]);
+ if (err < 0)
+ return err;
+ if (err == REG_LIVE_READ64)
+ mark_insn_zext(env, &parent_reg[i]);
}
- }
- /* ... and stack slots */
- for (frame = 0; frame <= vstate->curframe; frame++) {
- state = vstate->frame[frame];
- parent = vparent->frame[frame];
+ /* Propagate stack slots. */
for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
i < parent->allocated_stack / BPF_REG_SIZE; i++) {
- if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
- continue;
- if (state->stack[i].spilled_ptr.live & REG_LIVE_READ)
- mark_reg_read(env, &state->stack[i].spilled_ptr,
- &parent->stack[i].spilled_ptr);
+ parent_reg = &parent->stack[i].spilled_ptr;
+ state_reg = &state->stack[i].spilled_ptr;
+ err = propagate_liveness_reg(env, state_reg,
+ parent_reg);
+ if (err < 0)
+ return err;
}
}
- return err;
+ return 0;
+}
+
+/* find precise scalars in the previous equivalent state and
+ * propagate them into the current state
+ */
+static int propagate_precision(struct bpf_verifier_env *env,
+ const struct bpf_verifier_state *old)
+{
+ struct bpf_reg_state *state_reg;
+ struct bpf_func_state *state;
+ int i, err = 0;
+
+ state = old->frame[old->curframe];
+ state_reg = state->regs;
+ for (i = 0; i < BPF_REG_FP; i++, state_reg++) {
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "propagating r%d\n", i);
+ err = mark_chain_precision(env, i);
+ if (err < 0)
+ return err;
+ }
+
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
+ continue;
+ state_reg = &state->stack[i].spilled_ptr;
+ if (state_reg->type != SCALAR_VALUE ||
+ !state_reg->precise)
+ continue;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "propagating fp%d\n",
+ (-i - 1) * BPF_REG_SIZE);
+ err = mark_chain_precision_stack(env, i);
+ if (err < 0)
+ return err;
+ }
+ return 0;
+}
+
+static bool states_maybe_looping(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur)
+{
+ struct bpf_func_state *fold, *fcur;
+ int i, fr = cur->curframe;
+
+ if (old->curframe != fr)
+ return false;
+
+ fold = old->frame[fr];
+ fcur = cur->frame[fr];
+ for (i = 0; i < MAX_BPF_REG; i++)
+ if (memcmp(&fold->regs[i], &fcur->regs[i],
+ offsetof(struct bpf_reg_state, parent)))
+ return false;
+ return true;
}
+
static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
- struct bpf_verifier_state_list *sl;
+ struct bpf_verifier_state_list *sl, **pprev;
struct bpf_verifier_state *cur = env->cur_state, *new;
int i, j, err, states_cnt = 0;
+ bool add_new_state = false;
- sl = env->explored_states[insn_idx];
- if (!sl)
+ cur->last_insn_idx = env->prev_insn_idx;
+ if (!env->insn_aux_data[insn_idx].prune_point)
/* this 'insn_idx' instruction wasn't marked, so we will not
* be doing state search here
*/
return 0;
+ /* bpf progs typically have pruning point every 4 instructions
+ * http://vger.kernel.org/bpfconf2019.html#session-1
+ * Do not add new state for future pruning if the verifier hasn't seen
+ * at least 2 jumps and at least 8 instructions.
+ * This heuristics helps decrease 'total_states' and 'peak_states' metric.
+ * In tests that amounts to up to 50% reduction into total verifier
+ * memory consumption and 20% verifier time speedup.
+ */
+ if (env->jmps_processed - env->prev_jmps_processed >= 2 &&
+ env->insn_processed - env->prev_insn_processed >= 8)
+ add_new_state = true;
+
+ pprev = explored_state(env, insn_idx);
+ sl = *pprev;
+
clean_live_states(env, insn_idx, cur);
- while (sl != STATE_LIST_MARK) {
+ while (sl) {
+ states_cnt++;
+ if (sl->state.insn_idx != insn_idx)
+ goto next;
+ if (sl->state.branches) {
+ if (states_maybe_looping(&sl->state, cur) &&
+ states_equal(env, &sl->state, cur)) {
+ verbose_linfo(env, insn_idx, "; ");
+ verbose(env, "infinite loop detected at insn %d\n", insn_idx);
+ return -EINVAL;
+ }
+ /* if the verifier is processing a loop, avoid adding new state
+ * too often, since different loop iterations have distinct
+ * states and may not help future pruning.
+ * This threshold shouldn't be too low to make sure that
+ * a loop with large bound will be rejected quickly.
+ * The most abusive loop will be:
+ * r1 += 1
+ * if r1 < 1000000 goto pc-2
+ * 1M insn_procssed limit / 100 == 10k peak states.
+ * This threshold shouldn't be too high either, since states
+ * at the end of the loop are likely to be useful in pruning.
+ */
+ if (env->jmps_processed - env->prev_jmps_processed < 20 &&
+ env->insn_processed - env->prev_insn_processed < 100)
+ add_new_state = false;
+ goto miss;
+ }
if (states_equal(env, &sl->state, cur)) {
+ sl->hit_cnt++;
/* reached equivalent register/stack state,
* prune the search.
* Registers read by the continuation are read by us.
@@ -6154,27 +7288,87 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* this state and will pop a new one.
*/
err = propagate_liveness(env, &sl->state, cur);
+
+ /* if previous state reached the exit with precision and
+ * current state is equivalent to it (except precsion marks)
+ * the precision needs to be propagated back in
+ * the current state.
+ */
+ err = err ? : push_jmp_history(env, cur);
+ err = err ? : propagate_precision(env, &sl->state);
if (err)
return err;
return 1;
}
- sl = sl->next;
- states_cnt++;
+miss:
+ /* when new state is not going to be added do not increase miss count.
+ * Otherwise several loop iterations will remove the state
+ * recorded earlier. The goal of these heuristics is to have
+ * states from some iterations of the loop (some in the beginning
+ * and some at the end) to help pruning.
+ */
+ if (add_new_state)
+ sl->miss_cnt++;
+ /* heuristic to determine whether this state is beneficial
+ * to keep checking from state equivalence point of view.
+ * Higher numbers increase max_states_per_insn and verification time,
+ * but do not meaningfully decrease insn_processed.
+ */
+ if (sl->miss_cnt > sl->hit_cnt * 3 + 3) {
+ /* the state is unlikely to be useful. Remove it to
+ * speed up verification
+ */
+ *pprev = sl->next;
+ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) {
+ u32 br = sl->state.branches;
+
+ WARN_ONCE(br,
+ "BUG live_done but branches_to_explore %d\n",
+ br);
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ env->peak_states--;
+ } else {
+ /* cannot free this state, since parentage chain may
+ * walk it later. Add it for free_list instead to
+ * be freed at the end of verification
+ */
+ sl->next = env->free_list;
+ env->free_list = sl;
+ }
+ sl = *pprev;
+ continue;
+ }
+next:
+ pprev = &sl->next;
+ sl = *pprev;
}
+ if (env->max_states_per_insn < states_cnt)
+ env->max_states_per_insn = states_cnt;
+
if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES)
- return 0;
+ return push_jmp_history(env, cur);
+
+ if (!add_new_state)
+ return push_jmp_history(env, cur);
- /* there were no equivalent states, remember current one.
- * technically the current state is not proven to be safe yet,
+ /* There were no equivalent states, remember the current one.
+ * Technically the current state is not proven to be safe yet,
* but it will either reach outer most bpf_exit (which means it's safe)
- * or it will be rejected. Since there are no loops, we won't be
+ * or it will be rejected. When there are no loops the verifier won't be
* seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx)
- * again on the way to bpf_exit
+ * again on the way to bpf_exit.
+ * When looping the sl->state.branches will be > 0 and this state
+ * will not be considered for equivalence until branches == 0.
*/
new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
+ env->total_states++;
+ env->peak_states++;
+ env->prev_jmps_processed = env->jmps_processed;
+ env->prev_insn_processed = env->insn_processed;
/* add new state to the head of linked list */
new = &new_sl->state;
@@ -6184,8 +7378,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
kfree(new_sl);
return err;
}
- new_sl->next = env->explored_states[insn_idx];
- env->explored_states[insn_idx] = new_sl;
+ new->insn_idx = insn_idx;
+ WARN_ONCE(new->branches != 1,
+ "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx);
+
+ cur->parent = new;
+ cur->first_insn_idx = insn_idx;
+ clear_jmp_history(cur);
+ new_sl->next = *explored_state(env, insn_idx);
+ *explored_state(env, insn_idx) = new_sl;
/* connect new state to parentage chain. Current frame needs all
* registers connected. Only r6 - r9 of the callers are alive (pushed
* to the stack implicitly by JITs) so in callers' frames connect just
@@ -6193,17 +7394,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* the state of the call instruction (with WRITTEN set), and r0 comes
* from callee with its full parentage chain, anyway.
*/
- for (j = 0; j <= cur->curframe; j++)
- for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
- cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
* their parent and current state never has children yet. Only
* explored_states can get read marks.)
*/
- for (i = 0; i < BPF_REG_FP; i++)
- cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE;
+ for (j = 0; j <= cur->curframe; j++) {
+ for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++)
+ cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i];
+ for (i = 0; i < BPF_REG_FP; i++)
+ cur->frame[j]->regs[i].live = REG_LIVE_NONE;
+ }
/* all stack frames are accessible from callee, clear them all */
for (j = 0; j <= cur->curframe; j++) {
@@ -6230,6 +7432,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
case PTR_TO_SOCK_COMMON_OR_NULL:
case PTR_TO_TCP_SOCK:
case PTR_TO_TCP_SOCK_OR_NULL:
+ case PTR_TO_XDP_SOCK:
return false;
default:
return true;
@@ -6259,9 +7462,9 @@ static int do_check(struct bpf_verifier_env *env)
struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
struct bpf_reg_state *regs;
- int insn_cnt = env->prog->len, i;
- int insn_processed = 0;
+ int insn_cnt = env->prog->len;
bool do_print_state = false;
+ int prev_insn_idx = -1;
env->prev_linfo = NULL;
@@ -6270,6 +7473,7 @@ static int do_check(struct bpf_verifier_env *env)
return -ENOMEM;
state->curframe = 0;
state->speculative = false;
+ state->branches = 1;
state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL);
if (!state->frame[0]) {
kfree(state);
@@ -6286,6 +7490,7 @@ static int do_check(struct bpf_verifier_env *env)
u8 class;
int err;
+ env->prev_insn_idx = prev_insn_idx;
if (env->insn_idx >= insn_cnt) {
verbose(env, "invalid insn idx %d insn_cnt %d\n",
env->insn_idx, insn_cnt);
@@ -6295,10 +7500,10 @@ static int do_check(struct bpf_verifier_env *env)
insn = &insns[env->insn_idx];
class = BPF_CLASS(insn->code);
- if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
+ if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
verbose(env,
"BPF program is too large. Processed %d insn\n",
- insn_processed);
+ env->insn_processed);
return -E2BIG;
}
@@ -6307,7 +7512,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
if (do_print_state)
verbose(env, "\nfrom %d to %d%s: safe\n",
env->prev_insn_idx, env->insn_idx,
@@ -6325,8 +7530,9 @@ static int do_check(struct bpf_verifier_env *env)
if (need_resched())
cond_resched();
- if (env->log.level > 1 || (env->log.level && do_print_state)) {
- if (env->log.level > 1)
+ if (env->log.level & BPF_LOG_LEVEL2 ||
+ (env->log.level & BPF_LOG_LEVEL && do_print_state)) {
+ if (env->log.level & BPF_LOG_LEVEL2)
verbose(env, "%d:", env->insn_idx);
else
verbose(env, "\nfrom %d to %d%s:",
@@ -6337,7 +7543,7 @@ static int do_check(struct bpf_verifier_env *env)
do_print_state = false;
}
- if (env->log.level) {
+ if (env->log.level & BPF_LOG_LEVEL) {
const struct bpf_insn_cbs cbs = {
.cb_print = verbose,
.private_data = env,
@@ -6357,6 +7563,7 @@ static int do_check(struct bpf_verifier_env *env)
regs = cur_regs(env);
env->insn_aux_data[env->insn_idx].seen = true;
+ prev_insn_idx = env->insn_idx;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
@@ -6475,6 +7682,7 @@ static int do_check(struct bpf_verifier_env *env)
} else if (class == BPF_JMP || class == BPF_JMP32) {
u8 opcode = BPF_OP(insn->code);
+ env->jmps_processed++;
if (opcode == BPF_CALL) {
if (BPF_SRC(insn->code) != BPF_K ||
insn->off != 0 ||
@@ -6529,7 +7737,6 @@ static int do_check(struct bpf_verifier_env *env)
if (state->curframe) {
/* exit from nested function */
- env->prev_insn_idx = env->insn_idx;
err = prepare_func_exit(env, &env->insn_idx);
if (err)
return err;
@@ -6560,7 +7767,8 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
process_bpf_exit:
- err = pop_stack(env, &env->prev_insn_idx,
+ update_branch_counts(env, env->cur_state);
+ err = pop_stack(env, &prev_insn_idx,
&env->insn_idx);
if (err < 0) {
if (err != -ENOENT)
@@ -6602,16 +7810,6 @@ process_bpf_exit:
env->insn_idx++;
}
- verbose(env, "processed %d insns (limit %d), stack depth ",
- insn_processed, BPF_COMPLEXITY_LIMIT_INSNS);
- for (i = 0; i < env->subprog_cnt; i++) {
- u32 depth = env->subprog_info[i].stack_depth;
-
- verbose(env, "%d", depth);
- if (i + 1 < env->subprog_cnt)
- verbose(env, "+");
- }
- verbose(env, "\n");
env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
return 0;
}
@@ -6709,8 +7907,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
}
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_insn_aux_data *aux;
struct bpf_map *map;
struct fd f;
+ u64 addr;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -6719,13 +7919,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (insn->src_reg == 0)
+ if (insn[0].src_reg == 0)
/* valid generic load 64-bit imm */
goto next_insn;
- if (insn[0].src_reg != BPF_PSEUDO_MAP_FD ||
- insn[1].imm != 0) {
- verbose(env, "unrecognized bpf_ld_imm64 insn\n");
+ /* In final convert_pseudo_ld_imm64() step, this is
+ * converted into regular 64-bit imm load insn.
+ */
+ if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
+ insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
+ (insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
+ insn[1].imm != 0)) {
+ verbose(env,
+ "unrecognized bpf_ld_imm64 insn\n");
return -EINVAL;
}
@@ -6743,16 +7949,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return err;
}
- /* store map pointer inside BPF_LD_IMM64 instruction */
- insn[0].imm = (u32) (unsigned long) map;
- insn[1].imm = ((u64) (unsigned long) map) >> 32;
+ aux = &env->insn_aux_data[i];
+ if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
+ addr = (unsigned long)map;
+ } else {
+ u32 off = insn[1].imm;
+
+ if (off >= BPF_MAX_VAR_OFF) {
+ verbose(env, "direct value offset of %u is not allowed\n", off);
+ fdput(f);
+ return -EINVAL;
+ }
+
+ if (!map->ops->map_direct_value_addr) {
+ verbose(env, "no direct value access support for this map type\n");
+ fdput(f);
+ return -EINVAL;
+ }
+
+ err = map->ops->map_direct_value_addr(map, &addr, off);
+ if (err) {
+ verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
+ map->value_size, off);
+ fdput(f);
+ return err;
+ }
+
+ aux->map_off = off;
+ addr += off;
+ }
+
+ insn[0].imm = (u32)addr;
+ insn[1].imm = addr >> 32;
/* check whether we recorded this map already */
- for (j = 0; j < env->used_map_cnt; j++)
+ for (j = 0; j < env->used_map_cnt; j++) {
if (env->used_maps[j] == map) {
+ aux->map_index = j;
fdput(f);
goto next_insn;
}
+ }
if (env->used_map_cnt >= MAX_USED_MAPS) {
fdput(f);
@@ -6769,6 +8006,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
fdput(f);
return PTR_ERR(map);
}
+
+ aux->map_index = env->used_map_cnt;
env->used_maps[env->used_map_cnt++] = map;
if (bpf_map_is_cgroup_storage(map) &&
@@ -6832,14 +8071,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
* insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
* [0, off) and [off, end) to new locations, so the patched range stays zero
*/
-static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
- u32 off, u32 cnt)
+static int adjust_insn_aux_data(struct bpf_verifier_env *env,
+ struct bpf_prog *new_prog, u32 off, u32 cnt)
{
struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ struct bpf_insn *insn = new_prog->insnsi;
+ u32 prog_len;
int i;
+ /* aux info at OFF always needs adjustment, no matter fast path
+ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the
+ * original insn at old prog.
+ */
+ old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1);
+
if (cnt == 1)
return 0;
+ prog_len = new_prog->len;
new_data = vzalloc(array_size(prog_len,
sizeof(struct bpf_insn_aux_data)));
if (!new_data)
@@ -6847,8 +8095,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
memcpy(new_data + off + cnt - 1, old_data + off,
sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
- for (i = off; i < off + cnt - 1; i++)
+ for (i = off; i < off + cnt - 1; i++) {
new_data[i].seen = true;
+ new_data[i].zext_dst = insn_has_def32(env, insn + i);
+ }
env->insn_aux_data = new_data;
vfree(old_data);
return 0;
@@ -6874,9 +8124,14 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
struct bpf_prog *new_prog;
new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
- if (!new_prog)
+ if (IS_ERR(new_prog)) {
+ if (PTR_ERR(new_prog) == -ERANGE)
+ verbose(env,
+ "insn %d cannot be patched due to 16-bit range\n",
+ env->insn_aux_data[off].orig_idx);
return NULL;
- if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ }
+ if (adjust_insn_aux_data(env, new_prog, off, len))
return NULL;
adjust_subprog_starts(env, off, len);
return new_prog;
@@ -7140,6 +8395,84 @@ static int opt_remove_nops(struct bpf_verifier_env *env)
return 0;
}
+static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env,
+ const union bpf_attr *attr)
+{
+ struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4];
+ struct bpf_insn_aux_data *aux = env->insn_aux_data;
+ int i, patch_len, delta = 0, len = env->prog->len;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct bpf_prog *new_prog;
+ bool rnd_hi32;
+
+ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32;
+ zext_patch[1] = BPF_ZEXT_REG(0);
+ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0);
+ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32);
+ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX);
+ for (i = 0; i < len; i++) {
+ int adj_idx = i + delta;
+ struct bpf_insn insn;
+
+ insn = insns[adj_idx];
+ if (!aux[adj_idx].zext_dst) {
+ u8 code, class;
+ u32 imm_rnd;
+
+ if (!rnd_hi32)
+ continue;
+
+ code = insn.code;
+ class = BPF_CLASS(code);
+ if (insn_no_def(&insn))
+ continue;
+
+ /* NOTE: arg "reg" (the fourth one) is only used for
+ * BPF_STX which has been ruled out in above
+ * check, it is safe to pass NULL here.
+ */
+ if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) {
+ if (class == BPF_LD &&
+ BPF_MODE(code) == BPF_IMM)
+ i++;
+ continue;
+ }
+
+ /* ctx load could be transformed into wider load. */
+ if (class == BPF_LDX &&
+ aux[adj_idx].ptr_type == PTR_TO_CTX)
+ continue;
+
+ imm_rnd = get_random_int();
+ rnd_hi32_patch[0] = insn;
+ rnd_hi32_patch[1].imm = imm_rnd;
+ rnd_hi32_patch[3].dst_reg = insn.dst_reg;
+ patch = rnd_hi32_patch;
+ patch_len = 4;
+ goto apply_patch_buffer;
+ }
+
+ if (!bpf_jit_needs_zext())
+ continue;
+
+ zext_patch[0] = insn;
+ zext_patch[1].dst_reg = insn.dst_reg;
+ zext_patch[1].src_reg = insn.dst_reg;
+ patch = zext_patch;
+ patch_len = 2;
+apply_patch_buffer:
+ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len);
+ if (!new_prog)
+ return -ENOMEM;
+ env->prog = new_prog;
+ insns = new_prog->insnsi;
+ aux = env->insn_aux_data;
+ delta += patch_len - 1;
+ }
+
+ return 0;
+}
+
/* convert load instructions that access fields of a context type into a
* sequence of instructions that access fields of the underlying structure:
* struct __sk_buff -> struct sk_buff
@@ -7238,6 +8571,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
case PTR_TO_TCP_SOCK:
convert_ctx_access = bpf_tcp_sock_convert_ctx_access;
break;
+ case PTR_TO_XDP_SOCK:
+ convert_ctx_access = bpf_xdp_sock_convert_ctx_access;
+ break;
default:
continue;
}
@@ -7296,7 +8632,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
insn->dst_reg,
shift);
insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
- (1 << size * 8) - 1);
+ (1ULL << size * 8) - 1);
}
}
@@ -7413,9 +8749,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn->src_reg != BPF_PSEUDO_CALL)
continue;
subprog = insn->off;
- insn->imm = (u64 (*)(u64, u64, u64, u64, u64))
- func[subprog]->bpf_func -
- __bpf_call_base;
+ insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) -
+ __bpf_call_base;
}
/* we use the aux data to keep a list of the start addresses
@@ -7817,27 +9152,59 @@ static void free_states(struct bpf_verifier_env *env)
struct bpf_verifier_state_list *sl, *sln;
int i;
+ sl = env->free_list;
+ while (sl) {
+ sln = sl->next;
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ sl = sln;
+ }
+
if (!env->explored_states)
return;
- for (i = 0; i < env->prog->len; i++) {
+ for (i = 0; i < state_htab_size(env); i++) {
sl = env->explored_states[i];
- if (sl)
- while (sl != STATE_LIST_MARK) {
- sln = sl->next;
- free_verifier_state(&sl->state, false);
- kfree(sl);
- sl = sln;
- }
+ while (sl) {
+ sln = sl->next;
+ free_verifier_state(&sl->state, false);
+ kfree(sl);
+ sl = sln;
+ }
}
- kfree(env->explored_states);
+ kvfree(env->explored_states);
+}
+
+static void print_verification_stats(struct bpf_verifier_env *env)
+{
+ int i;
+
+ if (env->log.level & BPF_LOG_STATS) {
+ verbose(env, "verification time %lld usec\n",
+ div_u64(env->verification_time, 1000));
+ verbose(env, "stack depth ");
+ for (i = 0; i < env->subprog_cnt; i++) {
+ u32 depth = env->subprog_info[i].stack_depth;
+
+ verbose(env, "%d", depth);
+ if (i + 1 < env->subprog_cnt)
+ verbose(env, "+");
+ }
+ verbose(env, "\n");
+ }
+ verbose(env, "processed %d insns (limit %d) max_states_per_insn %d "
+ "total_states %d peak_states %d mark_read %d\n",
+ env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS,
+ env->max_states_per_insn, env->total_states,
+ env->peak_states, env->longest_mark_read_walk);
}
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
union bpf_attr __user *uattr)
{
+ u64 start_time = ktime_get_ns();
struct bpf_verifier_env *env;
struct bpf_verifier_log *log;
int i, len, ret = -EINVAL;
@@ -7865,9 +9232,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
+ is_priv = capable(CAP_SYS_ADMIN);
/* grab the mutex to protect few globals used by verifier */
- mutex_lock(&bpf_verifier_lock);
+ if (!is_priv)
+ mutex_lock(&bpf_verifier_lock);
if (attr->log_level || attr->log_buf || attr->log_size) {
/* user requested verbose verifier output
@@ -7879,8 +9248,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
ret = -EINVAL;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf)
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 ||
+ !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK)
goto err_unlock;
}
@@ -7890,7 +9259,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (attr->prog_flags & BPF_F_ANY_ALIGNMENT)
env->strict_alignment = false;
- is_priv = capable(CAP_SYS_ADMIN);
env->allow_ptr_leaks = is_priv;
ret = replace_map_fd_with_map_ptr(env);
@@ -7903,7 +9271,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
goto skip_full_check;
}
- env->explored_states = kcalloc(env->prog->len,
+ env->explored_states = kvcalloc(state_htab_size(env),
sizeof(struct bpf_verifier_state_list *),
GFP_USER);
ret = -ENOMEM;
@@ -7958,9 +9326,21 @@ skip_full_check:
if (ret == 0)
ret = fixup_bpf_calls(env);
+ /* do 32-bit optimization after insn patching has done so those patched
+ * insns could be handled correctly.
+ */
+ if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) {
+ ret = opt_subreg_zext_lo32_rnd_hi32(env, attr);
+ env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret
+ : false;
+ }
+
if (ret == 0)
ret = fixup_call_args(env);
+ env->verification_time = ktime_get_ns() - start_time;
+ print_verification_stats(env);
+
if (log->level && bpf_verifier_log_full(log))
ret = -ENOSPC;
if (log->level && !log->ubuf) {
@@ -8000,7 +9380,8 @@ err_release_maps:
release_maps(env);
*prog = env->prog;
err_unlock:
- mutex_unlock(&bpf_verifier_lock);
+ if (!is_priv)
+ mutex_unlock(&bpf_verifier_lock);
vfree(env->insn_aux_data);
err_free_env:
kfree(env);
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..9bb96ace9fa1 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -17,8 +17,8 @@ struct xsk_map {
static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
{
- int cpu, err = -EINVAL;
struct xsk_map *m;
+ int cpu, err;
u64 cost;
if (!capable(CAP_NET_ADMIN))
@@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *);
cost += sizeof(struct list_head) * num_possible_cpus();
- if (cost >= U32_MAX - PAGE_SIZE)
- goto free_m;
-
- m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
/* Notice returns -EPERM on if map size is larger than memlock limit */
- err = bpf_map_precharge_memlock(m->map.pages);
+ err = bpf_map_charge_init(&m->map.memory, cost);
if (err)
goto free_m;
@@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
m->flush_list = alloc_percpu(struct list_head);
if (!m->flush_list)
- goto free_m;
+ goto free_charge;
for_each_possible_cpu(cpu)
INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu));
@@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
free_percpu:
free_percpu(m->flush_list);
+free_charge:
+ bpf_map_charge_finish(&m->map.memory);
free_m:
kfree(m);
return ERR_PTR(err);
@@ -147,13 +145,18 @@ void __xsk_map_flush(struct bpf_map *map)
list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
xsk_flush(xs);
- __list_del(xs->flush_node.prev, xs->flush_node.next);
- xs->flush_node.prev = NULL;
+ __list_del_clearprev(&xs->flush_node);
}
}
static void *xsk_map_lookup_elem(struct bpf_map *map, void *key)
{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return __xsk_map_lookup_elem(map, *(u32 *)key);
+}
+
+static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key)
+{
return ERR_PTR(-EOPNOTSUPP);
}
@@ -220,6 +223,7 @@ const struct bpf_map_ops xsk_map_ops = {
.map_free = xsk_map_free,
.map_get_next_key = xsk_map_get_next_key,
.map_lookup_elem = xsk_map_lookup_elem,
+ .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only,
.map_update_elem = xsk_map_update_elem,
.map_delete_elem = xsk_map_delete_elem,
.map_check_btf = map_check_no_btf,
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index bfcdae896122..5d7a76bfbbb7 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o
+obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o
-obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
+obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 30e39f3932ad..809e34a3c017 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void);
#define TRACE_CGROUP_PATH(type, cgrp, ...) \
do { \
if (trace_cgroup_##type##_enabled()) { \
- spin_lock(&trace_cgroup_path_lock); \
+ unsigned long flags; \
+ spin_lock_irqsave(&trace_cgroup_path_lock, \
+ flags); \
cgroup_path(cgrp, trace_cgroup_path, \
TRACE_CGROUP_PATH_LEN); \
trace_cgroup_##type(cgrp, trace_cgroup_path, \
##__VA_ARGS__); \
- spin_unlock(&trace_cgroup_path_lock); \
+ spin_unlock_irqrestore(&trace_cgroup_path_lock, \
+ flags); \
} \
} while (0)
@@ -240,6 +243,7 @@ int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
struct kernfs_root *kf_root);
+int __cgroup_task_count(const struct cgroup *cgrp);
int cgroup_task_count(const struct cgroup *cgrp);
/*
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index c126b34fd4ff..88006be40ea3 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/ctype.h>
@@ -342,22 +343,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
return l;
}
-/**
- * cgroup_task_count - count the number of tasks in a cgroup.
- * @cgrp: the cgroup in question
- */
-int cgroup_task_count(const struct cgroup *cgrp)
-{
- int count = 0;
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += link->cset->nr_tasks;
- spin_unlock_irq(&css_set_lock);
- return count;
-}
-
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3f2b4bde0f9c..300b0c416341 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
@@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[];
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
@@ -593,6 +594,39 @@ static void cgroup_get_live(struct cgroup *cgrp)
css_get(&cgrp->self);
}
+/**
+ * __cgroup_task_count - count the number of tasks in a cgroup. The caller
+ * is responsible for taking the css_set_lock.
+ * @cgrp: the cgroup in question
+ */
+int __cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count = 0;
+ struct cgrp_cset_link *link;
+
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cgrp->cset_links, cset_link)
+ count += link->cset->nr_tasks;
+
+ return count;
+}
+
+/**
+ * cgroup_task_count - count the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+int cgroup_task_count(const struct cgroup *cgrp)
+{
+ int count;
+
+ spin_lock_irq(&css_set_lock);
+ count = __cgroup_task_count(cgrp);
+ spin_unlock_irq(&css_set_lock);
+
+ return count;
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -705,6 +739,7 @@ struct css_set init_css_set = {
.dom_cset = &init_css_set,
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
+ .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -783,6 +818,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
break;
cgroup1_check_for_release(cgrp);
+ TRACE_CGROUP_PATH(notify_populated, cgrp,
+ cgroup_is_populated(cgrp));
cgroup_file_notify(&cgrp->events_file);
child = cgrp;
@@ -808,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
cgroup_update_populated(link->cgrp, populated);
}
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position. Advancing an iterator might
+ * remove it from the list, use safe walk. See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+ struct task_struct *task)
+{
+ struct css_task_iter *it, *pos;
+
+ list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+ css_task_iter_skip(it, task);
+}
+
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
@@ -833,22 +885,9 @@ static void css_set_move_task(struct task_struct *task,
css_set_update_populated(to_cset, true);
if (from_cset) {
- struct css_task_iter *it, *pos;
-
WARN_ON_ONCE(list_empty(&task->cg_list));
- /*
- * @task is leaving, advance task iterators which are
- * pointing to it so that they can resume at the next
- * position. Advancing an iterator might remove it from
- * the list, use safe walk. See css_task_iter_advance*()
- * for details.
- */
- list_for_each_entry_safe(it, pos, &from_cset->task_iters,
- iters_node)
- if (it->task_pos == &task->cg_list)
- css_task_iter_advance(it);
-
+ css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
@@ -1175,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
+ INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
@@ -1775,11 +1815,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_memory_localevents,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_param_specs[] = {
- fsparam_flag ("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("memory_localevents", Opt_memory_localevents),
{}
};
@@ -1802,6 +1844,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_memory_localevents:
+ ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ return 0;
}
return -EINVAL;
}
@@ -1813,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+
+ if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
}
}
@@ -1820,6 +1870,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
+ seq_puts(seq, ",memory_localevents");
return 0;
}
@@ -2402,8 +2454,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
get_css_set(to_cset);
to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
- put_css_set_locked(from_cset);
from_cset->nr_tasks--;
+ /*
+ * If the source or destination cgroup is frozen,
+ * the task might require to change its state.
+ */
+ cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
+ to_cset->dfl_cgrp);
+ put_css_set_locked(from_cset);
+
}
}
spin_unlock_irq(&css_set_lock);
@@ -2602,7 +2661,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
if (!dst_cset)
- goto err;
+ return -ENOMEM;
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
@@ -2634,9 +2693,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
}
return 0;
-err:
- cgroup_migrate_finish(mgctx);
- return -ENOMEM;
}
/**
@@ -3447,8 +3503,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
static int cgroup_events_show(struct seq_file *seq, void *v)
{
- seq_printf(seq, "populated %d\n",
- cgroup_is_populated(seq_css(seq)->cgroup));
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
+ seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
+
return 0;
}
@@ -3498,17 +3557,118 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
#ifdef CONFIG_PSI
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+ return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+ return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
- return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+ struct cgroup *cgroup = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
+
+ return psi_show(seq, psi, PSI_CPU);
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
+{
+ struct psi_trigger *new;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_get(cgrp);
+ cgroup_kn_unlock(of->kn);
+
+ new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ cgroup_put(cgrp);
+ return PTR_ERR(new);
+ }
+
+ psi_trigger_replace(&of->priv, new);
+
+ cgroup_put(cgrp);
+
+ return nbytes;
+}
+
+static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+}
+
+static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+}
+
+static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
+ poll_table *pt)
+{
+ return psi_trigger_poll(&of->priv, of->file, pt);
+}
+
+static void cgroup_pressure_release(struct kernfs_open_file *of)
+{
+ psi_trigger_replace(&of->priv, NULL);
+}
+#endif /* CONFIG_PSI */
+
+static int cgroup_freeze_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+
+ seq_printf(seq, "%d\n", cgrp->freezer.freeze);
+
+ return 0;
+}
+
+static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct cgroup *cgrp;
+ ssize_t ret;
+ int freeze;
+
+ ret = kstrtoint(strstrip(buf), 0, &freeze);
+ if (ret)
+ return ret;
+
+ if (freeze < 0 || freeze > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ cgroup_freeze(cgrp, freeze);
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
-#endif
static int cgroup_file_open(struct kernfs_open_file *of)
{
@@ -4066,6 +4226,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
return NULL;
}
+EXPORT_SYMBOL_GPL(css_next_descendant_pre);
/**
* css_rightmost_descendant - return the rightmost descendant of a css
@@ -4253,15 +4414,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
it->task_pos = NULL;
return;
}
- } while (!css_set_populated(cset));
+ } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
- else
+ else if (!list_empty(&cset->mg_tasks))
it->task_pos = cset->mg_tasks.next;
+ else
+ it->task_pos = cset->dying_tasks.next;
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
+ it->dying_tasks_head = &cset->dying_tasks;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4287,9 +4451,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
list_add(&it->iters_node, &cset->task_iters);
}
+static void css_task_iter_skip(struct css_task_iter *it,
+ struct task_struct *task)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (it->task_pos == &task->cg_list) {
+ it->task_pos = it->task_pos->next;
+ it->flags |= CSS_TASK_ITER_SKIPPED;
+ }
+}
+
static void css_task_iter_advance(struct css_task_iter *it)
{
- struct list_head *next;
+ struct task_struct *task;
lockdep_assert_held(&css_set_lock);
repeat:
@@ -4299,25 +4474,40 @@ repeat:
* consumed first and then ->mg_tasks. After ->mg_tasks,
* we move onto the next cset.
*/
- next = it->task_pos->next;
-
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ it->flags &= ~CSS_TASK_ITER_SKIPPED;
+ else
+ it->task_pos = it->task_pos->next;
- if (next == it->mg_tasks_head)
+ if (it->task_pos == it->tasks_head)
+ it->task_pos = it->mg_tasks_head->next;
+ if (it->task_pos == it->mg_tasks_head)
+ it->task_pos = it->dying_tasks_head->next;
+ if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
} else {
/* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}
- /* if PROCS, skip over tasks which aren't group leaders */
- if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
- !thread_group_leader(list_entry(it->task_pos, struct task_struct,
- cg_list)))
- goto repeat;
+ if (!it->task_pos)
+ return;
+
+ task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+ if (it->flags & CSS_TASK_ITER_PROCS) {
+ /* if PROCS, skip over tasks which aren't group leaders */
+ if (!thread_group_leader(task))
+ goto repeat;
+
+ /* and dying leaders w/o live member threads */
+ if (!atomic_read(&task->signal->live))
+ goto repeat;
+ } else {
+ /* skip all dying ones */
+ if (task->flags & PF_EXITING)
+ goto repeat;
+ }
}
/**
@@ -4373,6 +4563,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
spin_lock_irq(&css_set_lock);
+ /* @it may be half-advanced by skips, finish advancing */
+ if (it->flags & CSS_TASK_ITER_SKIPPED)
+ css_task_iter_advance(it);
+
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list);
@@ -4654,6 +4848,12 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_stat_show,
},
{
+ .name = "cgroup.freeze",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cgroup_freeze_show,
+ .write = cgroup_freeze_write,
+ },
+ {
.name = "cpu.stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cpu_stat_show,
@@ -4661,20 +4861,26 @@ static struct cftype cgroup_base_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_io_pressure_show,
+ .write = cgroup_io_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "memory.pressure",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_memory_pressure_show,
+ .write = cgroup_memory_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
{
.name = "cpu.pressure",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_cpu_pressure_show,
+ .write = cgroup_cpu_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
},
-#endif
+#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -4781,9 +4987,11 @@ static void css_release_work_fn(struct work_struct *work)
if (cgroup_on_dfl(cgrp))
cgroup_rstat_flush(cgrp);
+ spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
tcgrp = cgroup_parent(tcgrp))
tcgrp->nr_dying_descendants--;
+ spin_unlock_irq(&css_set_lock);
cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
@@ -4798,8 +5006,6 @@ static void css_release_work_fn(struct work_struct *work)
if (cgrp->kn)
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
NULL);
-
- cgroup_bpf_put(cgrp);
}
mutex_unlock(&cgroup_mutex);
@@ -5001,12 +5207,31 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (ret)
goto out_psi_free;
+ /*
+ * New cgroup inherits effective freeze counter, and
+ * if the parent has to be frozen, the child has too.
+ */
+ cgrp->freezer.e_freeze = parent->freezer.e_freeze;
+ if (cgrp->freezer.e_freeze)
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+
+ spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
- if (tcgrp != cgrp)
+ if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
+
+ /*
+ * If the new cgroup is frozen, all ancestor cgroups
+ * get a new frozen descendant, but their state can't
+ * change because of this.
+ */
+ if (cgrp->freezer.e_freeze)
+ tcgrp->freezer.nr_frozen_descendants++;
+ }
}
+ spin_unlock_irq(&css_set_lock);
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -5291,13 +5516,23 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
if (parent && cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
+ spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
tcgrp->nr_descendants--;
tcgrp->nr_dying_descendants++;
+ /*
+ * If the dying cgroup is frozen, decrease frozen descendants
+ * counters of ancestor cgroups.
+ */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags))
+ tcgrp->freezer.nr_frozen_descendants--;
}
+ spin_unlock_irq(&css_set_lock);
cgroup1_check_for_release(parent);
+ cgroup_bpf_offline(cgrp);
+
/* put the base reference */
percpu_ref_kill(&cgrp->self.refcnt);
@@ -5432,7 +5667,6 @@ int __init cgroup_init(void)
int ssid;
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
- BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
@@ -5746,6 +5980,26 @@ void cgroup_post_fork(struct task_struct *child)
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
}
+
+ /*
+ * If the cgroup has to be frozen, the new task has too.
+ * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
+ * the task into the frozen state.
+ */
+ if (unlikely(cgroup_task_freeze(child))) {
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later
+ * from do_freezer_trap(). So we avoid cgroup's
+ * transient switch from the frozen state and back.
+ */
+ }
+
spin_unlock_irq(&css_set_lock);
}
@@ -5793,7 +6047,13 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+
+ WARN_ON_ONCE(cgroup_task_frozen(tsk));
+ if (unlikely(cgroup_task_freeze(tsk)))
+ cgroup_update_frozen(task_dfl_cgroup(tsk));
+
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
@@ -5813,6 +6073,13 @@ void cgroup_release(struct task_struct *task)
do_each_subsys_mask(ss, ssid, have_release_callback) {
ss->release(task);
} while_each_subsys_mask();
+
+ if (use_task_css_set_links) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
}
void cgroup_free(struct task_struct *task)
@@ -5974,6 +6241,48 @@ struct cgroup *cgroup_get_from_fd(int fd)
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
+static u64 power_of_ten(int power)
+{
+ u64 v = 1;
+ while (power--)
+ v *= 10;
+ return v;
+}
+
+/**
+ * cgroup_parse_float - parse a floating number
+ * @input: input string
+ * @dec_shift: number of decimal digits to shift
+ * @v: output
+ *
+ * Parse a decimal floating point number in @input and store the result in
+ * @v with decimal point right shifted @dec_shift times. For example, if
+ * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
+ * Returns 0 on success, -errno otherwise.
+ *
+ * There's nothing cgroup specific about this function except that it's
+ * currently the only user.
+ */
+int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
+{
+ s64 whole, frac = 0;
+ int fstart = 0, fend = 0, flen;
+
+ if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
+ return -EINVAL;
+ if (frac < 0)
+ return -EINVAL;
+
+ flen = fend > fstart ? fend - fstart : 0;
+ if (flen < dec_shift)
+ frac *= power_of_ten(dec_shift - flen);
+ else
+ frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
+
+ *v = whole * power_of_ten(dec_shift) + frac;
+ return 0;
+}
+
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
@@ -6012,6 +6321,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
* Don't use cgroup_get_live().
*/
cgroup_get(sock_cgroup_ptr(skcd));
+ cgroup_bpf_get(sock_cgroup_ptr(skcd));
return;
}
@@ -6023,6 +6333,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
skcd->val = (unsigned long)cset->dfl_cgrp;
+ cgroup_bpf_get(cset->dfl_cgrp);
break;
}
cpu_relax();
@@ -6033,7 +6344,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
- cgroup_put(sock_cgroup_ptr(skcd));
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ cgroup_bpf_put(cgrp);
+ cgroup_put(cgrp);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
@@ -6116,7 +6430,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+ return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
@@ -6136,4 +6450,5 @@ static int __init cgroup_sysfs_init(void)
return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
}
subsys_initcall(cgroup_sysfs_init);
+
#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 6a1942ed781c..b3b02b9c4405 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -729,7 +729,7 @@ static inline int nr_cpusets(void)
* load balancing domains (sched domains) as specified by that partial
* partition.
*
- * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt
+ * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst
* for a background explanation of this.
*
* Does not return errors, on the theory that the callers of this
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
if (task_css_is_root(task, cpuset_cgrp_id))
return;
- set_cpus_allowed_ptr(task, &current->cpus_allowed);
+ set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
}
@@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
spin_unlock_irqrestore(&callback_lock, flags);
}
+/**
+ * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
+ * @tsk: pointer to task_struct with which the scheduler is struggling
+ *
+ * Description: In the case that the scheduler cannot find an allowed cpu in
+ * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
+ * mode however, this value is the same as task_cs(tsk)->effective_cpus,
+ * which will not contain a sane cpumask during cases such as cpu hotplugging.
+ * This is the absolute last resort for the scheduler and it is only used if
+ * _every_ other avenue has been traveled.
+ **/
+
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
rcu_read_lock();
- do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
+ do_set_cpus_allowed(tsk, is_in_v2_mode() ?
+ task_cs(tsk)->cpus_allowed : cpu_possible_mask);
rcu_read_unlock();
/*
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 5f1b87330bee..80aa3f027ac3 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v)
css = cset->subsys[ss->id];
if (!css)
continue;
- seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
- (unsigned long)css, css->id);
+ seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name,
+ css, css->id);
}
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
@@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
if (css->parent)
snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
css->parent->id);
- seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
- (unsigned long)css, css->id,
+ seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name,
+ css, css->id,
atomic_read(&css->online_cnt), pbuf);
}
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 08236798d173..8cf010680678 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,481 +1,314 @@
-/*
- * cgroup_freezer.c - control group freezer subsystem
- *
- * Copyright IBM Corporation, 2007
- *
- * Author : Cedric Le Goater <clg@fr.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- */
-
-#include <linux/export.h>
-#include <linux/slab.h>
+//SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/uaccess.h>
-#include <linux/freezer.h>
-#include <linux/seq_file.h>
-#include <linux/mutex.h>
-
-/*
- * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
- * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
- * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
- * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
- * its ancestors has FREEZING_SELF set.
- */
-enum freezer_state_flags {
- CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
- CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
- CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
- CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+#include <linux/sched.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
- /* mask for all FREEZING flags */
- CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
-};
+#include "cgroup-internal.h"
-struct freezer {
- struct cgroup_subsys_state css;
- unsigned int state;
-};
+#include <trace/events/cgroup.h>
-static DEFINE_MUTEX(freezer_mutex);
-
-static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
+/*
+ * Propagate the cgroup frozen state upwards by the cgroup tree.
+ */
+static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen)
{
- return css ? container_of(css, struct freezer, css) : NULL;
-}
+ int desc = 1;
-static inline struct freezer *task_freezer(struct task_struct *task)
-{
- return css_freezer(task_css(task, freezer_cgrp_id));
+ /*
+ * If the new state is frozen, some freezing ancestor cgroups may change
+ * their state too, depending on if all their descendants are frozen.
+ *
+ * Otherwise, all ancestor cgroups are forced into the non-frozen state.
+ */
+ while ((cgrp = cgroup_parent(cgrp))) {
+ if (frozen) {
+ cgrp->freezer.nr_frozen_descendants += desc;
+ if (!test_bit(CGRP_FROZEN, &cgrp->flags) &&
+ test_bit(CGRP_FREEZE, &cgrp->flags) &&
+ cgrp->freezer.nr_frozen_descendants ==
+ cgrp->nr_descendants) {
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, 1);
+ desc++;
+ }
+ } else {
+ cgrp->freezer.nr_frozen_descendants -= desc;
+ if (test_bit(CGRP_FROZEN, &cgrp->flags)) {
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, 0);
+ desc++;
+ }
+ }
+ }
}
-static struct freezer *parent_freezer(struct freezer *freezer)
+/*
+ * Revisit the cgroup frozen state.
+ * Checks if the cgroup is really frozen and perform all state transitions.
+ */
+void cgroup_update_frozen(struct cgroup *cgrp)
{
- return css_freezer(freezer->css.parent);
-}
+ bool frozen;
-bool cgroup_freezing(struct task_struct *task)
-{
- bool ret;
+ lockdep_assert_held(&css_set_lock);
- rcu_read_lock();
- ret = task_freezer(task)->state & CGROUP_FREEZING;
- rcu_read_unlock();
+ /*
+ * If the cgroup has to be frozen (CGRP_FREEZE bit set),
+ * and all tasks are frozen and/or stopped, let's consider
+ * the cgroup frozen. Otherwise it's not frozen.
+ */
+ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) &&
+ cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp);
- return ret;
-}
+ if (frozen) {
+ /* Already there? */
+ if (test_bit(CGRP_FROZEN, &cgrp->flags))
+ return;
-static const char *freezer_state_strs(unsigned int state)
-{
- if (state & CGROUP_FROZEN)
- return "FROZEN";
- if (state & CGROUP_FREEZING)
- return "FREEZING";
- return "THAWED";
-};
-
-static struct cgroup_subsys_state *
-freezer_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct freezer *freezer;
+ set_bit(CGRP_FROZEN, &cgrp->flags);
+ } else {
+ /* Already there? */
+ if (!test_bit(CGRP_FROZEN, &cgrp->flags))
+ return;
- freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
- if (!freezer)
- return ERR_PTR(-ENOMEM);
+ clear_bit(CGRP_FROZEN, &cgrp->flags);
+ }
+ cgroup_file_notify(&cgrp->events_file);
+ TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen);
- return &freezer->css;
+ /* Update the state of ancestor cgroups. */
+ cgroup_propagate_frozen(cgrp, frozen);
}
-/**
- * freezer_css_online - commit creation of a freezer css
- * @css: css being created
- *
- * We're committing to creation of @css. Mark it online and inherit
- * parent's freezing state while holding both parent's and our
- * freezer->lock.
+/*
+ * Increment cgroup's nr_frozen_tasks.
*/
-static int freezer_css_online(struct cgroup_subsys_state *css)
+static void cgroup_inc_frozen_cnt(struct cgroup *cgrp)
{
- struct freezer *freezer = css_freezer(css);
- struct freezer *parent = parent_freezer(freezer);
-
- mutex_lock(&freezer_mutex);
-
- freezer->state |= CGROUP_FREEZER_ONLINE;
-
- if (parent && (parent->state & CGROUP_FREEZING)) {
- freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
- }
-
- mutex_unlock(&freezer_mutex);
- return 0;
+ cgrp->freezer.nr_frozen_tasks++;
}
-/**
- * freezer_css_offline - initiate destruction of a freezer css
- * @css: css being destroyed
- *
- * @css is going away. Mark it dead and decrement system_freezing_count if
- * it was holding one.
+/*
+ * Decrement cgroup's nr_frozen_tasks.
*/
-static void freezer_css_offline(struct cgroup_subsys_state *css)
+static void cgroup_dec_frozen_cnt(struct cgroup *cgrp)
{
- struct freezer *freezer = css_freezer(css);
-
- mutex_lock(&freezer_mutex);
-
- if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
-
- freezer->state = 0;
-
- mutex_unlock(&freezer_mutex);
+ cgrp->freezer.nr_frozen_tasks--;
+ WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0);
}
-static void freezer_css_free(struct cgroup_subsys_state *css)
+/*
+ * Enter frozen/stopped state, if not yet there. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
+ */
+void cgroup_enter_frozen(void)
{
- kfree(css_freezer(css));
+ struct cgroup *cgrp;
+
+ if (current->frozen)
+ return;
+
+ spin_lock_irq(&css_set_lock);
+ current->frozen = true;
+ cgrp = task_dfl_cgroup(current);
+ cgroup_inc_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
}
/*
- * Tasks can be migrated into a different freezer anytime regardless of its
- * current state. freezer_attach() is responsible for making new tasks
- * conform to the current state.
+ * Conditionally leave frozen/stopped state. Update cgroup's counters,
+ * and revisit the state of the cgroup, if necessary.
*
- * Freezer state changes and task migration are synchronized via
- * @freezer->lock. freezer_attach() makes the new tasks conform to the
- * current state and all following state changes can see the new tasks.
+ * If always_leave is not set, and the cgroup is freezing,
+ * we're racing with the cgroup freezing. In this case, we don't
+ * drop the frozen counter to avoid a transient switch to
+ * the unfrozen state.
*/
-static void freezer_attach(struct cgroup_taskset *tset)
+void cgroup_leave_frozen(bool always_leave)
{
- struct task_struct *task;
- struct cgroup_subsys_state *new_css;
-
- mutex_lock(&freezer_mutex);
-
- /*
- * Make the new tasks conform to the current state of @new_css.
- * For simplicity, when migrating any task to a FROZEN cgroup, we
- * revert it to FREEZING and let update_if_frozen() determine the
- * correct state later.
- *
- * Tasks in @tset are on @new_css but may not conform to its
- * current state before executing the following - !frozen tasks may
- * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
- */
- cgroup_taskset_for_each(task, new_css, tset) {
- struct freezer *freezer = css_freezer(new_css);
-
- if (!(freezer->state & CGROUP_FREEZING)) {
- __thaw_task(task);
- } else {
- freeze_task(task);
- /* clear FROZEN and propagate upwards */
- while (freezer && (freezer->state & CGROUP_FROZEN)) {
- freezer->state &= ~CGROUP_FROZEN;
- freezer = parent_freezer(freezer);
- }
- }
+ struct cgroup *cgrp;
+
+ spin_lock_irq(&css_set_lock);
+ cgrp = task_dfl_cgroup(current);
+ if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) {
+ cgroup_dec_frozen_cnt(cgrp);
+ cgroup_update_frozen(cgrp);
+ WARN_ON_ONCE(!current->frozen);
+ current->frozen = false;
+ } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) {
+ spin_lock(&current->sighand->siglock);
+ current->jobctl |= JOBCTL_TRAP_FREEZE;
+ set_thread_flag(TIF_SIGPENDING);
+ spin_unlock(&current->sighand->siglock);
}
-
- mutex_unlock(&freezer_mutex);
+ spin_unlock_irq(&css_set_lock);
}
-/**
- * freezer_fork - cgroup post fork callback
- * @task: a task which has just been forked
- *
- * @task has just been created and should conform to the current state of
- * the cgroup_freezer it belongs to. This function may race against
- * freezer_attach(). Losing to freezer_attach() means that we don't have
- * to do anything as freezer_attach() will put @task into the appropriate
- * state.
+/*
+ * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE
+ * jobctl bit.
*/
-static void freezer_fork(struct task_struct *task)
+static void cgroup_freeze_task(struct task_struct *task, bool freeze)
{
- struct freezer *freezer;
+ unsigned long flags;
- /*
- * The root cgroup is non-freezable, so we can skip locking the
- * freezer. This is safe regardless of race with task migration.
- * If we didn't race or won, skipping is obviously the right thing
- * to do. If we lost and root is the new cgroup, noop is still the
- * right thing to do.
- */
- if (task_css_is_root(task, freezer_cgrp_id))
+ /* If the task is about to die, don't bother with freezing it. */
+ if (!lock_task_sighand(task, &flags))
return;
- mutex_lock(&freezer_mutex);
- rcu_read_lock();
-
- freezer = task_freezer(task);
- if (freezer->state & CGROUP_FREEZING)
- freeze_task(task);
+ if (freeze) {
+ task->jobctl |= JOBCTL_TRAP_FREEZE;
+ signal_wake_up(task, false);
+ } else {
+ task->jobctl &= ~JOBCTL_TRAP_FREEZE;
+ wake_up_process(task);
+ }
- rcu_read_unlock();
- mutex_unlock(&freezer_mutex);
+ unlock_task_sighand(task, &flags);
}
-/**
- * update_if_frozen - update whether a cgroup finished freezing
- * @css: css of interest
- *
- * Once FREEZING is initiated, transition to FROZEN is lazily updated by
- * calling this function. If the current state is FREEZING but not FROZEN,
- * this function checks whether all tasks of this cgroup and the descendant
- * cgroups finished freezing and, if so, sets FROZEN.
- *
- * The caller is responsible for grabbing RCU read lock and calling
- * update_if_frozen() on all descendants prior to invoking this function.
- *
- * Task states and freezer state might disagree while tasks are being
- * migrated into or out of @css, so we can't verify task states against
- * @freezer state here. See freezer_attach() for details.
+/*
+ * Freeze or unfreeze all tasks in the given cgroup.
*/
-static void update_if_frozen(struct cgroup_subsys_state *css)
+static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze)
{
- struct freezer *freezer = css_freezer(css);
- struct cgroup_subsys_state *pos;
struct css_task_iter it;
struct task_struct *task;
- lockdep_assert_held(&freezer_mutex);
-
- if (!(freezer->state & CGROUP_FREEZING) ||
- (freezer->state & CGROUP_FROZEN))
- return;
+ lockdep_assert_held(&cgroup_mutex);
- /* are all (live) children frozen? */
- rcu_read_lock();
- css_for_each_child(pos, css) {
- struct freezer *child = css_freezer(pos);
-
- if ((child->state & CGROUP_FREEZER_ONLINE) &&
- !(child->state & CGROUP_FROZEN)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
+ spin_lock_irq(&css_set_lock);
+ if (freeze)
+ set_bit(CGRP_FREEZE, &cgrp->flags);
+ else
+ clear_bit(CGRP_FREEZE, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
- /* are all tasks frozen? */
- css_task_iter_start(css, 0, &it);
+ if (freeze)
+ TRACE_CGROUP_PATH(freeze, cgrp);
+ else
+ TRACE_CGROUP_PATH(unfreeze, cgrp);
+ css_task_iter_start(&cgrp->self, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
- }
-
- freezer->state |= CGROUP_FROZEN;
-out_iter_end:
- css_task_iter_end(&it);
-}
-
-static int freezer_read(struct seq_file *m, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(m), *pos;
-
- mutex_lock(&freezer_mutex);
- rcu_read_lock();
-
- /* update states bottom-up */
- css_for_each_descendant_post(pos, css) {
- if (!css_tryget_online(pos))
+ /*
+ * Ignore kernel threads here. Freezing cgroups containing
+ * kthreads isn't supported.
+ */
+ if (task->flags & PF_KTHREAD)
continue;
- rcu_read_unlock();
-
- update_if_frozen(pos);
-
- rcu_read_lock();
- css_put(pos);
+ cgroup_freeze_task(task, freeze);
}
-
- rcu_read_unlock();
- mutex_unlock(&freezer_mutex);
-
- seq_puts(m, freezer_state_strs(css_freezer(css)->state));
- seq_putc(m, '\n');
- return 0;
-}
-
-static void freeze_cgroup(struct freezer *freezer)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&freezer->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- freeze_task(task);
css_task_iter_end(&it);
-}
-static void unfreeze_cgroup(struct freezer *freezer)
-{
- struct css_task_iter it;
- struct task_struct *task;
-
- css_task_iter_start(&freezer->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- __thaw_task(task);
- css_task_iter_end(&it);
+ /*
+ * Cgroup state should be revisited here to cover empty leaf cgroups
+ * and cgroups which descendants are already in the desired state.
+ */
+ spin_lock_irq(&css_set_lock);
+ if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants)
+ cgroup_update_frozen(cgrp);
+ spin_unlock_irq(&css_set_lock);
}
-/**
- * freezer_apply_state - apply state change to a single cgroup_freezer
- * @freezer: freezer to apply state change to
- * @freeze: whether to freeze or unfreeze
- * @state: CGROUP_FREEZING_* flag to set or clear
- *
- * Set or clear @state on @cgroup according to @freeze, and perform
- * freezing or thawing as necessary.
+/*
+ * Adjust the task state (freeze or unfreeze) and revisit the state of
+ * source and destination cgroups.
*/
-static void freezer_apply_state(struct freezer *freezer, bool freeze,
- unsigned int state)
+void cgroup_freezer_migrate_task(struct task_struct *task,
+ struct cgroup *src, struct cgroup *dst)
{
- /* also synchronizes against task migration, see freezer_attach() */
- lockdep_assert_held(&freezer_mutex);
+ lockdep_assert_held(&css_set_lock);
- if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ /*
+ * Kernel threads are not supposed to be frozen at all.
+ */
+ if (task->flags & PF_KTHREAD)
return;
- if (freeze) {
- if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
- freezer->state |= state;
- freeze_cgroup(freezer);
- } else {
- bool was_freezing = freezer->state & CGROUP_FREEZING;
-
- freezer->state &= ~state;
-
- if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
- freezer->state &= ~CGROUP_FROZEN;
- unfreeze_cgroup(freezer);
- }
+ /*
+ * Adjust counters of freezing and frozen tasks.
+ * Note, that if the task is frozen, but the destination cgroup is not
+ * frozen, we bump both counters to keep them balanced.
+ */
+ if (task->frozen) {
+ cgroup_inc_frozen_cnt(dst);
+ cgroup_dec_frozen_cnt(src);
}
-}
-
-/**
- * freezer_change_state - change the freezing state of a cgroup_freezer
- * @freezer: freezer of interest
- * @freeze: whether to freeze or thaw
- *
- * Freeze or thaw @freezer according to @freeze. The operations are
- * recursive - all descendants of @freezer will be affected.
- */
-static void freezer_change_state(struct freezer *freezer, bool freeze)
-{
- struct cgroup_subsys_state *pos;
+ cgroup_update_frozen(dst);
+ cgroup_update_frozen(src);
/*
- * Update all its descendants in pre-order traversal. Each
- * descendant will try to inherit its parent's FREEZING state as
- * CGROUP_FREEZING_PARENT.
+ * Force the task to the desired state.
*/
- mutex_lock(&freezer_mutex);
- rcu_read_lock();
- css_for_each_descendant_pre(pos, &freezer->css) {
- struct freezer *pos_f = css_freezer(pos);
- struct freezer *parent = parent_freezer(pos_f);
-
- if (!css_tryget_online(pos))
- continue;
- rcu_read_unlock();
-
- if (pos_f == freezer)
- freezer_apply_state(pos_f, freeze,
- CGROUP_FREEZING_SELF);
- else
- freezer_apply_state(pos_f,
- parent->state & CGROUP_FREEZING,
- CGROUP_FREEZING_PARENT);
-
- rcu_read_lock();
- css_put(pos);
- }
- rcu_read_unlock();
- mutex_unlock(&freezer_mutex);
+ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags));
}
-static ssize_t freezer_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+void cgroup_freeze(struct cgroup *cgrp, bool freeze)
{
- bool freeze;
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+ bool applied = false;
- buf = strstrip(buf);
+ lockdep_assert_held(&cgroup_mutex);
- if (strcmp(buf, freezer_state_strs(0)) == 0)
- freeze = false;
- else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
- freeze = true;
- else
- return -EINVAL;
+ /*
+ * Nothing changed? Just exit.
+ */
+ if (cgrp->freezer.freeze == freeze)
+ return;
- freezer_change_state(css_freezer(of_css(of)), freeze);
- return nbytes;
-}
+ cgrp->freezer.freeze = freeze;
-static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- struct freezer *freezer = css_freezer(css);
+ /*
+ * Propagate changes downwards the cgroup tree.
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ dsct = css->cgroup;
- return (bool)(freezer->state & CGROUP_FREEZING_SELF);
-}
+ if (cgroup_is_dead(dsct))
+ continue;
-static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- struct freezer *freezer = css_freezer(css);
+ if (freeze) {
+ dsct->freezer.e_freeze++;
+ /*
+ * Already frozen because of ancestor's settings?
+ */
+ if (dsct->freezer.e_freeze > 1)
+ continue;
+ } else {
+ dsct->freezer.e_freeze--;
+ /*
+ * Still frozen because of ancestor's settings?
+ */
+ if (dsct->freezer.e_freeze > 0)
+ continue;
- return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
-}
+ WARN_ON_ONCE(dsct->freezer.e_freeze < 0);
+ }
+
+ /*
+ * Do change actual state: freeze or unfreeze.
+ */
+ cgroup_do_freeze(dsct, freeze);
+ applied = true;
+ }
-static struct cftype files[] = {
- {
- .name = "state",
- .flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = freezer_read,
- .write = freezer_write,
- },
- {
- .name = "self_freezing",
- .flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = freezer_self_freezing_read,
- },
- {
- .name = "parent_freezing",
- .flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = freezer_parent_freezing_read,
- },
- { } /* terminate */
-};
-
-struct cgroup_subsys freezer_cgrp_subsys = {
- .css_alloc = freezer_css_alloc,
- .css_online = freezer_css_online,
- .css_offline = freezer_css_offline,
- .css_free = freezer_css_free,
- .attach = freezer_attach,
- .fork = freezer_fork,
- .legacy_cftypes = files,
-};
+ /*
+ * Even if the actual state hasn't changed, let's notify a user.
+ * The state can be enforced by an ancestor cgroup: the cgroup
+ * can already be in the desired state or it can be locked in the
+ * opposite state, so that the transition will never happen.
+ * In both cases it's better to notify a user, that there is
+ * nothing to wait for.
+ */
+ if (!applied) {
+ TRACE_CGROUP_PATH(notify_frozen, cgrp,
+ test_bit(CGRP_FROZEN, &cgrp->flags));
+ cgroup_file_notify(&cgrp->events_file);
+ }
+}
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
new file mode 100644
index 000000000000..08236798d173
--- /dev/null
+++ b/kernel/cgroup/legacy_freezer.c
@@ -0,0 +1,481 @@
+/*
+ * cgroup_freezer.c - control group freezer subsystem
+ *
+ * Copyright IBM Corporation, 2007
+ *
+ * Author : Cedric Le Goater <clg@fr.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/freezer.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+/*
+ * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
+ * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared
+ * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING
+ * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of
+ * its ancestors has FREEZING_SELF set.
+ */
+enum freezer_state_flags {
+ CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */
+ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */
+ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */
+ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */
+
+ /* mask for all FREEZING flags */
+ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT,
+};
+
+struct freezer {
+ struct cgroup_subsys_state css;
+ unsigned int state;
+};
+
+static DEFINE_MUTEX(freezer_mutex);
+
+static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct freezer, css) : NULL;
+}
+
+static inline struct freezer *task_freezer(struct task_struct *task)
+{
+ return css_freezer(task_css(task, freezer_cgrp_id));
+}
+
+static struct freezer *parent_freezer(struct freezer *freezer)
+{
+ return css_freezer(freezer->css.parent);
+}
+
+bool cgroup_freezing(struct task_struct *task)
+{
+ bool ret;
+
+ rcu_read_lock();
+ ret = task_freezer(task)->state & CGROUP_FREEZING;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static const char *freezer_state_strs(unsigned int state)
+{
+ if (state & CGROUP_FROZEN)
+ return "FROZEN";
+ if (state & CGROUP_FREEZING)
+ return "FREEZING";
+ return "THAWED";
+};
+
+static struct cgroup_subsys_state *
+freezer_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct freezer *freezer;
+
+ freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
+ if (!freezer)
+ return ERR_PTR(-ENOMEM);
+
+ return &freezer->css;
+}
+
+/**
+ * freezer_css_online - commit creation of a freezer css
+ * @css: css being created
+ *
+ * We're committing to creation of @css. Mark it online and inherit
+ * parent's freezing state while holding both parent's and our
+ * freezer->lock.
+ */
+static int freezer_css_online(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+ struct freezer *parent = parent_freezer(freezer);
+
+ mutex_lock(&freezer_mutex);
+
+ freezer->state |= CGROUP_FREEZER_ONLINE;
+
+ if (parent && (parent->state & CGROUP_FREEZING)) {
+ freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+ atomic_inc(&system_freezing_cnt);
+ }
+
+ mutex_unlock(&freezer_mutex);
+ return 0;
+}
+
+/**
+ * freezer_css_offline - initiate destruction of a freezer css
+ * @css: css being destroyed
+ *
+ * @css is going away. Mark it dead and decrement system_freezing_count if
+ * it was holding one.
+ */
+static void freezer_css_offline(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ mutex_lock(&freezer_mutex);
+
+ if (freezer->state & CGROUP_FREEZING)
+ atomic_dec(&system_freezing_cnt);
+
+ freezer->state = 0;
+
+ mutex_unlock(&freezer_mutex);
+}
+
+static void freezer_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_freezer(css));
+}
+
+/*
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
+ */
+static void freezer_attach(struct cgroup_taskset *tset)
+{
+ struct task_struct *task;
+ struct cgroup_subsys_state *new_css;
+
+ mutex_lock(&freezer_mutex);
+
+ /*
+ * Make the new tasks conform to the current state of @new_css.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
+ *
+ * Tasks in @tset are on @new_css but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
+ */
+ cgroup_taskset_for_each(task, new_css, tset) {
+ struct freezer *freezer = css_freezer(new_css);
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ /* clear FROZEN and propagate upwards */
+ while (freezer && (freezer->state & CGROUP_FROZEN)) {
+ freezer->state &= ~CGROUP_FROZEN;
+ freezer = parent_freezer(freezer);
+ }
+ }
+ }
+
+ mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to. This function may race against
+ * freezer_attach(). Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
+static void freezer_fork(struct task_struct *task)
+{
+ struct freezer *freezer;
+
+ /*
+ * The root cgroup is non-freezable, so we can skip locking the
+ * freezer. This is safe regardless of race with task migration.
+ * If we didn't race or won, skipping is obviously the right thing
+ * to do. If we lost and root is the new cgroup, noop is still the
+ * right thing to do.
+ */
+ if (task_css_is_root(task, freezer_cgrp_id))
+ return;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
+ freeze_task(task);
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+}
+
+/**
+ * update_if_frozen - update whether a cgroup finished freezing
+ * @css: css of interest
+ *
+ * Once FREEZING is initiated, transition to FROZEN is lazily updated by
+ * calling this function. If the current state is FREEZING but not FROZEN,
+ * this function checks whether all tasks of this cgroup and the descendant
+ * cgroups finished freezing and, if so, sets FROZEN.
+ *
+ * The caller is responsible for grabbing RCU read lock and calling
+ * update_if_frozen() on all descendants prior to invoking this function.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @css, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
+ */
+static void update_if_frozen(struct cgroup_subsys_state *css)
+{
+ struct freezer *freezer = css_freezer(css);
+ struct cgroup_subsys_state *pos;
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZING) ||
+ (freezer->state & CGROUP_FROZEN))
+ return;
+
+ /* are all (live) children frozen? */
+ rcu_read_lock();
+ css_for_each_child(pos, css) {
+ struct freezer *child = css_freezer(pos);
+
+ if ((child->state & CGROUP_FREEZER_ONLINE) &&
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
+ /* are all tasks frozen? */
+ css_task_iter_start(css, 0, &it);
+
+ while ((task = css_task_iter_next(&it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto out_iter_end;
+ }
+ }
+
+ freezer->state |= CGROUP_FROZEN;
+out_iter_end:
+ css_task_iter_end(&it);
+}
+
+static int freezer_read(struct seq_file *m, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(m), *pos;
+
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ /* update states bottom-up */
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ update_if_frozen(pos);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+
+ seq_puts(m, freezer_state_strs(css_freezer(css)->state));
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void freeze_cgroup(struct freezer *freezer)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&freezer->css, 0, &it);
+ while ((task = css_task_iter_next(&it)))
+ freeze_task(task);
+ css_task_iter_end(&it);
+}
+
+static void unfreeze_cgroup(struct freezer *freezer)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ css_task_iter_start(&freezer->css, 0, &it);
+ while ((task = css_task_iter_next(&it)))
+ __thaw_task(task);
+ css_task_iter_end(&it);
+}
+
+/**
+ * freezer_apply_state - apply state change to a single cgroup_freezer
+ * @freezer: freezer to apply state change to
+ * @freeze: whether to freeze or unfreeze
+ * @state: CGROUP_FREEZING_* flag to set or clear
+ *
+ * Set or clear @state on @cgroup according to @freeze, and perform
+ * freezing or thawing as necessary.
+ */
+static void freezer_apply_state(struct freezer *freezer, bool freeze,
+ unsigned int state)
+{
+ /* also synchronizes against task migration, see freezer_attach() */
+ lockdep_assert_held(&freezer_mutex);
+
+ if (!(freezer->state & CGROUP_FREEZER_ONLINE))
+ return;
+
+ if (freeze) {
+ if (!(freezer->state & CGROUP_FREEZING))
+ atomic_inc(&system_freezing_cnt);
+ freezer->state |= state;
+ freeze_cgroup(freezer);
+ } else {
+ bool was_freezing = freezer->state & CGROUP_FREEZING;
+
+ freezer->state &= ~state;
+
+ if (!(freezer->state & CGROUP_FREEZING)) {
+ if (was_freezing)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state &= ~CGROUP_FROZEN;
+ unfreeze_cgroup(freezer);
+ }
+ }
+}
+
+/**
+ * freezer_change_state - change the freezing state of a cgroup_freezer
+ * @freezer: freezer of interest
+ * @freeze: whether to freeze or thaw
+ *
+ * Freeze or thaw @freezer according to @freeze. The operations are
+ * recursive - all descendants of @freezer will be affected.
+ */
+static void freezer_change_state(struct freezer *freezer, bool freeze)
+{
+ struct cgroup_subsys_state *pos;
+
+ /*
+ * Update all its descendants in pre-order traversal. Each
+ * descendant will try to inherit its parent's FREEZING state as
+ * CGROUP_FREEZING_PARENT.
+ */
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &freezer->css) {
+ struct freezer *pos_f = css_freezer(pos);
+ struct freezer *parent = parent_freezer(pos_f);
+
+ if (!css_tryget_online(pos))
+ continue;
+ rcu_read_unlock();
+
+ if (pos_f == freezer)
+ freezer_apply_state(pos_f, freeze,
+ CGROUP_FREEZING_SELF);
+ else
+ freezer_apply_state(pos_f,
+ parent->state & CGROUP_FREEZING,
+ CGROUP_FREEZING_PARENT);
+
+ rcu_read_lock();
+ css_put(pos);
+ }
+ rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
+}
+
+static ssize_t freezer_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ bool freeze;
+
+ buf = strstrip(buf);
+
+ if (strcmp(buf, freezer_state_strs(0)) == 0)
+ freeze = false;
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ freeze = true;
+ else
+ return -EINVAL;
+
+ freezer_change_state(css_freezer(of_css(of)), freeze);
+ return nbytes;
+}
+
+static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_SELF);
+}
+
+static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct freezer *freezer = css_freezer(css);
+
+ return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "state",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = freezer_read,
+ .write = freezer_write,
+ },
+ {
+ .name = "self_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_self_freezing_read,
+ },
+ {
+ .name = "parent_freezing",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = freezer_parent_freezing_read,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys freezer_cgrp_subsys = {
+ .css_alloc = freezer_css_alloc,
+ .css_online = freezer_css_online,
+ .css_offline = freezer_css_offline,
+ .css_free = freezer_css_free,
+ .attach = freezer_attach,
+ .fork = freezer_fork,
+ .legacy_cftypes = files,
+};
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index c9960baaa14f..8e513a573fe9 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Process number limiting controller for cgroups.
*
@@ -25,10 +26,6 @@
* a superset of parent/child/pids.current.
*
* Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/kernel.h>
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index 1d75ae7f1cb7..ae042c347c64 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* RDMA resource limiting controller for cgroups.
*
@@ -5,10 +6,6 @@
* additional RDMA resources after a certain limit is reached.
*
* Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
- *
- * This file is subject to the terms and conditions of version 2 of the GNU
- * General Public License. See the file COPYING in the main directory of the
- * Linux distribution for more details.
*/
#include <linux/bitops.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index bb95a35e8c2d..ca19b4c8acf5 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"
#include <linux/sched/cputime.h>
diff --git a/kernel/compat.c b/kernel/compat.c
index d8a36c6ad7c9..a2bc1d6ceb57 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/compat.c
*
@@ -5,10 +6,6 @@
* on 64 bit kernels.
*
* Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/linkage.h>
@@ -346,8 +343,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
return -EFAULT;
switch (_NSIG_WORDS) {
case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+ /* fall through */
case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+ /* fall through */
case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+ /* fall through */
case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
}
#else
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 9ad37b9e44a7..be01a4d627c9 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Context tracking: Probe on high level context boundaries such as kernel
* and userspace. This includes syscalls and exceptions entry/exit.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f2ef10460698..e84c0873559e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -522,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu)
/*
* SMT soft disabling on X86 requires to bring the CPU out of the
* BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
- * CPU marked itself as booted_once in cpu_notify_starting() so the
+ * CPU marked itself as booted_once in notify_cpu_starting() so the
* cpu_smt_allowed() check will now return false if this is not the
* primary sibling.
*/
@@ -1221,6 +1221,13 @@ int freeze_secondary_cpus(int primary)
for_each_online_cpu(cpu) {
if (cpu == primary)
continue;
+
+ if (pm_wakeup_pending()) {
+ pr_info("Wakeup pending. Abort CPU freeze\n");
+ error = -EBUSY;
+ break;
+ }
+
trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
@@ -1964,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev,
if (ret)
return ret;
+ if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
+ return -EINVAL;
+
/*
* Cannot fail STARTING/DYING callbacks.
*/
@@ -2061,7 +2071,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu)
kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}
-static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
int cpu, ret = 0;
@@ -2093,7 +2103,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
return ret;
}
-static int cpuhp_smt_enable(void)
+int cpuhp_smt_enable(void)
{
int cpu, ret = 0;
@@ -2339,6 +2349,9 @@ static int __init mitigations_parse_cmdline(char *arg)
cpu_mitigations = CPU_MITIGATIONS_AUTO;
else if (!strcmp(arg, "auto,nosmt"))
cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
+ else
+ pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
+ arg);
return 0;
}
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 67b02e138a47..cbca6879ab7d 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -1,18 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2011 Google, Inc.
*
* Author:
* Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 093c9f917ed0..9f1557b98468 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* crash.c - kernel crash support code.
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#include <linux/crash_core.h>
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index b64e238b553b..9c23ae074b40 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/crash_dump.h>
#include <linux/init.h>
diff --git a/kernel/cred.c b/kernel/cred.c
index 45d77284aed0..f9a0ce66c9c3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Task credentials management - see Documentation/security/credentials.rst
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
*/
#include <linux/export.h>
#include <linux/cred.h>
@@ -174,6 +170,11 @@ void exit_creds(struct task_struct *tsk)
validate_creds(cred);
alter_cred_subscribers(cred, -1);
put_cred(cred);
+
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+ key_put(current->cached_requested_key);
+ current->cached_requested_key = NULL;
+#endif
}
/**
@@ -327,6 +328,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
+#ifdef CONFIG_KEYS_REQUEST_CACHE
+ p->cached_requested_key = NULL;
+#endif
+
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
@@ -450,14 +455,23 @@ int commit_creds(struct cred *new)
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
+ /*
+ * If a task drops privileges and becomes nondumpable,
+ * the dumpability change must become visible before
+ * the credential change; otherwise, a __ptrace_may_access()
+ * racing with this change may be able to attach to a task it
+ * shouldn't be able to attach to (as if the task had dropped
+ * privileges without becoming nondumpable).
+ * Pairs with a read barrier in __ptrace_may_access().
+ */
smp_wmb();
}
/* alter the thread keyring */
if (!uid_eq(new->fsuid, old->fsuid))
- key_fsuid_changed(task);
+ key_fsuid_changed(new);
if (!gid_eq(new->fsgid, old->fsgid))
- key_fsgid_changed(task);
+ key_fsgid_changed(new);
/* do it
* RLIMIT_NPROC limits on user->processes have already been checked
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
index a85edc339985..332ee6c6ec2c 100644
--- a/kernel/debug/Makefile
+++ b/kernel/debug/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the linux kernel debugger
#
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 7510dc687c0d..4b280fc7dd67 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks)
return DBG_PASS_EVENT;
}
#endif
+ /* Fall through */
case 'C': /* Exception passing */
tmp = gdb_cmd_exception_pass(ks);
if (tmp > 0)
goto default_handle;
if (tmp == 0)
break;
- /* Fall through on tmp < 0 */
+ /* Fall through - on tmp < 0 */
case 'c': /* Continue packet */
case 's': /* Single step packet */
if (kgdb_contthread && kgdb_contthread != current) {
@@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
break;
}
dbg_activate_sw_breakpoints();
- /* Fall through to default processing */
+ /* Fall through - to default processing */
default:
default_handle:
error = kgdb_arch_handle_exception(ks->ex_vector,
@@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
return error;
case 's':
case 'c':
- strcpy(remcom_in_buffer, cmd);
+ strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
return 0;
case '$':
- strcpy(remcom_in_buffer, cmd);
+ strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer));
gdbstub_use_prev_in_buf = strlen(remcom_in_buffer);
gdbstub_prev_in_buf_pos = 0;
return 0;
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
index d4fc58f4b88d..efac857c5511 100644
--- a/kernel/debug/kdb/Makefile
+++ b/kernel/debug/kdb/Makefile
@@ -6,7 +6,6 @@
# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
#
-CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 6a4b41484afe..3a5184eb6977 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -446,7 +446,7 @@ poll_again:
char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt)
{
if (prompt && kdb_prompt_str != prompt)
- strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
+ strscpy(kdb_prompt_str, prompt, CMD_BUFLEN);
kdb_printf(kdb_prompt_str);
kdb_nextline = 1; /* Prompt and input resets line number */
return kdb_read(buffer, bufsize);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 82a3b32a7cfc..9ecfa37c7fbf 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv)
kdb_printf("machine %s\n", init_uts_ns.name.machine);
kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
- kdb_printf("ccversion %s\n", __stringify(CCVERSION));
now = __ktime_get_real_seconds();
time64_to_tm(now, 0, &tm);
@@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv)
diag = kdbgetularg(argv[3], &whichcpu);
if (diag)
return diag;
- if (!cpu_online(whichcpu)) {
+ if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) {
kdb_printf("cpu %ld is not online\n", whichcpu);
return KDB_BADCPUNUM;
}
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 50bf9b119bad..b8e6306e7e13 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len)
while ((name = kdb_walk_kallsyms(&pos))) {
if (strncmp(name, prefix_name, prefix_len) == 0) {
- strcpy(ks_namebuf, name);
+ strscpy(ks_namebuf, name, sizeof(ks_namebuf));
/* Work out the longest name that matches the prefix */
if (++number == 1) {
prev_len = min_t(int, max_len-1,
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 2a12b988c717..27725754ac99 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* delayacct.c - per-task delay accounting
*
* Copyright (C) Shailabh Nagar, IBM Corp. 2006
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
*/
#include <linux/sched.h>
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index a06ba3013b3b..70f8f8d9200e 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config HAS_DMA
bool
@@ -38,6 +39,9 @@ config ARCH_HAS_SYNC_DMA_FOR_CPU
config ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
bool
+config ARCH_HAS_DMA_PREP_COHERENT
+ bool
+
config ARCH_HAS_DMA_COHERENT_TO_PFN
bool
@@ -57,6 +61,7 @@ config SWIOTLB
config DMA_REMAP
depends on MMU
+ select GENERIC_ALLOCATOR
bool
config DMA_DIRECT_REMAP
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index badd77670d00..099002d84f46 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008 Advanced Micro Devices, Inc.
*
* Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) "DMA-API: " fmt
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index fcdb23e8d2fc..2c2772e9702a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -311,7 +311,7 @@ static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
size_t size)
{
return swiotlb_force != SWIOTLB_FORCE &&
- (!dev || dma_capable(dev, dma_addr, size));
+ dma_capable(dev, dma_addr, size);
}
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index c000906348c9..f7afdadb6770 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -238,17 +238,13 @@ u64 dma_get_required_mask(struct device *dev)
}
EXPORT_SYMBOL_GPL(dma_get_required_mask);
-#ifndef arch_dma_alloc_attrs
-#define arch_dma_alloc_attrs(dev) (true)
-#endif
-
void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t flag, unsigned long attrs)
{
const struct dma_map_ops *ops = get_dma_ops(dev);
void *cpu_addr;
- WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
+ WARN_ON_ONCE(!dev->coherent_dma_mask);
if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
return cpu_addr;
@@ -256,9 +252,6 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* let the implementation decide on the zone to allocate from: */
flag &= ~(__GFP_DMA | __GFP_DMA32 | __GFP_HIGHMEM);
- if (!arch_dma_alloc_attrs(&dev))
- return NULL;
-
if (dma_is_direct(ops))
cpu_addr = dma_direct_alloc(dev, size, dma_handle, flag, attrs);
else if (ops->alloc)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 53012db1e53c..13f0cb080a4d 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Dynamic DMA mapping support.
*
@@ -452,6 +453,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
unsigned long mask;
unsigned long offset_slots;
unsigned long max_slots;
+ unsigned long tmp_io_tlb_used;
if (no_iotlb_memory)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
@@ -538,9 +540,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
} while (index != wrap);
not_found:
+ tmp_io_tlb_used = io_tlb_used;
+
spin_unlock_irqrestore(&io_tlb_lock, flags);
if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit())
- dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes)\n", size);
+ dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
+ size, io_tlb_nslabs, tmp_io_tlb_used);
return DMA_MAPPING_ERROR;
found:
io_tlb_used += nslots;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index abbd4b3b96c2..785d708f8553 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2952,6 +2952,12 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
+ /*
+ * If we had been multiplexing, no rotations are necessary, now no events
+ * are active.
+ */
+ ctx->rotate_necessary = 0;
+
perf_pmu_disable(ctx->pmu);
if (is_active & EVENT_PINNED) {
list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
@@ -3319,10 +3325,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
return 0;
if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
- else
+ int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+ if (ret) {
sid->can_add_hw = 0;
+ sid->ctx->rotate_necessary = 1;
+ return 0;
+ }
+ list_add_tail(&event->active_list, &sid->ctx->flexible_active);
}
return 0;
@@ -3690,24 +3699,17 @@ ctx_first_active(struct perf_event_context *ctx)
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event *cpu_event = NULL, *task_event = NULL;
- bool cpu_rotate = false, task_rotate = false;
- struct perf_event_context *ctx = NULL;
+ struct perf_event_context *task_ctx = NULL;
+ int cpu_rotate, task_rotate;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- if (cpuctx->ctx.nr_events) {
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- cpu_rotate = true;
- }
-
- ctx = cpuctx->task_ctx;
- if (ctx && ctx->nr_events) {
- if (ctx->nr_events != ctx->nr_active)
- task_rotate = true;
- }
+ cpu_rotate = cpuctx->ctx.rotate_necessary;
+ task_ctx = cpuctx->task_ctx;
+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
@@ -3716,7 +3718,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(ctx);
+ task_event = ctx_first_active(task_ctx);
if (cpu_rotate)
cpu_event = ctx_first_active(&cpuctx->ctx);
@@ -3724,17 +3726,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (ctx && cpu_event))
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_ctx && cpu_event))
+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
if (cpu_event)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (task_event)
- rotate_ctx(ctx, task_event);
+ rotate_ctx(task_ctx, task_event);
if (cpu_event)
rotate_ctx(&cpuctx->ctx, cpu_event);
- perf_event_sched_in(cpuctx, ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -5005,6 +5007,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (perf_event_check_period(event, value))
return -EINVAL;
+ if (!event->attr.freq && (value & (1ULL << 63)))
+ return -EINVAL;
+
event_function_call(event, __perf_event_period, &value);
return 0;
@@ -5923,7 +5928,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (current->mm) {
+ } else if (!(current->flags & PF_KTHREAD)) {
perf_get_regs_user(regs_user, regs, regs_user_copy);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -8532,9 +8537,9 @@ static int perf_tp_event_match(struct perf_event *event,
if (event->hw.state & PERF_HES_STOPPED)
return 0;
/*
- * All tracepoints are from kernel-space.
+ * If exclude_kernel, only trace user-space tracepoints (uprobes)
*/
- if (event->attr.exclude_kernel)
+ if (event->attr.exclude_kernel && !user_mode(regs))
return 0;
if (!perf_tp_filter_match(event, data))
@@ -9874,6 +9879,12 @@ static int pmu_dev_alloc(struct pmu *pmu)
if (ret)
goto del_dev;
+ if (pmu->attr_update)
+ ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
+
+ if (ret)
+ goto del_dev;
+
out:
return ret;
@@ -10033,6 +10044,12 @@ void perf_pmu_unregister(struct pmu *pmu)
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
+static inline bool has_extended_regs(struct perf_event *event)
+{
+ return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
+ (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
+}
+
static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
struct perf_event_context *ctx = NULL;
@@ -10064,12 +10081,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
perf_event_ctx_unlock(event->group_leader, ctx);
if (!ret) {
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event))
+ ret = -EOPNOTSUPP;
+
if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
- event_has_any_exclude_flag(event)) {
- if (event->destroy)
- event->destroy(event);
+ event_has_any_exclude_flag(event))
ret = -EINVAL;
- }
+
+ if (ret && event->destroy)
+ event->destroy(event);
}
if (ret)
@@ -10680,11 +10701,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
break;
case CLOCK_BOOTTIME:
- event->clock = &ktime_get_boot_ns;
+ event->clock = &ktime_get_boottime_ns;
break;
case CLOCK_TAI:
- event->clock = &ktime_get_tai_ns;
+ event->clock = &ktime_get_clocktai_ns;
break;
default:
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 79c47076700a..3aef4191798c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -24,7 +24,7 @@ struct ring_buffer {
atomic_t poll; /* POLL_ for wakeups */
local_t head; /* write position */
- local_t nest; /* nested writers */
+ unsigned int nest; /* nested writers */
local_t events; /* event limit */
local_t wakeup; /* wakeup stamp */
local_t lost; /* nr records lost */
@@ -41,7 +41,7 @@ struct ring_buffer {
/* AUX area */
long aux_head;
- local_t aux_nest;
+ unsigned int aux_nest;
long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 674b35383491..ffb59a4ef4ff 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle)
struct ring_buffer *rb = handle->rb;
preempt_disable();
- local_inc(&rb->nest);
+
+ /*
+ * Avoid an explicit LOAD/STORE such that architectures with memops
+ * can use them.
+ */
+ (*(volatile unsigned int *)&rb->nest)++;
handle->wakeup = local_read(&rb->wakeup);
}
@@ -46,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle)
{
struct ring_buffer *rb = handle->rb;
unsigned long head;
+ unsigned int nest;
+
+ /*
+ * If this isn't the outermost nesting, we don't have to update
+ * @rb->user_page->data_head.
+ */
+ nest = READ_ONCE(rb->nest);
+ if (nest > 1) {
+ WRITE_ONCE(rb->nest, nest - 1);
+ goto out;
+ }
again:
+ /*
+ * In order to avoid publishing a head value that goes backwards,
+ * we must ensure the load of @rb->head happens after we've
+ * incremented @rb->nest.
+ *
+ * Otherwise we can observe a @rb->head value before one published
+ * by an IRQ/NMI happening between the load and the increment.
+ */
+ barrier();
head = local_read(&rb->head);
/*
- * IRQ/NMI can happen here, which means we can miss a head update.
+ * IRQ/NMI can happen here and advance @rb->head, causing our
+ * load above to be stale.
*/
- if (!local_dec_and_test(&rb->nest))
- goto out;
-
/*
* Since the mmap() consumer (userspace) can run on a different CPU:
*
@@ -84,14 +107,23 @@ again:
* See perf_output_begin().
*/
smp_wmb(); /* B, matches C */
- rb->user_page->data_head = head;
+ WRITE_ONCE(rb->user_page->data_head, head);
/*
- * Now check if we missed an update -- rely on previous implied
- * compiler barriers to force a re-read.
+ * We must publish the head before decrementing the nest count,
+ * otherwise an IRQ/NMI can publish a more recent head value and our
+ * write will (temporarily) publish a stale value.
*/
+ barrier();
+ WRITE_ONCE(rb->nest, 0);
+
+ /*
+ * Ensure we decrement @rb->nest before we validate the @rb->head.
+ * Otherwise we cannot be sure we caught the 'last' nested update.
+ */
+ barrier();
if (unlikely(head != local_read(&rb->head))) {
- local_inc(&rb->nest);
+ WRITE_ONCE(rb->nest, 1);
goto again;
}
@@ -330,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
struct ring_buffer *rb;
+ unsigned int nest;
if (output_event->parent)
output_event = output_event->parent;
@@ -360,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!refcount_inc_not_zero(&rb->aux_refcount))
goto err;
+ nest = READ_ONCE(rb->aux_nest);
/*
* Nesting is not supported for AUX area, make sure nested
* writers are caught early
*/
- if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
+ if (WARN_ON_ONCE(nest))
goto err_put;
+ WRITE_ONCE(rb->aux_nest, nest + 1);
+
aux_head = rb->aux_head;
handle->rb = rb;
@@ -394,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (!handle->size) { /* A, matches D */
event->pending_disable = smp_processor_id();
perf_output_wakeup(handle);
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
goto err_put;
}
}
@@ -471,7 +507,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
perf_event_aux_event(handle->event, aux_head, size,
handle->aux_flags);
- rb->user_page->aux_head = rb->aux_head;
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb))
wakeup = true;
@@ -483,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->event = NULL;
- local_set(&rb->aux_nest, 0);
+ WRITE_ONCE(rb->aux_nest, 0);
/* can't be last */
rb_free_aux(rb);
ring_buffer_put(rb);
@@ -503,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
- rb->user_page->aux_head = rb->aux_head;
+ WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e6a0d6be87e3..84fa00497c49 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
-static struct percpu_rw_semaphore dup_mmap_sem;
+DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
/* Have a copy of original instruction */
#define UPROBE_COPY_INSN 0
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
- mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);
@@ -2028,7 +2029,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
if (uc->handler) {
rc = uc->handler(uc, regs);
WARN(rc & ~UPROBE_HANDLER_MASK,
- "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ "bad rc=0x%x from %ps()\n", rc, uc->handler);
}
if (uc->ret_handler)
@@ -2111,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs)
sigill:
uprobe_warn(current, "handle uretprobe, sending SIGILL.");
- force_sig(SIGILL, current);
+ force_sig(SIGILL);
}
@@ -2227,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
if (unlikely(err)) {
uprobe_warn(current, "execute the probed insn, sending SIGILL.");
- force_sig(SIGILL, current);
+ force_sig(SIGILL);
}
}
@@ -2301,7 +2302,5 @@ void __init uprobes_init(void)
for (i = 0; i < UPROBES_HASH_SZ; i++)
mutex_init(&uprobes_mmap_mutex[i]);
- BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
-
BUG_ON(register_die_notifier(&uprobe_exception_nb));
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2166c2d92ddc..a75b6a7f458a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/exit.c
*
@@ -194,6 +195,7 @@ repeat:
rcu_read_unlock();
proc_flush_task(p);
+ cgroup_release(p);
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
@@ -219,7 +221,6 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
- cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
@@ -422,7 +423,7 @@ retry:
* freed task structure.
*/
if (atomic_read(&mm->mm_users) <= 1) {
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
return;
}
@@ -462,7 +463,7 @@ retry:
* most likely racing with swapoff (try_to_unuse()) or /proc or
* ptrace or page migration (get_task_mm()). Mark owner as NULL.
*/
- mm->owner = NULL;
+ WRITE_ONCE(mm->owner, NULL);
return;
assign_new_owner:
@@ -483,7 +484,7 @@ assign_new_owner:
put_task_struct(c);
goto retry;
}
- mm->owner = c;
+ WRITE_ONCE(mm->owner, c);
task_unlock(c);
put_task_struct(c);
}
diff --git a/kernel/extable.c b/kernel/extable.c
index 6a5b61ebc66c..e23cce6e6092 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Rewritten by Rusty Russell, on the backs of many others...
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/ftrace.h>
#include <linux/memory.h>
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 17f75b545f66..feb80712b913 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -210,7 +210,7 @@ static int fei_seq_show(struct seq_file *m, void *v)
{
struct fei_attr *attr = list_entry(v, struct fei_attr, list);
- seq_printf(m, "%pf\n", attr->kp.addr);
+ seq_printf(m, "%ps\n", attr->kp.addr);
return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index fbe9dfcd8680..8f3e2d97d771 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/kernel/fork.c
*
@@ -11,6 +12,7 @@
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
+#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
@@ -21,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
@@ -120,7 +123,7 @@
unsigned long total_forks; /* Handle normal Linux uptimes. */
int nr_threads; /* The idle threads do not count.. */
-int max_threads; /* tunable limit on nr_threads */
+static int max_threads; /* tunable limit on nr_threads */
DEFINE_PER_CPU(unsigned long, process_counts) = 0;
@@ -245,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
- return page ? page_address(page) : NULL;
+ if (likely(page)) {
+ tsk->stack = page_address(page);
+ return tsk->stack;
+ }
+ return NULL;
#endif
}
@@ -891,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_STACKPROTECTOR
tsk->stack_canary = get_random_canary();
#endif
+ if (orig->cpus_ptr == &orig->cpus_mask)
+ tsk->cpus_ptr = &tsk->cpus_mask;
/*
* One for us, one for whoever does the "release_task()" (usually
@@ -953,6 +962,15 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+ struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ if (mm->owner == p)
+ WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
@@ -1223,7 +1241,9 @@ static int wait_for_vfork_done(struct task_struct *child,
int killed;
freezer_do_not_count();
+ cgroup_enter_frozen();
killed = wait_for_completion_killable(vfork);
+ cgroup_leave_frozen(false);
freezer_count();
if (killed) {
@@ -1339,6 +1359,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
free_pt:
/* don't put binfmt in mmput, we haven't got module yet */
mm->binfmt = NULL;
+ mm_init_owner(mm, NULL);
mmput(mm);
fail_nomem:
@@ -1670,6 +1691,74 @@ static inline void rcu_copy_process(struct task_struct *p)
#endif /* #ifdef CONFIG_TASKS_RCU */
}
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+ struct pid *pid = file->private_data;
+
+ file->private_data = NULL;
+ put_pid(pid);
+ return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+ struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
+ struct pid *pid = f->private_data;
+
+ seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+ seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+ struct task_struct *task;
+ struct pid *pid = file->private_data;
+ int poll_flags = 0;
+
+ poll_wait(file, &pid->wait_pidfd, pts);
+
+ rcu_read_lock();
+ task = pid_task(pid, PIDTYPE_PID);
+ /*
+ * Inform pollers only when the whole thread group exits.
+ * If the thread group leader exits before all other threads in the
+ * group, then poll(2) should block, similar to the wait(2) family.
+ */
+ if (!task || (task->exit_state && thread_group_empty(task)))
+ poll_flags = POLLIN | POLLRDNORM;
+ rcu_read_unlock();
+
+ return poll_flags;
+}
+
+const struct file_operations pidfd_fops = {
+ .release = pidfd_release,
+ .poll = pidfd_poll,
+#ifdef CONFIG_PROC_FS
+ .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+ struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+ free_task(tsk);
+}
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+ if (IS_ENABLED(CONFIG_MEMCG))
+ call_rcu(&tsk->rcu, __delayed_free_task);
+ else
+ free_task(tsk);
+}
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1679,18 +1768,16 @@ static inline void rcu_copy_process(struct task_struct *p)
* flags). The actual kick-off is left to the caller.
*/
static __latent_entropy struct task_struct *copy_process(
- unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *child_tidptr,
struct pid *pid,
int trace,
- unsigned long tls,
- int node)
+ int node,
+ struct kernel_clone_args *args)
{
- int retval;
+ int pidfd = -1, retval;
struct task_struct *p;
struct multiprocess_signals delayed;
+ struct file *pidfile = NULL;
+ u64 clone_flags = args->flags;
/*
* Don't allow sharing the root directory with processes in a different
@@ -1738,6 +1825,16 @@ static __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_PIDFD) {
+ /*
+ * - CLONE_DETACHED is blocked so that we can potentially
+ * reuse it later for CLONE_PIDFD.
+ * - CLONE_THREAD is blocked until someone really needs it.
+ */
+ if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -1767,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process(
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
- p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
+ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
*/
- p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;
+ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;
ftrace_graph_init_task(p);
@@ -1876,9 +1973,6 @@ static __latent_entropy struct task_struct *copy_process(
p->pagefault_disabled = 0;
#ifdef CONFIG_LOCKDEP
- p->lockdep_depth = 0; /* no locks held yet */
- p->curr_chain_key = 0;
- p->lockdep_recursion = 0;
lockdep_init_task(p);
#endif
@@ -1930,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_io(clone_flags, p);
if (retval)
goto bad_fork_cleanup_namespaces;
- retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
+ retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
+ args->tls);
if (retval)
goto bad_fork_cleanup_io;
@@ -1944,6 +2039,32 @@ static __latent_entropy struct task_struct *copy_process(
}
}
+ /*
+ * This has to happen after we've potentially unshared the file
+ * descriptor table (so that the pidfd doesn't leak into the child
+ * if the fd table isn't shared).
+ */
+ if (clone_flags & CLONE_PIDFD) {
+ retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+ if (retval < 0)
+ goto bad_fork_free_pid;
+
+ pidfd = retval;
+
+ pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
+ O_RDWR | O_CLOEXEC);
+ if (IS_ERR(pidfile)) {
+ put_unused_fd(pidfd);
+ retval = PTR_ERR(pidfile);
+ goto bad_fork_free_pid;
+ }
+ get_pid(pid); /* held by pidfile now */
+
+ retval = put_user(pidfd, args->pidfd);
+ if (retval)
+ goto bad_fork_put_pidfd;
+ }
+
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
@@ -1970,7 +2091,7 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef TIF_SYSCALL_EMU
clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
- clear_all_latency_tracing(p);
+ clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
@@ -1982,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PARENT)
p->exit_signal = current->group_leader->exit_signal;
else
- p->exit_signal = (clone_flags & CSIGNAL);
+ p->exit_signal = args->exit_signal;
p->group_leader = p;
p->tgid = p->pid;
}
@@ -2004,7 +2125,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
retval = cgroup_can_fork(p);
if (retval)
- goto bad_fork_free_pid;
+ goto bad_fork_cgroup_threadgroup_change_end;
/*
* From this point on we must avoid any synchronous user-space
@@ -2015,7 +2136,7 @@ static __latent_entropy struct task_struct *copy_process(
*/
p->start_time = ktime_get_ns();
- p->real_start_time = ktime_get_boot_ns();
+ p->real_start_time = ktime_get_boottime_ns();
/*
* Make it visible to the rest of the system, but dont wake it up yet.
@@ -2056,6 +2177,9 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
+ /* past the last point of failure */
+ if (pidfile)
+ fd_install(pidfd, pidfile);
init_task_pid_links(p);
if (likely(p->pid)) {
@@ -2119,8 +2243,14 @@ bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
-bad_fork_free_pid:
+bad_fork_cgroup_threadgroup_change_end:
cgroup_threadgroup_change_end(current);
+bad_fork_put_pidfd:
+ if (clone_flags & CLONE_PIDFD) {
+ fput(pidfile);
+ put_unused_fd(pidfd);
+ }
+bad_fork_free_pid:
if (pid != &init_struct_pid)
free_pid(pid);
bad_fork_cleanup_thread:
@@ -2131,8 +2261,10 @@ bad_fork_cleanup_io:
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
- if (p->mm)
+ if (p->mm) {
+ mm_clear_owner(p->mm, p);
mmput(p->mm);
+ }
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
@@ -2163,7 +2295,7 @@ bad_fork_cleanup_count:
bad_fork_free:
p->state = TASK_DEAD;
put_task_stack(p);
- free_task(p);
+ delayed_free_task(p);
fork_out:
spin_lock_irq(&current->sighand->siglock);
hlist_del_init(&delayed.node);
@@ -2184,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle)
struct task_struct *fork_idle(int cpu)
{
struct task_struct *task;
- task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
- cpu_to_node(cpu));
+ struct kernel_clone_args args = {
+ .flags = CLONE_VM,
+ };
+
+ task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
if (!IS_ERR(task)) {
init_idle_pids(task);
init_idle(task, cpu);
@@ -2205,13 +2340,9 @@ struct mm_struct *copy_init_mm(void)
* It copies the process, and if successful kick-starts
* it and waits for it to finish using the VM if required.
*/
-long _do_fork(unsigned long clone_flags,
- unsigned long stack_start,
- unsigned long stack_size,
- int __user *parent_tidptr,
- int __user *child_tidptr,
- unsigned long tls)
+long _do_fork(struct kernel_clone_args *args)
{
+ u64 clone_flags = args->flags;
struct completion vfork;
struct pid *pid;
struct task_struct *p;
@@ -2227,7 +2358,7 @@ long _do_fork(unsigned long clone_flags,
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
- else if ((clone_flags & CSIGNAL) != SIGCHLD)
+ else if (args->exit_signal != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
@@ -2236,8 +2367,7 @@ long _do_fork(unsigned long clone_flags,
trace = 0;
}
- p = copy_process(clone_flags, stack_start, stack_size,
- child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
+ p = copy_process(NULL, trace, NUMA_NO_NODE, args);
add_latent_entropy();
if (IS_ERR(p))
@@ -2253,7 +2383,7 @@ long _do_fork(unsigned long clone_flags,
nr = pid_vnr(pid);
if (clone_flags & CLONE_PARENT_SETTID)
- put_user(nr, parent_tidptr);
+ put_user(nr, args->parent_tid);
if (clone_flags & CLONE_VFORK) {
p->vfork_done = &vfork;
@@ -2285,8 +2415,16 @@ long do_fork(unsigned long clone_flags,
int __user *parent_tidptr,
int __user *child_tidptr)
{
- return _do_fork(clone_flags, stack_start, stack_size,
- parent_tidptr, child_tidptr, 0);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = stack_start,
+ .stack_size = stack_size,
+ };
+
+ return _do_fork(&args);
}
#endif
@@ -2295,15 +2433,25 @@ long do_fork(unsigned long clone_flags,
*/
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
- return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
- (unsigned long)arg, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL),
+ .exit_signal = (flags & CSIGNAL),
+ .stack = (unsigned long)fn,
+ .stack_size = (unsigned long)arg,
+ };
+
+ return _do_fork(&args);
}
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
- return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
#else
/* can not support in nommu mode */
return -EINVAL;
@@ -2314,8 +2462,12 @@ SYSCALL_DEFINE0(fork)
#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
- return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
- 0, NULL, NULL, 0);
+ struct kernel_clone_args args = {
+ .flags = CLONE_VFORK | CLONE_VM,
+ .exit_signal = SIGCHLD,
+ };
+
+ return _do_fork(&args);
}
#endif
@@ -2343,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
unsigned long, tls)
#endif
{
- return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
+ struct kernel_clone_args args = {
+ .flags = (clone_flags & ~CSIGNAL),
+ .pidfd = parent_tidptr,
+ .child_tid = child_tidptr,
+ .parent_tid = parent_tidptr,
+ .exit_signal = (clone_flags & CSIGNAL),
+ .stack = newsp,
+ .tls = tls,
+ };
+
+ /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
+ if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID))
+ return -EINVAL;
+
+ return _do_fork(&args);
+}
+#endif
+
+#ifdef __ARCH_WANT_SYS_CLONE3
+noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
+ struct clone_args __user *uargs,
+ size_t size)
+{
+ struct clone_args args;
+
+ if (unlikely(size > PAGE_SIZE))
+ return -E2BIG;
+
+ if (unlikely(size < sizeof(struct clone_args)))
+ return -EINVAL;
+
+ if (unlikely(!access_ok(uargs, size)))
+ return -EFAULT;
+
+ if (size > sizeof(struct clone_args)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uargs + sizeof(struct clone_args);
+ end = (void __user *)uargs + size;
+
+ for (; addr < end; addr++) {
+ if (get_user(val, addr))
+ return -EFAULT;
+ if (val)
+ return -E2BIG;
+ }
+
+ size = sizeof(struct clone_args);
+ }
+
+ if (copy_from_user(&args, uargs, size))
+ return -EFAULT;
+
+ *kargs = (struct kernel_clone_args){
+ .flags = args.flags,
+ .pidfd = u64_to_user_ptr(args.pidfd),
+ .child_tid = u64_to_user_ptr(args.child_tid),
+ .parent_tid = u64_to_user_ptr(args.parent_tid),
+ .exit_signal = args.exit_signal,
+ .stack = args.stack,
+ .stack_size = args.stack_size,
+ .tls = args.tls,
+ };
+
+ return 0;
+}
+
+static bool clone3_args_valid(const struct kernel_clone_args *kargs)
+{
+ /*
+ * All lower bits of the flag word are taken.
+ * Verify that no other unknown flags are passed along.
+ */
+ if (kargs->flags & ~CLONE_LEGACY_FLAGS)
+ return false;
+
+ /*
+ * - make the CLONE_DETACHED bit reuseable for clone3
+ * - make the CSIGNAL bits reuseable for clone3
+ */
+ if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
+ return false;
+
+ if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
+ kargs->exit_signal)
+ return false;
+
+ return true;
+}
+
+SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
+{
+ int err;
+
+ struct kernel_clone_args kargs;
+
+ err = copy_clone_args_from_user(&kargs, uargs, size);
+ if (err)
+ return err;
+
+ if (!clone3_args_valid(&kargs))
+ return -EINVAL;
+
+ return _do_fork(&kargs);
}
#endif
diff --git a/kernel/freezer.c b/kernel/freezer.c
index b162b74611e4..c0738424bb43 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/freezer.c - Function to freeze a process
*
diff --git a/kernel/futex.c b/kernel/futex.c
index 6262f1534ac9..6d50728ef2e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Fast Userspace Mutexes (which I call "Futexes!").
* (C) Rusty Russell, IBM 2002
@@ -29,20 +30,6 @@
*
* "The futexes are also cursed."
* "But they come in a choice of three flavours!"
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/compat.h>
#include <linux/slab.h>
@@ -484,6 +471,37 @@ enum futex_access {
};
/**
+ * futex_setup_timer - set up the sleeping hrtimer.
+ * @time: ptr to the given timeout value
+ * @timeout: the hrtimer_sleeper structure to be set up
+ * @flags: futex flags
+ * @range_ns: optional range in ns
+ *
+ * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
+ * value given
+ */
+static inline struct hrtimer_sleeper *
+futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
+ int flags, u64 range_ns)
+{
+ if (!time)
+ return NULL;
+
+ hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ?
+ CLOCK_REALTIME : CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(timeout, current);
+
+ /*
+ * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
+ * effectively the same as calling hrtimer_set_expires().
+ */
+ hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
+
+ return timeout;
+}
+
+/**
* get_futex_key() - Get parameters which are the keys for a futex
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
@@ -543,7 +561,7 @@ again:
if (unlikely(should_fail_futex(fshared)))
return -EFAULT;
- err = get_user_pages_fast(address, 1, 1, &page);
+ err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
* and get read-only access.
@@ -2692,7 +2710,7 @@ out:
static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct restart_block *restart;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
@@ -2702,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
return -EINVAL;
q.bitset = bitset;
- if (abs_time) {
- to = &timeout;
-
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
-
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
retry:
/*
* Prepare to wait on uaddr. On success, holds hb lock and increments
@@ -2792,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart)
static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
@@ -2805,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
if (refill_pi_state_cache())
return -ENOMEM;
- if (time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires(&to->timer, *time);
- }
+ to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0);
retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
@@ -3208,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
u32 __user *uaddr2)
{
- struct hrtimer_sleeper timeout, *to = NULL;
+ struct hrtimer_sleeper timeout, *to;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
@@ -3225,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (!bitset)
return -EINVAL;
- if (abs_time) {
- to = &timeout;
- hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
- CLOCK_REALTIME : CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS);
- hrtimer_init_sleeper(to, current);
- hrtimer_set_expires_range_ns(&to->timer, *abs_time,
- current->timer_slack_ns);
- }
+ to = futex_setup_timer(abs_time, &timeout, flags,
+ current->timer_slack_ns);
/*
* The waiter is allocated on our stack, manipulated by the requeue
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 1e3823fa799b..3941a9c48f83 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "GCOV-based kernel profiling"
config GCOV_KERNEL
@@ -53,6 +54,7 @@ config GCOV_PROFILE_ALL
choice
prompt "Specify GCOV format"
depends on GCOV_KERNEL
+ depends on CC_IS_GCC
---help---
The gcov format is usually determined by the GCC version, and the
default is chosen according to your GCC version. However, there are
@@ -62,7 +64,7 @@ choice
config GCOV_FORMAT_3_4
bool "GCC 3.4 format"
- depends on CC_IS_GCC && GCC_VERSION < 40700
+ depends on GCC_VERSION < 40700
---help---
Select this option to use the format defined by GCC 3.4.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index ff06d64df397..d66a74b0f100 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,5 +2,6 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o
+obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
+obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 9c7c8d5c18f2..0ffe9f194080 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -22,88 +22,8 @@
#include <linux/sched.h>
#include "gcov.h"
-static int gcov_events_enabled;
-static DEFINE_MUTEX(gcov_lock);
-
-/*
- * __gcov_init is called by gcc-generated constructor code for each object
- * file compiled with -fprofile-arcs.
- */
-void __gcov_init(struct gcov_info *info)
-{
- static unsigned int gcov_version;
-
- mutex_lock(&gcov_lock);
- if (gcov_version == 0) {
- gcov_version = gcov_info_version(info);
- /*
- * Printing gcc's version magic may prove useful for debugging
- * incompatibility reports.
- */
- pr_info("version magic: 0x%x\n", gcov_version);
- }
- /*
- * Add new profiling data structure to list and inform event
- * listener.
- */
- gcov_info_link(info);
- if (gcov_events_enabled)
- gcov_event(GCOV_ADD, info);
- mutex_unlock(&gcov_lock);
-}
-EXPORT_SYMBOL(__gcov_init);
-
-/*
- * These functions may be referenced by gcc-generated profiling code but serve
- * no function for kernel profiling.
- */
-void __gcov_flush(void)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_flush);
-
-void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_add);
-
-void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_single);
-
-void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_delta);
-
-void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_ior);
-
-void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_time_profile);
-
-void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_merge_icall_topn);
-
-void __gcov_exit(void)
-{
- /* Unused. */
-}
-EXPORT_SYMBOL(__gcov_exit);
+int gcov_events_enabled;
+DEFINE_MUTEX(gcov_lock);
/**
* gcov_enable_events - enable event reporting through gcov_event()
@@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event,
/* Remove entries located in module from linked list. */
while ((info = gcov_info_next(info))) {
- if (within_module((unsigned long)info, mod)) {
+ if (gcov_info_within_module(info, mod)) {
gcov_info_unlink(prev, info);
if (gcov_events_enabled)
gcov_event(GCOV_REMOVE, info);
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
new file mode 100644
index 000000000000..c94b820a1b62
--- /dev/null
+++ b/kernel/gcov/clang.c
@@ -0,0 +1,581 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Google, Inc.
+ * modified from kernel/gcov/gcc_4_7.c
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * LLVM uses profiling data that's deliberately similar to GCC, but has a
+ * very different way of exporting that data. LLVM calls llvm_gcov_init() once
+ * per module, and provides a couple of callbacks that we can use to ask for
+ * more data.
+ *
+ * We care about the "writeout" callback, which in turn calls back into
+ * compiler-rt/this module to dump all the gathered coverage data to disk:
+ *
+ * llvm_gcda_start_file()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * llvm_gcda_emit_function()
+ * llvm_gcda_emit_arcs()
+ * [... repeats for each function ...]
+ * llvm_gcda_summary_info()
+ * llvm_gcda_end_file()
+ *
+ * This design is much more stateless and unstructured than gcc's, and is
+ * intended to run at process exit. This forces us to keep some local state
+ * about which module we're dealing with at the moment. On the other hand, it
+ * also means we don't depend as much on how LLVM represents profiling data
+ * internally.
+ *
+ * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more
+ * details on how this works, particularly GCOVProfiler::emitProfileArcs(),
+ * GCOVProfiler::insertCounterWriteout(), and
+ * GCOVProfiler::insertFlush().
+ */
+
+#define pr_fmt(fmt) "gcov: " fmt
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/printk.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "gcov.h"
+
+typedef void (*llvm_gcov_callback)(void);
+
+struct gcov_info {
+ struct list_head head;
+
+ const char *filename;
+ unsigned int version;
+ u32 checksum;
+
+ struct list_head functions;
+};
+
+struct gcov_fn_info {
+ struct list_head head;
+
+ u32 ident;
+ u32 checksum;
+ u8 use_extra_checksum;
+ u32 cfg_checksum;
+
+ u32 num_counters;
+ u64 *counters;
+ const char *function_name;
+};
+
+static struct gcov_info *current_info;
+
+static LIST_HEAD(clang_gcov_list);
+
+void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
+{
+ struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ INIT_LIST_HEAD(&info->functions);
+
+ mutex_lock(&gcov_lock);
+
+ list_add_tail(&info->head, &clang_gcov_list);
+ current_info = info;
+ writeout();
+ current_info = NULL;
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(llvm_gcov_init);
+
+void llvm_gcda_start_file(const char *orig_filename, const char version[4],
+ u32 checksum)
+{
+ current_info->filename = orig_filename;
+ memcpy(&current_info->version, version, sizeof(current_info->version));
+ current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+
+void llvm_gcda_emit_function(u32 ident, const char *function_name,
+ u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
+{
+ struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ info->ident = ident;
+ info->checksum = func_checksum;
+ info->use_extra_checksum = use_extra_checksum;
+ info->cfg_checksum = cfg_checksum;
+ if (function_name)
+ info->function_name = kstrdup(function_name, GFP_KERNEL);
+
+ list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+
+void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
+{
+ struct gcov_fn_info *info = list_last_entry(&current_info->functions,
+ struct gcov_fn_info, head);
+
+ info->num_counters = num_counters;
+ info->counters = counters;
+}
+EXPORT_SYMBOL(llvm_gcda_emit_arcs);
+
+void llvm_gcda_summary_info(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_summary_info);
+
+void llvm_gcda_end_file(void)
+{
+}
+EXPORT_SYMBOL(llvm_gcda_end_file);
+
+/**
+ * gcov_info_filename - return info filename
+ * @info: profiling data set
+ */
+const char *gcov_info_filename(struct gcov_info *info)
+{
+ return info->filename;
+}
+
+/**
+ * gcov_info_version - return info version
+ * @info: profiling data set
+ */
+unsigned int gcov_info_version(struct gcov_info *info)
+{
+ return info->version;
+}
+
+/**
+ * gcov_info_next - return next profiling data set
+ * @info: profiling data set
+ *
+ * Returns next gcov_info following @info or first gcov_info in the chain if
+ * @info is %NULL.
+ */
+struct gcov_info *gcov_info_next(struct gcov_info *info)
+{
+ if (!info)
+ return list_first_entry_or_null(&clang_gcov_list,
+ struct gcov_info, head);
+ if (list_is_last(&info->head, &clang_gcov_list))
+ return NULL;
+ return list_next_entry(info, head);
+}
+
+/**
+ * gcov_info_link - link/add profiling data set to the list
+ * @info: profiling data set
+ */
+void gcov_info_link(struct gcov_info *info)
+{
+ list_add_tail(&info->head, &clang_gcov_list);
+}
+
+/**
+ * gcov_info_unlink - unlink/remove profiling data set from the list
+ * @prev: previous profiling data set
+ * @info: profiling data set
+ */
+void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
+{
+ /* Generic code unlinks while iterating. */
+ __list_del_entry(&info->head);
+}
+
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info->filename, mod);
+}
+
+/* Symbolic links to be created for each profiling data file. */
+const struct gcov_link gcov_link[] = {
+ { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
+ { 0, NULL},
+};
+
+/**
+ * gcov_info_reset - reset profiling data to zero
+ * @info: profiling data set
+ */
+void gcov_info_reset(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn;
+
+ list_for_each_entry(fn, &info->functions, head)
+ memset(fn->counters, 0,
+ sizeof(fn->counters[0]) * fn->num_counters);
+}
+
+/**
+ * gcov_info_is_compatible - check if profiling data can be added
+ * @info1: first profiling data set
+ * @info2: second profiling data set
+ *
+ * Returns non-zero if profiling data can be added, zero otherwise.
+ */
+int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
+{
+ struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null(
+ &info1->functions, struct gcov_fn_info, head);
+ struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null(
+ &info2->functions, struct gcov_fn_info, head);
+
+ if (info1->checksum != info2->checksum)
+ return false;
+ if (!fn_ptr1)
+ return fn_ptr1 == fn_ptr2;
+ while (!list_is_last(&fn_ptr1->head, &info1->functions) &&
+ !list_is_last(&fn_ptr2->head, &info2->functions)) {
+ if (fn_ptr1->checksum != fn_ptr2->checksum)
+ return false;
+ if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum)
+ return false;
+ if (fn_ptr1->use_extra_checksum &&
+ fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum)
+ return false;
+ fn_ptr1 = list_next_entry(fn_ptr1, head);
+ fn_ptr2 = list_next_entry(fn_ptr2, head);
+ }
+ return list_is_last(&fn_ptr1->head, &info1->functions) &&
+ list_is_last(&fn_ptr2->head, &info2->functions);
+}
+
+/**
+ * gcov_info_add - add up profiling data
+ * @dest: profiling data set to which data is added
+ * @source: profiling data set which is added
+ *
+ * Adds profiling counts of @source to @dest.
+ */
+void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
+{
+ struct gcov_fn_info *dfn_ptr;
+ struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions,
+ struct gcov_fn_info, head);
+
+ list_for_each_entry(dfn_ptr, &dst->functions, head) {
+ u32 i;
+
+ for (i = 0; i < sfn_ptr->num_counters; i++)
+ dfn_ptr->counters[i] += sfn_ptr->counters[i];
+ }
+}
+
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+ size_t cv_size; /* counter values size */
+ struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+ GFP_KERNEL);
+ if (!fn_dup)
+ return NULL;
+ INIT_LIST_HEAD(&fn_dup->head);
+
+ fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL);
+ if (!fn_dup->function_name)
+ goto err_name;
+
+ cv_size = fn->num_counters * sizeof(fn->counters[0]);
+ fn_dup->counters = vmalloc(cv_size);
+ if (!fn_dup->counters)
+ goto err_counters;
+ memcpy(fn_dup->counters, fn->counters, cv_size);
+
+ return fn_dup;
+
+err_counters:
+ kfree(fn_dup->function_name);
+err_name:
+ kfree(fn_dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_dup - duplicate profiling data set
+ * @info: profiling data set to duplicate
+ *
+ * Return newly allocated duplicate on success, %NULL on error.
+ */
+struct gcov_info *gcov_info_dup(struct gcov_info *info)
+{
+ struct gcov_info *dup;
+ struct gcov_fn_info *fn;
+
+ dup = kmemdup(info, sizeof(*dup), GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ INIT_LIST_HEAD(&dup->head);
+ INIT_LIST_HEAD(&dup->functions);
+ dup->filename = kstrdup(info->filename, GFP_KERNEL);
+ if (!dup->filename)
+ goto err;
+
+ list_for_each_entry(fn, &info->functions, head) {
+ struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn);
+
+ if (!fn_dup)
+ goto err;
+ list_add_tail(&fn_dup->head, &dup->functions);
+ }
+
+ return dup;
+
+err:
+ gcov_info_free(dup);
+ return NULL;
+}
+
+/**
+ * gcov_info_free - release memory for profiling data set duplicate
+ * @info: profiling data set duplicate to free
+ */
+void gcov_info_free(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn, *tmp;
+
+ list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+ kfree(fn->function_name);
+ vfree(fn->counters);
+ list_del(&fn->head);
+ kfree(fn);
+ }
+ kfree(info->filename);
+ kfree(info);
+}
+
+#define ITER_STRIDE PAGE_SIZE
+
+/**
+ * struct gcov_iterator - specifies current file position in logical records
+ * @info: associated profiling data
+ * @buffer: buffer containing file data
+ * @size: size of buffer
+ * @pos: current position in file
+ */
+struct gcov_iterator {
+ struct gcov_info *info;
+ void *buffer;
+ size_t size;
+ loff_t pos;
+};
+
+/**
+ * store_gcov_u32 - store 32 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't
+ * store anything.
+ */
+static size_t store_gcov_u32(void *buffer, size_t off, u32 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+ *data = v;
+ }
+
+ return sizeof(*data);
+}
+
+/**
+ * store_gcov_u64 - store 64 bit number in gcov format to buffer
+ * @buffer: target buffer or NULL
+ * @off: offset into the buffer
+ * @v: value to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store
+ * anything.
+ */
+static size_t store_gcov_u64(void *buffer, size_t off, u64 v)
+{
+ u32 *data;
+
+ if (buffer) {
+ data = buffer + off;
+
+ data[0] = (v & 0xffffffffUL);
+ data[1] = (v >> 32);
+ }
+
+ return sizeof(*data) * 2;
+}
+
+/**
+ * convert_to_gcda - convert profiling data set to gcda file format
+ * @buffer: the buffer to store file data or %NULL if no data should be stored
+ * @info: profiling data set to be converted
+ *
+ * Returns the number of bytes that were/would have been stored into the buffer.
+ */
+static size_t convert_to_gcda(char *buffer, struct gcov_info *info)
+{
+ struct gcov_fn_info *fi_ptr;
+ size_t pos = 0;
+
+ /* File header. */
+ pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC);
+ pos += store_gcov_u32(buffer, pos, info->version);
+ pos += store_gcov_u32(buffer, pos, info->checksum);
+
+ list_for_each_entry(fi_ptr, &info->functions, head) {
+ u32 i;
+ u32 len = 2;
+
+ if (fi_ptr->use_extra_checksum)
+ len++;
+
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION);
+ pos += store_gcov_u32(buffer, pos, len);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->ident);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->checksum);
+ if (fi_ptr->use_extra_checksum)
+ pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum);
+
+ pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE);
+ pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2);
+ for (i = 0; i < fi_ptr->num_counters; i++)
+ pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]);
+ }
+
+ return pos;
+}
+
+/**
+ * gcov_iter_new - allocate and initialize profiling data iterator
+ * @info: profiling data set to be iterated
+ *
+ * Return file iterator on success, %NULL otherwise.
+ */
+struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
+{
+ struct gcov_iterator *iter;
+
+ iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL);
+ if (!iter)
+ goto err_free;
+
+ iter->info = info;
+ /* Dry-run to get the actual buffer size. */
+ iter->size = convert_to_gcda(NULL, info);
+ iter->buffer = vmalloc(iter->size);
+ if (!iter->buffer)
+ goto err_free;
+
+ convert_to_gcda(iter->buffer, info);
+
+ return iter;
+
+err_free:
+ kfree(iter);
+ return NULL;
+}
+
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+void gcov_iter_free(struct gcov_iterator *iter)
+{
+ vfree(iter->buffer);
+ kfree(iter);
+}
+
+/**
+ * gcov_iter_get_info - return profiling data set for given file iterator
+ * @iter: file iterator
+ */
+struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
+{
+ return iter->info;
+}
+
+/**
+ * gcov_iter_start - reset file iterator to starting position
+ * @iter: file iterator
+ */
+void gcov_iter_start(struct gcov_iterator *iter)
+{
+ iter->pos = 0;
+}
+
+/**
+ * gcov_iter_next - advance file iterator to next logical record
+ * @iter: file iterator
+ *
+ * Return zero if new position is valid, non-zero if iterator has reached end.
+ */
+int gcov_iter_next(struct gcov_iterator *iter)
+{
+ if (iter->pos < iter->size)
+ iter->pos += ITER_STRIDE;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * gcov_iter_write - write data for current pos to seq_file
+ * @iter: file iterator
+ * @seq: seq_file handle
+ *
+ * Return zero on success, non-zero otherwise.
+ */
+int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
+{
+ size_t len;
+
+ if (iter->pos >= iter->size)
+ return -EINVAL;
+
+ len = ITER_STRIDE;
+ if (iter->pos + len > iter->size)
+ len = iter->size - iter->pos;
+
+ seq_write(seq, iter->buffer + iter->pos, len);
+
+ return 0;
+}
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index 2dddecbdbe6e..801ee4b0b969 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
gcov_info_head = info->next;
}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info, mod);
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index ca5e5c0ef853..ec37563674d6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
gcov_info_head = info->next;
}
+/**
+ * gcov_info_within_module - check if a profiling data set belongs to a module
+ * @info: profiling data set
+ * @mod: module
+ *
+ * Returns true if profiling data belongs module, false otherwise.
+ */
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
+{
+ return within_module((unsigned long)info, mod);
+}
+
/* Symbolic links to be created for each profiling data file. */
const struct gcov_link gcov_link[] = {
{ OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c
new file mode 100644
index 000000000000..3cf736b9f880
--- /dev/null
+++ b/kernel/gcov/gcc_base.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include "gcov.h"
+
+/*
+ * __gcov_init is called by gcc-generated constructor code for each object
+ * file compiled with -fprofile-arcs.
+ */
+void __gcov_init(struct gcov_info *info)
+{
+ static unsigned int gcov_version;
+
+ mutex_lock(&gcov_lock);
+ if (gcov_version == 0) {
+ gcov_version = gcov_info_version(info);
+ /*
+ * Printing gcc's version magic may prove useful for debugging
+ * incompatibility reports.
+ */
+ pr_info("version magic: 0x%x\n", gcov_version);
+ }
+ /*
+ * Add new profiling data structure to list and inform event
+ * listener.
+ */
+ gcov_info_link(info);
+ if (gcov_events_enabled)
+ gcov_event(GCOV_ADD, info);
+ mutex_unlock(&gcov_lock);
+}
+EXPORT_SYMBOL(__gcov_init);
+
+/*
+ * These functions may be referenced by gcc-generated profiling code but serve
+ * no function for kernel profiling.
+ */
+void __gcov_flush(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_flush);
+
+void __gcov_merge_add(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_add);
+
+void __gcov_merge_single(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_single);
+
+void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_delta);
+
+void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_ior);
+
+void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_time_profile);
+
+void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_merge_icall_topn);
+
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index de118ad4a024..6ab2c1808c9d 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -15,6 +15,7 @@
#ifndef GCOV_H
#define GCOV_H GCOV_H
+#include <linux/module.h>
#include <linux/types.h>
/*
@@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info);
struct gcov_info *gcov_info_next(struct gcov_info *info);
void gcov_info_link(struct gcov_info *info);
void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info);
+bool gcov_info_within_module(struct gcov_info *info, struct module *mod);
/* Base interface. */
enum gcov_action {
@@ -83,4 +85,7 @@ struct gcov_link {
};
extern const struct gcov_link gcov_link[];
+extern int gcov_events_enabled;
+extern struct mutex gcov_lock;
+
#endif /* GCOV_H */
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
new file mode 100755
index 000000000000..9a34e1d9bd7f
--- /dev/null
+++ b/kernel/gen_kheaders.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This script generates an archive consisting of kernel headers
+# for CONFIG_IKHEADERS.
+set -e
+spath="$(dirname "$(readlink -f "$0")")"
+kroot="$spath/.."
+outdir="$(pwd)"
+tarfile=$1
+cpio_dir=$outdir/$tarfile.tmp
+
+# Script filename relative to the kernel source root
+# We add it to the archive because it is small and any changes
+# to this script will also cause a rebuild of the archive.
+sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")"
+
+src_file_list="
+include/
+arch/$SRCARCH/include/
+$sfile
+"
+
+obj_file_list="
+include/
+arch/$SRCARCH/include/
+"
+
+# Support incremental builds by skipping archive generation
+# if timestamps of files being archived are not changed.
+
+# This block is useful for debugging the incremental builds.
+# Uncomment it for debugging.
+# if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter;
+# else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi
+# find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter
+# find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter
+
+# include/generated/compile.h is ignored because it is touched even when none
+# of the source files changed. This causes pointless regeneration, so let us
+# ignore them for md5 calculation.
+pushd $kroot > /dev/null
+src_files_md5="$(find $src_file_list -type f |
+ grep -v "include/generated/compile.h" |
+ grep -v "include/generated/autoconf.h" |
+ grep -v "include/config/auto.conf" |
+ grep -v "include/config/auto.conf.cmd" |
+ grep -v "include/config/tristate.conf" |
+ xargs ls -lR | md5sum | cut -d ' ' -f1)"
+popd > /dev/null
+obj_files_md5="$(find $obj_file_list -type f |
+ grep -v "include/generated/compile.h" |
+ grep -v "include/generated/autoconf.h" |
+ grep -v "include/config/auto.conf" |
+ grep -v "include/config/auto.conf.cmd" |
+ grep -v "include/config/tristate.conf" |
+ xargs ls -lR | md5sum | cut -d ' ' -f1)"
+
+if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi
+if [ -f kernel/kheaders.md5 ] &&
+ [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] &&
+ [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then
+ exit
+fi
+
+if [ "${quiet}" != "silent_" ]; then
+ echo " GEN $tarfile"
+fi
+
+rm -rf $cpio_dir
+mkdir $cpio_dir
+
+pushd $kroot > /dev/null
+for f in $src_file_list;
+ do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir
+popd > /dev/null
+
+# The second CPIO can complain if files already exist which can
+# happen with out of tree builds. Just silence CPIO for now.
+for f in $obj_file_list;
+ do find "$f" ! -name "*.cmd" ! -name ".*";
+done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1
+
+# Remove comments except SDPX lines
+find $cpio_dir -type f -print0 |
+ xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+
+tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null
+
+echo "$src_files_md5" > kernel/kheaders.md5
+echo "$obj_files_md5" >> kernel/kheaders.md5
+echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
+
+rm -rf $cpio_dir
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index f108a95882c6..14a625c16cb3 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Detect Hung Task
*
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5f3e2baefca9..f92d9a687372 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "IRQ subsystem"
# Options selectable by the architecture code
@@ -91,6 +92,9 @@ config GENERIC_MSI_IRQ_DOMAIN
select IRQ_DOMAIN_HIERARCHY
select GENERIC_MSI_IRQ
+config IRQ_MSI_IOMMU
+ bool
+
config HANDLE_DOMAIN_IRQ
bool
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index ff6e352e3a6c..b4f53717d143 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -2,6 +2,9 @@
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
obj-$(CONFIG_IRQ_TIMINGS) += timings.o
+ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
+ CFLAGS_timings.o += -DDEBUG
+endif
obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f18cd5aa33e8..4352b08ae48d 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}
-static int __irq_build_affinity_masks(const struct irq_affinity *affd,
- unsigned int startvec,
+static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
@@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
- unsigned int startvec, unsigned int numvecs,
+static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
unsigned int firstvec,
struct irq_affinity_desc *masks)
{
@@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
build_node_to_cpumask(node_to_cpumask);
/* Spread on present CPUs starting from affd->pre_vectors */
- nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+ nr_present = __irq_build_affinity_masks(curvec, numvecs,
firstvec, node_to_cpumask,
cpu_present_mask, nmsk, masks);
@@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
else
curvec = firstvec + nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+ nr_others = __irq_build_affinity_masks(curvec, numvecs,
firstvec, node_to_cpumask,
npresmsk, nmsk, masks);
put_online_cpus();
@@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
unsigned int this_vecs = affd->set_size[i];
int ret;
- ret = irq_build_affinity_masks(affd, curvec, this_vecs,
+ ret = irq_build_affinity_masks(curvec, this_vecs,
curvec, masks);
if (ret) {
kfree(masks);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 16cbf6beb276..ae60cae24e9a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -90,7 +90,7 @@ unsigned long probe_irq_on(void)
/* It triggered already - consider it spurious. */
if (!(desc->istate & IRQS_WAITING)) {
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
} else
if (i < 32)
mask |= 1 << i;
@@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val)
mask |= 1 << i;
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val)
nr_of_irqs++;
}
desc->istate &= ~IRQS_AUTODETECT;
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 51128bea3846..b76703b2c0af 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc)
}
irq_state_clr_started(desc);
}
+}
+
+
+void irq_shutdown_and_deactivate(struct irq_desc *desc)
+{
+ irq_shutdown(desc);
/*
* This must be called even if the interrupt was never started up,
* because the activation can happen before the interrupt is
@@ -748,6 +754,8 @@ void handle_fasteoi_nmi(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
+ __kstat_incr_irqs_this_cpu(desc);
+
trace_irq_handler_entry(irq, action);
/*
* NMIs cannot be shared, there is only one action.
@@ -962,6 +970,8 @@ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
+ __kstat_incr_irqs_this_cpu(desc);
+
trace_irq_handler_entry(irq, action);
res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
trace_irq_handler_exit(irq, action, res);
@@ -1459,6 +1469,33 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent);
+
+/**
+ * irq_chip_request_resources_parent - Request resources on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+int irq_chip_request_resources_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+
+ if (data->chip->irq_request_resources)
+ return data->chip->irq_request_resources(data);
+
+ return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent);
+
+/**
+ * irq_chip_release_resources_parent - Release resources on the parent interrupt
+ * @data: Pointer to interrupt specific data
+ */
+void irq_chip_release_resources_parent(struct irq_data *data)
+{
+ data = data->parent_data;
+ if (data->chip->irq_release_resources)
+ data->chip->irq_release_resources(data);
+}
+EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
#endif
/**
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 5b1072e394b2..6c7ca2e983a5 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
*/
if (irqd_affinity_is_managed(d)) {
irqd_set_managed_shutdown(d);
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
return false;
}
affinity = cpu_online_mask;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 516c00a5e867..c1eccd4f6520 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -152,7 +152,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
raw_spin_lock_irq(&desc->lock);
data = irq_desc_get_irq_data(desc);
- seq_printf(m, "handler: %pf\n", desc->handle_irq);
+ seq_printf(m, "handler: %ps\n", desc->handle_irq);
seq_printf(m, "device: %s\n", desc->dev_name);
seq_printf(m, "status: 0x%08x\n", desc->status_use_accessors);
irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6df5ddfdb0f8..a4ace611f47f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -149,7 +149,7 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
res = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, res);
- if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+ if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pS enabled interrupts\n",
irq, action->handler))
local_irq_disable();
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 70c3053bc1f6..3924fbe829d4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -82,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
@@ -96,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { }
extern void irq_mark_irq(unsigned int irq);
#endif
+extern int __irq_get_irqchip_state(struct irq_data *data,
+ enum irqchip_irq_state which,
+ bool *state);
+
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
@@ -354,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp)
return value & U16_MAX;
}
+static __always_inline void irq_timings_push(u64 ts, int irq)
+{
+ struct irq_timings *timings = this_cpu_ptr(&irq_timings);
+
+ timings->values[timings->count & IRQ_TIMINGS_MASK] =
+ irq_timing_encode(ts, irq);
+
+ timings->count++;
+}
+
/*
* The function record_irq_time is only called in one place in the
* interrupts handler. We want this function always inline so the code
@@ -367,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc)
if (!static_branch_likely(&irq_timing_enabled))
return;
- if (desc->istate & IRQS_TIMINGS) {
- struct irq_timings *timings = this_cpu_ptr(&irq_timings);
-
- timings->values[timings->count & IRQ_TIMINGS_MASK] =
- irq_timing_encode(local_clock(),
- irq_desc_get_irq(desc));
-
- timings->count++;
- }
+ if (desc->istate & IRQS_TIMINGS)
+ irq_timings_push(local_clock(), irq_desc_get_irq(desc));
}
#else
static inline void irq_remove_timings(struct irq_desc *desc) {}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9f8a709337cf..9484e88dabc2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -275,11 +275,12 @@ static struct attribute *irq_attrs[] = {
&actions_attr.attr,
NULL
};
+ATTRIBUTE_GROUPS(irq);
static struct kobj_type irq_kobj_type = {
.release = irq_kobj_release,
.sysfs_ops = &kobj_sysfs_ops,
- .default_attrs = irq_attrs,
+ .default_groups = irq_groups,
};
static void irq_sysfs_add(int irq, struct irq_desc *desc)
@@ -679,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
* @hwirq: The HW irq number to convert to a logical one
* @regs: Register file coming from the low-level handling code
*
+ * This function must be called from an NMI context.
+ *
* Returns: 0 on success, or -EINVAL if conversion has failed
*/
int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
@@ -688,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
unsigned int irq;
int ret = 0;
- nmi_enter();
+ /*
+ * NMI context needs to be setup earlier in order to deal with tracing.
+ */
+ WARN_ON(!in_nmi());
irq = irq_find_mapping(domain, hwirq);
@@ -701,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
else
ret = -EINVAL;
- nmi_exit();
set_irq_regs(old_regs);
return ret;
}
@@ -945,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
+static bool irq_is_nmi(struct irq_desc *desc)
+{
+ return desc->istate & IRQS_NMI;
+}
+
/**
* kstat_irqs - Get the statistics for an interrupt
* @irq: The interrupt number
@@ -962,7 +972,8 @@ unsigned int kstat_irqs(unsigned int irq)
if (!desc || !desc->kstat_irqs)
return 0;
if (!irq_settings_is_per_cpu_devid(desc) &&
- !irq_settings_is_per_cpu(desc))
+ !irq_settings_is_per_cpu(desc) &&
+ !irq_is_nmi(desc))
return desc->tot_count;
for_each_possible_cpu(cpu)
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9ed29e4a7dbf..3078d0e48bba 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
* @ops: domain callbacks
* @host_data: Controller private data pointer
*
- * Allocates and initialize and irq_domain structure.
+ * Allocates and initializes an irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
@@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size),
GFP_KERNEL, of_node_to_nid(of_node));
- if (WARN_ON(!domain))
+ if (!domain)
return NULL;
if (fwnode && is_fwnode_irqchip(fwnode)) {
@@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
/**
* __irq_domain_alloc_irqs - Allocate IRQs from domain
* @domain: domain to allocate from
- * @irq_base: allocate specified IRQ nubmer if irq_base >= 0
+ * @irq_base: allocate specified IRQ number if irq_base >= 0
* @nr_irqs: number of IRQs to allocate
* @node: NUMA node id for memory allocation
* @arg: domain specific argument
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 53a081392115..e8f7f179bf77 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
+#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
@@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
-static void __synchronize_hardirq(struct irq_desc *desc)
+static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
+ struct irq_data *irqd = irq_desc_get_irq_data(desc);
bool inprogress;
do {
@@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc)
/* Ok, that indicated we're done: double-check carefully. */
raw_spin_lock_irqsave(&desc->lock, flags);
inprogress = irqd_irq_inprogress(&desc->irq_data);
+
+ /*
+ * If requested and supported, check at the chip whether it
+ * is in flight at the hardware level, i.e. already pending
+ * in a CPU and waiting for service and acknowledge.
+ */
+ if (!inprogress && sync_chip) {
+ /*
+ * Ignore the return code. inprogress is only updated
+ * when the chip supports it.
+ */
+ __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
+ &inprogress);
+ }
raw_spin_unlock_irqrestore(&desc->lock, flags);
/* Oops, that failed? */
@@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc)
* Returns: false if a threaded handler is active.
*
* This function may be called - with care - from IRQ context.
+ *
+ * It does not check whether there is an interrupt in flight at the
+ * hardware level, but not serviced yet, as this might deadlock when
+ * called with interrupts disabled and the target CPU of the interrupt
+ * is the current CPU.
*/
bool synchronize_hardirq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
- __synchronize_hardirq(desc);
+ __synchronize_hardirq(desc, false);
return !atomic_read(&desc->threads_active);
}
@@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq);
* to complete before returning. If you use this function while
* holding a resource the IRQ handler may need you will deadlock.
*
- * This function may be called - with care - from IRQ context.
+ * Can only be called from preemptible code as it might sleep when
+ * an interrupt thread is associated to @irq.
+ *
+ * It optionally makes sure (when the irq chip supports that method)
+ * that the interrupt is not pending in any CPU and waiting for
+ * service.
*/
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
- __synchronize_hardirq(desc);
+ __synchronize_hardirq(desc, true);
/*
* We made sure that no hardirq handler is
* running. Now verify that no threaded handlers are
@@ -781,7 +807,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
ret = 0;
break;
default:
- pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
+ pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
flags, irq_desc_get_irq(desc), chip->irq_set_type);
}
if (unmask)
@@ -1699,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
irq_settings_clr_disable_unlazy(desc);
+ /* Only shutdown. Deactivate after synchronize_hardirq() */
irq_shutdown(desc);
}
@@ -1727,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
unregister_handler_proc(irq, action);
- /* Make sure it's not being used on another CPU: */
- synchronize_hardirq(irq);
+ /*
+ * Make sure it's not being used on another CPU and if the chip
+ * supports it also make sure that there is no (not yet serviced)
+ * interrupt in flight at the hardware level.
+ */
+ __synchronize_hardirq(desc, true);
#ifdef CONFIG_DEBUG_SHIRQ
/*
@@ -1768,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
* require it to deallocate resources over the slow bus.
*/
chip_bus_lock(desc);
+ /*
+ * There is no interrupt on the fly anymore. Deactivate it
+ * completely.
+ */
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ irq_domain_deactivate_irq(&desc->irq_data);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
irq_release_resources(desc);
chip_bus_sync_unlock(desc);
irq_remove_timings(desc);
@@ -1855,7 +1894,7 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
}
irq_settings_clr_disable_unlazy(desc);
- irq_shutdown(desc);
+ irq_shutdown_and_deactivate(desc);
irq_release_resources(desc);
@@ -2578,6 +2617,28 @@ out:
irq_put_desc_unlock(desc, flags);
}
+int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
+ bool *state)
+{
+ struct irq_chip *chip;
+ int err = -EINVAL;
+
+ do {
+ chip = irq_data_get_irq_chip(data);
+ if (chip->irq_get_irqchip_state)
+ break;
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ data = data->parent_data;
+#else
+ data = NULL;
+#endif
+ } while (data);
+
+ if (data)
+ err = chip->irq_get_irqchip_state(data, which, state);
+ return err;
+}
+
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
* @irq: Interrupt line that is forwarded to a VM
@@ -2596,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
{
struct irq_desc *desc;
struct irq_data *data;
- struct irq_chip *chip;
unsigned long flags;
int err = -EINVAL;
@@ -2606,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
data = irq_desc_get_irq_data(desc);
- do {
- chip = irq_data_get_irq_chip(data);
- if (chip->irq_get_irqchip_state)
- break;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- data = data->parent_data;
-#else
- data = NULL;
-#endif
- } while (data);
-
- if (data)
- err = chip->irq_get_irqchip_state(data, which, state);
+ err = __irq_get_irqchip_state(data, which, state);
irq_put_desc_busunlock(desc, flags);
return err;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 6d2fa6914b30..2ed97a7c9b2a 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -212,9 +212,9 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
*/
raw_spin_lock_irqsave(&desc->lock, flags);
for_each_action_of_desc(desc, action) {
- printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
+ printk(KERN_ERR "[<%p>] %ps", action->handler, action->handler);
if (action->thread_fn)
- printk(KERN_CONT " threaded [<%p>] %pf",
+ printk(KERN_CONT " threaded [<%p>] %ps",
action->thread_fn, action->thread_fn);
printk(KERN_CONT "\n");
}
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index 90c735da15d0..e960d7ce7bcc 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -1,10 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org>
+#define pr_fmt(fmt) "irq_timings: " fmt
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/static_key.h>
+#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/idr.h>
#include <linux/irq.h>
@@ -261,12 +263,29 @@ void irq_timings_disable(void)
#define EMA_ALPHA_VAL 64
#define EMA_ALPHA_SHIFT 7
-#define PREDICTION_PERIOD_MIN 2
+#define PREDICTION_PERIOD_MIN 3
#define PREDICTION_PERIOD_MAX 5
#define PREDICTION_FACTOR 4
#define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */
#define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */
+/*
+ * Number of elements in the circular buffer: If it happens it was
+ * flushed before, then the number of elements could be smaller than
+ * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is
+ * used as we wrapped. The index begins from zero when we did not
+ * wrap. That could be done in a nicer way with the proper circular
+ * array structure type but with the cost of extra computation in the
+ * interrupt handler hot path. We choose efficiency.
+ */
+#define for_each_irqts(i, irqts) \
+ for (i = irqts->count < IRQ_TIMINGS_SIZE ? \
+ 0 : irqts->count & IRQ_TIMINGS_MASK, \
+ irqts->count = min(IRQ_TIMINGS_SIZE, \
+ irqts->count); \
+ irqts->count > 0; irqts->count--, \
+ i = (i + 1) & IRQ_TIMINGS_MASK)
+
struct irqt_stat {
u64 last_ts;
u64 ema_time[PREDICTION_BUFFER_SIZE];
@@ -297,7 +316,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old)
static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
{
- int i;
+ int period;
+
+ /*
+ * Move the beginning pointer to the end minus the max period x 3.
+ * We are at the point we can begin searching the pattern
+ */
+ buffer = &buffer[len - (period_max * 3)];
+
+ /* Adjust the length to the maximum allowed period x 3 */
+ len = period_max * 3;
/*
* The buffer contains the suite of intervals, in a ilog2
@@ -306,21 +334,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max)
* period beginning at the end of the buffer. We do that for
* each suffix.
*/
- for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) {
+ for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) {
- int *begin = &buffer[len - (i * 3)];
- int *ptr = begin;
+ /*
+ * The first comparison always succeed because the
+ * suffix is deduced from the first n-period bytes of
+ * the buffer and we compare the initial suffix with
+ * itself, so we can skip the first iteration.
+ */
+ int idx = period;
+ size_t size = period;
/*
* We look if the suite with period 'i' repeat
* itself. If it is truncated at the end, as it
* repeats we can use the period to find out the next
- * element.
+ * element with the modulo.
*/
- while (!memcmp(ptr, begin, i * sizeof(*ptr))) {
- ptr += i;
- if (ptr >= &buffer[len])
- return begin[((i * 3) % i)];
+ while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) {
+
+ /*
+ * Move the index in a period basis
+ */
+ idx += size;
+
+ /*
+ * If this condition is reached, all previous
+ * memcmp were successful, so the period is
+ * found.
+ */
+ if (idx == len)
+ return buffer[len % period];
+
+ /*
+ * If the remaining elements to compare are
+ * smaller than the period, readjust the size
+ * of the comparison for the last iteration.
+ */
+ if (len - idx < period)
+ size = len - idx;
}
}
@@ -380,11 +432,43 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now)
return irqs->last_ts + irqs->ema_time[index];
}
+static __always_inline int irq_timings_interval_index(u64 interval)
+{
+ /*
+ * The PREDICTION_FACTOR increase the interval size for the
+ * array of exponential average.
+ */
+ u64 interval_us = (interval >> 10) / PREDICTION_FACTOR;
+
+ return likely(interval_us) ? ilog2(interval_us) : 0;
+}
+
+static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs,
+ u64 interval)
+{
+ int index;
+
+ /*
+ * Get the index in the ema table for this interrupt.
+ */
+ index = irq_timings_interval_index(interval);
+
+ /*
+ * Store the index as an element of the pattern in another
+ * circular array.
+ */
+ irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
+
+ irqs->ema_time[index] = irq_timings_ema_new(interval,
+ irqs->ema_time[index]);
+
+ irqs->count++;
+}
+
static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
{
u64 old_ts = irqs->last_ts;
u64 interval;
- int index;
/*
* The timestamps are absolute time values, we need to compute
@@ -415,24 +499,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
return;
}
- /*
- * Get the index in the ema table for this interrupt. The
- * PREDICTION_FACTOR increase the interval size for the array
- * of exponential average.
- */
- index = likely(interval) ?
- ilog2((interval >> 10) / PREDICTION_FACTOR) : 0;
-
- /*
- * Store the index as an element of the pattern in another
- * circular array.
- */
- irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index;
-
- irqs->ema_time[index] = irq_timings_ema_new(interval,
- irqs->ema_time[index]);
-
- irqs->count++;
+ __irq_timings_store(irq, irqs, interval);
}
/**
@@ -493,11 +560,7 @@ u64 irq_timings_next_event(u64 now)
* model while decrementing the counter because we consume the
* data from our circular buffer.
*/
-
- i = (irqts->count & IRQ_TIMINGS_MASK) - 1;
- irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count);
-
- for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) {
+ for_each_irqts(i, irqts) {
irq = irq_timing_decode(irqts->values[i], &ts);
s = idr_find(&irqt_stats, irq);
if (s)
@@ -564,3 +627,325 @@ int irq_timings_alloc(int irq)
return 0;
}
+
+#ifdef CONFIG_TEST_IRQ_TIMINGS
+struct timings_intervals {
+ u64 *intervals;
+ size_t count;
+};
+
+/*
+ * Intervals are given in nanosecond base
+ */
+static u64 intervals0[] __initdata = {
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000, 500000,
+ 10000, 50000, 200000,
+};
+
+static u64 intervals1[] __initdata = {
+ 223947000, 1240000, 1384000, 1386000, 1386000,
+ 217416000, 1236000, 1384000, 1386000, 1387000,
+ 214719000, 1241000, 1386000, 1387000, 1384000,
+ 213696000, 1234000, 1384000, 1386000, 1388000,
+ 219904000, 1240000, 1385000, 1389000, 1385000,
+ 212240000, 1240000, 1386000, 1386000, 1386000,
+ 214415000, 1236000, 1384000, 1386000, 1387000,
+ 214276000, 1234000,
+};
+
+static u64 intervals2[] __initdata = {
+ 4000, 3000, 5000, 100000,
+ 3000, 3000, 5000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 5000, 3000, 117000,
+ 4000, 4000, 5000, 112000,
+ 4000, 3000, 4000, 110000,
+ 3000, 4000, 5000, 112000,
+ 4000,
+};
+
+static u64 intervals3[] __initdata = {
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+ 1386000, 214415000, 1236000,
+ 1384000, 214276000, 1234000,
+ 1386000, 214415000, 1236000,
+ 1385000, 212240000, 1240000,
+};
+
+static u64 intervals4[] __initdata = {
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000, 50000, 10000, 50000,
+ 10000,
+};
+
+static struct timings_intervals tis[] __initdata = {
+ { intervals0, ARRAY_SIZE(intervals0) },
+ { intervals1, ARRAY_SIZE(intervals1) },
+ { intervals2, ARRAY_SIZE(intervals2) },
+ { intervals3, ARRAY_SIZE(intervals3) },
+ { intervals4, ARRAY_SIZE(intervals4) },
+};
+
+static int __init irq_timings_test_next_index(struct timings_intervals *ti)
+{
+ int _buffer[IRQ_TIMINGS_SIZE];
+ int buffer[IRQ_TIMINGS_SIZE];
+ int index, start, i, count, period_max;
+
+ count = ti->count - 1;
+
+ period_max = count > (3 * PREDICTION_PERIOD_MAX) ?
+ PREDICTION_PERIOD_MAX : count / 3;
+
+ /*
+ * Inject all values except the last one which will be used
+ * to compare with the next index result.
+ */
+ pr_debug("index suite: ");
+
+ for (i = 0; i < count; i++) {
+ index = irq_timings_interval_index(ti->intervals[i]);
+ _buffer[i & IRQ_TIMINGS_MASK] = index;
+ pr_cont("%d ", index);
+ }
+
+ start = count < IRQ_TIMINGS_SIZE ? 0 :
+ count & IRQ_TIMINGS_MASK;
+
+ count = min_t(int, count, IRQ_TIMINGS_SIZE);
+
+ for (i = 0; i < count; i++) {
+ int index = (start + i) & IRQ_TIMINGS_MASK;
+ buffer[i] = _buffer[index];
+ }
+
+ index = irq_timings_next_event_index(buffer, count, period_max);
+ i = irq_timings_interval_index(ti->intervals[ti->count - 1]);
+
+ if (index != i) {
+ pr_err("Expected (%d) and computed (%d) next indexes differ\n",
+ i, index);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __init irq_timings_next_index_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+
+ ret = irq_timings_test_next_index(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqs(struct timings_intervals *ti)
+{
+ struct irqt_stat __percpu *s;
+ struct irqt_stat *irqs;
+ int i, index, ret, irq = 0xACE5;
+
+ ret = irq_timings_alloc(irq);
+ if (ret) {
+ pr_err("Failed to allocate irq timings\n");
+ return ret;
+ }
+
+ s = idr_find(&irqt_stats, irq);
+ if (!s) {
+ ret = -EIDRM;
+ goto out;
+ }
+
+ irqs = this_cpu_ptr(s);
+
+ for (i = 0; i < ti->count; i++) {
+
+ index = irq_timings_interval_index(ti->intervals[i]);
+ pr_debug("%d: interval=%llu ema_index=%d\n",
+ i, ti->intervals[i], index);
+
+ __irq_timings_store(irq, irqs, ti->intervals[i]);
+ if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) {
+ pr_err("Failed to store in the circular buffer\n");
+ goto out;
+ }
+ }
+
+ if (irqs->count != ti->count) {
+ pr_err("Count differs\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ irq_timings_free(irq);
+
+ return ret;
+}
+
+static int __init irq_timings_irqs_selftest(void)
+{
+ int i, ret;
+
+ for (i = 0; i < ARRAY_SIZE(tis); i++) {
+ pr_info("---> Injecting intervals number #%d (count=%zd)\n",
+ i, tis[i].count);
+ ret = irq_timings_test_irqs(&tis[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_test_irqts(struct irq_timings *irqts,
+ unsigned count)
+{
+ int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0;
+ int i, irq, oirq = 0xBEEF;
+ u64 ots = 0xDEAD, ts;
+
+ /*
+ * Fill the circular buffer by using the dedicated function.
+ */
+ for (i = 0; i < count; i++) {
+ pr_debug("%d: index=%d, ts=%llX irq=%X\n",
+ i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i);
+
+ irq_timings_push(ots + i, oirq + i);
+ }
+
+ /*
+ * Compute the first elements values after the index wrapped
+ * up or not.
+ */
+ ots += start;
+ oirq += start;
+
+ /*
+ * Test the circular buffer count is correct.
+ */
+ pr_debug("---> Checking timings array count (%d) is right\n", count);
+ if (WARN_ON(irqts->count != count))
+ return -EINVAL;
+
+ /*
+ * Test the macro allowing to browse all the irqts.
+ */
+ pr_debug("---> Checking the for_each_irqts() macro\n");
+ for_each_irqts(i, irqts) {
+
+ irq = irq_timing_decode(irqts->values[i], &ts);
+
+ pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n",
+ i, ts, ots, irq, oirq);
+
+ if (WARN_ON(ts != ots || irq != oirq))
+ return -EINVAL;
+
+ ots++; oirq++;
+ }
+
+ /*
+ * The circular buffer should have be flushed when browsed
+ * with for_each_irqts
+ */
+ pr_debug("---> Checking timings array is empty after browsing it\n");
+ if (WARN_ON(irqts->count))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int __init irq_timings_irqts_selftest(void)
+{
+ struct irq_timings *irqts = this_cpu_ptr(&irq_timings);
+ int i, ret;
+
+ /*
+ * Test the circular buffer with different number of
+ * elements. The purpose is to test at the limits (empty, half
+ * full, full, wrapped with the cursor at the boundaries,
+ * wrapped several times, etc ...
+ */
+ int count[] = { 0,
+ IRQ_TIMINGS_SIZE >> 1,
+ IRQ_TIMINGS_SIZE,
+ IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1),
+ 2 * IRQ_TIMINGS_SIZE,
+ (2 * IRQ_TIMINGS_SIZE) + 3,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(count); i++) {
+
+ pr_info("---> Checking the timings with %d/%d values\n",
+ count[i], IRQ_TIMINGS_SIZE);
+
+ ret = irq_timings_test_irqts(irqts, count[i]);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static int __init irq_timings_selftest(void)
+{
+ int ret;
+
+ pr_info("------------------- selftest start -----------------\n");
+
+ /*
+ * At this point, we don't except any subsystem to use the irq
+ * timings but us, so it should not be enabled.
+ */
+ if (static_branch_unlikely(&irq_timing_enabled)) {
+ pr_warn("irq timings already initialized, skipping selftest\n");
+ return 0;
+ }
+
+ ret = irq_timings_irqts_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_irqs_selftest();
+ if (ret)
+ goto out;
+
+ ret = irq_timings_next_index_selftest();
+out:
+ pr_info("---------- selftest end with %s -----------\n",
+ ret ? "failure" : "success");
+
+ return ret;
+}
+early_initcall(irq_timings_selftest);
+#endif
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 73288914ed5e..d42acaf81886 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
*
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index de6efdecc70d..df3008419a1d 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* jump label support
*
@@ -36,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b)
const struct jump_entry *jea = a;
const struct jump_entry *jeb = b;
+ /*
+ * Entrires are sorted by key.
+ */
if (jump_entry_key(jea) < jump_entry_key(jeb))
return -1;
if (jump_entry_key(jea) > jump_entry_key(jeb))
return 1;
+ /*
+ * In the batching mode, entries should also be sorted by the code
+ * inside the already sorted list of entries, enabling a bsearch in
+ * the vector.
+ */
+ if (jump_entry_code(jea) < jump_entry_code(jeb))
+ return -1;
+
+ if (jump_entry_code(jea) > jump_entry_code(jeb))
+ return 1;
+
return 0;
}
@@ -383,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry)
return enabled ^ branch;
}
+static bool jump_label_can_update(struct jump_entry *entry, bool init)
+{
+ /*
+ * Cannot update code that was in an init text area.
+ */
+ if (!init && jump_entry_is_init(entry))
+ return false;
+
+ if (!kernel_text_address(jump_entry_code(entry))) {
+ WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry));
+ return false;
+ }
+
+ return true;
+}
+
+#ifndef HAVE_JUMP_LABEL_BATCH
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
struct jump_entry *stop,
bool init)
{
for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
- /*
- * An entry->code of 0 indicates an entry which has been
- * disabled because it was in an init text area.
- */
- if (init || !jump_entry_is_init(entry)) {
- if (kernel_text_address(jump_entry_code(entry)))
- arch_jump_label_transform(entry, jump_label_type(entry));
- else
- WARN_ONCE(1, "can't patch jump_label at %pS",
- (void *)jump_entry_code(entry));
+ if (jump_label_can_update(entry, init))
+ arch_jump_label_transform(entry, jump_label_type(entry));
+ }
+}
+#else
+static void __jump_label_update(struct static_key *key,
+ struct jump_entry *entry,
+ struct jump_entry *stop,
+ bool init)
+{
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
+
+ if (!jump_label_can_update(entry, init))
+ continue;
+
+ if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
+ /*
+ * Queue is full: Apply the current queue and try again.
+ */
+ arch_jump_label_transform_apply();
+ BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
}
}
+ arch_jump_label_transform_apply();
}
+#endif
void __init jump_label_init(void)
{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 14934afa9e68..95a260f9214b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kallsyms.c: in-kernel printing of symbolic oopses and stack traces.
*
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 68559808fdfa..1b018f1a6e0d 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec.c - kexec_load system call
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index fd5c95ff9251..d5870723b8ad 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1,9 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec.c - kexec system call core code.
* Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index f7fb8f6a688f..b8cc032d5620 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kexec: kexec_file_load system call
*
* Copyright (C) 2014 Red Hat Inc.
* Authors:
* Vivek Goyal <vgoyal@redhat.com>
- *
- * This source code is licensed under the GNU General Public License,
- * Version 2. See the file COPYING for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -198,9 +196,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
return ret;
image->kernel_buf_len = size;
- /* IMA needs to pass the measurement list to the next kernel. */
- ima_add_kexec_buffer(image);
-
/* Call arch image probe handlers */
ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
image->kernel_buf_len);
@@ -241,8 +236,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
ret = -EINVAL;
goto out;
}
+
+ ima_kexec_cmdline(image->cmdline_buf,
+ image->cmdline_buf_len - 1);
}
+ /* IMA needs to pass the measurement list to the next kernel. */
+ ima_add_kexec_buffer(image);
+
/* Call arch image load handlers */
ldata = arch_kexec_kernel_image_load(image);
@@ -500,13 +501,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg)
return locate_mem_hole_bottom_up(start, end, kbuf);
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
-static int kexec_walk_memblock(struct kexec_buf *kbuf,
- int (*func)(struct resource *, void *))
-{
- return 0;
-}
-#else
+#ifdef CONFIG_ARCH_KEEP_MEMBLOCK
static int kexec_walk_memblock(struct kexec_buf *kbuf,
int (*func)(struct resource *, void *))
{
@@ -550,6 +545,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
return ret;
}
+#else
+static int kexec_walk_memblock(struct kexec_buf *kbuf,
+ int (*func)(struct resource *, void *))
+{
+ return 0;
+}
#endif
/**
@@ -589,7 +590,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf)
if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN)
return 0;
- if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK))
+ if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
ret = kexec_walk_resources(kbuf, locate_mem_hole_callback);
else
ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback);
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
new file mode 100644
index 000000000000..8f69772af77b
--- /dev/null
+++ b/kernel/kheaders.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Provide kernel headers useful to build tracing programs
+ * such as for running eBPF tracing tools.
+ *
+ * (Borrowed code from kernel/configs.c)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/init.h>
+
+/*
+ * Define kernel_headers_data and kernel_headers_data_end, within which the
+ * compressed kernel headers are stored. The file is first compressed with xz.
+ */
+
+asm (
+" .pushsection .rodata, \"a\" \n"
+" .global kernel_headers_data \n"
+"kernel_headers_data: \n"
+" .incbin \"kernel/kheaders_data.tar.xz\" \n"
+" .global kernel_headers_data_end \n"
+"kernel_headers_data_end: \n"
+" .popsection \n"
+);
+
+extern char kernel_headers_data;
+extern char kernel_headers_data_end;
+
+static ssize_t
+ikheaders_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t len)
+{
+ memcpy(buf, &kernel_headers_data + off, len);
+ return len;
+}
+
+static struct bin_attribute kheaders_attr __ro_after_init = {
+ .attr = {
+ .name = "kheaders.tar.xz",
+ .mode = 0444,
+ },
+ .read = &ikheaders_read,
+};
+
+static int __init ikheaders_init(void)
+{
+ kheaders_attr.size = (&kernel_headers_data_end -
+ &kernel_headers_data);
+ return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
+}
+
+static void __exit ikheaders_cleanup(void)
+{
+ sysfs_remove_bin_file(kernel_kobj, &kheaders_attr);
+}
+
+module_init(ikheaders_init);
+module_exit(ikheaders_cleanup);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Joel Fernandes");
+MODULE_DESCRIPTION("Echo the kernel header artifacts used to build the kernel");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b1ea30a5540e..445337c107e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Kernel Probes (KProbes)
* kernel/kprobes.c
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
* Copyright (C) IBM Corporation, 2002, 2004
*
* 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 46ba853656f6..35859da8bd4f 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -1,11 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/ksysfs.c - sysfs attributes in /sys/kernel, which
* are not related to any other subsystem
*
* Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org>
- *
- * This file is release under the GPLv2
- *
*/
#include <linux/kobject.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5942eeafb9ac..621467c33fef 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
*
@@ -11,6 +12,7 @@
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
+#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 99a5b5f46dc5..e3acead004e6 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* latencytop.c: Latency display infrastructure
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
/*
@@ -67,13 +63,10 @@ static struct latency_record latency_record[MAXLR];
int latencytop_enabled;
-void clear_all_latency_tracing(struct task_struct *p)
+void clear_tsk_latency_tracing(struct task_struct *p)
{
unsigned long flags;
- if (!latencytop_enabled)
- return;
-
raw_spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
@@ -96,9 +89,6 @@ account_global_scheduler_latency(struct task_struct *tsk,
int firstnonnull = MAXLR + 1;
int i;
- if (!latencytop_enabled)
- return;
-
/* skip kernel threads for now */
if (!tsk->mm)
return;
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
index ec4565122e65..54102deb50ba 100644
--- a/kernel/livepatch/Kconfig
+++ b/kernel/livepatch/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config HAVE_LIVEPATCH
bool
help
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index b36ceda6488e..cf9b5bcdb952 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_LIVEPATCH) += livepatch.o
livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index f12c0eabd843..c4ce08f43bd6 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* core.c - Kernel Live Patching Core
*
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2014 SUSE
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,6 +18,7 @@
#include <linux/elf.h>
#include <linux/moduleloader.h>
#include <linux/completion.h>
+#include <linux/memory.h>
#include <asm/cacheflush.h>
#include "core.h"
#include "patch.h"
@@ -419,6 +408,7 @@ static struct attribute *klp_patch_attrs[] = {
&force_kobj_attr.attr,
NULL
};
+ATTRIBUTE_GROUPS(klp_patch);
static void klp_free_object_dynamic(struct klp_object *obj)
{
@@ -549,7 +539,7 @@ static void klp_kobj_release_patch(struct kobject *kobj)
static struct kobj_type klp_ktype_patch = {
.release = klp_kobj_release_patch,
.sysfs_ops = &kobj_sysfs_ops,
- .default_attrs = klp_patch_attrs,
+ .default_groups = klp_patch_groups,
};
static void klp_kobj_release_object(struct kobject *kobj)
@@ -729,16 +719,21 @@ static int klp_init_object_loaded(struct klp_patch *patch,
struct klp_func *func;
int ret;
+ mutex_lock(&text_mutex);
+
module_disable_ro(patch->mod);
ret = klp_write_object_relocations(patch->mod, obj);
if (ret) {
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
return ret;
}
arch_klp_init_object_loaded(patch, obj);
module_enable_ro(patch->mod, true);
+ mutex_unlock(&text_mutex);
+
klp_for_each_func(obj, func) {
ret = klp_find_object_symbol(obj->name, func->old_name,
func->old_sympos,
@@ -1207,14 +1202,6 @@ void klp_module_going(struct module *mod)
static int __init klp_init(void)
{
- int ret;
-
- ret = klp_check_compiler_support();
- if (ret) {
- pr_info("Your compiler is too old; turning off.\n");
- return -EINVAL;
- }
-
klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj);
if (!klp_root_kobj)
return -ENOMEM;
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 99cb3ad05eb4..bd43537702bd 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* patch.c - livepatch patching functions
*
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2014 SUSE
* Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 83958c814439..e5c9fb295ba9 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* shadow.c - Shadow Variables
*
* Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
* Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
* Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/**
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 3c764e73b032..cdf318d86dd6 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* transition.c - Kernel Live Patching transition functions
*
* Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 6fe2f333aecb..45452facff3b 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index feb1acc54611..8c7e7d25f09c 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -31,12 +31,13 @@ enum lock_events {
DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
/*
- * Increment the PV qspinlock statistical counters
+ * Increment the statistical counters. use raw_cpu_inc() because of lower
+ * overhead and we don't care if we loose the occasional update.
*/
static inline void __lockevent_inc(enum lock_events event, bool cond)
{
if (cond)
- __this_cpu_inc(lockevents[event]);
+ raw_cpu_inc(lockevents[event]);
}
#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
@@ -44,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond)
static inline void __lockevent_add(enum lock_events event, int inc)
{
- __this_cpu_add(lockevents[event], inc);
+ raw_cpu_add(lockevents[event], inc);
}
#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index ad7668cfc9da..239039d0ce21 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -56,12 +56,16 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
-LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */
-LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */
+LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */
+LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */
+LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */
+LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */
+LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */
+LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */
LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
-LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */
+LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
-LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */
+LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d06190fa5082..341f52117f88 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/lockdep.c
*
@@ -150,17 +151,28 @@ unsigned long nr_lock_classes;
static
#endif
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
- if (!hlock->class_idx) {
+ unsigned int class_idx = hlock->class_idx;
+
+ /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */
+ barrier();
+
+ if (!test_bit(class_idx, lock_classes_in_use)) {
/*
* Someone passed in garbage, we give up.
*/
DEBUG_LOCKS_WARN_ON(1);
return NULL;
}
- return lock_classes + hlock->class_idx - 1;
+
+ /*
+ * At this point, if the passed hlock->class_idx is still garbage,
+ * we just have to live with it
+ */
+ return lock_classes + class_idx;
}
#ifdef CONFIG_LOCK_STAT
@@ -358,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx)
return k0 | (u64)k1 << 32;
}
+void lockdep_init_task(struct task_struct *task)
+{
+ task->lockdep_depth = 0; /* no locks held yet */
+ task->curr_chain_key = INITIAL_CHAIN_KEY;
+ task->lockdep_recursion = 0;
+}
+
void lockdep_off(void)
{
current->lockdep_recursion++;
@@ -418,13 +437,6 @@ static int verbose(struct lock_class *class)
return 0;
}
-/*
- * Stack-trace: tightly packed array of stack backtrace
- * addresses. Protected by the graph_lock.
- */
-unsigned long nr_stack_trace_entries;
-static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
-
static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
@@ -434,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg)
#endif
}
+unsigned long nr_stack_trace_entries;
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+/*
+ * Stack-trace: tightly packed array of stack backtrace
+ * addresses. Protected by the graph_lock.
+ */
+static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+
static int save_trace(struct lock_trace *trace)
{
unsigned long *entries = stack_trace + nr_stack_trace_entries;
@@ -456,6 +477,7 @@ static int save_trace(struct lock_trace *trace)
return 1;
}
+#endif
unsigned int nr_hardirq_chains;
unsigned int nr_softirq_chains;
@@ -469,6 +491,7 @@ unsigned int max_lockdep_depth;
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
#endif
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
/*
* Locking printouts:
*/
@@ -486,6 +509,7 @@ static const char *usage_str[] =
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
};
+#endif
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
@@ -499,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit)
static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
{
+ /*
+ * The usage character defaults to '.' (i.e., irqs disabled and not in
+ * irq context), which is the safest usage category.
+ */
char c = '.';
- if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
+ /*
+ * The order of the following usage checks matters, which will
+ * result in the outcome character as follows:
+ *
+ * - '+': irq is enabled and not in irq context
+ * - '-': in irq context and irq is disabled
+ * - '?': in irq context and irq is enabled
+ */
+ if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) {
c = '+';
- if (class->usage_mask & lock_flag(bit)) {
- c = '-';
- if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
+ if (class->usage_mask & lock_flag(bit))
c = '?';
- }
+ } else if (class->usage_mask & lock_flag(bit))
+ c = '-';
return c;
}
@@ -571,19 +606,22 @@ static void print_lock(struct held_lock *hlock)
/*
* We can be called locklessly through debug_show_all_locks() so be
* extra careful, the hlock might have been released and cleared.
+ *
+ * If this indeed happens, lets pretend it does not hurt to continue
+ * to print the lock unless the hlock class_idx does not point to a
+ * registered class. The rationale here is: since we don't attempt
+ * to distinguish whether we are in this situation, if it just
+ * happened we can't count on class_idx to tell either.
*/
- unsigned int class_idx = hlock->class_idx;
-
- /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */
- barrier();
+ struct lock_class *lock = hlock_class(hlock);
- if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) {
+ if (!lock) {
printk(KERN_CONT "<RELEASED>\n");
return;
}
printk(KERN_CONT "%p", hlock->instance);
- print_lock_name(lock_classes + class_idx - 1);
+ print_lock_name(lock);
printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
}
@@ -731,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
* Huh! same key, different name? Did someone trample
* on some memory? We're most confused.
*/
- WARN_ON_ONCE(class->name != lock->name);
+ WARN_ON_ONCE(class->name != lock->name &&
+ lock->key != &__lockdep_no_validate__);
return class;
}
}
@@ -837,11 +876,11 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
static bool check_lock_chain_key(struct lock_chain *chain)
{
#ifdef CONFIG_PROVE_LOCKING
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int i;
for (i = chain->base; i < chain->base + chain->depth; i++)
- chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
/*
* The 'unsigned long long' casts avoid that a compiler warning
* is reported when building tools/lib/lockdep.
@@ -1116,6 +1155,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
return NULL;
}
nr_lock_classes++;
+ __set_bit(class - lock_classes, lock_classes_in_use);
debug_atomic_inc(nr_unused_locks);
class->key = key;
class->name = lock->name;
@@ -1227,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this,
#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
/*
- * The circular_queue and helpers is used to implement the
- * breadth-first search(BFS)algorithem, by which we can build
- * the shortest path from the next lock to be acquired to the
- * previous held lock if there is a circular between them.
+ * The circular_queue and helpers are used to implement graph
+ * breadth-first search (BFS) algorithm, by which we can determine
+ * whether there is a path from a lock to another. In deadlock checks,
+ * a path from the next lock to be acquired to a previous held lock
+ * indicates that adding the <prev> -> <next> lock dependency will
+ * produce a circle in the graph. Breadth-first search instead of
+ * depth-first search is used in order to find the shortest (circular)
+ * path.
*/
struct circular_queue {
- unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
+ struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE];
unsigned int front, rear;
};
@@ -1259,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq)
return ((cq->rear + 1) & CQ_MASK) == cq->front;
}
-static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
+static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem)
{
if (__cq_full(cq))
return -1;
@@ -1269,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
return 0;
}
-static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
+/*
+ * Dequeue an element from the circular_queue, return a lock_list if
+ * the queue is not empty, or NULL if otherwise.
+ */
+static inline struct lock_list * __cq_dequeue(struct circular_queue *cq)
{
+ struct lock_list * lock;
+
if (__cq_empty(cq))
- return -1;
+ return NULL;
- *elem = cq->element[cq->front];
+ lock = cq->element[cq->front];
cq->front = (cq->front + 1) & CQ_MASK;
- return 0;
+
+ return lock;
}
static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
@@ -1321,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child)
return depth;
}
+/*
+ * Return the forward or backward dependency list.
+ *
+ * @lock: the lock_list to get its class's dependency list
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ */
+static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
+{
+ void *lock_class = lock->class;
+
+ return lock_class + offset;
+}
+
+/*
+ * Forward- or backward-dependency search, used for both circular dependency
+ * checking and hardirq-unsafe/softirq-unsafe checking.
+ */
static int __bfs(struct lock_list *source_entry,
void *data,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry,
- int forward)
+ int offset)
{
struct lock_list *entry;
+ struct lock_list *lock;
struct list_head *head;
struct circular_queue *cq = &lock_cq;
int ret = 1;
@@ -1338,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry,
goto exit;
}
- if (forward)
- head = &source_entry->class->locks_after;
- else
- head = &source_entry->class->locks_before;
-
+ head = get_dep_list(source_entry, offset);
if (list_empty(head))
goto exit;
__cq_init(cq);
- __cq_enqueue(cq, (unsigned long)source_entry);
+ __cq_enqueue(cq, source_entry);
- while (!__cq_empty(cq)) {
- struct lock_list *lock;
-
- __cq_dequeue(cq, (unsigned long *)&lock);
+ while ((lock = __cq_dequeue(cq))) {
if (!lock->class) {
ret = -2;
goto exit;
}
- if (forward)
- head = &lock->class->locks_after;
- else
- head = &lock->class->locks_before;
+ head = get_dep_list(lock, offset);
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1376,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry,
goto exit;
}
- if (__cq_enqueue(cq, (unsigned long)entry)) {
+ if (__cq_enqueue(cq, entry)) {
ret = -1;
goto exit;
}
@@ -1395,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 1);
+ return __bfs(src_entry, data, match, target_entry,
+ offsetof(struct lock_class, locks_after));
}
@@ -1404,16 +1465,11 @@ static inline int __bfs_backwards(struct lock_list *src_entry,
int (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- return __bfs(src_entry, data, match, target_entry, 0);
+ return __bfs(src_entry, data, match, target_entry,
+ offsetof(struct lock_class, locks_before));
}
-/*
- * Recursive, forwards-direction lock-dependency checking, used for
- * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
- * checking.
- */
-
static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
{
unsigned long *entries = stack_trace + trace->offset;
@@ -1425,16 +1481,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
* Print a dependency chain entry (this is only done when a deadlock
* has been detected):
*/
-static noinline int
+static noinline void
print_circular_bug_entry(struct lock_list *target, int depth)
{
if (debug_locks_silent)
- return 0;
+ return;
printk("\n-> #%u", depth);
print_lock_name(target->class);
printk(KERN_CONT ":\n");
print_lock_trace(&target->trace, 6);
- return 0;
}
static void
@@ -1491,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src,
* When a circular dependency is detected, print the
* header first:
*/
-static noinline int
+static noinline void
print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct held_lock *check_src,
struct held_lock *check_tgt)
@@ -1499,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
struct task_struct *curr = current;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("======================================================\n");
@@ -1517,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
pr_warn("\nthe existing dependency chain (in reverse order) is:\n");
print_circular_bug_entry(entry, depth);
-
- return 0;
}
static inline int class_equal(struct lock_list *entry, void *data)
@@ -1526,10 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data)
return entry->class == data;
}
-static noinline int print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt)
+static noinline void print_circular_bug(struct lock_list *this,
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1537,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
if (!save_trace(&this->trace))
- return 0;
+ return;
depth = get_lock_depth(target);
@@ -1562,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this,
printk("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
-static noinline int print_bfs_bug(int ret)
+static noinline void print_bfs_bug(int ret)
{
if (!debug_locks_off_graph_unlock())
- return 0;
+ return;
/*
* Breadth-first-search failed, graph got corrupted?
*/
WARN(1, "lockdep bfs error:%d\n", ret);
-
- return 0;
}
static int noop_count(struct lock_list *entry, void *data)
@@ -1639,36 +1688,95 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
}
/*
- * Prove that the dependency graph starting at <entry> can not
- * lead to <target>. Print an error and return 0 if it does.
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. Print an error and return 0 if it does.
*/
static noinline int
-check_noncircular(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+check_path(struct lock_class *target, struct lock_list *src_entry,
+ struct lock_list **target_entry)
{
- int result;
+ int ret;
+
+ ret = __bfs_forwards(src_entry, (void *)target, class_equal,
+ target_entry);
+
+ if (unlikely(ret < 0))
+ print_bfs_bug(ret);
+
+ return ret;
+}
+
+/*
+ * Prove that the dependency graph starting at <src> can not
+ * lead to <target>. If it can, there is a circle when adding
+ * <target> -> <src> dependency.
+ *
+ * Print an error and return 0 if it does.
+ */
+static noinline int
+check_noncircular(struct held_lock *src, struct held_lock *target,
+ struct lock_trace *trace)
+{
+ int ret;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list src_entry = {
+ .class = hlock_class(src),
+ .parent = NULL,
+ };
debug_atomic_inc(nr_cyclic_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(hlock_class(target), &src_entry, &target_entry);
- return result;
+ if (unlikely(!ret)) {
+ if (!trace->nr_entries) {
+ /*
+ * If save_trace fails here, the printing might
+ * trigger a WARN but because of the !nr_entries it
+ * should not do bad things.
+ */
+ save_trace(trace);
+ }
+
+ print_circular_bug(&src_entry, target_entry, src, target);
+ }
+
+ return ret;
}
+#ifdef CONFIG_LOCKDEP_SMALL
+/*
+ * Check that the dependency graph starting at <src> can lead to
+ * <target> or not. If it can, <src> -> <target> dependency is already
+ * in the graph.
+ *
+ * Print an error and return 2 if it does or 1 if it does not.
+ */
static noinline int
-check_redundant(struct lock_list *root, struct lock_class *target,
- struct lock_list **target_entry)
+check_redundant(struct held_lock *src, struct held_lock *target)
{
- int result;
+ int ret;
+ struct lock_list *uninitialized_var(target_entry);
+ struct lock_list src_entry = {
+ .class = hlock_class(src),
+ .parent = NULL,
+ };
debug_atomic_inc(nr_redundant_checks);
- result = __bfs_forwards(root, target, class_equal, target_entry);
+ ret = check_path(hlock_class(target), &src_entry, &target_entry);
- return result;
+ if (!ret) {
+ debug_atomic_inc(nr_redundant);
+ ret = 2;
+ } else if (ret < 0)
+ ret = 0;
+
+ return ret;
}
+#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+#ifdef CONFIG_TRACE_IRQFLAGS
static inline int usage_accumulate(struct lock_list *entry, void *mask)
{
@@ -1765,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
*/
static void __used
print_shortest_lock_dependencies(struct lock_list *leaf,
- struct lock_list *root)
+ struct lock_list *root)
{
struct lock_list *entry = leaf;
int depth;
@@ -1787,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
entry = get_lock_parent(entry);
depth--;
} while (entry && (depth >= 0));
-
- return;
}
static void
@@ -1847,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry,
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_bad_irq_dependency(struct task_struct *curr,
struct lock_list *prev_root,
struct lock_list *next_root,
@@ -1860,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr,
const char *irqclass)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=====================================================\n");
@@ -1906,19 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr,
pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass);
if (!save_trace(&prev_root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(backwards_entry, prev_root);
pr_warn("\nthe dependencies between the lock to be acquired");
pr_warn(" and %s-irq-unsafe lock:\n", irqclass);
if (!save_trace(&next_root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(forwards_entry, next_root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static const char *state_names[] = {
@@ -2065,8 +2169,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
this.class = hlock_class(prev);
ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
usage_mask &= LOCKF_USED_IN_IRQ_ALL;
if (!usage_mask)
@@ -2082,8 +2188,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
that.class = hlock_class(next);
ret = find_usage_forwards(&that, forward_mask, &target_entry1);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
@@ -2095,8 +2203,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
backward_mask = original_mask(target_entry1->class->usage_mask);
ret = find_usage_backwards(&this, backward_mask, &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (DEBUG_LOCKS_WARN_ON(ret == 1))
return 1;
@@ -2110,11 +2220,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
if (DEBUG_LOCKS_WARN_ON(ret == -1))
return 1;
- return print_bad_irq_dependency(curr, &this, &that,
- target_entry, target_entry1,
- prev, next,
- backward_bit, forward_bit,
- state_name(backward_bit));
+ print_bad_irq_dependency(curr, &this, &that,
+ target_entry, target_entry1,
+ prev, next,
+ backward_bit, forward_bit,
+ state_name(backward_bit));
+
+ return 0;
}
static void inc_chains(void)
@@ -2142,11 +2254,10 @@ static inline void inc_chains(void)
nr_process_chains++;
}
-#endif
+#endif /* CONFIG_TRACE_IRQFLAGS */
static void
-print_deadlock_scenario(struct held_lock *nxt,
- struct held_lock *prv)
+print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv)
{
struct lock_class *next = hlock_class(nxt);
struct lock_class *prev = hlock_class(prv);
@@ -2164,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt,
printk(" May be due to missing lock nesting notation\n\n");
}
-static int
+static void
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("============================================\n");
@@ -2188,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -2201,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
* Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
*/
static int
-check_deadlock(struct task_struct *curr, struct held_lock *next,
- struct lockdep_map *next_instance, int read)
+check_deadlock(struct task_struct *curr, struct held_lock *next)
{
struct held_lock *prev;
struct held_lock *nest = NULL;
@@ -2221,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
* Allow read-after-read recursion of the same
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
- if ((read == 2) && prev->read)
+ if ((next->read == 2) && prev->read)
return 2;
/*
@@ -2231,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
if (nest)
return 2;
- return print_deadlock_bug(curr, prev, next);
+ print_deadlock_bug(curr, prev, next);
+ return 0;
}
return 1;
}
/*
* There was a chain-cache miss, and we are about to add a new dependency
- * to a previous lock. We recursively validate the following rules:
+ * to a previous lock. We validate the following rules:
*
* - would the adding of the <prev> -> <next> dependency create a
* circular dependency in the graph? [== circular deadlock]
@@ -2262,9 +2371,7 @@ static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next, int distance, struct lock_trace *trace)
{
- struct lock_list *uninitialized_var(target_entry);
struct lock_list *entry;
- struct lock_list this;
int ret;
if (!hlock_class(prev)->key || !hlock_class(next)->key) {
@@ -2288,28 +2395,16 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
/*
* Prove that the new <prev> -> <next> dependency would not
* create a circular dependency in the graph. (We do this by
- * forward-recursing into the graph starting at <next>, and
- * checking whether we can reach <prev>.)
+ * a breadth-first search into the graph starting at <next>,
+ * and check whether we can reach <prev>.)
*
- * We are using global variables to control the recursion, to
- * keep the stackframe size of the recursive functions low:
+ * The search is limited by the size of the circular queue (i.e.,
+ * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes
+ * in the graph whose neighbours are to be checked.
*/
- this.class = hlock_class(next);
- this.parent = NULL;
- ret = check_noncircular(&this, hlock_class(prev), &target_entry);
- if (unlikely(!ret)) {
- if (!trace->nr_entries) {
- /*
- * If save_trace fails here, the printing might
- * trigger a WARN but because of the !nr_entries it
- * should not do bad things.
- */
- save_trace(trace);
- }
- return print_circular_bug(&this, target_entry, next, prev);
- }
- else if (unlikely(ret < 0))
- return print_bfs_bug(ret);
+ ret = check_noncircular(next, prev, trace);
+ if (unlikely(ret <= 0))
+ return 0;
if (!check_irq_usage(curr, prev, next))
return 0;
@@ -2340,19 +2435,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
+#ifdef CONFIG_LOCKDEP_SMALL
/*
* Is the <prev> -> <next> link redundant?
*/
- this.class = hlock_class(prev);
- this.parent = NULL;
- ret = check_redundant(&this, hlock_class(next), &target_entry);
- if (!ret) {
- debug_atomic_inc(nr_redundant);
- return 2;
- }
- if (ret < 0)
- return print_bfs_bug(ret);
-
+ ret = check_redundant(prev, next);
+ if (ret != 1)
+ return ret;
+#endif
if (!trace->nr_entries && !save_trace(trace))
return 0;
@@ -2504,12 +2594,13 @@ static void
print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next)
{
struct held_lock *hlock;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int depth = curr->lockdep_depth;
- int i;
+ int i = get_first_held_lock(curr, hlock_next);
- printk("depth: %u\n", depth + 1);
- for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) {
+ printk("depth: %u (irq_context %u)\n", depth - i + 1,
+ hlock_next->irq_context);
+ for (; i < depth; i++) {
hlock = curr->held_locks + i;
chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
@@ -2523,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
static void print_chain_keys_chain(struct lock_chain *chain)
{
int i;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
int class_id;
printk("depth: %u\n", chain->depth);
for (i = 0; i < chain->depth; i++) {
class_id = chain_hlocks[chain->base + i];
- chain_key = print_chain_key_iteration(class_id + 1, chain_key);
+ chain_key = print_chain_key_iteration(class_id, chain_key);
print_lock_name(lock_classes + class_id);
printk("\n");
@@ -2580,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr,
}
for (j = 0; j < chain->depth - 1; j++, i++) {
- id = curr->held_locks[i].class_idx - 1;
+ id = curr->held_locks[i].class_idx;
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
print_collision(curr, hlock, chain);
@@ -2663,7 +2754,7 @@ static inline int add_chain_cache(struct task_struct *curr,
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx - 1;
+ int lock_id = curr->held_locks[i].class_idx;
chain_hlocks[chain->base + j] = lock_id;
}
chain_hlocks[chain->base + j] = class - lock_classes;
@@ -2753,8 +2844,9 @@ cache_hit:
return 1;
}
-static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
- struct held_lock *hlock, int chain_head, u64 chain_key)
+static int validate_chain(struct task_struct *curr,
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
/*
* Trylock needs to maintain the stack of held locks, but it
@@ -2775,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* - is softirq-safe, if this lock is hardirq-unsafe
*
* And check whether the new lock's dependency graph
- * could lead back to the previous lock.
+ * could lead back to the previous lock:
*
- * any of these scenarios could lead to a deadlock. If
- * All validations
+ * - within the current held-lock stack
+ * - across our accumulated lock dependency records
+ *
+ * any of these scenarios could lead to a deadlock.
*/
- int ret = check_deadlock(curr, hlock, lock, hlock->read);
+ /*
+ * The simple case: does the current hold the same lock
+ * already?
+ */
+ int ret = check_deadlock(curr, hlock);
if (!ret)
return 0;
@@ -2811,16 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
}
#else
static inline int validate_chain(struct task_struct *curr,
- struct lockdep_map *lock, struct held_lock *hlock,
- int chain_head, u64 chain_key)
+ struct held_lock *hlock,
+ int chain_head, u64 chain_key)
{
return 1;
}
-
-static void print_lock_trace(struct lock_trace *trace, unsigned int spaces)
-{
-}
-#endif
+#endif /* CONFIG_PROVE_LOCKING */
/*
* We are building curr_chain_key incrementally, so double-check
@@ -2831,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr)
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
unsigned int i;
- u64 chain_key = 0;
+ u64 chain_key = INITIAL_CHAIN_KEY;
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
@@ -2847,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr)
(unsigned long long)hlock->prev_chain_key);
return;
}
+
/*
- * Whoops ran out of static storage again?
+ * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is
+ * it registered lock class index?
*/
- if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use)))
return;
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_key = iterate_chain_key(chain_key, hlock->class_idx);
prev_hlock = hlock;
}
@@ -2873,14 +2969,11 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit);
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
-
-
-static void
-print_usage_bug_scenario(struct held_lock *lock)
+static void print_usage_bug_scenario(struct held_lock *lock)
{
struct lock_class *class = hlock_class(lock);
@@ -2897,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock)
printk("\n *** DEADLOCK ***\n\n");
}
-static int
+static void
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
{
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("================================\n");
@@ -2932,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -2943,8 +3034,10 @@ static inline int
valid_state(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
{
- if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
- return print_usage_bug(curr, this, bad_bit, new_bit);
+ if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) {
+ print_usage_bug(curr, this, bad_bit, new_bit);
+ return 0;
+ }
return 1;
}
@@ -2952,7 +3045,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
/*
* print irq inversion bug:
*/
-static int
+static void
print_irq_inversion_bug(struct task_struct *curr,
struct lock_list *root, struct lock_list *other,
struct held_lock *this, int forwards,
@@ -2963,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr,
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("========================================================\n");
@@ -3004,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr,
pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
if (!save_trace(&root->trace))
- return 0;
+ return;
print_shortest_lock_dependencies(other, root);
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
/*
@@ -3028,13 +3119,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, irqclass);
+ return 0;
}
/*
@@ -3052,13 +3146,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
root.parent = NULL;
root.class = hlock_class(this);
ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0)
- return print_bfs_bug(ret);
+ if (ret < 0) {
+ print_bfs_bug(ret);
+ return 0;
+ }
if (ret == 1)
return ret;
- return print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, irqclass);
+ return 0;
}
void print_irqtrace_events(struct task_struct *curr)
@@ -3141,7 +3238,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
* Validate that the lock dependencies don't have conflicting usage
* states.
*/
- if ((!read || !dir || STRICT_READ_CHECKS) &&
+ if ((!read || STRICT_READ_CHECKS) &&
!usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
return 0;
@@ -3366,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(redundant_softirqs_off);
}
-static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
+static int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
{
+ if (!check)
+ goto lock_used;
+
/*
* If non-trylock use in a hardirq or softirq context, then
* mark the lock as used in these contexts:
@@ -3411,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
}
}
+lock_used:
+ /* mark it as used: */
+ if (!mark_lock(curr, hlock, LOCK_USED))
+ return 0;
+
return 1;
}
@@ -3442,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
-static inline
-int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit)
-{
- WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
- return 1;
-}
-
-static inline int mark_irqflags(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 1;
-}
-
-static inline unsigned int task_irq_context(struct task_struct *task)
-{
- return 0;
-}
-
-static inline int separate_irq_context(struct task_struct *curr,
- struct held_lock *hlock)
-{
- return 0;
-}
-
-#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
-
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
@@ -3479,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
{
unsigned int new_mask = 1 << new_bit, ret = 1;
+ if (new_bit >= LOCK_USAGE_STATES) {
+ DEBUG_LOCKS_WARN_ON(1);
+ return 0;
+ }
+
/*
* If already set then do not dirty the cacheline,
* nor do any checks:
@@ -3502,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return 0;
switch (new_bit) {
-#define LOCKDEP_STATE(__STATE) \
- case LOCK_USED_IN_##__STATE: \
- case LOCK_USED_IN_##__STATE##_READ: \
- case LOCK_ENABLED_##__STATE: \
- case LOCK_ENABLED_##__STATE##_READ:
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
- ret = mark_lock_irq(curr, this, new_bit);
- if (!ret)
- return 0;
- break;
case LOCK_USED:
debug_atomic_dec(nr_unused_locks);
break;
default:
- if (!debug_locks_off_graph_unlock())
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
return 0;
- WARN_ON(1);
- return 0;
}
graph_unlock();
@@ -3538,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return ret;
}
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
+static inline int
+mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
+{
+ return 1;
+}
+
+static inline unsigned int task_irq_context(struct task_struct *task)
+{
+ return 0;
+}
+
+static inline int separate_irq_context(struct task_struct *curr,
+ struct held_lock *hlock)
+{
+ return 0;
+}
+
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
+
/*
* Initialize a lock instance's lock-class mapping info:
*/
@@ -3601,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
-static int
+static void
print_lock_nested_lock_not_held(struct task_struct *curr,
struct held_lock *hlock,
unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("==================================\n");
@@ -3631,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static int __lock_is_held(const struct lockdep_map *lock, int read);
@@ -3697,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
- class_idx = class - lock_classes + 1;
+ class_idx = class - lock_classes;
if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references) {
- /*
- * Check: unsigned int references:12, overflow.
- */
- if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
- return 0;
+ if (!references)
+ references++;
+ if (!hlock->references)
hlock->references++;
- } else {
- hlock->references = 2;
- }
- return 1;
+ hlock->references += references;
+
+ /* Overflow */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references < references))
+ return 0;
+
+ return 2;
}
}
@@ -3741,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
hlock->pin_count = pin_count;
- if (check && !mark_irqflags(curr, hlock))
- return 0;
-
- /* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED))
+ /* Initialize the lock usage bit */
+ if (!mark_usage(curr, hlock, check))
return 0;
/*
@@ -3759,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* the hash, not class->key.
*/
/*
- * Whoops, we did it again.. ran straight out of our static allocation.
+ * Whoops, we did it again.. class_idx is invalid.
*/
- if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
+ if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use)))
return 0;
chain_key = curr->curr_chain_key;
@@ -3769,27 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
/*
* How can we have a chain hash when we ain't got no keys?!
*/
- if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
+ if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY))
return 0;
chain_head = 1;
}
hlock->prev_chain_key = chain_key;
if (separate_irq_context(curr, hlock)) {
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
chain_key = iterate_chain_key(chain_key, class_idx);
- if (nest_lock && !__lock_is_held(nest_lock, -1))
- return print_lock_nested_lock_not_held(curr, hlock, ip);
+ if (nest_lock && !__lock_is_held(nest_lock, -1)) {
+ print_lock_nested_lock_not_held(curr, hlock, ip);
+ return 0;
+ }
if (!debug_locks_silent) {
WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key);
WARN_ON_ONCE(!hlock_class(hlock)->key);
}
- if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
+ if (!validate_chain(curr, hlock, chain_head, chain_key))
return 0;
curr->curr_chain_key = chain_key;
@@ -3818,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
return 1;
}
-static int
-print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_unlock_imbalance_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=====================================\n");
@@ -3843,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static int match_held_lock(const struct held_lock *hlock,
@@ -3876,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock,
if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
return 0;
- if (hlock->class_idx == class - lock_classes + 1)
+ if (hlock->class_idx == class - lock_classes)
return 1;
}
@@ -3920,22 +4006,33 @@ out:
}
static int reacquire_held_locks(struct task_struct *curr, unsigned int depth,
- int idx)
+ int idx, unsigned int *merged)
{
struct held_lock *hlock;
+ int first_idx = idx;
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) {
- if (!__lock_acquire(hlock->instance,
+ switch (__lock_acquire(hlock->instance,
hlock_class(hlock)->subclass,
hlock->trylock,
hlock->read, hlock->check,
hlock->hardirqs_off,
hlock->nest_lock, hlock->acquire_ip,
- hlock->references, hlock->pin_count))
+ hlock->references, hlock->pin_count)) {
+ case 0:
return 1;
+ case 1:
+ break;
+ case 2:
+ *merged += (idx == first_idx);
+ break;
+ default:
+ WARN_ON(1);
+ return 0;
+ }
}
return 0;
}
@@ -3946,9 +4043,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
struct lock_class *class;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -3963,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
lockdep_init_map(lock, name, key, 0);
class = register_lock_class(lock, subclass, 0);
- hlock->class_idx = class - lock_classes + 1;
+ hlock->class_idx = class - lock_classes;
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
return 0;
/*
* I took it apart and put it back together again, except now I have
* these 'spare' parts.. where shall I put them.
*/
- if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
+ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged))
return 0;
return 1;
}
@@ -3988,8 +4087,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 0;
struct held_lock *hlock;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -4004,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
return 0;
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
curr->lockdep_depth = i;
curr->curr_chain_key = hlock->prev_chain_key;
@@ -4014,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
hlock->read = 1;
hlock->acquire_ip = ip;
- if (reacquire_held_locks(curr, depth, i))
+ if (reacquire_held_locks(curr, depth, i, &merged))
+ return 0;
+
+ /* Merging can't happen with unchanged classes.. */
+ if (DEBUG_LOCKS_WARN_ON(merged))
return 0;
/*
@@ -4023,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
return 0;
+
return 1;
}
@@ -4034,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip)
* @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+__lock_release(struct lockdep_map *lock, unsigned long ip)
{
struct task_struct *curr = current;
+ unsigned int depth, merged = 1;
struct held_lock *hlock;
- unsigned int depth;
int i;
if (unlikely(!debug_locks))
@@ -4049,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(depth <= 0))
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (depth <= 0) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
/*
* Check whether the lock exists in the current stack
* of held locks:
*/
hlock = find_held_lock(curr, lock, depth, &i);
- if (!hlock)
- return print_unlock_imbalance_bug(curr, lock, ip);
+ if (!hlock) {
+ print_unlock_imbalance_bug(curr, lock, ip);
+ return 0;
+ }
if (hlock->instance == lock)
lock_release_holdtime(hlock);
@@ -4093,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
if (i == depth-1)
return 1;
- if (reacquire_held_locks(curr, depth, i + 1))
+ if (reacquire_held_locks(curr, depth, i + 1, &merged))
return 0;
/*
* We had N bottles of beer on the wall, we drank one, but now
* there's not N-1 bottles of beer left on the wall...
+ * Pouring two of the bottles together is acceptable.
*/
- DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1);
+ DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged);
/*
* Since reacquire_held_locks() would have called check_chain_key()
@@ -4318,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested,
check_flags(flags);
current->lockdep_recursion = 1;
trace_lock_release(lock, ip);
- if (__lock_release(lock, nested, ip))
+ if (__lock_release(lock, ip))
check_chain_key(current);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
@@ -4401,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
EXPORT_SYMBOL_GPL(lock_unpin_lock);
#ifdef CONFIG_LOCK_STAT
-static int
-print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
+static void print_lock_contention_bug(struct task_struct *curr,
+ struct lockdep_map *lock,
+ unsigned long ip)
{
if (!debug_locks_off())
- return 0;
+ return;
if (debug_locks_silent)
- return 0;
+ return;
pr_warn("\n");
pr_warn("=================================\n");
@@ -4426,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
pr_warn("\nstack backtrace:\n");
dump_stack();
-
- return 0;
}
static void
@@ -4572,9 +4681,7 @@ void lockdep_reset(void)
int i;
raw_local_irq_save(flags);
- current->curr_chain_key = 0;
- current->lockdep_depth = 0;
- current->lockdep_recursion = 0;
+ lockdep_init_task(current);
memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
nr_hardirq_chains = 0;
nr_softirq_chains = 0;
@@ -4614,9 +4721,9 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
return;
recalc:
- chain_key = 0;
+ chain_key = INITIAL_CHAIN_KEY;
for (i = chain->base; i < chain->base + chain->depth; i++)
- chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1);
+ chain_key = iterate_chain_key(chain_key, chain_hlocks[i]);
if (chain->depth && chain->chain_key == chain_key)
return;
/* Overwrite the chain key for concurrent RCU readers. */
@@ -4690,6 +4797,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
WRITE_ONCE(class->key, NULL);
WRITE_ONCE(class->name, NULL);
nr_lock_classes--;
+ __clear_bit(class - lock_classes, lock_classes_in_use);
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -5035,6 +5143,7 @@ void __init lockdep_init(void)
printk(" memory used by lock dependency info: %zu kB\n",
(sizeof(lock_classes) +
+ sizeof(lock_classes_in_use) +
sizeof(classhash_table) +
sizeof(list_entries) +
sizeof(list_entries_in_use) +
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index 150ec3f0c5b5..cc83568d5012 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains;
extern unsigned int nr_softirq_chains;
extern unsigned int nr_process_chains;
extern unsigned int max_lockdep_depth;
-extern unsigned int max_recursion_depth;
extern unsigned int max_bfs_queue_depth;
@@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class)
* and we want to avoid too much cache bouncing.
*/
struct lockdep_stats {
- int chain_lookup_hits;
- int chain_lookup_misses;
- int hardirqs_on_events;
- int hardirqs_off_events;
- int redundant_hardirqs_on;
- int redundant_hardirqs_off;
- int softirqs_on_events;
- int softirqs_off_events;
- int redundant_softirqs_on;
- int redundant_softirqs_off;
- int nr_unused_locks;
- int nr_redundant_checks;
- int nr_redundant;
- int nr_cyclic_checks;
- int nr_cyclic_check_recursions;
- int nr_find_usage_forwards_checks;
- int nr_find_usage_forwards_recursions;
- int nr_find_usage_backwards_checks;
- int nr_find_usage_backwards_recursions;
+ unsigned long chain_lookup_hits;
+ unsigned int chain_lookup_misses;
+ unsigned long hardirqs_on_events;
+ unsigned long hardirqs_off_events;
+ unsigned long redundant_hardirqs_on;
+ unsigned long redundant_hardirqs_off;
+ unsigned long softirqs_on_events;
+ unsigned long softirqs_off_events;
+ unsigned long redundant_softirqs_on;
+ unsigned long redundant_softirqs_off;
+ int nr_unused_locks;
+ unsigned int nr_redundant_checks;
+ unsigned int nr_redundant;
+ unsigned int nr_cyclic_checks;
+ unsigned int nr_find_usage_forwards_checks;
+ unsigned int nr_find_usage_backwards_checks;
/*
* Per lock class locking operation stat counts
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 80a463d31a8d..c513031cd7e3 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -975,7 +975,7 @@ static int __init lock_torture_init(void)
goto unwind;
}
if (stutter > 0) {
- firsterr = torture_stutter_init(stutter);
+ firsterr = torture_stutter_init(stutter, stutter);
if (firsterr)
goto unwind;
}
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index db578783dd36..0c601ae072b3 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* kernel/locking/mutex.c
*
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index f17dad99eec8..364d38a0c444 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/atomic.h>
#include <linux/rwsem.h>
#include <linux/percpu.h>
@@ -17,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
return -ENOMEM;
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ rcu_sync_init(&sem->rss);
__init_rwsem(&sem->rw_sem, name, rwsem_key);
rcuwait_init(&sem->writer);
sem->readers_block = 0;
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index c7471c3fb798..fe9ca92faa2a 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -1,16 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Queued read/write locks
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
*
* Authors: Waiman Long <waiman.long@hp.com>
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index e14b32c69639..2473f10c6956 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -1,16 +1