diff options
Diffstat (limited to 'kernel')
262 files changed, 16324 insertions, 7685 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index ce77f0265660..c2f1fd95a821 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -96,8 +96,9 @@ config PREEMPTION config PREEMPT_DYNAMIC bool "Preemption behaviour defined on boot" depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT + select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY select PREEMPT_BUILD - default y + default y if HAVE_PREEMPT_DYNAMIC_CALL help This option allows to define the preemption model on the kernel command line parameter and thus override the default preemption diff --git a/kernel/Makefile b/kernel/Makefile index 186c49582f45..847a82bfe0e3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -67,6 +67,7 @@ obj-y += up.o endif obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -107,12 +108,14 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ +obj-$(CONFIG_RETHOOK) += trace/ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o obj-$(CONFIG_BPF) += bpf/ obj-$(CONFIG_KCSAN) += kcsan/ obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o -obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL) += static_call.o +obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call_inline.o obj-$(CONFIG_CFI_CLANG) += cfi.o obj-$(CONFIG_PERF_EVENTS) += events/ diff --git a/kernel/async.c b/kernel/async.c index b8d7a663497f..b2c4ba5686ee 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data, atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); - /* mark that this task has queued an async job, used by module init */ - current->flags |= PF_USED_ASYNC; - /* schedule for execution */ queue_work_node(node, system_unbound_wq, &entry->work); diff --git a/kernel/audit.c b/kernel/audit.c index 121d37e700a6..7690c29d4ee4 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb) /** * kauditd_rehold_skb - Handle a audit record send failure in the hold queue * @skb: audit record + * @error: error code (unused) * * Description: * This should only be used by the kauditd_thread when it fails to flush the * hold queue. */ -static void kauditd_rehold_skb(struct sk_buff *skb) +static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error) { - /* put the record back in the queue at the same place */ - skb_queue_head(&audit_hold_queue, skb); + /* put the record back in the queue */ + skb_queue_tail(&audit_hold_queue, skb); } /** * kauditd_hold_skb - Queue an audit record, waiting for auditd * @skb: audit record + * @error: error code * * Description: * Queue the audit record, waiting for an instance of auditd. When this @@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb) * and queue it, if we have room. If we want to hold on to the record, but we * don't have room, record a record lost message. */ -static void kauditd_hold_skb(struct sk_buff *skb) +static void kauditd_hold_skb(struct sk_buff *skb, int error) { /* at this point it is uncertain if we will ever send this to auditd so * try to send the message via printk before we go any further */ kauditd_printk_skb(skb); /* can we just silently drop the message? */ - if (!audit_default) { - kfree_skb(skb); - return; + if (!audit_default) + goto drop; + + /* the hold queue is only for when the daemon goes away completely, + * not -EAGAIN failures; if we are in a -EAGAIN state requeue the + * record on the retry queue unless it's full, in which case drop it + */ + if (error == -EAGAIN) { + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + audit_log_lost("kauditd retry queue overflow"); + goto drop; } - /* if we have room, queue the message */ + /* if we have room in the hold queue, queue the message */ if (!audit_backlog_limit || skb_queue_len(&audit_hold_queue) < audit_backlog_limit) { skb_queue_tail(&audit_hold_queue, skb); @@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb) /* we have no other options - drop the message */ audit_log_lost("kauditd hold queue overflow"); +drop: kfree_skb(skb); } /** * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd * @skb: audit record + * @error: error code (unused) * * Description: * Not as serious as kauditd_hold_skb() as we still have a connected auditd, * but for some reason we are having problems sending it audit records so * queue the given record and attempt to resend. */ -static void kauditd_retry_skb(struct sk_buff *skb) +static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error) { - /* NOTE: because records should only live in the retry queue for a - * short period of time, before either being sent or moved to the hold - * queue, we don't currently enforce a limit on this queue */ - skb_queue_tail(&audit_retry_queue, skb); + if (!audit_backlog_limit || + skb_queue_len(&audit_retry_queue) < audit_backlog_limit) { + skb_queue_tail(&audit_retry_queue, skb); + return; + } + + /* we have to drop the record, send it via printk as a last effort */ + kauditd_printk_skb(skb); + audit_log_lost("kauditd retry queue overflow"); + kfree_skb(skb); } /** @@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac) /* flush the retry queue to the hold queue, but don't touch the main * queue since we need to process that normally for multicast */ while ((skb = skb_dequeue(&audit_retry_queue))) - kauditd_hold_skb(skb); + kauditd_hold_skb(skb, -ECONNREFUSED); } /** @@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, struct sk_buff_head *queue, unsigned int retry_limit, void (*skb_hook)(struct sk_buff *skb), - void (*err_hook)(struct sk_buff *skb)) + void (*err_hook)(struct sk_buff *skb, int error)) { int rc = 0; - struct sk_buff *skb; - static unsigned int failed = 0; + struct sk_buff *skb = NULL; + struct sk_buff *skb_tail; + unsigned int failed = 0; /* NOTE: kauditd_thread takes care of all our locking, we just use * the netlink info passed to us (e.g. sk and portid) */ - while ((skb = skb_dequeue(queue))) { + skb_tail = skb_peek_tail(queue); + while ((skb != skb_tail) && (skb = skb_dequeue(queue))) { /* call the skb_hook for each skb we touch */ if (skb_hook) (*skb_hook)(skb); @@ -731,36 +755,34 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, /* can we send to anyone via unicast? */ if (!sk) { if (err_hook) - (*err_hook)(skb); + (*err_hook)(skb, -ECONNREFUSED); continue; } +retry: /* grab an extra skb reference in case of error */ skb_get(skb); rc = netlink_unicast(sk, skb, portid, 0); if (rc < 0) { - /* fatal failure for our queue flush attempt? */ + /* send failed - try a few times unless fatal error */ if (++failed >= retry_limit || rc == -ECONNREFUSED || rc == -EPERM) { - /* yes - error processing for the queue */ sk = NULL; if (err_hook) - (*err_hook)(skb); - if (!skb_hook) - goto out; - /* keep processing with the skb_hook */ + (*err_hook)(skb, rc); + if (rc == -EAGAIN) + rc = 0; + /* continue to drain the queue */ continue; } else - /* no - requeue to preserve ordering */ - skb_queue_head(queue, skb); + goto retry; } else { - /* it worked - drop the extra reference and continue */ + /* skb sent - drop the extra reference and continue */ consume_skb(skb); failed = 0; } } -out: return (rc >= 0 ? 0 : rc); } @@ -1446,7 +1468,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err) return err; } - sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL); if (!sig_data) { if (audit_sig_sid) security_release_secctx(ctx, len); @@ -1459,7 +1481,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) security_release_secctx(ctx, len); } audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0, - sig_data, sizeof(*sig_data) + len); + sig_data, struct_size(sig_data, ctx, len)); kfree(sig_data); break; case AUDIT_TTY_GET: { @@ -1542,6 +1564,20 @@ static void audit_receive(struct sk_buff *skb) nlh = nlmsg_next(nlh, &len); } audit_ctl_unlock(); + + /* can't block with the ctrl lock, so penalize the sender now */ + if (audit_backlog_limit && + (skb_queue_len(&audit_queue) > audit_backlog_limit)) { + DECLARE_WAITQUEUE(wait, current); + + /* wake kauditd to try and flush the queue */ + wake_up_interruptible(&kauditd_wait); + + add_wait_queue_exclusive(&audit_backlog_wait, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(audit_backlog_wait_time); + remove_wait_queue(&audit_backlog_wait, &wait); + } } /* Log information about who is connecting to the audit multicast socket */ @@ -1609,7 +1645,8 @@ static int __net_init audit_net_init(struct net *net) audit_panic("cannot initialize netlink socket in namespace"); return -ENOMEM; } - aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + /* limit the timeout in case auditd is blocked/stopped */ + aunet->sk->sk_sndtimeo = HZ / 10; return 0; } @@ -1825,7 +1862,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, * task_tgid_vnr() since auditd_pid is set in audit_receive_msg() * using a PID anchored in the caller's namespace * 2. generator holding the audit_cmd_mutex - we don't want to block - * while holding the mutex */ + * while holding the mutex, although we do penalize the sender + * later in audit_receive() when it is safe to block + */ if (!(auditd_test_task(current) || audit_ctl_owner_current())) { long stime = audit_backlog_wait_time; @@ -2132,7 +2171,7 @@ int audit_log_task_context(struct audit_buffer *ab) int error; u32 sid; - security_task_getsecid_subj(current, &sid); + security_current_getsecid_subj(&sid); if (!sid) return 0; @@ -2353,7 +2392,7 @@ int audit_signal_info(int sig, struct task_struct *t) audit_sig_uid = auid; else audit_sig_uid = uid; - security_task_getsecid_subj(current, &audit_sig_sid); + security_current_getsecid_subj(&audit_sig_sid); } return audit_signal_info_syscall(t); diff --git a/kernel/audit.h b/kernel/audit.h index c4498090a5bd..58b66543b4d5 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -201,6 +201,10 @@ struct audit_context { struct { char *name; } module; + struct { + struct audit_ntp_data ntp_data; + struct timespec64 tk_injoffset; + } time; }; int fds[2]; struct audit_proctitle proctitle; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 72324afcffef..e7315d487163 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s) { struct audit_tree *tree; - tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); + tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL); if (tree) { refcount_set(&tree->count, 1); tree->goner = 0; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index d75acb014ccd..42d99896e7a6 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -637,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) void *bufp; int i; - data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); + data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL); if (unlikely(!data)) return NULL; memset(data, 0, sizeof(*data)); @@ -1092,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q) break; skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, data, - sizeof(*data) + data->buflen); + struct_size(data, buf, data->buflen)); if (skb) skb_queue_tail(q, skb); kfree(data); @@ -1368,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype) case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: if (f->lsm_rule) { - security_task_getsecid_subj(current, - &sid); + security_current_getsecid_subj(&sid); result = security_audit_rule_match(sid, f->type, f->op, f->lsm_rule); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b517947bfa48..ea2ee1181921 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -185,7 +185,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask) case AUDITSC_EXECVE: return mask & AUDIT_PERM_EXEC; case AUDITSC_OPENAT2: - return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags); + return mask & ACC_MODE((u32)ctx->openat2.flags); default: return 0; } @@ -666,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk, logged upon error */ if (f->lsm_rule) { if (need_sid) { - security_task_getsecid_subj(tsk, &sid); + /* @tsk should always be equal to + * @current with the exception of + * fork()/copy_process() in which case + * the new @tsk creds are still a dup + * of @current's creds so we can still + * use security_current_getsecid_subj() + * here even though it always refs + * @current's creds + */ + security_current_getsecid_subj(&sid); need_sid = 0; } result = security_audit_rule_match(sid, f->type, @@ -1331,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) from_kuid(&init_user_ns, name->fcap.rootid)); } +static void audit_log_time(struct audit_context *context, struct audit_buffer **ab) +{ + const struct audit_ntp_data *ntp = &context->time.ntp_data; + const struct timespec64 *tk = &context->time.tk_injoffset; + static const char * const ntp_name[] = { + "offset", + "freq", + "status", + "tai", + "tick", + "adjust", + }; + int type; + + if (context->type == AUDIT_TIME_ADJNTPVAL) { + for (type = 0; type < AUDIT_NTP_NVALS; type++) { + if (ntp->vals[type].newval != ntp->vals[type].oldval) { + if (!*ab) { + *ab = audit_log_start(context, + GFP_KERNEL, + AUDIT_TIME_ADJNTPVAL); + if (!*ab) + return; + } + audit_log_format(*ab, "op=%s old=%lli new=%lli", + ntp_name[type], + ntp->vals[type].oldval, + ntp->vals[type].newval); + audit_log_end(*ab); + *ab = NULL; + } + } + } + if (tk->tv_sec != 0 || tk->tv_nsec != 0) { + if (!*ab) { + *ab = audit_log_start(context, GFP_KERNEL, + AUDIT_TIME_INJOFFSET); + if (!*ab) + return; + } + audit_log_format(*ab, "sec=%lli nsec=%li", + (long long)tk->tv_sec, tk->tv_nsec); + audit_log_end(*ab); + *ab = NULL; + } +} + static void show_special(struct audit_context *context, int *call_panic) { struct audit_buffer *ab; @@ -1445,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic) audit_log_format(ab, "(null)"); break; + case AUDIT_TIME_ADJNTPVAL: + case AUDIT_TIME_INJOFFSET: + /* this call deviates from the rest, eating the buffer */ + audit_log_time(context, &ab); + break; } audit_log_end(ab); } @@ -2840,31 +2901,26 @@ void __audit_fanotify(unsigned int response) void __audit_tk_injoffset(struct timespec64 offset) { - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET, - "sec=%lli nsec=%li", - (long long)offset.tv_sec, offset.tv_nsec); -} - -static void audit_log_ntp_val(const struct audit_ntp_data *ad, - const char *op, enum audit_ntp_type type) -{ - const struct audit_ntp_val *val = &ad->vals[type]; - - if (val->newval == val->oldval) - return; + struct audit_context *context = audit_context(); - audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL, - "op=%s old=%lli new=%lli", op, val->oldval, val->newval); + /* only set type if not already set by NTP */ + if (!context->type) + context->type = AUDIT_TIME_INJOFFSET; + memcpy(&context->time.tk_injoffset, &offset, sizeof(offset)); } void __audit_ntp_log(const struct audit_ntp_data *ad) { - audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET); - audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ); - audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS); - audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI); - audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK); - audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST); + struct audit_context *context = audit_context(); + int type; + + for (type = 0; type < AUDIT_NTP_NVALS; type++) + if (ad->vals[type].newval != ad->vals[type].oldval) { + /* unconditionally set type, overwriting TK */ + context->type = AUDIT_TIME_ADJNTPVAL; + memcpy(&context->time.ntp_data, ad, sizeof(*ad)); + break; + } } void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index d24d518ddd63..d56ee177d5f8 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -30,6 +30,7 @@ config BPF_SYSCALL select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET + select PAGE_POOL if NET default n help Enable the bpf() system call that allows to manipulate BPF programs @@ -58,6 +59,10 @@ config BPF_JIT_ALWAYS_ON Enables BPF JIT and removes BPF interpreter to avoid speculative execution of BPF instructions by the interpreter. + When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable + is permanently set to 1 and setting any other value than that will + return failure. + config BPF_JIT_DEFAULT_ON def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON depends on HAVE_EBPF_JIT && BPF_JIT diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index cf6ca339f3cd..c1a9be6a4b9f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif obj-$(CONFIG_BPF_PRELOAD) += preload/ + +obj-$(CONFIG_BPF_SYSCALL) += relo_core.o +$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE + $(call if_changed_rule,cc_o_c) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c7a5be3bf8be..7f145aefbff8 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key) static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { - struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog = bpf_prog_get(fd); if (IS_ERR(prog)) return prog; - if (!bpf_prog_array_compatible(array, prog)) { + if (!bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return ERR_PTR(-EINVAL); } @@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); - spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index 277a05e9c984..b141a1346f72 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -82,6 +82,11 @@ static int bloom_map_delete_elem(struct bpf_map *map, void *value) return -EOPNOTSUPP; } +static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EOPNOTSUPP; +} + static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) { u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits; @@ -192,6 +197,7 @@ const struct bpf_map_ops bloom_filter_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = bloom_map_alloc, .map_free = bloom_map_free, + .map_get_next_key = bloom_map_get_next_key, .map_push_elem = bloom_map_push_elem, .map_peek_elem = bloom_map_peek_elem, .map_pop_elem = bloom_map_pop_elem, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 96ceed0e0fb5..96be8d518885 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -17,6 +17,7 @@ #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> +#include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); @@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, if (!bsb) return NULL; - inode_storage = rcu_dereference(bsb->storage); + inode_storage = + rcu_dereference_check(bsb->storage, bpf_rcu_lock_held()); if (!inode_storage) return NULL; @@ -134,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, - value, map_flags); + value, map_flags, GFP_ATOMIC); fput(f); return PTR_ERR_OR_ZERO(sdata); } @@ -167,11 +169,13 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key) return err; } -BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, - void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, + void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; @@ -193,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { sdata = bpf_local_storage_update( inode, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } @@ -204,6 +208,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, BPF_CALL_2(bpf_inode_storage_delete, struct bpf_map *, map, struct inode *, inode) { + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!inode) return -EINVAL; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index b2ee45064e06..110029ede71e 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -5,6 +5,7 @@ #include <linux/anon_inodes.h> #include <linux/filter.h> #include <linux/bpf.h> +#include <linux/rcupdate_trace.h> struct bpf_iter_target_info { struct list_head list; @@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { int ret; - rcu_read_lock(); - migrate_disable(); - ret = bpf_prog_run(prog, ctx); - migrate_enable(); - rcu_read_unlock(); + if (prog->aux->sleepable) { + rcu_read_lock_trace(); + migrate_disable(); + might_fault(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock_trace(); + } else { + rcu_read_lock(); + migrate_disable(); + ret = bpf_prog_run(prog, ctx); + migrate_enable(); + rcu_read_unlock(); + } /* bpf program can only return 0 or 1: * 0 : okay @@ -714,3 +724,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = { .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; + +/* maximum number of loops */ +#define MAX_LOOPS BIT(23) + +BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, + u64, flags) +{ + bpf_callback_t callback = (bpf_callback_t)callback_fn; + u64 ret; + u32 i; + + if (flags) + return -EINVAL; + if (nr_loops > MAX_LOOPS) + return -E2BIG; + + for (i = 0; i < nr_loops; i++) { + ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); + /* return value: 0 - continue, 1 - stop and return */ + if (ret) + return i + 1; + } + + return i; +} + +const struct bpf_func_proto bpf_loop_proto = { + .func = bpf_loop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, + .arg2_type = ARG_PTR_TO_FUNC, + .arg3_type = ARG_PTR_TO_STACK_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index b305270b7a4b..01aa2b51ec4d 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -11,6 +11,9 @@ #include <net/sock.h> #include <uapi/linux/sock_diag.h> #include <uapi/linux/btf.h> +#include <linux/rcupdate.h> +#include <linux/rcupdate_trace.h> +#include <linux/rcupdate_wait.h> #define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE) @@ -60,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem) struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, - void *value, bool charge_mem) + void *value, bool charge_mem, gfp_t gfp_flags) { struct bpf_local_storage_elem *selem; @@ -68,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; selem = bpf_map_kzalloc(&smap->map, smap->elem_size, - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, return NULL; } +void bpf_local_storage_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage *local_storage; + + local_storage = container_of(rcu, struct bpf_local_storage, rcu); + kfree_rcu(local_storage, rcu); +} + +static void bpf_selem_free_rcu(struct rcu_head *rcu) +{ + struct bpf_local_storage_elem *selem; + + selem = container_of(rcu, struct bpf_local_storage_elem, rcu); + kfree_rcu(selem, rcu); +} + /* local_storage->lock must be held and selem->local_storage == local_storage. * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. @@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, bool free_local_storage; void *owner; - smap = rcu_dereference(SDATA(selem)->smap); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); owner = local_storage->owner; /* All uncharging on the owner must be done first. @@ -117,13 +136,13 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, * will be done by the caller. * * Although the unlock will be done under - * rcu_read_lock(), it is more intutivie to - * read if kfree_rcu(local_storage, rcu) is done + * rcu_read_lock(), it is more intuitive to + * read if the freeing of the storage is done * after the raw_spin_unlock_bh(&local_storage->lock). * * Hence, a "bool free_local_storage" is returned - * to the caller which then calls the kfree_rcu() - * after unlock. + * to the caller which then calls then frees the storage after + * all the RCU grace periods have expired. */ } hlist_del_init_rcu(&selem->snode); @@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, SDATA(selem)) RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL); - kfree_rcu(selem, rcu); - + call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu); return free_local_storage; } @@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) /* selem has already been unlinked from sk */ return; - local_storage = rcu_dereference(selem->local_storage); + local_storage = rcu_dereference_check(selem->local_storage, + bpf_rcu_lock_held()); raw_spin_lock_irqsave(&local_storage->lock, flags); if (likely(selem_linked_to_storage(selem))) free_local_storage = bpf_selem_unlink_storage_nolock( @@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem) raw_spin_unlock_irqrestore(&local_storage->lock, flags); if (free_local_storage) - kfree_rcu(local_storage, rcu); + call_rcu_tasks_trace(&local_storage->rcu, + bpf_local_storage_free_rcu); } void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, @@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem) /* selem has already be unlinked from smap */ return; - smap = rcu_dereference(SDATA(selem)->smap); + smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held()); b = select_bucket(smap, selem); raw_spin_lock_irqsave(&b->lock, flags); if (likely(selem_linked_to_map(selem))) @@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem; /* Fast path (cache hit) */ - sdata = rcu_dereference(local_storage->cache[smap->cache_idx]); + sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx], + bpf_rcu_lock_held()); if (sdata && rcu_access_pointer(sdata->smap) == smap) return sdata; /* Slow path (cache miss) */ - hlist_for_each_entry_rcu(selem, &local_storage->list, snode) + hlist_for_each_entry_rcu(selem, &local_storage->list, snode, + rcu_read_lock_trace_held()) if (rcu_access_pointer(SDATA(selem)->smap) == smap) break; @@ -260,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata, int bpf_local_storage_alloc(void *owner, struct bpf_local_storage_map *smap, - struct bpf_local_storage_elem *first_selem) + struct bpf_local_storage_elem *first_selem, + gfp_t gfp_flags) { struct bpf_local_storage *prev_storage, *storage; struct bpf_local_storage **owner_storage_ptr; @@ -271,7 +294,7 @@ int bpf_local_storage_alloc(void *owner, return err; storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), - GFP_ATOMIC | __GFP_NOWARN); + gfp_flags | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -306,7 +329,8 @@ int bpf_local_storage_alloc(void *owner, * bucket->list, first_selem can be freed immediately * (instead of kfree_rcu) because * bpf_local_storage_map_free() does a - * synchronize_rcu() before walking the bucket->list. + * synchronize_rcu_mult (waiting for both sleepable and + * normal programs) before walking the bucket->list. * Hence, no one is accessing selem from the * bucket->list under rcu_read_lock(). */ @@ -327,10 +351,10 @@ uncharge: */ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, - void *value, u64 map_flags) + void *value, u64 map_flags, gfp_t gfp_flags) { struct bpf_local_storage_data *old_sdata = NULL; - struct bpf_local_storage_elem *selem; + struct bpf_local_storage_elem *selem = NULL; struct bpf_local_storage *local_storage; unsigned long flags; int err; @@ -342,18 +366,22 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, !map_value_has_spin_lock(&smap->map))) return ERR_PTR(-EINVAL); - local_storage = rcu_dereference(*owner_storage(smap, owner)); + if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) + return ERR_PTR(-EINVAL); + + local_storage = rcu_dereference_check(*owner_storage(smap, owner), + bpf_rcu_lock_held()); if (!local_storage || hlist_empty(&local_storage->list)) { /* Very first elem for the owner */ err = check_flags(NULL, map_flags); if (err) return ERR_PTR(err); - selem = bpf_selem_alloc(smap, owner, value, true); + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); if (!selem) return ERR_PTR(-ENOMEM); - err = bpf_local_storage_alloc(owner, smap, selem); + err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags); if (err) { kfree(selem); mem_uncharge(smap, owner, smap->elem_size); @@ -380,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, } } + if (gfp_flags == GFP_KERNEL) { + selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags); + if (!selem) + return ERR_PTR(-ENOMEM); + } + raw_spin_lock_irqsave(&local_storage->lock, flags); /* Recheck local_storage->list under local_storage->lock */ @@ -405,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, goto unlock; } - /* local_storage->lock is held. Hence, we are sure - * we can unlink and uncharge the old_sdata successfully - * later. Hence, instead of charging the new selem now - * and then uncharge the old selem later (which may cause - * a potential but unnecessary charge failure), avoid taking - * a charge at all here (the "!old_sdata" check) and the - * old_sdata will not be uncharged later during - * bpf_selem_unlink_storage_nolock(). - */ - selem = bpf_selem_alloc(smap, owner, value, !old_sdata); - if (!selem) { - err = -ENOMEM; - goto unlock_err; + if (gfp_flags != GFP_KERNEL) { + /* local_storage->lock is held. Hence, we are sure + * we can unlink and uncharge the old_sdata successfully + * later. Hence, instead of charging the new selem now + * and then uncharge the old selem later (which may cause + * a potential but unnecessary charge failure), avoid taking + * a charge at all here (the "!old_sdata" check) and the + * old_sdata will not be uncharged later during + * bpf_selem_unlink_storage_nolock(). + */ + selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags); + if (!selem) { + err = -ENOMEM; + goto unlock_err; + } } /* First, link the new selem to the map */ @@ -439,6 +475,10 @@ unlock: unlock_err: raw_spin_unlock_irqrestore(&local_storage->lock, flags); + if (selem) { + mem_uncharge(smap, owner, smap->elem_size); + kfree(selem); + } return ERR_PTR(err); } diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 06062370c3b8..064eccba641d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = { .allowed = bpf_ima_inode_hash_allowed, }; +BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size) +{ + return ima_file_hash(file, dst, size); +} + +BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file) + +static const struct bpf_func_proto bpf_ima_file_hash_proto = { + .func = bpf_ima_file_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .allowed = bpf_ima_inode_hash_allowed, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_bprm_opts_set_proto; case BPF_FUNC_ima_inode_hash: return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; + case BPF_FUNC_ima_file_hash: + return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } @@ -167,6 +187,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr) BTF_ID(func, bpf_lsm_inode_symlink) BTF_ID(func, bpf_lsm_inode_unlink) BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernel_read_file) BTF_ID(func, bpf_lsm_kernfs_init_security) #ifdef CONFIG_KEYS @@ -207,7 +228,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_task_getsecid_subj) +BTF_ID(func, bpf_lsm_current_getsecid_subj) BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 8ecfe4752769..21069dbe9138 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -165,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log) break; } - if (btf_member_bitfield_size(t, member)) { + if (__btf_member_bitfield_size(t, member)) { pr_warn("bit field member %s in struct %s is not supported\n", mname, st_ops->name); break; @@ -296,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data) const struct btf_type *mtype; for_each_member(i, t, member) { - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (moff > prev_mend && memchr_inv(data + prev_mend, 0, moff - prev_mend)) return -EINVAL; @@ -387,7 +387,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, struct bpf_prog *prog; u32 moff; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ebfa8bc90892..6638a0ecc3d2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -17,6 +17,7 @@ #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #include <linux/fdtable.h> +#include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); @@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, struct bpf_local_storage *task_storage; struct bpf_local_storage_map *smap; - task_storage = rcu_dereference(task->bpf_storage); + task_storage = + rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held()); if (!task_storage) return NULL; @@ -172,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, bpf_task_storage_lock(); sdata = bpf_local_storage_update( - task, (struct bpf_local_storage_map *)map, value, map_flags); + task, (struct bpf_local_storage_map *)map, value, map_flags, + GFP_ATOMIC); bpf_task_storage_unlock(); err = PTR_ERR_OR_ZERO(sdata); @@ -224,11 +227,13 @@ out: return err; } -BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, - task, void *, value, u64, flags) +/* *gfp_flags* is a hidden argument provided by the verifier */ +BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; @@ -247,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update( task, (struct bpf_local_storage_map *)map, value, - BPF_NOEXIST); + BPF_NOEXIST, gfp_flags); unlock: bpf_task_storage_unlock(); @@ -260,6 +265,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, { int ret; + WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!task) return -EINVAL; @@ -323,7 +329,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = { .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_task_struct_ids[0], + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; @@ -334,5 +340,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID, - .arg2_btf_id = &btf_task_struct_ids[0], + .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index dbc3ad07e21b..0918a39279f6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0 */ +// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ #include <uapi/linux/btf.h> @@ -25,6 +25,7 @@ #include <linux/kobject.h> #include <linux/sysfs.h> #include <net/sock.h> +#include "../tools/lib/bpf/relo_core.h" /* BTF (BPF Type Format) is the meta data format which describes * the data types of BPF program/map. Hence, it basically focus @@ -197,6 +198,21 @@ DEFINE_IDR(btf_idr); DEFINE_SPINLOCK(btf_idr_lock); +enum btf_kfunc_hook { + BTF_KFUNC_HOOK_XDP, + BTF_KFUNC_HOOK_TC, + BTF_KFUNC_HOOK_STRUCT_OPS, + BTF_KFUNC_HOOK_MAX, +}; + +enum { + BTF_KFUNC_SET_MAX_CNT = 32, +}; + +struct btf_kfunc_set_tab { + struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX]; +}; + struct btf { void *data; struct btf_type **types; @@ -211,6 +227,7 @@ struct btf { refcount_t refcnt; u32 id; struct rcu_head rcu; + struct btf_kfunc_set_tab *kfunc_set_tab; /* split BTF support */ struct btf *base_btf; @@ -282,6 +299,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = "DATASEC", [BTF_KIND_FLOAT] = "FLOAT", [BTF_KIND_DECL_TAG] = "DECL_TAG", + [BTF_KIND_TYPE_TAG] = "TYPE_TAG", }; const char *btf_type_str(const struct btf_type *t) @@ -401,6 +419,9 @@ static struct btf_type btf_void; static int btf_resolve(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id); +static int btf_func_check(struct btf_verifier_env *env, + const struct btf_type *t); + static bool btf_type_is_modifier(const struct btf_type *t) { /* Some of them is not strictly a C modifier @@ -418,6 +439,7 @@ static bool btf_type_is_modifier(const struct btf_type *t) case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: return true; } @@ -503,6 +525,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) return -ENOENT; } +static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p) +{ + struct btf *btf; + s32 ret; + int id; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -EINVAL; + + ret = btf_find_by_name_kind(btf, name, kind); + /* ret is never zero, since btf_find_by_name_kind returns + * positive btf_id or negative error. + */ + if (ret > 0) { + btf_get(btf); + *btf_p = btf; + return ret; + } + + /* If name is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, btf, id) { + if (!btf_is_module(btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(btf); + spin_unlock_bh(&btf_idr_lock); + ret = btf_find_by_name_kind(btf, name, kind); + if (ret > 0) { + *btf_p = btf; + return ret; + } + spin_lock_bh(&btf_idr_lock); + btf_put(btf); + } + spin_unlock_bh(&btf_idr_lock); + return ret; +} + const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, u32 id, u32 *res_id) { @@ -576,6 +642,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t) btf_type_is_struct(t) || btf_type_is_array(t) || btf_type_is_var(t) || + btf_type_is_func(t) || btf_type_is_decl_tag(t) || btf_type_is_datasec(t); } @@ -834,7 +901,7 @@ static const char *btf_show_name(struct btf_show *show) const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)]; const char *name = NULL, *prefix = "", *parens = ""; const struct btf_member *m = show->state.member; - const struct btf_type *t = show->state.type; + const struct btf_type *t; const struct btf_array *array; u32 id = show->state.type_id; const char *member = NULL; @@ -1528,8 +1595,30 @@ static void btf_free_id(struct btf *btf) spin_unlock_irqrestore(&btf_idr_lock, flags); } +static void btf_free_kfunc_set_tab(struct btf *btf) +{ + struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab; + int hook, type; + + if (!tab) + return; + /* For module BTF, we directly assign the sets being registered, so + * there is nothing to free except kfunc_set_tab. + */ + if (btf_is_module(btf)) + goto free_tab; + for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) { + for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++) + kfree(tab->sets[hook][type]); + } +free_tab: + kfree(tab); + btf->kfunc_set_tab = NULL; +} + static void btf_free(struct btf *btf) { + btf_free_kfunc_set_tab(btf); kvfree(btf->types); kvfree(btf->resolved_sizes); kvfree(btf->resolved_ids); @@ -1737,6 +1826,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type, case BTF_KIND_VOLATILE: case BTF_KIND_CONST: case BTF_KIND_RESTRICT: + case BTF_KIND_TYPE_TAG: id = type->type; type = btf_type_by_id(btf, type->type); break; @@ -2345,6 +2435,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, const struct btf_type *t, u32 meta_left) { + const char *value; + if (btf_type_vlen(t)) { btf_verifier_log_type(env, t, "vlen != 0"); return -EINVAL; @@ -2360,7 +2452,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, return -EINVAL; } - /* typedef type must have a valid name, and other ref types, + /* typedef/type_tag type must have a valid name, and other ref types, * volatile, const, restrict, should have a null name. */ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { @@ -2369,6 +2461,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env, btf_verifier_log_type(env, t, "Invalid name"); return -EINVAL; } + } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) { + value = btf_name_by_offset(env->btf, t->name_off); + if (!value || !value[0]) { + btf_verifier_log_type(env, t, "Invalid name"); + return -EINVAL; + } } else { if (t->name_off) { btf_verifier_log_type(env, t, "Invalid name"); @@ -2493,7 +2591,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env, * * We now need to continue from the last-resolved-ptr to * ensure the last-resolved-ptr will not referring back to - * the currenct ptr (t). + * the current ptr (t). */ if (btf_type_is_modifier(next_type)) { const struct btf_type *resolved_type; @@ -2958,7 +3056,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env, return -EINVAL; } - offset = btf_member_bit_offset(t, member); + offset = __btf_member_bit_offset(t, member); if (is_union && offset) { btf_verifier_log_member(env, t, member, "Invalid member bits_offset"); @@ -3083,7 +3181,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t if (off != -ENOENT) /* only one such field is allowed */ return -E2BIG; - off = btf_member_bit_offset(t, member); + off = __btf_member_bit_offset(t, member); if (off % 8) /* valid C code cannot generate such BTF */ return -EINVAL; @@ -3173,8 +3271,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t, btf_show_start_member(show, member); - member_offset = btf_member_bit_offset(t, member); - bitfield_size = btf_member_bitfield_size(t, member); + member_offset = __btf_member_bit_offset(t, member); + bitfield_size = __btf_member_bitfield_size(t, member); bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); bits8_offset = BITS_PER_BYTE_MASKED(member_offset); if (bitfield_size) { @@ -3521,9 +3619,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env, return 0; } +static int btf_func_resolve(struct btf_verifier_env *env, + const struct resolve_vertex *v) +{ + const struct btf_type *t = v->t; + u32 next_type_id = t->type; + int err; + + err = btf_func_check(env, t); + if (err) + return err; + + env_stack_pop_resolved(env, next_type_id, 0); + return 0; +} + static struct btf_kind_operations func_ops = { .check_meta = btf_func_check_meta, - .resolve = btf_df_resolve, + .resolve = btf_func_resolve, .check_member = btf_df_check_member, .check_kflag_member = btf_df_check_kflag_member, .log_details = btf_ref_type_log, @@ -4059,6 +4172,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { [BTF_KIND_DATASEC] = &datasec_ops, [BTF_KIND_FLOAT] = &float_ops, [BTF_KIND_DECL_TAG] = &decl_tag_ops, + [BTF_KIND_TYPE_TAG] = &modifier_ops, }; static s32 btf_check_meta(struct btf_verifier_env *env, @@ -4143,7 +4257,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return !btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); - if (btf_type_is_decl_tag(t)) + if (btf_type_is_decl_tag(t) || btf_type_is_func(t)) return btf_resolved_type_id(btf, type_id) && !btf_resolved_type_size(btf, type_id); @@ -4233,12 +4347,6 @@ static int btf_check_all_types(struct btf_verifier_env *env) if (err) return err; } - - if (btf_type_is_func(t)) { - err = btf_func_check(env, t); - if (err) - return err; - } } return 0; @@ -4374,8 +4482,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) btf = env->btf; btf_data_size = btf->data_size; - if (btf_data_size < - offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { + if (btf_data_size < offsetofend(struct btf_header, hdr_len)) { btf_verifier_log(env, "hdr_len not found"); return -EINVAL; } @@ -4460,8 +4567,7 @@ static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size, log->len_total = log_size; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || - !log->level || !log->ubuf) { + if (!bpf_verifier_log_attr_valid(log)) { err = -EINVAL; goto errout; } @@ -4814,7 +4920,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) return prog->aux->attach_btf; } -static bool is_string_ptr(struct btf *btf, const struct btf_type *t) +static bool is_int_ptr(struct btf *btf, const struct btf_type *t) { /* t comes in already as a pointer */ t = btf_type_by_id(btf, t->type); @@ -4823,8 +4929,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t) if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST) t = btf_type_by_id(btf, t->type); - /* char, signed char, unsigned char */ - return btf_type_is_int(t) && t->size == 1; + return btf_type_is_int(t); } bool btf_ctx_access(int off, int size, enum bpf_access_type type, @@ -4837,6 +4942,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + const char *tag_value; u32 nr_args, arg; int i, ret; @@ -4929,10 +5035,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; + u32 type, flag; - if (ctx_arg_info->offset == off && - (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL || - ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) { + type = base_type(ctx_arg_info->reg_type); + flag = type_flag(ctx_arg_info->reg_type); + if (ctx_arg_info->offset == off && type == PTR_TO_BUF && + (flag & PTR_MAYBE_NULL)) { info->reg_type = ctx_arg_info->reg_type; return true; } @@ -4945,7 +5053,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, */ return true; - if (is_string_ptr(btf, t)) + if (is_int_ptr(btf, t)) return true; /* this is a pointer to another type */ @@ -4987,6 +5095,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); + + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + if (strcmp(tag_value, "user") == 0) + info->reg_type |= MEM_USER; + if (strcmp(tag_value, "percpu") == 0) + info->reg_type |= MEM_PERCPU; + } + /* skip modifiers */ while (btf_type_is_modifier(t)) { info->btf_id = t->type; @@ -5013,12 +5130,12 @@ enum bpf_struct_walk_result { static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { u32 i, moff, mtrue_end, msize = 0, total_nelems = 0; const struct btf_type *mtype, *elem_type = NULL; const struct btf_member *member; - const char *tname, *mname; + const char *tname, *mname, *tag_value; u32 vlen, elem_id, mid; again: @@ -5048,7 +5165,7 @@ again: if (array_elem->nelems != 0) goto error; - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off < moff) goto error; @@ -5071,14 +5188,14 @@ error: for_each_member(i, t, member) { /* offset of the field in bytes */ - moff = btf_member_bit_offset(t, member) / 8; + moff = __btf_member_bit_offset(t, member) / 8; if (off + size <= moff) /* won't find anything, field is already too far */ break; - if (btf_member_bitfield_size(t, member)) { - u32 end_bit = btf_member_bit_offset(t, member) + - btf_member_bitfield_size(t, member); + if (__btf_member_bitfield_size(t, member)) { + u32 end_bit = __btf_member_bit_offset(t, member) + + __btf_member_bitfield_size(t, member); /* off <= moff instead of off == moff because clang * does not generate a BTF member for anonymous @@ -5202,7 +5319,8 @@ error: } if (btf_type_is_ptr(mtype)) { - const struct btf_type *stype; + const struct btf_type *stype, *t; + enum bpf_type_flag tmp_flag = 0; u32 id; if (msize != size || off != moff) { @@ -5211,9 +5329,23 @@ error: mname, moff, tname, off, size); return -EACCES; } + + /* check type tag */ + t = btf_type_by_id(btf, mtype->type); + if (btf_type_is_type_tag(t)) { + tag_value = __btf_name_by_offset(btf, t->name_off); + /* check __user tag */ + if (strcmp(tag_value, "user") == 0) + tmp_flag = MEM_USER; + /* check __percpu tag */ + if (strcmp(tag_value, "percpu") == 0) + tmp_flag = MEM_PERCPU; + } + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; + *flag = tmp_flag; return WALK_PTR; } } @@ -5240,13 +5372,14 @@ error: int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, - u32 *next_btf_id) + u32 *next_btf_id, enum bpf_type_flag *flag) { + enum bpf_type_flag tmp_flag = 0; int err; u32 id; do { - err = btf_struct_walk(log, btf, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); switch (err) { case WALK_PTR: @@ -5254,6 +5387,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, * we're done. */ *next_btf_id = id; + *flag = tmp_flag; return PTR_TO_BTF_ID; case WALK_SCALAR: return SCALAR_VALUE; @@ -5298,6 +5432,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *need_btf, u32 need_type_id) { const struct btf_type *type; + enum bpf_type_flag flag; int err; /* Are we already done? */ @@ -5308,7 +5443,7 @@ again: type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, btf, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id, &flag); if (err != WALK_STRUCT) return false; @@ -5372,7 +5507,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, } args = (const struct btf_param *)(func + 1); nargs = btf_type_vlen(func); - if (nargs >= MAX_BPF_FUNC_ARGS) { + if (nargs > MAX_BPF_FUNC_ARGS) { bpf_log(log, "The function %s has %d arguments. Too many.\n", tname, nargs); @@ -5563,16 +5698,85 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #endif }; +/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */ +static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int rec) +{ + const struct btf_type *member_type; + const struct btf_member *member; + u32 i; + + if (!btf_type_is_struct(t)) + return false; + + for_each_member(i, t, member) { + const struct btf_array *array; + + member_type = btf_type_skip_modifiers(btf, member->type, NULL); + if (btf_type_is_struct(member_type)) { + if (rec >= 3) { + bpf_log(log, "max struct nesting depth exceeded\n"); + return false; + } + if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1)) + return false; + continue; + } + if (btf_type_is_array(member_type)) { + array = btf_type_array(member_type); + if (!array->nelems) + return false; + member_type = btf_type_skip_modifiers(btf, array->type, NULL); + if (!btf_type_is_scalar(member_type)) + return false; + continue; + } + if (!btf_type_is_scalar(member_type)) + return false; + } + return true; +} + +static bool is_kfunc_arg_mem_size(const struct btf *btf, + const struct btf_param *arg, + const struct bpf_reg_state *reg) +{ + int len, sfx_len = sizeof("__sz") - 1; + const struct btf_type *t; + const char *param_name; + + t = btf_type_skip_modifiers(btf, arg->type, NULL); + if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE) + return false; + + /* In the future, this can be ported to use BTF tagging */ + param_name = btf_name_by_offset(btf, arg->name_off); + if (str_is_empty(param_name)) + return false; + len = strlen(param_name); + if (len < sfx_len) + return false; + param_name += len - sfx_len; + if (strncmp(param_name, "__sz", sfx_len)) + return false; + + return true; +} + static int btf_check_func_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs, bool ptr_to_mem_ok) { struct bpf_verifier_log *log = &env->log; + u32 i, nargs, ref_id, ref_obj_id = 0; + bool is_kfunc = btf_is_kernel(btf); const char *func_name, *ref_tname; const struct btf_type *t, *ref_t; const struct btf_param *args; - u32 i, nargs, ref_id; + int ref_regno = 0, ret; + bool rel = false; t = btf_type_by_id(btf, func_id); if (!t || !btf_type_is_func(t)) { @@ -5598,6 +5802,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, return -EINVAL; } + /* Only kfunc can be release func */ + if (is_kfunc) + rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RELEASE, func_id); /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -5621,7 +5829,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id); ref_tname = btf_name_by_offset(btf, ref_t->name_off); - if (btf_is_kernel(btf)) { + + ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel); + if (ret < 0) + return ret; + + if (btf_get_prog_ctx_type(log, btf, t, + env->prog->type, i)) { + /* If function expects ctx type in BTF check that caller + * is passing PTR_TO_CTX. + */ + if (reg->type != PTR_TO_CTX) { + bpf_log(log, + "arg#%d expected pointer to ctx, but got %s\n", + i, btf_type_str(t)); + return -EINVAL; + } + } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || + (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) { const struct btf_type *reg_ref_t; const struct btf *reg_btf; const char *reg_ref_tname; @@ -5637,14 +5862,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (reg->type == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; - } else if (reg2btf_ids[reg->type]) { - reg_btf = btf_vmlinux; - reg_ref_id = *reg2btf_ids[reg->type]; + /* Ensure only one argument is referenced + * PTR_TO_BTF_ID, check_func_arg_reg_off relies + * on only one referenced register being allowed + * for kfuncs. + */ + if (reg->ref_obj_id) { + if (ref_obj_id) { + bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", + regno, reg->ref_obj_id, ref_obj_id); + return -EFAULT; + } + ref_regno = regno; + ref_obj_id = reg->ref_obj_id; + } } else { - bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n", - func_name, i, - btf_type_str(ref_t), ref_tname, regno); - return -EINVAL; + reg_btf = btf_vmlinux; + reg_ref_id = *reg2btf_ids[base_type(reg->type)]; } reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id, @@ -5660,23 +5894,40 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, reg_ref_tname); return -EINVAL; } - } else if (btf_get_prog_ctx_type(log, btf, t, - env->prog->type, i)) { - /* If function expects ctx type in BTF check that caller - * is passing PTR_TO_CTX. - */ - if (reg->type != PTR_TO_CTX) { - bpf_log(log, - "arg#%d expected pointer to ctx, but got %s\n", - i, btf_type_str(t)); - return -EINVAL; - } - if (check_ctx_reg(env, reg, regno)) - return -EINVAL; } else if (ptr_to_mem_ok) { const struct btf_type *resolve_ret; u32 type_size; + if (is_kfunc) { + bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], ®s[regno + 1]); + + /* Permit pointer to mem, but only when argument + * type is pointer to scalar, or struct composed + * (recursively) of scalars. + * When arg_mem_size is true, the pointer can be + * void *. + */ + if (!btf_type_is_scalar(ref_t) && + !__btf_type_is_scalar_struct(log, btf, ref_t, 0) && + (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) { + bpf_log(log, + "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n", + i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : ""); + return -EINVAL; + } + + /* Check for mem, len pair */ + if (arg_mem_size) { + if (check_kfunc_mem_size_reg(env, ®s[regno + 1], regno + 1)) { + bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n", + i, i + 1); + return -EINVAL; + } + i++; + continue; + } + } + resolve_ret = btf_resolve_size(btf, ref_t, &type_size); if (IS_ERR(resolve_ret)) { bpf_log(log, @@ -5689,11 +5940,26 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, if (check_mem_reg(env, reg, regno, type_size)) return -EINVAL; } else { + bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i, + is_kfunc ? "kernel " : "", func_name, func_id); return -EINVAL; } } - return 0; + /* Either both are set, or neither */ + WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno)); + /* We already made sure ref_obj_id is set only for one argument. We do + * allow (!rel && ref_obj_id), so that passing such referenced + * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when + * is_kfunc is true. + */ + if (rel && !ref_obj_id) { + bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n", + func_name); + return -EINVAL; + } + /* returns argument register number > 0 in case of reference release kfunc */ + return rel ? ref_regno : 0; } /* Compare BTF of a function with given bpf_reg_state. @@ -5738,7 +6004,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, const struct btf *btf, u32 func_id, struct bpf_reg_state *regs) { - return btf_check_func_arg_match(env, btf, func_id, regs, false); + return btf_check_func_arg_match(env, btf, func_id, regs, true); } /* Convert BTF of a function into bpf_reg_state if possible @@ -5846,7 +6112,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, return -EINVAL; } - reg->type = PTR_TO_MEM_OR_NULL; + reg->type = PTR_TO_MEM | PTR_MAYBE_NULL; reg->id = ++env->id_gen; continue; @@ -5939,7 +6205,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf); - /* If we encontered an error, return it. */ + /* If we encountered an error, return it. */ if (ssnprintf.show.state.status) return ssnprintf.show.state.status; @@ -6135,12 +6401,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } +enum { + BTF_MODULE_F_LIVE = (1 << 0), +}; + #ifdef CONFIG_DEBUG_INFO_BTF_MODULES struct btf_module { struct list_head list; struct module *module; struct btf *btf; struct bin_attribute *sysfs_attr; + int flags; }; static LIST_HEAD(btf_modules); @@ -6157,6 +6428,8 @@ btf_module_read(struct file *file, struct kobject *kobj, return len; } +static void purge_cand_cache(struct btf *btf); + static int btf_module_notify(struct notifier_block *nb, unsigned long op, void *module) { @@ -6166,7 +6439,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, int err = 0; if (mod->btf_data_size == 0 || - (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE && + op != MODULE_STATE_GOING)) goto out; switch (op) { @@ -6181,7 +6455,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, pr_warn("failed to validate module [%s] BTF: %ld\n", mod->name, PTR_ERR(btf)); kfree(btf_mod); - err = PTR_ERR(btf); + if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH)) + err = PTR_ERR(btf); goto out; } err = btf_alloc_id(btf); @@ -6191,6 +6466,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, goto out; } + purge_cand_cache(NULL); mutex_lock(&btf_module_mutex); btf_mod->module = module; btf_mod->btf = btf; @@ -6224,6 +6500,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, } break; + case MODULE_STATE_LIVE: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_mod->flags |= BTF_MODULE_F_LIVE; + break; + } + mutex_unlock(&btf_module_mutex); + break; case MODULE_STATE_GOING: mutex_lock(&btf_module_mutex); list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { @@ -6233,6 +6520,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op, list_del(&btf_mod->list); if (btf_mod->sysfs_attr) sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + purge_cand_cache(btf_mod->btf); btf_put(btf_mod->btf); kfree(btf_mod->sysfs_attr); kfree(btf_mod); @@ -6269,7 +6557,12 @@ struct module *btf_try_get_module(const struct btf *btf) if (btf_mod->btf != btf) continue; - if (try_module_get(btf_mod->module)) + /* We must only consider module whose __init routine has + * finished, hence we must check for BTF_MODULE_F_LIVE flag, + * which is set from the notifier callback for + * MODULE_STATE_LIVE. + */ + if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module)) res = btf_mod->module; break; @@ -6280,9 +6573,43 @@ struct module *btf_try_get_module(const struct btf *btf) return res; } +/* Returns struct btf corresponding to the struct module. + * This function can return NULL or ERR_PTR. + */ +static struct btf *btf_get_module_btf(const struct module *module) +{ +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; +#endif + struct btf *btf = NULL; + + if (!module) { + btf = bpf_get_btf_vmlinux(); + if (!IS_ERR_OR_NULL(btf)) + btf_get(btf); + return btf; + } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + btf_get(btf_mod->btf); + btf = btf_mod->btf; + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return btf; +} + BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags) { - struct btf *btf; + struct btf *btf = NULL; + int btf_obj_fd = 0; long ret; if (flags) @@ -6291,44 +6618,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int if (name_sz <= 1 || name[name_sz - 1]) return -EINVAL; - btf = bpf_get_btf_vmlinux(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - - ret = btf_find_by_name_kind(btf, name, kind); - /* ret is never zero, since btf_find_by_name_kind returns - * positive btf_id or negative error. - */ - if (ret < 0) { - struct btf *mod_btf; - int id; - - /* If name is not found in vmlinux's BTF then search in module's BTFs */ - spin_lock_bh(&btf_idr_lock); - idr_for_each_entry(&btf_idr, mod_btf, id) { - if (!btf_is_module(mod_btf)) - continue; - /* linear search could be slow hence unlock/lock - * the IDR to avoiding holding it for too long - */ - btf_get(mod_btf); - spin_unlock_bh(&btf_idr_lock); - ret = btf_find_by_name_kind(mod_btf, name, kind); - if (ret > 0) { - int btf_obj_fd; - - btf_obj_fd = __btf_new_fd(mod_btf); - if (btf_obj_fd < 0) { - btf_put(mod_btf); - return btf_obj_fd; - } - return ret | (((u64)btf_obj_fd) << 32); - } - spin_lock_bh(&btf_idr_lock); - btf_put(mod_btf); + ret = bpf_find_btf_id(name, kind, &btf); + if (ret > 0 && btf_is_module(btf)) { + btf_obj_fd = __btf_new_fd(btf); + if (btf_obj_fd < 0) { + btf_put(btf); + return btf_obj_fd; } - spin_unlock_bh(&btf_idr_lock); + return ret | (((u64)btf_obj_fd) << 32); } + if (ret > 0) + btf_put(btf); return ret; } @@ -6336,65 +6636,691 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = { .func = bpf_btf_find_by_name_kind, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_ANYTHING, }; -BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct) +BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE) +#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type) +BTF_TRACING_TYPE_xxx +#undef BTF_TRACING_TYPE -/* BTF ID set registration API for modules */ +/* Kernel Function (kfunc) BTF ID set registration API */ -struct kfunc_btf_id_list { - struct list_head list; - struct mutex mutex; +static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + struct btf_id_set *add_set, bool vmlinux_set) +{ + struct btf_kfunc_set_tab *tab; + struct btf_id_set *set; + u32 set_cnt; + int ret; + + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) { + ret = -EINVAL; + goto end; + } + + if (!add_set->cnt) + return 0; + + tab = btf->kfunc_set_tab; + if (!tab) { + tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN); + if (!tab) + return -ENOMEM; + btf->kfunc_set_tab = tab; + } + + set = tab->sets[hook][type]; + /* Warn when register_btf_kfunc_id_set is called twice for the same hook + * for module sets. + */ + if (WARN_ON_ONCE(set && !vmlinux_set)) { + ret = -EINVAL; + goto end; + } + + /* We don't need to allocate, concatenate, and sort module sets, because + * only one is allowed per hook. Hence, we can directly assign the + * pointer and return. + */ + if (!vmlinux_set) { + tab->sets[hook][type] = add_set; + return 0; + } + + /* In case of vmlinux sets, there may be more than one set being + * registered per hook. To create a unified set, we allocate a new set + * and concatenate all individual sets being registered. While each set + * is individually sorted, they may become unsorted when concatenated, + * hence re-sorting the final set again is required to make binary + * searching the set using btf_id_set_contains function work. + */ + set_cnt = set ? set->cnt : 0; + + if (set_cnt > U32_MAX - add_set->cnt) { + ret = -EOVERFLOW; + goto end; + } + + if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) { + ret = -E2BIG; + goto end; + } + + /* Grow set */ + set = krealloc(tab->sets[hook][type], + offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]), + GFP_KERNEL | __GFP_NOWARN); + if (!set) { + ret = -ENOMEM; + goto end; + } + + /* For newly allocated set, initialize set->cnt to 0 */ + if (!tab->sets[hook][type]) + set->cnt = 0; + tab->sets[hook][type] = set; + + /* Concatenate the two sets */ + memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0])); + set->cnt += add_set->cnt; + + sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL); + + return 0; +end: + btf_free_kfunc_set_tab(btf); + return ret; +} + +static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) +{ + bool vmlinux_set = !btf_is_module(btf); + int type, ret = 0; + + for (type = 0; type < ARRAY_SIZE(kset->sets); type++) { + if (!kset->sets[type]) + continue; + + ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set); + if (ret) + break; + } + return ret; +} + +static bool __btf_kfunc_id_set_contains(const struct btf *btf, + enum btf_kfunc_hook hook, + enum btf_kfunc_type type, + u32 kfunc_btf_id) +{ + struct btf_id_set *set; + + if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) + return false; + if (!btf->kfunc_set_tab) + return false; + set = btf->kfunc_set_tab->sets[hook][type]; + if (!set) + return false; + return btf_id_set_contains(set, kfunc_btf_id); +} + +static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_XDP: + return BTF_KFUNC_HOOK_XDP; + case BPF_PROG_TYPE_SCHED_CLS: + return BTF_KFUNC_HOOK_TC; + case BPF_PROG_TYPE_STRUCT_OPS: + return BTF_KFUNC_HOOK_STRUCT_OPS; + default: + return BTF_KFUNC_HOOK_MAX; + } +} + +/* Caution: + * Reference to the module (obtained using btf_try_get_module) corresponding to + * the struct btf *MUST* be held when calling this function from verifier + * context. This is usually true as we stash references in prog's kfunc_btf_tab; + * keeping the reference for the duration of the call provides the necessary + * protection for looking up a well-formed btf->kfunc_set_tab. + */ +bool btf_kfunc_id_set_contains(const struct btf *btf, + enum bpf_prog_type prog_type, + enum btf_kfunc_type type, u32 kfunc_btf_id) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id); +} + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + struct btf *btf; + int ret; + + btf = btf_get_module_btf(kset->owner); + if (!btf) { + if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { + pr_err("missing vmlinux BTF, cannot register kfuncs\n"); + return -ENOENT; + } + if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) { + pr_err("missing module BTF, cannot register kfuncs\n"); + return -ENOENT; + } + return 0; + } + if (IS_ERR(btf)) + return PTR_ERR(btf); + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + ret = btf_populate_kfunc_set(btf, hook, kset); + btf_put(btf); + return ret; +} +EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); + +#define MAX_TYPES_ARE_COMPAT_DEPTH 2 + +static +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, + int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id); + targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (btf_kind(local_type) != btf_kind(targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + btf_type_skip_modifiers(local_btf, local_p->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + btf_type_skip_modifiers(local_btf, local_type->type, &local_id); + btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + return 0; + } +} + +/* Check local and target types for compatibility. This check is used for + * type-based CO-RE relocations and follow slightly different rules than + * field-based relocations. This function assumes that root types were already + * checked for name match. Beyond that initial root-level name check, names + * are completely ignored. Compatibility rules are as follows: + * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but + * kind should match for local and target types (i.e., STRUCT is not + * compatible with UNION); + * - for ENUMs, the size is ignored; + * - for INT, size and signedness are ignored; + * - for ARRAY, dimensionality is ignored, element types are checked for + * compatibility recursively; + * - CONST/VOLATILE/RESTRICT modifiers are ignored; + * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible; + * - FUNC_PROTOs are compatible if they have compatible signature: same + * number of input args and compatible return and argument types. + * These rules are not set in stone and probably will be adjusted as we get + * more experience with using BPF CO-RE relocations. + */ +int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_are_compat(local_btf, local_id, + targ_btf, targ_id, + MAX_TYPES_ARE_COMPAT_DEPTH); +} + +static bool bpf_core_is_flavor_sep(const char *s) +{ + /* check X___Y name pattern, where X and Y are not underscores */ + return s[0] != '_' && /* X */ + s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */ + s[4] != '_'; /* Y */ +} + +size_t bpf_core_essential_name_len(const char *name) +{ + size_t n = strlen(name); + int i; + + for (i = n - 5; i >= 0; i--) { + if (bpf_core_is_flavor_sep(name + i)) + return i + 1; + } + return n; +} + +struct bpf_cand_cache { + const char *name; + u32 name_len; + u16 kind; + u16 cnt; + struct { + const struct btf *btf; + u32 id; + } cands[]; }; -#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +static void bpf_free_cands(struct bpf_cand_cache *cands) +{ + if (!cands->cnt) + /* empty candidate array was allocated on stack */ + return; + kfree(cands); +} -void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) +static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands) { - mutex_lock(&l->mutex); - list_add(&s->list, &l->list); - mutex_unlock(&l->mutex); + kfree(cands->name); + kfree(cands); } -EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set); -void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l, - struct kfunc_btf_id_set *s) +#define VMLINUX_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE]; + +#define MODULE_CAND_CACHE_SIZE 31 +static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE]; + +static DEFINE_MUTEX(cand_cache_mutex); + +static void __print_cand_cache(struct bpf_verifier_log *log, + struct bpf_cand_cache **cache, + int cache_size) { - mutex_lock(&l->mutex); - list_del_init(&s->list); - mutex_unlock(&l->mutex); + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + bpf_log(log, "[%d]%s(", i, cc->name); + for (j = 0; j < cc->cnt; j++) { + bpf_log(log, "%d", cc->cands[j].id); + if (j < cc->cnt - 1) + bpf_log(log, " "); + } + bpf_log(log, "), "); + } } -EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set); -bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id, - struct module *owner) +static void print_cand_cache(struct bpf_verifier_log *log) { - struct kfunc_btf_id_set *s; + mutex_lock(&cand_cache_mutex); + bpf_log(log, "vmlinux_cand_cache:"); + __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + bpf_log(log, "\nmodule_cand_cache:"); + __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE); + bpf_log(log, "\n"); + mutex_unlock(&cand_cache_mutex); +} - if (!owner) - return false; - mutex_lock(&klist->mutex); - list_for_each_entry(s, &klist->list, list) { - if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) { - mutex_unlock(&klist->mutex); - return true; +static u32 hash_cands(struct bpf_cand_cache *cands) +{ + return jhash(cands->name, cands->name_len, 0); +} + +static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size]; + + if (cc && cc->name_len == cands->name_len && + !strncmp(cc->name, cands->name, cands->name_len)) + return cc; + return NULL; +} + +static size_t sizeof_cands(int cnt) +{ + return offsetof(struct bpf_cand_cache, cands[cnt]); +} + +static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands, + struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands; + + if (*cc) { + bpf_free_cands_from_cache(*cc); + *cc = NULL; + } + new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + /* strdup the name, since it will stay in cache. + * the cands->name points to strings in prog's BTF and the prog can be unloaded. + */ + new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL); + bpf_free_cands(cands); + if (!new_cands->name) { + kfree(new_cands); + return ERR_PTR(-ENOMEM); + } + *cc = new_cands; + return new_cands; +} + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache, + int cache_size) +{ + struct bpf_cand_cache *cc; + int i, j; + + for (i = 0; i < cache_size; i++) { + cc = cache[i]; + if (!cc) + continue; + if (!btf) { + /* when new module is loaded purge all of module_cand_cache, + * since new module might have candidates with the name + * that matches cached cands. + */ + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + continue; } + /* when module is unloaded purge cache entries + * that match module's btf + */ + for (j = 0; j < cc->cnt; j++) + if (cc->cands[j].btf == btf) { + bpf_free_cands_from_cache(cc); + cache[i] = NULL; + break; + } } - mutex_unlock(&klist->mutex); - return false; + } +static void purge_cand_cache(struct btf *btf) +{ + mutex_lock(&cand_cache_mutex); + __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE); + mutex_unlock(&cand_cache_mutex); +} #endif -#define DEFINE_KFUNC_BTF_ID_LIST(name) \ - struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \ - __MUTEX_INITIALIZER(name.mutex) }; \ - EXPORT_SYMBOL_GPL(name) +static struct bpf_cand_cache * +bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf, + int targ_start_id) +{ + struct bpf_cand_cache *new_cands; + const struct btf_type *t; + const char *targ_name; + size_t targ_essent_len; + int n, i; + + n = btf_nr_types(targ_btf); + for (i = targ_start_id; i < n; i++) { + t = btf_type_by_id(targ_btf, i); + if (btf_kind(t) != cands->kind) + continue; + + targ_name = btf_name_by_offset(targ_btf, t->name_off); + if (!targ_name) + continue; -DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list); -DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list); + /* the resched point is before strncmp to make sure that search + * for non-existing name will have a chance to schedule(). + */ + cond_resched(); + + if (strncmp(cands->name, targ_name, cands->name_len) != 0) + continue; + + targ_essent_len = bpf_core_essential_name_len(targ_name); + if (targ_essent_len != cands->name_len) + continue; + + /* most of the time there is only one candidate for a given kind+name pair */ + new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL); + if (!new_cands) { + bpf_free_cands(cands); + return ERR_PTR(-ENOMEM); + } + + memcpy(new_cands, cands, sizeof_cands(cands->cnt)); + bpf_free_cands(cands); + cands = new_cands; + cands->cands[cands->cnt].btf = targ_btf; + cands->cands[cands->cnt].id = i; + cands->cnt++; + } + return cands; +} + +static struct bpf_cand_cache * +bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id) +{ + struct bpf_cand_cache *cands, *cc, local_cand = {}; + const struct btf *local_btf = ctx->btf; + const struct btf_type *local_type; + const struct btf *main_btf; + size_t local_essent_len; + struct btf *mod_btf; + const char *name; + int id; + + main_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(main_btf)) + return ERR_CAST(main_btf); + if (!main_btf) + return ERR_PTR(-EINVAL); + + local_type = btf_type_by_id(local_btf, local_type_id); + if (!local_type) + return ERR_PTR(-EINVAL); + + name = btf_name_by_offset(local_btf, local_type->name_off); + if (str_is_empty(name)) + return ERR_PTR(-EINVAL); + local_essent_len = bpf_core_essential_name_len(name); + + cands = &local_cand; + cands->name = name; + cands->kind = btf_kind(local_type); + cands->name_len = local_essent_len; + + cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + /* cands is a pointer to stack here */ + if (cc) { + if (cc->cnt) + return cc; + goto check_modules; + } + + /* Attempt to find target candidates in vmlinux BTF first */ + cands = bpf_core_add_cands(cands, main_btf, 1); + if (IS_ERR(cands)) + return ERR_CAST(cands); + + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */ + + /* populate cache even when cands->cnt == 0 */ + cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE); + if (IS_ERR(cc)) + return ERR_CAST(cc); + + /* if vmlinux BTF has any candidate, don't go for module BTFs */ + if (cc->cnt) + return cc; + +check_modules: + /* cands is a pointer to stack here and cands->cnt == 0 */ + cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); + if (cc) + /* if cache has it return it even if cc->cnt == 0 */ + return cc; + + /* If candidate is not found in vmlinux's BTF then search in module's BTFs */ + spin_lock_bh(&btf_idr_lock); + idr_for_each_entry(&btf_idr, mod_btf, id) { + if (!btf_is_module(mod_btf)) + continue; + /* linear search could be slow hence unlock/lock + * the IDR to avoiding holding it for too long + */ + btf_get(mod_btf); + spin_unlock_bh(&btf_idr_lock); + cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf)); + if (IS_ERR(cands)) { + btf_put(mod_btf); + return ERR_CAST(cands); + } + spin_lock_bh(&btf_idr_lock); + btf_put(mod_btf); + } + spin_unlock_bh(&btf_idr_lock); + /* cands is a pointer to kmalloced memory here if cands->cnt > 0 + * or pointer to stack if cands->cnd == 0. + * Copy it into the cache even when cands->cnt == 0 and + * return the result. + */ + return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE); +} + +int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, + int relo_idx, void *insn) +{ + bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL; + struct bpf_core_cand_list cands = {}; + struct bpf_core_relo_res targ_res; + struct bpf_core_spec *specs; + int err; + + /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5" + * into arrays of btf_ids of struct fields and array indices. + */ + specs = kcalloc(3, sizeof(*specs), GFP_KERNEL); + if (!specs) + return -ENOMEM; + + if (need_cands) { + struct bpf_cand_cache *cc; + int i; + + mutex_lock(&cand_cache_mutex); + cc = bpf_core_find_cands(ctx, relo->type_id); + if (IS_ERR(cc)) { + bpf_log(ctx->log, "target candidate search failed for %d\n", + relo->type_id); + err = PTR_ERR(cc); + goto out; + } + if (cc->cnt) { + cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL); + if (!cands.cands) { + err = -ENOMEM; + goto out; + } + } + for (i = 0; i < cc->cnt; i++) { + bpf_log(ctx->log, + "CO-RE relocating %s %s: found target candidate [%d]\n", + btf_kind_str[cc->kind], cc->name, cc->cands[i].id); + cands.cands[i].btf = cc->cands[i].btf; + cands.cands[i].id = cc->cands[i].id; + } + cands.len = cc->cnt; + /* cand_cache_mutex needs to span the cache lookup and + * copy of btf pointer into bpf_core_cand_list, + * since module can be unloaded while bpf_core_calc_relo_insn + * is working with module's btf. + */ + } + + err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs, + &targ_res); + if (err) + goto out; + + err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx, + &targ_res); + +out: + kfree(specs); + if (need_cands) { + kfree(cands.cands); + mutex_unlock(&cand_cache_mutex); + if (ctx->log->level & BPF_LOG_LEVEL2) + print_cand_cache(ctx->log); + } + return err; +} diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 2ca643af9a54..128028efda64 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1031,7 +1031,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. @@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr * NET_XMIT_CN (2) - continue with packet output and notify TCP * to call cwr - * -EPERM - drop packet + * -err - drop packet * * For ingress packets, this function will return -EPERM if any * attached program was found and if it returned != 1 during execution. @@ -1079,8 +1079,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb); } else { ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb, - __bpf_prog_run_save_cb); - ret = (ret == 1 ? 0 : -EPERM); + __bpf_prog_run_save_cb, 0); + if (ret && !IS_ERR_VALUE((long)ret)) + ret = -EFAULT; } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); @@ -1093,7 +1094,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket is passed is expected to be of type INET or INET6. * @@ -1107,10 +1108,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); @@ -1119,7 +1119,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * provided by user sockaddr * @sk: sock struct that will use sockaddr * @uaddr: sockaddr struct provided by user - * @type: The type of program to be exectuted + * @type: The type of program to be executed * @t_ctx: Pointer to attach type specific context * @flags: Pointer to u32 which contains higher bits of BPF program * return value (OR'ed together). @@ -1142,7 +1142,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, }; struct sockaddr_storage unspec; struct cgroup *cgrp; - int ret; /* Check socket family since not all sockets represent network * endpoint (e.g. AF_UNIX). @@ -1156,10 +1155,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run, flags); - - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0, flags); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); @@ -1169,7 +1166,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains * sk with connection information (IP addresses, etc.) May not contain * cgroup info if it is a req sock. - * @type: The type of program to be exectuted + * @type: The type of program to be executed * * socket passed is expected to be of type INET or INET6. * @@ -1184,11 +1181,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, enum cgroup_bpf_attach_type atype) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - int ret; - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, - bpf_prog_run); - return ret == 1 ? 0 : -EPERM; + return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops, + bpf_prog_run, 0); } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); @@ -1201,17 +1196,47 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, .major = major, .minor = minor, }; - int allow; + int ret; rcu_read_lock(); cgrp = task_dfl_cgroup(current); - allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, - bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0); rcu_read_unlock(); - return !allow; + return ret; +} + +BPF_CALL_0(bpf_get_retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + return ctx->retval; +} + +static const struct bpf_func_proto bpf_get_retval_proto = { + .func = bpf_get_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_1(bpf_set_retval, int, retval) +{ + struct bpf_cg_run_ctx *ctx = + container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); + + ctx->retval = retval; + return 0; } +static const struct bpf_func_proto bpf_set_retval_proto = { + .func = bpf_set_retval, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1224,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_cgroup_id_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; + case BPF_FUNC_get_retval: + return &bpf_get_retval_proto; + case BPF_FUNC_set_retval: + return &bpf_set_retval_proto; default: return bpf_base_func_proto(func_id); } @@ -1337,7 +1366,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, rcu_read_lock(); cgrp = task_dfl_cgroup(current); - ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run); + ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, + bpf_prog_run, 0); rcu_read_unlock(); kfree(ctx.cur_val); @@ -1350,24 +1380,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.new_val); } - return ret == 1 ? 0 : -EPERM; + return ret; } #ifdef CONFIG_NET -static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, - enum cgroup_bpf_attach_type attach_type) -{ - struct bpf_prog_array *prog_array; - bool empty; - - rcu_read_lock(); - prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); - empty = bpf_prog_array_is_empty(prog_array); - rcu_read_unlock(); - - return empty; -} - static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, struct bpf_sockopt_buf *buf) { @@ -1426,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, }; int ret, max_optlen; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT)) - return 0; - /* Allocate a bit more than the initial user buffer for * BPF program. The canonical use case is overriding * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1452,13 +1460,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, 0); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret) goto out; - } if (ctx.optlen == -1) { /* optlen set to -1, bypass kernel */ @@ -1518,19 +1524,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, + .current_task = current, }; int ret; - /* Opportunistic check to see whether we have any BPF program - * attached to the hook so we don't waste time allocating - * memory and locking the socket. - */ - if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT)) - return retval; - ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1562,27 +1560,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, lock_sock(sk); ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); + &ctx, bpf_prog_run, retval); release_sock(sk); - if (!ret) { - ret = -EPERM; + if (ret < 0) goto out; - } if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; goto out; } - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) { - ret = -EFAULT; - goto out; - } - if (ctx.optlen != 0) { if (copy_to_user(optval, ctx.optval, ctx.optlen) || put_user(ctx.optlen, optlen)) { @@ -1591,8 +1579,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, } } - ret = ctx.retval; - out: sockopt_free_buf(&ctx, &buf); return ret; @@ -1607,10 +1593,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, .sk = sk, .level = level, .optname = optname, - .retval = retval, .optlen = *optlen, .optval = optval, .optval_end = optval + *optlen, + .current_task = current, }; int ret; @@ -1623,25 +1609,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, */ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT], - &ctx, bpf_prog_run); - if (!ret) - return -EPERM; + &ctx, bpf_prog_run, retval); + if (ret < 0) + return ret; if (ctx.optlen > *optlen) return -EFAULT; - /* BPF programs only allowed to set retval to 0, not some - * arbitrary value. - */ - if (ctx.retval != 0 && ctx.retval != retval) - return -EFAULT; - /* BPF programs can shrink the buffer, export the modifications. */ if (ctx.optlen != 0) *optlen = ctx.optlen; - return ctx.retval; + return ret; } #endif @@ -1789,7 +1769,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -1809,6 +1789,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sysctl_get_new_value_proto; case BPF_FUNC_sysctl_set_new_value: return &bpf_sysctl_set_new_value_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; default: return cgroup_base_func_proto(func_id, prog); } @@ -2055,10 +2037,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); break; case offsetof(struct bpf_sockopt, retval): - if (type == BPF_WRITE) - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); - else - *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); + + if (type == BPF_WRITE) { + int treg = BPF_REG_9; + + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + if (si->src_reg == treg || si->dst_reg == treg) + --treg; + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + treg, treg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + treg, si->src_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, + offsetof(struct bpf_sockopt_kern, tmp_reg)); + } else { + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sockopt_kern, current_task)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), + si->dst_reg, si->dst_reg, + offsetof(struct task_struct, bpf_ctx)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), + si->dst_reg, si->dst_reg, + offsetof(struct bpf_cg_run_ctx, retval)); + } break; case offsetof(struct bpf_sockopt, optval): *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2405e39d800f..13e9dbeeedf3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -33,6 +33,7 @@ #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> +#include <linux/nodemask.h> #include <asm/barrier.h> #include <asm/unaligned.h> @@ -105,6 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); + fp->blinding_requested = bpf_jit_blinding_enabled(fp); INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); mutex_init(&fp->aux->used_maps_mutex); @@ -537,13 +539,10 @@ long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { - const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); - unsigned long addr = (unsigned long)hdr; - WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; - prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE; + prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void @@ -808,6 +807,176 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, return slot; } +/* + * BPF program pack allocator. + * + * Most BPF programs are pretty small. Allocating a hole page for each + * program is sometime a waste. Many small bpf program also adds pressure + * to instruction TLB. To solve this issue, we introduce a BPF program pack + * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) + * to host BPF programs. + */ +#define BPF_PROG_CHUNK_SHIFT 6 +#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) +#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) + +struct bpf_prog_pack { + struct list_head list; + void *ptr; + unsigned long bitmap[]; +}; + +#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) + +static size_t bpf_prog_pack_size = -1; +static size_t bpf_prog_pack_mask = -1; + +static int bpf_prog_chunk_count(void) +{ + WARN_ON_ONCE(bpf_prog_pack_size == -1); + return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE; +} + +static DEFINE_MUTEX(pack_mutex); +static LIST_HEAD(pack_list); + +/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with + * CONFIG_MMU=n. Use PAGE_SIZE in these cases. + */ +#ifdef PMD_SIZE +#define BPF_HPAGE_SIZE PMD_SIZE +#define BPF_HPAGE_MASK PMD_MASK +#else +#define BPF_HPAGE_SIZE PAGE_SIZE +#define BPF_HPAGE_MASK PAGE_MASK +#endif + +static size_t select_bpf_prog_pack_size(void) +{ + size_t size; + void *ptr; + + size = BPF_HPAGE_SIZE * num_online_nodes(); + ptr = module_alloc(size); + + /* Test whether we can get huge pages. If not just use PAGE_SIZE + * packs. + */ + if (!ptr || !is_vm_area_hugepages(ptr)) { + size = PAGE_SIZE; + bpf_prog_pack_mask = PAGE_MASK; + } else { + bpf_prog_pack_mask = BPF_HPAGE_MASK; + } + + vfree(ptr); + return size; +} + +static struct bpf_prog_pack *alloc_new_pack(void) +{ + struct bpf_prog_pack *pack; + + pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())), + GFP_KERNEL); + if (!pack) + return NULL; + pack->ptr = module_alloc(bpf_prog_pack_size); + if (!pack->ptr) { + kfree(pack); + return NULL; + } + bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE); + list_add_tail(&pack->list, &pack_list); + + set_vm_flush_reset_perms(pack->ptr); + set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE); + return pack; +} + +static void *bpf_prog_pack_alloc(u32 size) +{ + unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); + struct bpf_prog_pack *pack; + unsigned long pos; + void *ptr = NULL; + + mutex_lock(&pack_mutex); + if (bpf_prog_pack_size == -1) + bpf_prog_pack_size = select_bpf_prog_pack_size(); + + if (size > bpf_prog_pack_size) { + size = round_up(size, PAGE_SIZE); + ptr = module_alloc(size); + if (ptr) { + set_vm_flush_reset_perms(ptr); + set_memory_ro((unsigned long)ptr, size / PAGE_SIZE); + set_memory_x((unsigned long)ptr, size / PAGE_SIZE); + } + goto out; + } + list_for_each_entry(pack, &pack_list, list) { + pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + nbits, 0); + if (pos < bpf_prog_chunk_count()) + goto found_free_area; + } + + pack = alloc_new_pack(); + if (!pack) + goto out; + + pos = 0; + +found_free_area: + bitmap_set(pack->bitmap, pos, nbits); + ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); + +out: + mutex_unlock(&pack_mutex); + return ptr; +} + +static void bpf_prog_pack_free(struct bpf_binary_header *hdr) +{ + struct bpf_prog_pack *pack = NULL, *tmp; + unsigned int nbits; + unsigned long pos; + void *pack_ptr; + + mutex_lock(&pack_mutex); + if (hdr->size > bpf_prog_pack_size) { + module_memfree(hdr); + goto out; + } + + pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask); + + list_for_each_entry(tmp, &pack_list, list) { + if (tmp->ptr == pack_ptr) { + pack = tmp; + break; + } + } + + if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) + goto out; + + nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size); + pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT; + + bitmap_clear(pack->bitmap, pos, nbits); + if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0, + bpf_prog_chunk_count(), 0) == 0) { + list_del(&pack->list); + module_memfree(pack->ptr); + kfree(pack); + } +out: + mutex_unlock(&pack_mutex); +} + static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, @@ -833,12 +1002,11 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 size) { - if (atomic_long_add_return(pages, &bpf_jit_current) > - (bpf_jit_limit >> PAGE_SHIFT)) { + if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) { if (!bpf_capable()) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } @@ -846,9 +1014,9 @@ int bpf_jit_charge_modmem(u32 pages) return 0; } -void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 size) { - atomic_long_sub(pages, &bpf_jit_current); + atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) @@ -867,7 +1035,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; - u32 size, hole, start, pages; + u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); @@ -877,20 +1045,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); - pages = size / PAGE_SIZE; - if (bpf_jit_charge_modmem(pages)) + if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); - hdr->pages = pages; + hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = (get_random_int() % hole) & ~(alignment - 1); @@ -903,10 +1070,117 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - u32 pages = hdr->pages; + u32 size = hdr->size; bpf_jit_free_exec(hdr); - bpf_jit_uncharge_modmem(pages); + bpf_jit_uncharge_modmem(size); +} + +/* Allocate jit binary from bpf_prog_pack allocator. + * Since the allocated memory is RO+X, the JIT engine cannot write directly + * to the memory. To solve this problem, a RW buffer is also allocated at + * as the same time. The JIT engine should calculate offsets based on the + * RO memory address, but write JITed program to the RW buffer. Once the + * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies + * the JITed program to the RO memory. + */ +struct bpf_binary_header * +bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + struct bpf_binary_header **rw_header, + u8 **rw_image, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *ro_header; + u32 size, hole, start; + + WARN_ON_ONCE(!is_power_of_2(alignment) || + alignment > BPF_IMAGE_ALIGNMENT); + + /* add 16 bytes for a random section of illegal instructions */ + size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); + + if (bpf_jit_charge_modmem(size)) + return NULL; + ro_header = bpf_prog_pack_alloc(size); + if (!ro_header) { + bpf_jit_uncharge_modmem(size); + return NULL; + } + + *rw_header = kvmalloc(size, GFP_KERNEL); + if (!*rw_header) { + bpf_arch_text_copy(&ro_header->size, &size, sizeof(size)); + bpf_prog_pack_free(ro_header); + bpf_jit_uncharge_modmem(size); + return NULL; + } + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(*rw_header, size); + (*rw_header)->size = size; + + hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), + BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); + start = (get_random_int() % hole) & ~(alignment - 1); + + *image_ptr = &ro_header->image[start]; + *rw_image = &(*rw_header)->image[start]; + + return ro_header; +} + +/* Copy JITed text from rw_header to its final location, the ro_header. */ +int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, + struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + void *ptr; + + ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); + + kvfree(rw_header); + + if (IS_ERR(ptr)) { + bpf_prog_pack_free(ro_header); + return PTR_ERR(ptr); + } + prog->aux->use_bpf_prog_pack = true; + return 0; +} + +/* bpf_jit_binary_pack_free is called in two different scenarios: + * 1) when the program is freed after; + * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). + * For case 2), we need to free both the RO memory and the RW buffer. + * + * bpf_jit_binary_pack_free requires proper ro_header->size. However, + * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size + * must be set with either bpf_jit_binary_pack_finalize (normal path) or + * bpf_arch_text_copy (when jit fails). + */ +void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, + struct bpf_binary_header *rw_header) +{ + u32 size = ro_header->size; + + bpf_prog_pack_free(ro_header); + kvfree(rw_header); + bpf_jit_uncharge_modmem(size); +} + +static inline struct bpf_binary_header * +bpf_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->bpf_func; + unsigned long addr; + + if (fp->aux->use_bpf_prog_pack) + addr = real_start & BPF_PROG_CHUNK_MASK; + else + addr = real_start & PAGE_MASK; + + return (void *)addr; } /* This symbol is only overridden by archs that have different @@ -918,7 +1192,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp) if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); - bpf_jit_binary_free(hdr); + if (fp->aux->use_bpf_prog_pack) + bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */); + else + bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } @@ -1146,7 +1423,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog) || prog->blinded) + if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -1574,7 +1851,8 @@ select_insn: if (unlikely(index >= array->map.max_entries)) goto out; - if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) + + if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; @@ -1828,28 +2106,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, } #endif -bool bpf_prog_array_compatible(struct bpf_array *array, - const struct bpf_prog *fp) +bool bpf_prog_map_compatible(struct bpf_map *map, + const struct bpf_prog *fp) { bool ret; if (fp->kprobe_override) return false; - spin_lock(&array->aux->owner.lock); - - if (!array->aux->owner.type) { + spin_lock(&map->owner.lock); + if (!map->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->owner.type = fp->type; - array->aux->owner.jited = fp->jited; + map->owner.type = fp->type; + map->owner.jited = fp->jited; + map->owner.xdp_has_frags = fp->aux->xdp_has_frags; ret = true; } else { - ret = array->aux->owner.type == fp->type && - array->aux->owner.jited == fp->jited; + ret = map->owner.type == fp->type && + map->owner.jited == fp->jited && + map->owner.xdp_has_frags == fp->aux->xdp_has_frags; } - spin_unlock(&array->aux->owner.lock); + spin_unlock(&map->owner.lock); + return ret; } @@ -1861,13 +2141,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; - struct bpf_array *array; - if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) + if (!map_type_contains_progs(map)) continue; - array = container_of(map, struct bpf_array, map); - if (!bpf_prog_array_compatible(array, fp)) { + if (!bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } @@ -1891,7 +2169,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp) /** * bpf_prog_select_runtime - select exec runtime for BPF program - * @fp: bpf_prog populated with internal BPF program + * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. @@ -1967,18 +2245,10 @@ static struct bpf_prog_dummy { }, }; -/* to avoid allocating empty bpf_prog_array for cgroups that - * don't have bpf program attached use one global 'empty_prog_array' - * It will not be modified the caller of bpf_prog_array_alloc() - * (since caller requested prog_cnt == 0) - * that pointer should be 'freed' by bpf_prog_array_free() - */ -static struct { - struct bpf_prog_array hdr; - struct bpf_prog *null_prog; -} empty_prog_array = { +struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; +EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { @@ -1988,12 +2258,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) (prog_cnt + 1), flags); - return &empty_prog_array.hdr; + return &bpf_empty_prog_array.hdr; } void bpf_prog_array_free(struct bpf_prog_array *progs) { - if (!progs || progs == &empty_prog_array.hdr) + if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } @@ -2300,7 +2570,6 @@ static void bpf_prog_free_deferred(struct work_struct *work) } } -/* Free internal BPF program */ void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; @@ -2453,6 +2722,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, return -ENOTSUPP; } +void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + return ERR_PTR(-ENOTSUPP); +} + DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 585b2b77ccc4..650e5d21f90d 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu, } return; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(skb->dev, rcpu->prog, act); @@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu, } break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act); fallthrough; case XDP_DROP: xdp_return_frame(xdpf); @@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) +static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, + struct bpf_map *map, int fd) { struct bpf_prog *prog; @@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->expected_attach_type != BPF_XDP_CPUMAP) { + if (prog->expected_attach_type != BPF_XDP_CPUMAP || + !bpf_prog_map_compatible(map, prog)) { bpf_prog_put(prog); return -EINVAL; } @@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; - if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) + if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd)) goto free_ptr_ring; /* Setup kthread */ @@ -746,15 +748,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) list_add(&bq->flush_node, flush_list); } -int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf, struct net_device *dev_rx) { - struct xdp_frame *xdpf; - - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - /* Info needed when constructing SKB on remote CPU */ xdpf->dev_rx = dev_rx; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f02d04540c0c..038f6d7a83e4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, frames[nframes++] = xdpf; break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dev, xdp_prog, act); @@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf, bq->q[bq->count++] = xdpf; } -static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_prog *xdp_prog) { - struct xdp_frame *xdpf; int err; if (!dev->netdev_ops->ndo_xdp_xmit) return -EOPNOTSUPP; - err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data); + err = xdp_ok_fwd_dev(dev, xdpf->len); if (unlikely(err)) return err; - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - bq_enqueue(dev, xdpf, dev_rx, xdp_prog); return 0; } @@ -507,7 +502,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev __skb_push(skb, skb->mac_len); break; default: - bpf_warn_invalid_xdp_action(act); + bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act); fallthrough; case XDP_ABORTED: trace_xdp_exception(dst->dev, dst->xdp_prog, act); @@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev return act; } -int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, +int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf, struct net_device *dev_rx) { - return __xdp_enqueue(dev, xdp, dev_rx, NULL); + return __xdp_enqueue(dev, xdpf, dev_rx, NULL); } -int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf, struct net_device *dev_rx) { struct net_device *dev = dst->dev; - return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog); + return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog); } -static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp) +static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf) { if (!obj || !obj->dev->netdev_ops->ndo_xdp_xmit) return false; - if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data)) + if (xdp_ok_fwd_dev(obj->dev, xdpf->len)) return false; return true; @@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes) return n; } -int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, +int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx, struct bpf_map *map, bool exclude_ingress) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *dst, *last_dst = NULL; int excluded_devices[1+MAX_NEST_DEV]; struct hlist_head *head; - struct xdp_frame *xdpf; int num_excluded = 0; unsigned int i; int err; @@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, excluded_devices[num_excluded++] = dev_rx->ifindex; } - xdpf = xdp_convert_buff_to_frame(xdp); - if (unlikely(!xdpf)) - return -EOVERFLOW; - if (map->map_type == BPF_MAP_TYPE_DEVMAP) { for (i = 0; i < map->max_entries; i++) { dst = rcu_dereference_check(dtab->netdev_map[i], rcu_read_lock_bh_held()); - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex)) @@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx, head = dev_map_index_hash(dtab, i); hlist_for_each_entry_rcu(dst, head, index_hlist, lockdep_is_held(&dtab->index_lock)) { - if (!is_valid_dst(dst, xdp)) + if (!is_valid_dst(dst, xdpf)) continue; if (is_ifindex_excluded(excluded_devices, num_excluded, @@ -868,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, BPF_PROG_TYPE_XDP, false); if (IS_ERR(prog)) goto err_put_dev; - if (prog->expected_attach_type != BPF_XDP_DEVMAP) + if (prog->expected_attach_type != BPF_XDP_DEVMAP || + !bpf_prog_map_compatible(&dtab->map, prog)) goto err_put_prog; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d29af9988f37..65877967f414 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1636,7 +1636,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, value_size = size * num_possible_cpus(); total = 0; /* while experimenting with hash tables with sizes ranging from 10 to - * 1000, it was observed that a bucket can have upto 5 entries. + * 1000, it was observed that a bucket can have up to 5 entries. */ bucket_size = 5; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1ffd469c217f..315053ef6a75 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2,6 +2,8 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/bpf-cgroup.h> #include <linux/rcupdate.h> #include <linux/random.h> #include <linux/smp.h> @@ -15,6 +17,7 @@ #include <linux/pid_namespace.h> #include <linux/proc_ns.h> #include <linux/security.h> +#include <linux/btf_ids.h> #include "../../lib/kstrtox.h" @@ -222,13 +225,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) if (unlikely(!task)) goto err_clear; - strncpy(buf, task->comm, size); - - /* Verifier guarantees that size > 0. For task->comm exceeding - * size, guarantee that buf is %NUL-terminated. Unconditionally - * done here to save the size test. - */ - buf[size - 1] = 0; + /* Verifier guarantees that size > 0 */ + strscpy(buf, task->comm, size); return 0; err_clear: memset(buf, 0, size); @@ -530,7 +528,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .func = bpf_strtol, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, @@ -558,13 +556,27 @@ const struct bpf_func_proto bpf_strtoul_proto = { .func = bpf_strtoul, .gpl_only = false, .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_MEM, + .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; #endif +BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2) +{ + return strncmp(s1, s2, s1_sz); +} + +const struct bpf_func_proto bpf_strncmp_proto = { + .func = bpf_strncmp, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_PTR_TO_CONST_STR, +}; + BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino, struct bpf_pidns_info *, nsdata, u32, size) { @@ -630,7 +642,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -656,6 +668,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size, + const void __user *, user_ptr, struct task_struct *, tsk, u64, flags) +{ + int ret; + + /* flags is not used yet */ + if (unlikely(flags)) + return -EINVAL; + + if (unlikely(!size)) + return 0; + + ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0); + if (ret == size) + return 0; + + memset(dst, 0, size); + /* Return -EFAULT for partial read */ + return ret < 0 ? ret : -EFAULT; +} + +const struct bpf_func_proto bpf_copy_from_user_task_proto = { + .func = bpf_copy_from_user_task, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_UNINIT_MEM, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_BTF_ID, + .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg5_type = ARG_ANYTHING +}; + BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) { if (cpu >= nr_cpu_ids) @@ -667,7 +712,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu) const struct bpf_func_proto bpf_per_cpu_ptr_proto = { .func = bpf_per_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, .arg2_type = ARG_ANYTHING, }; @@ -680,7 +725,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr) const struct bpf_func_proto bpf_this_cpu_ptr_proto = { .func = bpf_this_cpu_ptr, .gpl_only = false, - .ret_type = RET_PTR_TO_MEM_OR_BTF_ID, + .ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY, .arg1_type = ARG_PTR_TO_PERCPU_BTF_ID, }; @@ -1011,7 +1056,7 @@ const struct bpf_func_proto bpf_snprintf_proto = { .arg1_type = ARG_PTR_TO_MEM_OR_NULL, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_PTR_TO_CONST_STR, - .arg4_type = ARG_PTR_TO_MEM_OR_NULL, + .arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY, .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; @@ -1043,7 +1088,7 @@ struct bpf_hrtimer { struct bpf_timer_kern { struct bpf_hrtimer *timer; /* bpf_spin_lock is used here instead of spinlock_t to make - * sure that it always fits into space resereved by struct bpf_timer + * sure that it always fits into space reserved by struct bpf_timer * regardless of LOCKDEP and spinlock debug flags. */ struct bpf_spin_lock lock; @@ -1060,6 +1105,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer) void *key; u32 idx; + BTF_TYPE_EMIT(struct bpf_timer); callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held()); if (!callback_fn) goto out; @@ -1364,8 +1410,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; - case BPF_FUNC_ktime_get_coarse_ns: - return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: @@ -1378,6 +1422,10 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_query_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; + case BPF_FUNC_loop: + return &bpf_loop_proto; + case BPF_FUNC_strncmp: + return &bpf_strncmp_proto; default: break; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 80da1db47c68..4f841e16779e 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -648,12 +648,22 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, bpf_fs_parameters, param, &result); - if (opt < 0) + if (opt < 0) { /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ - return opt == -ENOPARAM ? 0 : opt; + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; + + return 0; + } + + if (opt < 0) + return opt; + } switch (opt) { case OPT_MODE: @@ -700,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock); static int populate_bpffs(struct dentry *parent) { struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; - struct bpf_link *links[BPF_PRELOAD_LINKS] = {}; int err = 0, i; /* grab the mutex to make sure the kernel interactions with bpf_preload - * UMD are serialized + * are serialized */ mutex_lock(&bpf_preload_lock); @@ -712,40 +721,22 @@ static int populate_bpffs(struct dentry *parent) if (!bpf_preload_mod_get()) goto out; - if (!bpf_preload_ops->info.tgid) { - /* preload() will start UMD that will load BPF iterator programs */ - err = bpf_preload_ops->preload(objs); - if (err) + err = bpf_preload_ops->preload(objs); + if (err) + goto out_put; + for (i = 0; i < BPF_PRELOAD_LINKS; i++) { + bpf_link_inc(objs[i].link); + err = bpf_iter_link_pin_kernel(parent, + objs[i].link_name, objs[i].link); + if (err) { + bpf_link_put(objs[i].link); goto out_put; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - links[i] = bpf_link_by_id(objs[i].link_id); - if (IS_ERR(links[i])) { - err = PTR_ERR(links[i]); - goto out_put; - } - } - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - err = bpf_iter_link_pin_kernel(parent, - objs[i].link_name, links[i]); - if (err) - goto out_put; - /* do not unlink successfully pinned links even - * if later link fails to pin - */ - links[i] = NULL; } - /* finish() will tell UMD process to exit */ - err = bpf_preload_ops->finish(); - if (err) - goto out_put; } out_put: bpf_preload_mod_put(); out: mutex_unlock(&bpf_preload_lock); - for (i = 0; i < BPF_PRELOAD_LINKS && err; i++) - if (!IS_ERR_OR_NULL(links[i])) - bpf_link_put(links[i]); return err; } diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 035e9e3a7132..497916060ac7 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include <linux/bpf-cgroup.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> @@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return 0; } - new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + - map->value_size, + new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size), __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, map->numa_node); if (!new) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 423549d2c52e..5763cc7ac4f1 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -412,7 +412,7 @@ static int trie_update_elem(struct bpf_map *map, rcu_assign_pointer(im_node->child[1], node); } - /* Finally, assign the intermediate node to the determined spot */ + /* Finally, assign the intermediate node to the determined slot */ rcu_assign_pointer(*slot, im_node); out: diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c index 6a9542af4212..b0fa190b0979 100644 --- a/kernel/bpf/map_iter.c +++ b/kernel/bpf/map_iter.c @@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = { .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__bpf_map_elem, key), - PTR_TO_RDONLY_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY }, { offsetof(struct bpf_iter__bpf_map_elem, value), - PTR_TO_RDWR_BUF_OR_NULL }, + PTR_TO_BUF | PTR_MAYBE_NULL }, }, }; diff --git a/kernel/bpf/mmap_unlock_work.h b/kernel/bpf/mmap_unlock_work.h new file mode 100644 index 000000000000..5d18d7d85bef --- /dev/null +++ b/kernel/bpf/mmap_unlock_work.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021 Facebook + */ + +#ifndef __MMAP_UNLOCK_WORK_H__ +#define __MMAP_UNLOCK_WORK_H__ +#include <linux/irq_work.h> + +/* irq_work to run mmap_read_unlock() in irq_work */ +struct mmap_unlock_irq_work { + struct irq_work irq_work; + struct mm_struct *mm; +}; + +DECLARE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +/* + * We cannot do mmap_read_unlock() when the irq is disabled, because of + * risk to deadlock with rq_lock. To look up vma when the irqs are + * disabled, we need to run mmap_read_unlock() in irq_work. We use a + * percpu variable to do the irq_work. If the irq_work is already used + * by another lookup, we fall over. + */ +static inline bool bpf_mmap_unlock_get_irq_work(struct mmap_unlock_irq_work **work_ptr) +{ + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = false; + + if (irqs_disabled()) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + work = this_cpu_ptr(&mmap_unlock_work); + if (irq_work_is_busy(&work->irq_work)) { + /* cannot queue more up_read, fallback */ + irq_work_busy = true; + } + } else { + /* + * PREEMPT_RT does not allow to trylock mmap sem in + * interrupt disabled context. Force the fallback code. + */ + irq_work_busy = true; + } + } + + *work_ptr = work; + return irq_work_busy; +} + +static inline void bpf_mmap_unlock_mm(struct mmap_unlock_irq_work *work, struct mm_struct *mm) +{ + if (!work) { + mmap_read_unlock(mm); + } else { + work->mm = mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); + irq_work_queue(&work->irq_work); + } +} + +#endif /* __MMAP_UNLOCK_WORK_H__ */ diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c index 542f275bf252..868cc2c43899 100644 --- a/kernel/bpf/net_namespace.c +++ b/kernel/bpf/net_namespace.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/bpf.h> +#include <linux/bpf-netns.h> #include <linux/filter.h> #include <net/net_namespace.h> diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index 26bced262473..c9d45c9d6918 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD if BPF_PRELOAD config BPF_PRELOAD_UMD - tristate "bpf_preload kernel module with user mode driver" - depends on CC_CAN_LINK - depends on m || CC_CAN_LINK_STATIC + tristate "bpf_preload kernel module" default m help - This builds bpf_preload kernel module with embedded user mode driver. + This builds bpf_preload kernel module with embedded BPF programs for + introspection in bpffs. endif diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 1400ac58178e..20f89cc0a0a6 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -1,42 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ -LIBBPF_OUT = $(abspath $(obj))/libbpf -LIBBPF_A = $(LIBBPF_OUT)/libbpf.a -LIBBPF_DESTDIR = $(LIBBPF_OUT) -LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include - -# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test -# in tools/scripts/Makefile.include always succeeds when building the kernel -# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg". -$(LIBBPF_A): | $(LIBBPF_OUT) - $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \ - DESTDIR=$(LIBBPF_DESTDIR) prefix= \ - $(LIBBPF_OUT)/libbpf.a install_headers - -libbpf_hdrs: $(LIBBPF_A) - -.PHONY: libbpf_hdrs - -$(LIBBPF_OUT): - $(call msg,MKDIR,$@) - $(Q)mkdir -p $@ - -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ - -I $(LIBBPF_INCLUDE) -Wno-unused-result - -userprogs := bpf_preload_umd - -clean-files := libbpf/ - -$(obj)/iterators/iterators.o: | libbpf_hdrs - -bpf_preload_umd-objs := iterators/iterators.o -bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz - -$(obj)/bpf_preload_umd: $(LIBBPF_A) - -$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd +LIBBPF_INCLUDE = $(srctree)/tools/lib obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o -bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o +CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE) +bpf_preload-objs += bpf_preload_kern.o diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h index 2f9932276f2e..f065c91213a0 100644 --- a/kernel/bpf/preload/bpf_preload.h +++ b/kernel/bpf/preload/bpf_preload.h @@ -2,13 +2,13 @@ #ifndef _BPF_PRELOAD_H #define _BPF_PRELOAD_H -#include <linux/usermode_driver.h> -#include "iterators/bpf_preload_common.h" +struct bpf_preload_info { + char link_name[16]; + struct bpf_link *link; +}; struct bpf_preload_ops { - struct umd_info info; int (*preload)(struct bpf_preload_info *); - int (*finish)(void); struct module *owner; }; extern struct bpf_preload_ops *bpf_preload_ops; diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 53736e52c1df..5106b5372f0c 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -2,101 +2,87 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/init.h> #include <linux/module.h> -#include <linux/pid.h> -#include <linux/fs.h> -#include <linux/sched/signal.h> #include "bpf_preload.h" +#include "iterators/iterators.lskel.h" -extern char bpf_preload_umd_start; -extern char bpf_preload_umd_end; +static struct bpf_link *maps_link, *progs_link; +static struct iterators_bpf *skel; -static int preload(struct bpf_preload_info *obj); -static int finish(void); +static void free_links_and_skel(void) +{ + if (!IS_ERR_OR_NULL(maps_link)) + bpf_link_put(maps_link); + if (!IS_ERR_OR_NULL(progs_link)) + bpf_link_put(progs_link); + iterators_bpf__destroy(skel); +} + +static int preload(struct bpf_preload_info *obj) +{ + strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name)); + obj[0].link = maps_link; + strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name)); + obj[1].link = progs_link; + return 0; +} -static struct bpf_preload_ops umd_ops = { - .info.driver_name = "bpf_preload", +static struct bpf_preload_ops ops = { .preload = preload, - .finish = finish, .owner = THIS_MODULE, }; -static int preload(struct bpf_preload_info *obj) +static int load_skel(void) { - int magic = BPF_PRELOAD_START; - loff_t pos = 0; - int i, err; - ssize_t n; + int err; - err = fork_usermode_driver(&umd_ops.info); + skel = iterators_bpf__open(); + if (!skel) + return -ENOMEM; + err = iterators_bpf__load(skel); if (err) - return err; - - /* send the start magic to let UMD proceed with loading BPF progs */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - /* receive bpf_link IDs and names from UMD */ - pos = 0; - for (i = 0; i < BPF_PRELOAD_LINKS; i++) { - n = kernel_read(umd_ops.info.pipe_from_umh, - &obj[i], sizeof(*obj), &pos); - if (n != sizeof(*obj)) - return -EPIPE; + goto out; + err = iterators_bpf__attach(skel); + if (err) + goto out; + maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd); + if (IS_ERR(maps_link)) { + err = PTR_ERR(maps_link); + goto out; } - return 0; -} - -static int finish(void) -{ - int magic = BPF_PRELOAD_END; - struct pid *tgid; - loff_t pos = 0; - ssize_t n; - - /* send the last magic to UMD. It will do a normal exit. */ - n = kernel_write(umd_ops.info.pipe_to_umh, - &magic, sizeof(magic), &pos); - if (n != sizeof(magic)) - return -EPIPE; - - tgid = umd_ops.info.tgid; - if (tgid) { - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); + progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd); + if (IS_ERR(progs_link)) { + err = PTR_ERR(progs_link); + goto out; } + /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out + * makes skel_closenz() a no-op later in iterators_bpf__destroy(). + */ + close_fd(skel->links.dump_bpf_map_fd); + skel->links.dump_bpf_map_fd = 0; + close_fd(skel->links.dump_bpf_prog_fd); + skel->links.dump_bpf_prog_fd = 0; return 0; +out: + free_links_and_skel(); + return err; } -static int __init load_umd(void) +static int __init load(void) { int err; - err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start, - &bpf_preload_umd_end - &bpf_preload_umd_start); + err = load_skel(); if (err) return err; - bpf_preload_ops = &umd_ops; + bpf_preload_ops = &ops; return err; } -static void __exit fini_umd(void) +static void __exit fini(void) { - struct pid *tgid; - bpf_preload_ops = NULL; - - /* kill UMD in case it's still there due to earlier error */ - tgid = umd_ops.info.tgid; - if (tgid) { - kill_pid(tgid, SIGKILL, 1); - - wait_event(tgid->wait_pidfd, thread_group_exited(tgid)); - umd_cleanup_helper(&umd_ops.info); - } - umd_unload_blob(&umd_ops.info); + free_links_and_skel(); } -late_initcall(load_umd); -module_exit(fini_umd); +late_initcall(load); +module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S deleted file mode 100644 index f1f40223b5c3..000000000000 --- a/kernel/bpf/preload/bpf_preload_umd_blob.S +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - .section .init.rodata, "a" - .global bpf_preload_umd_start -bpf_preload_umd_start: - .incbin "kernel/bpf/preload/bpf_preload_umd" - .global bpf_preload_umd_end -bpf_preload_umd_end: diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile index b8bd60511227..bfe24f8c5a20 100644 --- a/kernel/bpf/preload/iterators/Makefile +++ b/kernel/bpf/preload/iterators/Makefile @@ -35,15 +35,15 @@ endif .PHONY: all clean -all: iterators.skel.h +all: iterators.lskel.h clean: $(call msg,CLEAN) $(Q)rm -rf $(OUTPUT) iterators -iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) +iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL) $(call msg,GEN-SKEL,$@) - $(Q)$(BPFTOOL) gen skeleton $< > $@ + $(Q)$(BPFTOOL) gen skeleton -L $< > $@ $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT) diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h deleted file mode 100644 index 8464d1a48c05..000000000000 --- a/kernel/bpf/preload/iterators/bpf_preload_common.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BPF_PRELOAD_COMMON_H -#define _BPF_PRELOAD_COMMON_H - -#define BPF_PRELOAD_START 0x5555 -#define BPF_PRELOAD_END 0xAAAA - -struct bpf_preload_info { - char link_name[16]; - int link_id; -}; - -#endif diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c deleted file mode 100644 index 5d872a705470..000000000000 --- a/kernel/bpf/preload/iterators/iterators.c +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (c) 2020 Facebook */ -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/resource.h> -#include <bpf/libbpf.h> -#include <bpf/bpf.h> -#include <sys/mount.h> -#include "iterators.skel.h" -#include "bpf_preload_common.h" - -int to_kernel = -1; -int from_kernel = 0; - -static int send_link_to_kernel(struct bpf_link *link, const char *link_name) -{ - struct bpf_preload_info obj = {}; - struct bpf_link_info info = {}; - __u32 info_len = sizeof(info); - int err; - - err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len); - if (err) - return err; - obj.link_id = info.id; - if (strlen(link_name) >= sizeof(obj.link_name)) - return -E2BIG; - strcpy(obj.link_name, link_name); - if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj)) - return -EPIPE; - return 0; -} - -int main(int argc, char **argv) -{ - struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY }; - struct iterators_bpf *skel; - int err, magic; - int debug_fd; - - debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC); - if (debug_fd < 0) - return 1; - to_kernel = dup(1); - close(1); - dup(debug_fd); - /* now stdin and stderr point to /dev/console */ - - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_START) { - printf("bad start magic %d\n", magic); - return 1; - } - setrlimit(RLIMIT_MEMLOCK, &rlim); - /* libbpf opens BPF object and loads it into the kernel */ - skel = iterators_bpf__open_and_load(); - if (!skel) { - /* iterators.skel.h is little endian. - * libbpf doesn't support automatic little->big conversion - * of BPF bytecode yet. - * The program load will fail in such case. - */ - printf("Failed load could be due to wrong endianness\n"); - return 1; - } - err = iterators_bpf__attach(skel); - if (err) - goto cleanup; - - /* send two bpf_link IDs with names to the kernel */ - err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug"); - if (err) - goto cleanup; - err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug"); - if (err) - goto cleanup; - - /* The kernel will proceed with pinnging the links in bpffs. - * UMD will wait on read from pipe. - */ - read(from_kernel, &magic, sizeof(magic)); - if (magic != BPF_PRELOAD_END) { - printf("bad final magic %d\n", magic); - err = -EINVAL; - } -cleanup: - iterators_bpf__destroy(skel); - - return err != 0; -} diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h new file mode 100644 index 000000000000..70f236a82fe1 --- /dev/null +++ b/kernel/bpf/preload/iterators/iterators.lskel.h @@ -0,0 +1,425 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* THIS FILE IS AUTOGENERATED! */ +#ifndef __ITERATORS_BPF_SKEL_H__ +#define __ITERATORS_BPF_SKEL_H__ + +#include <bpf/skel_internal.h> + +struct iterators_bpf { + struct bpf_loader_ctx ctx; + struct { + struct bpf_map_desc rodata; + } maps; + struct { + struct bpf_prog_desc dump_bpf_map; + struct bpf_prog_desc dump_bpf_prog; + } progs; + struct { + int dump_bpf_map_fd; + int dump_bpf_prog_fd; + } links; + struct iterators_bpf__rodata { + } *rodata; +}; + +static inline int +iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_map.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_map_fd = fd; + return fd; +} + +static inline int +iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel) +{ + int prog_fd = skel->progs.dump_bpf_prog.prog_fd; + int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER); + + if (fd > 0) + skel->links.dump_bpf_prog_fd = fd; + return fd; +} + +static inline int +iterators_bpf__attach(struct iterators_bpf *skel) +{ + int ret = 0; + + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel); + ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel); + return ret < 0 ? ret : 0; +} + +static inline void +iterators_bpf__detach(struct iterators_bpf *skel) +{ + skel_closenz(skel->links.dump_bpf_map_fd); + skel_closenz(skel->links.dump_bpf_prog_fd); +} +static void +iterators_bpf__destroy(struct iterators_bpf *skel) +{ + if (!skel) + return; + iterators_bpf__detach(skel); + skel_closenz(skel->progs.dump_bpf_map.prog_fd); + skel_closenz(skel->progs.dump_bpf_prog.prog_fd); + skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096); + skel_closenz(skel->maps.rodata.map_fd); + skel_free(skel); +} +static inline struct iterators_bpf * +iterators_bpf__open(void) +{ + struct iterators_bpf *skel; + + skel = skel_alloc(sizeof(*skel)); + if (!skel) + goto cleanup; + skel->ctx.sz = (void *)&skel->links - (void *)skel; + skel->rodata = skel_prep_map_data((void *)"\ +\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\ +\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\ +\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98); + if (!skel->rodata) + goto cleanup; + skel->maps.rodata.initial_value = (__u64) (long) skel->rodata; + return skel; +cleanup: + iterators_bpf__destroy(skel); + return NULL; +} + +static inline int +iterators_bpf__load(struct iterators_bpf *skel) +{ + struct bpf_load_and_run_opts opts = {}; + int err; + + opts.ctx = (struct bpf_loader_ctx *)skel; + opts.data_sz = 6056; + opts.data = (void *)"\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\ +\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\ +\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\ +\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\ +\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\ +\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\ +\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\ +\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\ +\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\ +\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\ +\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\ +\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\ +\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\ +\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\ +\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\ +\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\ +\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\ +\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\ +\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\ +\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\ +\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\ +\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\ +\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\ +\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\ +\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\ +\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\ +\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\ +\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\ +\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\ +\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\ +\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\ +\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\ +\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\ +\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\ +\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\ +\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\ +\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\ +\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\ +\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\ +\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\ +\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\ +\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\ +\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\ +\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\ +\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\ +\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\ +\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\ +\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\ +\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\ +\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\ +\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ +\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ +\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ +\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ +\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ +\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ +\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ +\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ +\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ +\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ +\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ +\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ +\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ +\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ +\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ +\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ +\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ +\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ +\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ +\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ +\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ +\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ +\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ +\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ +\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ +\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ +\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ +\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ +\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ +\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ +\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ +\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ +\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ +\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ +\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ +\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ +\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ +\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ +\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ +\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ +\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\ +\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\ +\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\ +\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\ +\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\ +\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\ +\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\ +\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\ +\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\ +\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\ +\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\ +\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\ +\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\ +\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\ +\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\ +\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\ +\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\ +\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\ +\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\ +\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\ +\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\ +\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\ +\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\ +\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\ +\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\ +\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\ +\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\ +\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\ +\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\ +\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\ +\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\ +\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\ +\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\ +\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\ +\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\ +\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\ +\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\ +\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\ +\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\ +\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\ +\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\ +\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\ +\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\ +\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\ +\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\ +\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\ +\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\ +\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\ +\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\ +\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\ +\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\ +\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\ +\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\ +\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\ +\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\ +\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\ +\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\ +\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\ +\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\ +\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\ +\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\ +\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\ +\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\ +\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\ +\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\ +\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\ +\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\ +\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\ +\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\ +\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\ +\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\ +\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\ +\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\ +\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\ +\0\0\0\0"; + opts.insns_sz = 2216; + opts.insns = (void *)"\ +\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\ +\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\ +\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\ +\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\ +\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\ +\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\ +\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\ +\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\ +\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\ +\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\ +\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\ +\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\ +\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\ +\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\ +\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\ +\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\ +\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\ +\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\ +\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\ +\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\ +\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\ +\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\ +\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\ +\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\ +\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\ +\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\ +\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\ +\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\ +\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\ +\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\ +\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\ +\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\ +\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\ +\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\ +\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\ +\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\ +\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\ +\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\ +\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\ +\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\ +\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\ +\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\ +\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\ +\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\ +\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\ +\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\ +\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\ +\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\ +\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\ +\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\ +\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\ +\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\ +\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\ +\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\ +\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\ +\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\ +\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\ +\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\ +\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\ +\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\ +\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\ +\0\0\0\0\x95\0\0\0\0\0\0\0"; + err = bpf_load_and_run(&opts); + if (err < 0) + return err; + skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value, + 4096, PROT_READ, skel->maps.rodata.map_fd); + if (!skel->rodata) + return -ENOMEM; + return 0; +} + +static inline struct iterators_bpf * +iterators_bpf__open_and_load(void) +{ + struct iterators_bpf *skel; + + skel = iterators_bpf__open(); + if (!skel) + return NULL; + if (iterators_bpf__load(skel)) { + iterators_bpf__destroy(skel); + return NULL; + } + return skel; +} + +#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h deleted file mode 100644 index cf9a6a94b3a4..000000000000 --- a/kernel/bpf/preload/iterators/iterators.skel.h +++ /dev/null @@ -1,412 +0,0 @@ -/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ - -/* THIS FILE IS AUTOGENERATED! */ -#ifndef __ITERATORS_BPF_SKEL_H__ -#define __ITERATORS_BPF_SKEL_H__ - -#include <stdlib.h> -#include <bpf/libbpf.h> - -struct iterators_bpf { - struct bpf_object_skeleton *skeleton; - struct bpf_object *obj; - struct { - struct bpf_map *rodata; - } maps; - struct { - struct bpf_program *dump_bpf_map; - struct bpf_program *dump_bpf_prog; - } progs; - struct { - struct bpf_link *dump_bpf_map; - struct bpf_link *dump_bpf_prog; - } links; - struct iterators_bpf__rodata { - char dump_bpf_map____fmt[35]; - char dump_bpf_map____fmt_1[14]; - char dump_bpf_prog____fmt[32]; - char dump_bpf_prog____fmt_2[17]; - } *rodata; -}; - -static void -iterators_bpf__destroy(struct iterators_bpf *obj) -{ - if (!obj) - return; - if (obj->skeleton) - bpf_object__destroy_skeleton(obj->skeleton); - free(obj); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj); - -static inline struct iterators_bpf * -iterators_bpf__open_opts(const struct bpf_object_open_opts *opts) -{ - struct iterators_bpf *obj; - - obj = (struct iterators_bpf *)calloc(1, sizeof(*obj)); - if (!obj) - return NULL; - if (iterators_bpf__create_skeleton(obj)) - goto err; - if (bpf_object__open_skeleton(obj->skeleton, opts)) - goto err; - - return obj; -err: - iterators_bpf__destroy(obj); - return NULL; -} - -static inline struct iterators_bpf * -iterators_bpf__open(void) -{ - return iterators_bpf__open_opts(NULL); -} - -static inline int -iterators_bpf__load(struct iterators_bpf *obj) -{ - return bpf_object__load_skeleton(obj->skeleton); -} - -static inline struct iterators_bpf * -iterators_bpf__open_and_load(void) -{ - struct iterators_bpf *obj; - - obj = iterators_bpf__open(); - if (!obj) - return NULL; - if (iterators_bpf__load(obj)) { - iterators_bpf__destroy(obj); - return NULL; - } - return obj; -} - -static inline int -iterators_bpf__attach(struct iterators_bpf *obj) -{ - return bpf_object__attach_skeleton(obj->skeleton); -} - -static inline void -iterators_bpf__detach(struct iterators_bpf *obj) -{ - return bpf_object__detach_skeleton(obj->skeleton); -} - -static inline int -iterators_bpf__create_skeleton(struct iterators_bpf *obj) -{ - struct bpf_object_skeleton *s; - - s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s)); - if (!s) - return -1; - obj->skeleton = s; - - s->sz = sizeof(*s); - s->name = "iterators_bpf"; - s->obj = &obj->obj; - - /* maps */ - s->map_cnt = 1; - s->map_skel_sz = sizeof(*s->maps); - s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz); - if (!s->maps) - goto err; - - s->maps[0].name = "iterator.rodata"; - s->maps[0].map = &obj->maps.rodata; - s->maps[0].mmaped = (void **)&obj->rodata; - - /* programs */ - s->prog_cnt = 2; - s->prog_skel_sz = sizeof(*s->progs); - s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz); - if (!s->progs) - goto err; - - s->progs[0].name = "dump_bpf_map"; - s->progs[0].prog = &obj->progs.dump_bpf_map; - s->progs[0].link = &obj->links.dump_bpf_map; - - s->progs[1].name = "dump_bpf_prog"; - s->progs[1].prog = &obj->progs.dump_bpf_prog; - s->progs[1].link = &obj->links.dump_bpf_prog; - - s->data_sz = 7176; - s->data = (void *)"\ -\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\ -\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\ -\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\ -\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\ -\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\ -\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\ -\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\ -\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\ -\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\ -\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\ -\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\ -\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\ -\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\ -\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\ -\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\ -\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\ -\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\ -\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\ -\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\ -\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\ -\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\ -\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\ -\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\ -\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\ -\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\ -\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\ -\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\ -\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\ -\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\ -\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\ -\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\ -\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\ -\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\ -\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\ -\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\ -\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\ -\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\ -\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\ -\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\ -\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\ -\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\ -\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\ -\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\ -\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\ -\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\ -\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\ -\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\ -\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\ -\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\ -\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\ -\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\ -\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\ -\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\ -\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\ -\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\ -\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\ -\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\ -\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\ -\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\ -\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\ -\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\ -\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\ -\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\ -\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\ -\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\ -\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\ -\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\ -\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\ -\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\ -\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\ -\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\ -\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\ -\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\ -\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\ -\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\ -\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\ -\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\ -\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\ -\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\ -\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\ -\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\ -\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\ -\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\ -\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\ -\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\ -\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\ -\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\ -\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\ -\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\ -\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\ -\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\ -\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\ -\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\ -\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\ -\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\ -\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\ -\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\ -\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\ -\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\ -\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\ -\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\ -\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\ -\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\ -\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\ -\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\ -\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\ -\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\ -\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\ -\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\ -\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\ -\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\ -\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\ -\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\ -\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\ -\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\ -\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\ -\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\ -\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\ -\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\ -\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\ -\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\ -\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\ -\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\ -\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\ -\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\ -\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\ -\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\ -\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\ -\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\ -\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\ -\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\ -\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\ -\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\ -\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\ -\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\ -\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\ -\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\ -\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\ -\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\ -\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\ -\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\ -\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\ -\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\ -\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\ -\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\ -\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\ -\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\ -\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\ -\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\ -\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\ -\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\ -\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\ -\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\ -\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\ -\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\ -\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\ -\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\ -\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\ -\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\ -\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\ -\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\ -\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\ -\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\ -\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\ -\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\ -\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\ -\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\ -\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\ -\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\ -\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\ -\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\ -\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\ -\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\ -\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\ -\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\ -\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\ -\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\ -\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\ -\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\ -\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\ -\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\ -\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\ -\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\ -\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\ -\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\ -\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\ -\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\ -\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\ -\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\ -\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\ -\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\ -\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\ -\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\ -\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\ -\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\ -\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\ -\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\ -\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\ -\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\ -\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\ -\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\ -\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\ -\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\ -\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\ -\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\ -\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; - - return 0; -err: - bpf_object__destroy_skeleton(s); - return -1; -} - -#endif /* __ITERATORS_BPF_SKEL_H__ */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 93a55391791a..8251243022a2 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -143,7 +143,7 @@ static void reuseport_array_free(struct bpf_map *map) /* * Once reaching here, all sk->sk_user_data is not - * referenceing this "array". "array" can be freed now. + * referencing this "array". "array" can be freed now. */ bpf_map_area_free(array); } @@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - u64 array_size; if (!bpf_capable()) return ERR_PTR(-EPERM); - array_size = sizeof(*array); - array_size += (u64)attr->max_entries * sizeof(struct sock *); - /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size, numa_node); + array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node); if (!array) return ERR_PTR(-ENOMEM); diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 9e0c10c6892a..710ba9de12ce 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, - VM_ALLOC | VM_USERMAP, PAGE_KERNEL); + VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; @@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6e75bbee39f0..34725bfa1e97 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,10 @@ #include <linux/kernel.h> #include <linux/stacktrace.h> #include <linux/perf_event.h> -#include <linux/irq_work.h> #include <linux/btf_ids.h> #include <linux/buildid.h> #include "percpu_freelist.h" +#include "mmap_unlock_work.h" #define STACK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ @@ -31,25 +31,6 @@ struct bpf_stack_map { struct stack_map_bucket *buckets[]; }; -/* irq_work to run up_read() for build_id lookup in nmi context */ -struct stack_map_irq_work { - struct irq_work irq_work; - struct mm_struct *mm; -}; - -static void do_up_read(struct irq_work *entry) -{ - struct stack_map_irq_work *work; - - if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) - return; - - work = container_of(entry, struct stack_map_irq_work, irq_work); - mmap_read_unlock_non_owner(work->mm); -} - -static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); - static inline bool stack_map_use_build_id(struct bpf_map *map) { return (map->map_flags & BPF_F_STACK_BUILD_ID); @@ -149,35 +130,14 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { int i; - struct vm_area_struct *vma; - bool irq_work_busy = false; - struct stack_map_irq_work *work = NULL; - - if (irqs_disabled()) { - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { - work = this_cpu_ptr(&up_read_work); - if (irq_work_is_busy(&work->irq_work)) { - /* cannot queue more up_read, fallback */ - irq_work_busy = true; - } - } else { - /* - * PREEMPT_RT does not allow to trylock mmap sem in - * interrupt disabled context. Force the fallback code. - */ - irq_work_busy = true; - } - } - - /* - * We cannot do up_read() when the irq is disabled, because of - * risk to deadlock with rq_lock. To do build_id lookup when the - * irqs are disabled, we need to run up_read() in irq_work. We use - * a percpu variable to do the irq_work. If the irq_work is - * already used by another lookup, we fall back to report ips. - * - * Same fallback is used for kernel stack (!user) on a stackmap - * with build_id. + struct mmap_unlock_irq_work *work = NULL; + bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + struct vm_area_struct *vma, *prev_vma = NULL; + const char *prev_build_id; + + /* If the irq_work is in use, fall back to report ips. Same + * fallback is used for kernel stack (!user) on a stackmap with + * build_id. */ if (!user || !current || !current->mm || irq_work_busy || !mmap_read_trylock(current->mm)) { @@ -191,6 +151,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } for (i = 0; i < trace_nr; i++) { + if (range_in_vma(prev_vma, ips[i], ips[i])) { + vma = prev_vma; + memcpy(id_offs[i].build_id, prev_build_id, + BUILD_ID_SIZE_MAX); + goto build_id_valid; + } vma = find_vma(current->mm, ips[i]); if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ @@ -199,27 +165,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } +build_id_valid: id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] - vma->vm_start; id_offs[i].status = BPF_STACK_BUILD_ID_VALID; + prev_vma = vma; + prev_build_id = id_offs[i].build_id; } - - if (!work) { - mmap_read_unlock(current->mm); - } else { - work->mm = current->mm; - - /* The lock will be released once we're out of interrupt - * context. Tell lockdep that we've released it now so - * it doesn't complain that we forgot to release it. - */ - rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); - irq_work_queue(&work->irq_work); - } + bpf_mmap_unlock_mm(work, current->mm); } static struct perf_callchain_entry * -get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) +get_callchain_entry_for_task(struct task_struct *task, u32 max_depth) { #ifdef CONFIG_STACKTRACE struct perf_callchain_entry *entry; @@ -230,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) if (!entry) return NULL; - entry->nr = init_nr + - stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr), - sysctl_perf_event_max_stack - init_nr, 0); + entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip, + max_depth, 0); /* stack_trace_save_tsk() works on unsigned long array, while * perf_callchain_entry uses u64 array. For 32-bit systems, it is @@ -244,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) int i; /* copy data from the end to avoid using extra buffer */ - for (i = entry->nr - 1; i >= (int)init_nr; i--) + for (i = entry->nr - 1; i >= 0; i--) to[i] = (u64)(from[i]); } @@ -261,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map, { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct stack_map_bucket *bucket, *new_bucket, *old_bucket; - u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; u64 *ips; bool hash_matches; - /* get_perf_callchain() guarantees that trace->nr >= init_nr - * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth - */ - trace_nr = trace->nr - init_nr; - - if (trace_nr <= skip) + if (trace->nr <= skip) /* skipping more than usable stack trace */ return -EFAULT; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_len = trace_nr * sizeof(u64); - ips = trace->ip + skip + init_nr; + ips = trace->ip + skip; hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); id = hash & (smap->n_buckets - 1); bucket = READ_ONCE(smap->buckets[id]); @@ -338,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u64, flags) { u32 max_depth = map->value_size / stack_map_data_size(map); - /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ - u32 init_nr = sysctl_perf_event_max_stack - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; struct perf_callchain_entry *trace; bool kernel = !user; @@ -348,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) return -EINVAL; - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); + max_depth += skip; + if (max_depth > sysctl_perf_event_max_stack) + max_depth = sysctl_perf_event_max_stack; + + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, + false, false); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -440,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, struct perf_callchain_entry *trace_in, void *buf, u32 size, u64 flags) { - u32 init_nr, trace_nr, copy_len, elem_size, num_elem; + u32 trace_nr, copy_len, elem_size, num_elem, max_depth; bool user_build_id = flags & BPF_F_USER_BUILD_ID; u32 skip = flags & BPF_F_SKIP_FIELD_MASK; bool user = flags & BPF_F_USER_STACK; @@ -465,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, goto err_fault; num_elem = size / elem_size; - if (sysctl_perf_event_max_stack < num_elem) - init_nr = 0; - else - init_nr = sysctl_perf_event_max_stack - num_elem; + max_depth = num_elem + skip; + if (sysctl_perf_event_max_stack < max_depth) + max_depth = sysctl_perf_event_max_stack; if (trace_in) trace = trace_in; else if (kernel && task) - trace = get_callchain_entry_for_task(task, init_nr); + trace = get_callchain_entry_for_task(task, max_depth); else - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, + trace = get_perf_callchain(regs, 0, kernel, user, max_depth, false, false); if (unlikely(!trace)) goto err_fault; - trace_nr = trace->nr - init_nr; - if (trace_nr < skip) + if (trace->nr < skip) goto err_fault; - trace_nr -= skip; + trace_nr = trace->nr - skip; trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; copy_len = trace_nr * elem_size; - ips = trace->ip + skip + init_nr; + + ips = trace->ip + skip; if (user && user_build_id) stack_map_get_build_id_offset(buf, ips, trace_nr, user); else @@ -525,13 +474,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf, u32, size, u64, flags) { struct pt_regs *regs; - long res; + long res = -EINVAL; if (!try_get_task_stack(task)) return -EFAULT; regs = task_pt_regs(task); - res = __bpf_get_stack(regs, task, NULL, buf, size, flags); + if (regs) + res = __bpf_get_stack(regs, task, NULL, buf, size, flags); put_task_stack(task); return res; @@ -542,7 +492,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &btf_task_struct_ids[0], + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], .arg2_type = ARG_PTR_TO_UNINIT_MEM, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, @@ -719,16 +669,3 @@ const struct bpf_map_ops stack_trace_map_ops = { .map_btf_name = "bpf_stack_map", .map_btf_id = &stack_trace_map_btf_id, }; - -static int __init stack_map_init(void) -{ - int cpu; - struct stack_map_irq_work *work; - - for_each_possible_cpu(cpu) { - work = per_cpu_ptr(&up_read_work, cpu); - init_irq_work(&work->irq_work, do_up_read); - } - return 0; -} -subsys_initcall(stack_map_init); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 50f96ea4452a..cdaa1152436a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2,6 +2,7 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com */ #include <linux/bpf.h> +#include <linux/bpf-cgroup.h> #include <linux/bpf_trace.h> #include <linux/bpf_lirc.h> #include <linux/bpf_verifier.h> @@ -31,6 +32,7 @@ #include <linux/bpf-netns.h> #include <linux/rcupdate_trace.h> #include <linux/memcontrol.h> +#include <linux/trace_events.h> #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -132,6 +134,21 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } +static void bpf_map_write_active_inc(struct bpf_map *map) +{ + atomic64_inc(&map->writecnt); +} + +static void bpf_map_write_active_dec(struct bpf_map *map) +{ + atomic64_dec(&map->writecnt); +} + +bool bpf_map_write_active(const struct bpf_map *map) +{ + return atomic64_read(&map->writecnt) != 0; +} + static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || @@ -540,16 +557,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { - const struct bpf_map *map = filp->private_data; - const struct bpf_array *array; + struct bpf_map *map = filp->private_data; u32 type = 0, jited = 0; - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { - array = container_of(map, struct bpf_array, map); - spin_lock(&array->aux->owner.lock); - type = array->aux->owner.type; - jited = array->aux->owner.jited; - spin_unlock(&array->aux->owner.lock); + if (map_type_contains_progs(map)) { + spin_lock(&map->owner.lock); + type = map->owner.type; + jited = map->owner.jited; + spin_unlock(&map->owner.lock); } seq_printf(m, @@ -601,11 +616,8 @@ static void bpf_map_mmap_open(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt++; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_inc(map); } /* called for all unmapped memory region (including initial) */ @@ -613,11 +625,8 @@ static void bpf_map_mmap_close(struct vm_area_struct *vma) { struct bpf_map *map = vma->vm_file->private_data; - if (vma->vm_flags & VM_MAYWRITE) { - mutex_lock(&map->freeze_mutex); - map->writecnt--; - mutex_unlock(&map->freeze_mutex); - } + if (vma->vm_flags & VM_MAYWRITE) + bpf_map_write_active_dec(map); } static const struct vm_operations_struct bpf_map_default_vmops = { @@ -668,7 +677,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) goto out; if (vma->vm_flags & VM_MAYWRITE) - map->writecnt++; + bpf_map_write_active_inc(map); out: mutex_unlock(&map->freeze_mutex); return err; @@ -864,6 +873,7 @@ static int map_create(union bpf_attr *attr) atomic64_set(&map->refcnt, 1); atomic64_set(&map->usercnt, 1); mutex_init(&map->freeze_mutex); + spin_lock_init(&map->owner.lock); map->spin_lock_off = -EINVAL; map->timer_off = -EINVAL; @@ -976,6 +986,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } +EXPORT_SYMBOL(bpf_map_get); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -1139,6 +1150,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1174,6 +1186,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1196,6 +1209,7 @@ static int map_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; @@ -1226,6 +1240,7 @@ static int map_delete_elem(union bpf_attr *attr) out: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1338,14 +1353,16 @@ int generic_map_delete_batch(struct bpf_map *map, err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); bpf_enable_instrumentation(); - maybe_wait_bpf_programs(map); if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) err = -EFAULT; kvfree(key); + + maybe_wait_bpf_programs(map); return err; } @@ -1398,6 +1415,7 @@ int generic_map_update_batch(struct bpf_map *map, if (err) break; + cond_resched(); } if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp))) @@ -1495,6 +1513,7 @@ int generic_map_lookup_batch(struct bpf_map *map, swap(prev_key, key); retry = MAP_LOOKUP_RETRIES; cp++; + cond_resched(); } if (err == -EFAULT) @@ -1533,6 +1552,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); + bpf_map_write_active_inc(map); if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; @@ -1597,6 +1617,7 @@ free_value: free_key: kvfree(key); err_put: + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -1624,8 +1645,7 @@ static int map_freeze(const union bpf_attr *attr) } mutex_lock(&map->freeze_mutex); - - if (map->writecnt) { + if (bpf_map_write_active(map)) { err = -EBUSY; goto err_put; } @@ -2184,7 +2204,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD fd_array +#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) { @@ -2202,7 +2222,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) BPF_F_ANY_ALIGNMENT | BPF_F_TEST_STATE_FREQ | BPF_F_SLEEPABLE | - BPF_F_TEST_RND_HI32)) + BPF_F_TEST_RND_HI32 | + BPF_F_XDP_HAS_FRAGS)) return -EINVAL; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && @@ -2288,6 +2309,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr) prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; + prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS; err = security_bpf_prog_alloc(prog->aux); if (err) @@ -2473,6 +2495,7 @@ void bpf_link_put(struct bpf_link *link) bpf_link_free(link); } } +EXPORT_SYMBOL(bpf_link_put); static int bpf_link_release(struct inode *inode, struct file *filp) { @@ -2544,7 +2567,7 @@ static int bpf_link_alloc_id(struct bpf_link *link) * pre-allocated resources are to be freed with bpf_cleanup() call. All the * transient state is passed around in struct bpf_link_primer. * This is preferred way to create and initialize bpf_link, especially when - * there are complicated and expensive operations inbetween creating bpf_link + * there are complicated and expensive operations in between creating bpf_link * itself and attaching it to BPF hook. By using bpf_link_prime() and * bpf_link_settle() kernel code using bpf_link doesn't have to perform * expensive (and potentially failing) roll back operations in a rare case @@ -2615,6 +2638,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) return link; } +EXPORT_SYMBOL(bpf_link_get_from_fd); struct bpf_tracing_link { struct bpf_link link; @@ -2999,6 +3023,11 @@ out_put_file: fput(perf_file); return err; } +#else +static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_PERF_EVENTS */ #define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd @@ -3303,12 +3332,17 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_FLOW_DISSECTOR: case BPF_SK_LOOKUP: return netns_bpf_prog_query(attr, uattr); + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + case BPF_SK_MSG_VERDICT: + case BPF_SK_SKB_VERDICT: + return sock_map_bpf_prog_query(attr, uattr); default: return -EINVAL; } } -#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu +#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size static int bpf_prog_test_run(const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -4171,6 +4205,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, union bpf_attr __user *uattr, int cmd) { + bool has_read = cmd == BPF_MAP_LOOKUP_BATCH || + cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH; + bool has_write = cmd != BPF_MAP_LOOKUP_BATCH; struct bpf_map *map; int err, ufd; struct fd f; @@ -4183,16 +4220,13 @@ static int bpf_map_do_batch(const union bpf_attr *attr, map = __bpf_map_get(f); if (IS_ERR(map)) return PTR_ERR(map); - - if ((cmd == BPF_MAP_LOOKUP_BATCH || - cmd == BPF_MAP_LOOKUP_AND_DELETE_BATCH) && - !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { + if (has_write) + bpf_map_write_active_inc(map); + if (has_read && !(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { err = -EPERM; goto err_put; } - - if (cmd != BPF_MAP_LOOKUP_BATCH && - !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { + if (has_write && !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { err = -EPERM; goto err_put; } @@ -4205,8 +4239,9 @@ static int bpf_map_do_batch(const union bpf_attr *attr, BPF_DO_BATCH(map->ops->map_update_batch); else BPF_DO_BATCH(map->ops->map_delete_batch); - err_put: + if (has_write) + bpf_map_write_active_dec(map); fdput(f); return err; } @@ -4226,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr, return -EINVAL; } -#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len +#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies static int link_create(union bpf_attr *attr, bpfptr_t uattr) { enum bpf_prog_type ptype; @@ -4250,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = tracing_bpf_link_attach(attr, uattr, prog); goto out; case BPF_PROG_TYPE_PERF_EVENT: - case BPF_PROG_TYPE_KPROBE: case BPF_PROG_TYPE_TRACEPOINT: if (attr->link_create.attach_type != BPF_PERF_EVENT) { ret = -EINVAL; @@ -4258,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) } ptype = prog->type; break; + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type != BPF_PERF_EVENT && + attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) { + ret = -EINVAL; + goto out; + } + ptype = prog->type; + break; default: ptype = attach_type_to_prog_type(attr->link_create.attach_type); if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) { @@ -4289,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr) ret = bpf_xdp_link_attach(attr, prog); break; #endif -#ifdef CONFIG_PERF_EVENTS case BPF_PROG_TYPE_PERF_EVENT: case BPF_PROG_TYPE_TRACEPOINT: - case BPF_PROG_TYPE_KPROBE: ret = bpf_perf_link_attach(attr, prog); break; -#endif + case BPF_PROG_TYPE_KPROBE: + if (attr->link_create.attach_type == BPF_PERF_EVENT) + ret = bpf_perf_link_attach(attr, prog); + else + ret = bpf_kprobe_multi_link_attach(attr, prog); + break; default: ret = -EINVAL; } @@ -4734,30 +4779,59 @@ static bool syscall_prog_is_valid_access(int off, int size, return true; } -BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size) +BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size) { + struct bpf_prog * __maybe_unused prog; + switch (cmd) { case BPF_MAP_CREATE: case BPF_MAP_UPDATE_ELEM: case BPF_MAP_FREEZE: case BPF_PROG_LOAD: case BPF_BTF_LOAD: + case BPF_LINK_CREATE: + case BPF_RAW_TRACEPOINT_OPEN: break; - /* case BPF_PROG_TEST_RUN: - * is not part of this list to prevent recursive test_run - */ +#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */ + case BPF_PROG_TEST_RUN: + if (attr->test.data_in || attr->test.data_out || + attr->test.ctx_out || attr->test.duration || + attr->test.repeat || attr->test.flags) + return -EINVAL; + + prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (attr->test.ctx_size_in < prog->aux->max_ctx_offset || + attr->test.ctx_size_in > U16_MAX) { + bpf_prog_put(prog); + return -EINVAL; + } + + if (!__bpf_prog_enter_sleepable(prog)) { + /* recursion detected */ + bpf_prog_put(prog); + return -EBUSY; + } + attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in); + __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */); + bpf_prog_put(prog); + return 0; +#endif default: return -EINVAL; } return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size); } +EXPORT_SYMBOL(bpf_sys_bpf); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE, }; @@ -4804,7 +4878,7 @@ const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_MEM, - .arg2_type = ARG_CONST_SIZE, + .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_LONG, }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index b48750bfba5a..d94696198ef8 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -8,6 +8,7 @@ #include <linux/fdtable.h> #include <linux/filter.h> #include <linux/btf_ids.h> +#include "mmap_unlock_work.h" struct bpf_iter_seq_task_common { struct pid_namespace *ns; @@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = { .show = task_vma_seq_show, }; -BTF_ID_LIST(btf_task_file_ids) -BTF_ID(struct, file) -BTF_ID(struct, vm_area_struct) - static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, .init_seq_private = init_seq_pidns, @@ -586,23 +583,88 @@ static struct bpf_iter_reg task_vma_reg_info = { .seq_info = &task_vma_seq_info, }; +BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, + bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) +{ + struct mmap_unlock_irq_work *work = NULL; + struct vm_area_struct *vma; + bool irq_work_busy = false; + struct mm_struct *mm; + int ret = -ENOENT; + + if (flags) + return -EINVAL; + + if (!task) + return -ENOENT; + + mm = task->mm; + if (!mm) + return -ENOENT; + + irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); + + if (irq_work_busy || !mmap_read_trylock(mm)) + return -EBUSY; + + vma = find_vma(mm, start); + + if (vma && vma->vm_start <= start && vma->vm_end > start) { + callback_fn((u64)(long)task, (u64)(long)vma, + (u64)(long)callback_ctx, 0, 0); + ret = 0; + } + bpf_mmap_unlock_mm(work, mm); + return ret; +} + +const struct bpf_func_proto bpf_find_vma_proto = { + .func = bpf_find_vma, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_FUNC, + .arg4_type = ARG_PTR_TO_STACK_OR_NULL, + .arg5_type = ARG_ANYTHING, +}; + +DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); + +static void do_mmap_read_unlock(struct irq_work *entry) +{ + struct mmap_unlock_irq_work *work; + + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return; + + work = container_of(entry, struct mmap_unlock_irq_work, irq_work); + mmap_read_unlock_non_owner(work->mm); +} + static int __init task_iter_init(void) { - int ret; + struct mmap_unlock_irq_work *work; + int ret, cpu; + + for_each_possible_cpu(cpu) { + work = per_cpu_ptr(&mmap_unlock_work, cpu); + init_irq_work(&work->irq_work, do_mmap_read_unlock); + } - task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; + task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; ret = bpf_iter_reg_target(&task_reg_info); if (ret) return ret; - task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0]; + task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; + task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; ret = bpf_iter_reg_target(&task_file_reg_info); if (ret) return ret; - task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0]; - task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index e98de5e73ba5..ada97751ae1b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -27,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; /* serializes access to trampoline_table */ static DEFINE_MUTEX(trampoline_mutex); +bool bpf_prog_has_trampoline(const struct bpf_prog *prog) +{ + enum bpf_attach_type eatype = prog->expected_attach_type; + + return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN; +} + void *bpf_jit_alloc_exec_page(void) { void *image; @@ -37,7 +45,7 @@ void *bpf_jit_alloc_exec_page(void) set_vm_flush_reset_perms(image); /* Keep image as writeable. The alternative is to keep flipping ro/rw - * everytime new program is attached or detached. + * every time new program is attached or detached. */ set_memory_x((long)image, 1); return image; @@ -109,18 +117,6 @@ static void bpf_trampoline_module_put(struct bpf_trampoline *tr) tr->mod = NULL; } -static int is_ftrace_location(void *ip) -{ - long addr; - - addr = ftrace_location((long)ip); - if (!addr) - return 0; - if (WARN_ON_ONCE(addr != (long)ip)) - return -EFAULT; - return 1; -} - static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) { void *ip = tr->func.addr; @@ -152,12 +148,12 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad static int register_fentry(struct bpf_trampoline *tr, void *new_addr) { void *ip = tr->func.addr; + unsigned long faddr; int ret; - ret = is_ftrace_location(ip); - if (ret < 0) - return ret; - tr->func.ftrace_managed = ret; + faddr = ftrace_location((unsigned long)ip); + if (faddr) + tr->func.ftrace_managed = true; if (bpf_trampoline_module_get(tr)) return -ENOENT; @@ -205,7 +201,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work) im = container_of(work, struct bpf_tramp_image, work); bpf_image_ksym_del(&im->ksym); bpf_jit_free_exec(im->image); - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); percpu_ref_exit(&im->pcref); kfree_rcu(im, rcu); } @@ -302,7 +298,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) if (!im) goto out; - err = bpf_jit_charge_modmem(1); + err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) goto out_free_im; @@ -324,7 +320,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) out_free_image: bpf_jit_free_exec(im->image); out_uncharge: - bpf_jit_uncharge_modmem(1); + bpf_jit_uncharge_modmem(PAGE_SIZE); out_free_im: kfree(im); out: @@ -542,11 +538,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void) static void notrace inc_misses_counter(struct bpf_prog *prog) { struct bpf_prog_stats *stats; + unsigned int flags; stats = this_cpu_ptr(prog->stats); - u64_stats_update_begin(&stats->syncp); + flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->misses); - u64_stats_update_end(&stats->syncp); + u64_stats_update_end_irqrestore(&stats->syncp, flags); } /* The logic is similar to bpf_prog_run(), but with an explicit diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 890b3ec375a3..d175b70067b3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4,6 +4,7 @@ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io */ #include <uapi/linux/btf.h> +#include <linux/bpf-cgroup.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> @@ -293,13 +294,15 @@ void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); - n = min(log->len_total - log->len_used - 1, n); - log->kbuf[n] = '\0'; - if (log->level == BPF_LOG_KERNEL) { - pr_err("BPF:%s\n", log->kbuf); + bool newline = n > 0 && log->kbuf[n - 1] == '\n'; + + pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); return; } + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) log->len_used += n; else @@ -439,18 +442,6 @@ static bool reg_type_not_null(enum bpf_reg_type type) type == PTR_TO_SOCK_COMMON; } -static bool reg_type_may_be_null(enum bpf_reg_type type) -{ - return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_BTF_ID_OR_NULL || - type == PTR_TO_MEM_OR_NULL || - type == PTR_TO_RDONLY_BUF_OR_NULL || - type == PTR_TO_RDWR_BUF_OR_NULL; -} - static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && @@ -459,12 +450,15 @@ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) { - return type == PTR_TO_SOCKET || - type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_TCP_SOCK || - type == PTR_TO_TCP_SOCK_OR_NULL || - type == PTR_TO_MEM || - type == PTR_TO_MEM_OR_NULL; + return base_type(type) == PTR_TO_SOCKET || + base_type(type) == PTR_TO_TCP_SOCK || + base_type(type) == PTR_TO_MEM || + base_type(type) == PTR_TO_BTF_ID; +} + +static bool type_is_rdonly_mem(u32 type) +{ + return type & MEM_RDONLY; } static bool arg_type_may_be_refcounted(enum bpf_arg_type type) @@ -472,14 +466,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type) return type == ARG_PTR_TO_SOCK_COMMON; } -static bool arg_type_may_be_null(enum bpf_arg_type type) +static bool type_may_be_null(u32 type) { - return type == ARG_PTR_TO_MAP_VALUE_OR_NULL || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_CTX_OR_NULL || - type == ARG_PTR_TO_SOCKET_OR_NULL || - type == ARG_PTR_TO_ALLOC_MEM_OR_NULL || - type == ARG_PTR_TO_STACK_OR_NULL; + return type & PTR_MAYBE_NULL; } /* Determine whether the function releases some resources allocated by another @@ -539,39 +528,58 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } -/* string representation of 'enum bpf_reg_type' */ -static const char * const reg_type_str[] = { - [NOT_INIT] = "?", - [SCALAR_VALUE] = "inv", - [PTR_TO_CTX] = "ctx", - [CONST_PTR_TO_MAP] = "map_ptr", - [PTR_TO_MAP_VALUE] = "map_value", - [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", - [PTR_TO_STACK] = "fp", - [PTR_TO_PACKET] = "pkt", - [PTR_TO_PACKET_META] = "pkt_meta", - [PTR_TO_PACKET_END] = "pkt_end", - [PTR_TO_FLOW_KEYS] = "flow_keys", - [PTR_TO_SOCKET] = "sock", - [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", - [PTR_TO_SOCK_COMMON] = "sock_common", - [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", - [PTR_TO_TCP_SOCK] = "tcp_sock", - [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", - [PTR_TO_TP_BUFFER] = "tp_buffer", - [PTR_TO_XDP_SOCK] = "xdp_sock", - [PTR_TO_BTF_ID] = "ptr_", - [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_", - [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_", - [PTR_TO_MEM] = "mem", - [PTR_TO_MEM_OR_NULL] = "mem_or_null", - [PTR_TO_RDONLY_BUF] = "rdonly_buf", - [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null", - [PTR_TO_RDWR_BUF] = "rdwr_buf", - [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null", - [PTR_TO_FUNC] = "func", - [PTR_TO_MAP_KEY] = "map_key", -}; +/* string representation of 'enum bpf_reg_type' + * + * Note that reg_type_str() can not appear more than once in a single verbose() + * statement. + */ +static const char *reg_type_str(struct bpf_verifier_env *env, + enum bpf_reg_type type) +{ + char postfix[16] = {0}, prefix[32] = {0}; + static const char * const str[] = { + [NOT_INIT] = "?", + [SCALAR_VALUE] = "scalar", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_STACK] = "fp", + [PTR_TO_PACKET] = "pkt", + [PTR_TO_PACKET_META] = "pkt_meta", + [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", + [PTR_TO_SOCKET] = "sock", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TP_BUFFER] = "tp_buffer", + [PTR_TO_XDP_SOCK] = "xdp_sock", + [PTR_TO_BTF_ID] = "ptr_", + [PTR_TO_MEM] = "mem", + [PTR_TO_BUF] = "buf", + [PTR_TO_FUNC] = "func", + [PTR_TO_MAP_KEY] = "map_key", + }; + + if (type & PTR_MAYBE_NULL) { + if (base_type(type) == PTR_TO_BTF_ID) + strncpy(postfix, "or_null_", 16); + else + strncpy(postfix, "_or_null", 16); + } + + if (type & MEM_RDONLY) + strncpy(prefix, "rdonly_", 32); + if (type & MEM_ALLOC) + strncpy(prefix, "alloc_", 32); + if (type & MEM_USER) + strncpy(prefix, "user_", 32); + if (type & MEM_PERCPU) + strncpy(prefix, "percpu_", 32); + + snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", + prefix, str[base_type(type)], postfix); + return env->type_str_buf; +} static char slot_type_char[] = { [STACK_INVALID] = '?', @@ -606,6 +614,44 @@ static const char *kernel_type_name(const struct btf* btf, u32 id) return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } +static void mark_reg_scratched(struct bpf_verifier_env *env, u32 regno) +{ + env->scratched_regs |= 1U << regno; +} + +static void mark_stack_slot_scratched(struct bpf_verifier_env *env, u32 spi) +{ + env->scratched_stack_slots |= 1ULL << spi; +} + +static bool reg_scratched(const struct bpf_verifier_env *env, u32 regno) +{ + return (env->scratched_regs >> regno) & 1; +} + +static bool stack_slot_scratched(const struct bpf_verifier_env *env, u64 regno) +{ + return (env->scratched_stack_slots >> regno) & 1; +} + +static bool verifier_state_scratched(const struct bpf_verifier_env *env) +{ + return env->scratched_regs || env->scratched_stack_slots; +} + +static void mark_verifier_state_clean(struct bpf_verifier_env *env) +{ + env->scratched_regs = 0U; + env->scratched_stack_slots = 0ULL; +} + +/* Used for printing the entire verifier state. */ +static void mark_verifier_state_scratched(struct bpf_verifier_env *env) +{ + env->scratched_regs = ~0U; + env->scratched_stack_slots = ~0ULL; +} + /* The reg state of a pointer or a bounded scalar was saved when * it was spilled to the stack. */ @@ -621,7 +667,8 @@ static void scrub_spilled_slot(u8 *stype) } static void print_verifier_state(struct bpf_verifier_env *env, - const struct bpf_func_state *state) + const struct bpf_func_state *state, + bool print_all) { const struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -634,78 +681,83 @@ static void print_verifier_state(struct bpf_verifier_env *env, t = reg->type; if (t == NOT_INIT) continue; + if (!print_all && !reg_scratched(env, i)) + continue; verbose(env, " R%d", i); print_liveness(env, reg->live); - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "="); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ + verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); verbose(env, "%lld", reg->var_off.value + reg->off); } else { - if (t == PTR_TO_BTF_ID || - t == PTR_TO_BTF_ID_OR_NULL || - t == PTR_TO_PERCPU_BTF_ID) + const char *sep = ""; + + verbose(env, "%s", reg_type_str(env, t)); + if (base_type(t) == PTR_TO_BTF_ID) verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); - verbose(env, "(id=%d", reg->id); - if (reg_type_may_be_refcounted_or_null(t)) - verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); + verbose(env, "("); +/* + * _a stands for append, was shortened to avoid multiline statements below. + * This macro is used to output a comma separated list of attributes. + */ +#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; }) + + if (reg->id) + verbose_a("id=%d", reg->id); + if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id) + verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (t != SCALAR_VALUE) - verbose(env, ",off=%d", reg->off); + verbose_a("off=%d", reg->off); if (type_is_pkt_pointer(t)) - verbose(env, ",r=%d", reg->range); - else if (t == CONST_PTR_TO_MAP || - t == PTR_TO_MAP_KEY || - t == PTR_TO_MAP_VALUE || - t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(env, ",ks=%d,vs=%d", - reg->map_ptr->key_size, - reg->map_ptr->value_size); + verbose_a("r=%d", reg->range); + else if (base_type(t) == CONST_PTR_TO_MAP || + base_type(t) == PTR_TO_MAP_KEY || + base_type(t) == PTR_TO_MAP_VALUE) + verbose_a("ks=%d,vs=%d", + reg->map_ptr->key_size, + reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { /* Typically an immediate SCALAR_VALUE, but * could be a pointer whose offset is too big * for reg->off */ - verbose(env, ",imm=%llx", reg->var_off.value); + verbose_a("imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(env, ",smin_value=%lld", - (long long)reg->smin_value); + verbose_a("smin=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(env, ",smax_value=%lld", - (long long)reg->smax_value); + verbose_a("smax=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(env, ",umin_value=%llu", - (unsigned long long)reg->umin_value); + verbose_a("umin=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(env, ",umax_value=%llu", - (unsigned long long)reg->umax_value); + verbose_a("umax=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, ",var_off=%s", tn_buf); + verbose_a("var_off=%s", tn_buf); } if (reg->s32_min_value != reg->smin_value && reg->s32_min_value != S32_MIN) - verbose(env, ",s32_min_value=%d", - (int)(reg->s32_min_value)); + verbose_a("s32_min=%d", (int)(reg->s32_min_value)); if (reg->s32_max_value != reg->smax_value && reg->s32_max_value != S32_MAX) - verbose(env, ",s32_max_value=%d", - (int)(reg->s32_max_value)); + verbose_a("s32_max=%d", (int)(reg->s32_max_value)); if (reg->u32_min_value != reg->umin_value && reg->u32_min_value != U32_MIN) - verbose(env, ",u32_min_value=%d", - (int)(reg->u32_min_value)); + verbose_a("u32_min=%d", (int)(reg->u32_min_value)); if (reg->u32_max_value != reg->umax_value && reg->u32_max_value != U32_MAX) - verbose(env, ",u32_max_value=%d", - (int)(reg->u32_max_value)); + verbose_a("u32_max=%d", (int)(reg->u32_max_value)); } +#undef verbose_a + verbose(env, ")"); } } @@ -723,12 +775,14 @@ static void print_verifier_state(struct bpf_verifier_env *env, types_buf[BPF_REG_SIZE] = 0; if (!valid) continue; + if (!print_all && !stack_slot_scratched(env, i)) + continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, state->stack[i].spilled_ptr.live); if (is_spilled_reg(&state->stack[i])) { reg = &state->stack[i].spilled_ptr; t = reg->type; - verbose(env, "=%s", reg_type_str[t]); + verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t)); if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) @@ -748,6 +802,26 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (state->in_async_callback_fn) verbose(env, " async_cb"); verbose(env, "\n"); + mark_verifier_state_clean(env); +} + +static inline u32 vlog_alignment(u32 pos) +{ + return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), + BPF_LOG_MIN_ALIGNMENT) - pos - 1; +} + +static void print_insn_state(struct bpf_verifier_env *env, + const struct bpf_func_state *state) +{ + if (env->prev_log_len && env->prev_log_len == env->log.len_used) { + /* remove new line character */ + bpf_vlog_reset(&env->log, env->prev_log_len - 1); + verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_len), ' '); + } else { + verbose(env, "%d:", env->insn_idx); + } + print_verifier_state(env, state, false); } /* copy array src of length n * size bytes to dst. dst is reallocated if it's too @@ -1141,8 +1215,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) { - switch (reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: { + if (base_type(reg->type) == PTR_TO_MAP_VALUE) { const struct bpf_map *map = reg->map_ptr; if (map->inner_map_meta) { @@ -1151,7 +1224,8 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - reg->map_uid = reg->id; + if (map_value_has_timer(map->inner_map_meta)) + reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || @@ -1160,32 +1234,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) } else { reg->type = PTR_TO_MAP_VALUE; } - break; - } - case PTR_TO_SOCKET_OR_NULL: - reg->type = PTR_TO_SOCKET; - break; - case PTR_TO_SOCK_COMMON_OR_NULL: - reg->type = PTR_TO_SOCK_COMMON; - break; - case PTR_TO_TCP_SOCK_OR_NULL: - reg->type = PTR_TO_TCP_SOCK; - break; - case PTR_TO_BTF_ID_OR_NULL: - reg->type = PTR_TO_BTF_ID; - break; - case PTR_TO_MEM_OR_NULL: - reg->type = PTR_TO_MEM; - break; - case PTR_TO_RDONLY_BUF_OR_NULL: - reg->type = PTR_TO_RDONLY_BUF; - break; - case PTR_TO_RDWR_BUF_OR_NULL: - reg->type = PTR_TO_RDWR_BUF; - break; - default: - WARN_ONCE(1, "unknown nullable register type"); + return; } + + reg->type &= ~PTR_MAYBE_NULL; } static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) @@ -1365,22 +1417,28 @@ static void __reg_bound_offset(struct bpf_reg_state *reg) reg->var_off = tnum_or(tnum_clear_subreg(var64_off), var32_off); } +static bool __reg32_bound_s64(s32 a) +{ + return a >= 0 && a <= S32_MAX; +} + static void __reg_assign_32_into_64(struct bpf_reg_state *reg) { reg->umin_value = reg->u32_min_value; reg->umax_value = reg->u32_max_value; - /* Attempt to pull 32-bit signed bounds into 64-bit bounds - * but must be positive otherwise set to worse case bounds - * and refine later from tnum. + + /* Attempt to pull 32-bit signed bounds into 64-bit bounds but must + * be positive otherwise set to worse case bounds and refine later + * from tnum. */ - if (reg->s32_min_value >= 0 && reg->s32_max_value >= 0) - reg->smax_value = reg->s32_max_value; - else - reg->smax_value = U32_MAX; - if (reg->s32_min_value >= 0) + if (__reg32_bound_s64(reg->s32_min_value) && + __reg32_bound_s64(reg->s32_max_value)) { reg->smin_value = reg->s32_min_value; - else + reg->smax_value = reg->s32_max_value; + } else { reg->smin_value = 0; + reg->smax_value = U32_MAX; + } } static void __reg_combine_32_into_64(struct bpf_reg_state *reg) @@ -1496,14 +1554,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, enum bpf_reg_type reg_type, - struct btf *btf, u32 btf_id) + struct btf *btf, u32 btf_id, + enum bpf_type_flag flag) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); return; } mark_reg_known_zero(env, regs, regno); - regs[regno].type = PTR_TO_BTF_ID; + regs[regno].type = PTR_TO_BTF_ID | flag; regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -1537,6 +1596,7 @@ static void init_func_state(struct bpf_verifier_env *env, state->frameno = frameno; state->subprogno = subprogno; init_reg_state(env, state); + mark_verifier_state_scratched(env); } /* Similar to push_stack(), but for async callbacks */ @@ -1692,7 +1752,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset) } static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, - s16 offset, struct module **btf_modp) + s16 offset) { struct bpf_kfunc_btf kf_btf = { .offset = offset }; struct bpf_kfunc_btf_tab *tab; @@ -1746,8 +1806,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); } - if (btf_modp) - *btf_modp = b->module; return b->btf; } @@ -1764,8 +1822,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) } static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, - u32 func_id, s16 offset, - struct module **btf_modp) + u32 func_id, s16 offset) { if (offset) { if (offset < 0) { @@ -1776,7 +1833,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env, return ERR_PTR(-EINVAL); } - return __find_kfunc_desc_btf(env, offset, btf_modp); + return __find_kfunc_desc_btf(env, offset); } return btf_vmlinux ?: ERR_PTR(-ENOENT); } @@ -1790,6 +1847,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) struct bpf_kfunc_desc *desc; const char *func_name; struct btf *desc_btf; + unsigned long call_imm; unsigned long addr; int err; @@ -1839,7 +1897,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) prog_aux->kfunc_btf_tab = btf_tab; } - desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL); + desc_btf = find_kfunc_desc_btf(env, func_id, offset); if (IS_ERR(desc_btf)) { verbose(env, "failed to find BTF for kernel function\n"); return PTR_ERR(desc_btf); @@ -1874,9 +1932,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset) return -EINVAL; } + call_imm = BPF_CALL_IMM(addr); + /* Check whether or not the relative offset overflows desc->imm */ + if ((unsigned long)(s32)call_imm != call_imm) { + verbose(env, "address of kernel function %s is out of range\n", + func_name); + return -EINVAL; + } + desc = &tab->descs[tab->nr_descs++]; desc->func_id = func_id; - desc->imm = BPF_CALL_IMM(addr); + desc->imm = call_imm; desc->offset = offset; err = btf_distill_func_proto(&env->log, desc_btf, func_proto, func_name, @@ -2040,7 +2106,7 @@ static int mark_reg_read(struct bpf_verifier_env *env, break; if (parent->live & REG_LIVE_DONE) { verbose(env, "verifier BUG type %s var_off %lld off %d\n", - reg_type_str[parent->type], + reg_type_str(env, parent->type), parent->var_off.value, parent->off); return -EFAULT; } @@ -2224,6 +2290,8 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, return -EINVAL; } + mark_reg_scratched(env, regno); + reg = ®s[regno]; rw64 = is_reg64(env, insn, regno, reg, t); if (t == SRC_OP) { @@ -2298,7 +2366,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL) return NULL; - desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL); + desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off); if (IS_ERR(desc_btf)) return "<error>"; @@ -2328,7 +2396,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, if (insn->code == 0) return 0; - if (env->log.level & BPF_LOG_LEVEL) { + if (env->log.level & BPF_LOG_LEVEL2) { verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); verbose(env, "%d: ", idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); @@ -2378,8 +2446,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, */ if (insn->src_reg != BPF_REG_FP) return 0; - if (BPF_SIZE(insn->code) != BPF_DW) - return 0; /* dreg = *(u64 *)[fp - off] was a fill from the stack. * that [fp - off] slot contains scalar that needs to be @@ -2402,8 +2468,6 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, /* scalars can only be spilled into stack */ if (insn->dst_reg != BPF_REG_FP) return 0; - if (BPF_SIZE(insn->code) != BPF_DW) - return 0; spi = (-insn->off - 1) / BPF_REG_SIZE; if (spi >= 64) { verbose(env, "BUG spi %d\n", spi); @@ -2586,7 +2650,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, DECLARE_BITMAP(mask, 64); u32 history = st->jmp_history_cnt; - if (env->log.level & BPF_LOG_LEVEL) + if (env->log.level & BPF_LOG_LEVEL2) verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); for (i = last_idx;;) { if (skip_first) { @@ -2673,11 +2737,11 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, new_marks = true; reg->precise = true; } - if (env->log.level & BPF_LOG_LEVEL) { - print_verifier_state(env, func); - verbose(env, "parent %s regs=%x stack=%llx marks\n", + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "parent %s regs=%x stack=%llx marks:", new_marks ? "didn't have" : "already had", reg_mask, stack_mask); + print_verifier_state(env, func, true); } if (!reg_mask && !stack_mask) @@ -2703,9 +2767,8 @@ static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) static bool is_spillable_regtype(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_MAP_VALUE: - case PTR_TO_MAP_VALUE_OR_NULL: case PTR_TO_STACK: case PTR_TO_CTX: case PTR_TO_PACKET: @@ -2714,21 +2777,12 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: - case PTR_TO_RDONLY_BUF: - case PTR_TO_RDONLY_BUF_OR_NULL: - case PTR_TO_RDWR_BUF: - case PTR_TO_RDWR_BUF_OR_NULL: - case PTR_TO_PERCPU_BTF_ID: + case PTR_TO_BUF: case PTR_TO_MEM: - case PTR_TO_MEM_OR_NULL: case PTR_TO_FUNC: case PTR_TO_MAP_KEY: return true; @@ -2833,6 +2887,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, env->insn_aux_data[insn_idx].sanitize_stack_spill = true; } + mark_stack_slot_scratched(env, spi); if (reg && !(off % BPF_REG_SIZE) && register_is_bounded(reg) && !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { @@ -2954,6 +3009,7 @@ static int check_stack_write_var_off(struct bpf_verifier_env *env, slot = -i - 1; spi = slot / BPF_REG_SIZE; stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + mark_stack_slot_scratched(env, spi); if (!env->allow_ptr_leaks && *stype != NOT_INIT @@ -3370,11 +3426,8 @@ static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno, /* We may have adjusted the register pointing to memory region, so we * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. - */ - if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, state); - - /* The minimum value is only important with signed + * + * The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use @@ -3459,11 +3512,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, #define MAX_PACKET_OFF 0xffff -static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) -{ - return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type; -} - static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, const struct bpf_call_arg_meta *meta, enum bpf_access_type t) @@ -3569,7 +3617,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { + if (base_type(*reg_type) == PTR_TO_BTF_ID) { *btf = info.btf; *btf_id = info.btf_id; } else { @@ -3637,7 +3685,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, } verbose(env, "R%d invalid %s access off=%d size=%d\n", - regno, reg_type_str[reg->type], off, size); + regno, reg_type_str(env, reg->type), off, size); return -EACCES; } @@ -3932,16 +3980,23 @@ static int get_callee_stack_depth(struct bpf_verifier_env *env, } #endif -int check_ctx_reg(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, int regno) +static int __check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + bool fixed_off_ok) { - /* Access to ctx or passing it to a helper is only allowed in - * its original, unmodified form. + /* Access to this pointer-typed register or passing it to a helper + * is only allowed in its original, unmodified form. */ - if (reg->off) { - verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", - regno, reg->off); + if (reg->off < 0) { + verbose(env, "negative offset %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); + return -EACCES; + } + + if (!fixed_off_ok && reg->off) { + verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n", + reg_type_str(env, reg->type), regno, reg->off); return -EACCES; } @@ -3949,13 +4004,20 @@ int check_ctx_reg(struct bpf_verifier_env *env, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); + verbose(env, "variable %s access var_off=%s disallowed\n", + reg_type_str(env, reg->type), tn_buf); return -EACCES; } return 0; } +int check_ptr_off_reg(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno) +{ + return __check_ptr_off_reg(env, reg, regno, false); +} + static int __check_buffer_access(struct bpf_verifier_env *env, const char *buf_info, const struct bpf_reg_state *reg, @@ -4000,9 +4062,9 @@ static int check_buffer_access(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, int off, int size, bool zero_size_allowed, - const char *buf_info, u32 *max_access) { + const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr"; int err; err = __check_buffer_access(env, buf_info, reg, regno, off, size); @@ -4055,7 +4117,22 @@ static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) static bool bpf_map_is_rdonly(const struct bpf_map *map) { - return (map->map_flags & BPF_F_RDONLY_PROG) && map->frozen; + /* A map is considered read-only if the following condition are true: + * + * 1) BPF program side cannot change any of the map content. The + * BPF_F_RDONLY_PROG flag is throughout the lifetime of a map + * and was set at map creation time. + * 2) The map value(s) have been initialized from user space by a + * loader and then "frozen", such that no new map update/delete + * operations from syscall side are possible for the rest of + * the map's lifetime from that point onwards. + * 3) Any parallel/pending map update/delete operations from syscall + * side have been completed. Only after that point, it's safe to + * assume that map value(s) are immutable. + */ + return (map->map_flags & BPF_F_RDONLY_PROG) && + READ_ONCE(map->frozen) && + !bpf_map_write_active(map); } static int bpf_map_direct_read(struct bpf_map *map, int off, int size, u64 *val) @@ -4097,6 +4174,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, struct bpf_reg_state *reg = regs + regno; const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); const char *tname = btf_name_by_offset(reg->btf, t->name_off); + enum bpf_type_flag flag = 0; u32 btf_id; int ret; @@ -4116,9 +4194,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, return -EACCES; } + if (reg->type & MEM_USER) { + verbose(env, + "R%d is ptr_%s access user memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + + if (reg->type & MEM_PERCPU) { + verbose(env, + "R%d is ptr_%s access percpu memory: off=%d\n", + regno, tname, off); + return -EACCES; + } + if (env->ops->btf_struct_access) { ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id); + off, size, atype, &btf_id, &flag); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); @@ -4126,14 +4218,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id); + atype, &btf_id, &flag); } if (ret < 0) return ret; if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); return 0; } @@ -4146,6 +4238,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; u32 btf_id; @@ -4183,12 +4276,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag); return 0; } @@ -4349,15 +4442,30 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } } - } else if (reg->type == PTR_TO_MEM) { + } else if (base_type(reg->type) == PTR_TO_MEM) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + + if (type_may_be_null(reg->type)) { + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str(env, reg->type)); + return -EACCES; + } + + if (t == BPF_WRITE && rdonly_mem) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { verbose(env, "R%d leaks addr into mem\n", value_regno); return -EACCES; } + err = check_mem_region_access(env, regno, off, size, reg->mem_size, false); - if (!err && t == BPF_READ && value_regno >= 0) + if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem)) mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -4370,11 +4478,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - err = check_ctx_reg(env, reg, regno); + err = check_ptr_off_reg(env, reg, regno); if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, + &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -4387,7 +4496,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else { mark_reg_known_zero(env, regs, value_regno); - if (reg_type_may_be_null(reg_type)) + if (type_may_be_null(reg_type)) regs[value_regno].id = ++env->id_gen; /* A load of ctx field could have different * actual load size with the one encoded in the @@ -4395,8 +4504,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * a sub-register. */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; - if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) { + if (base_type(reg_type) == PTR_TO_BTF_ID) { regs[value_regno].btf = btf; regs[value_regno].btf_id = btf_id; } @@ -4449,7 +4557,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); + regno, reg_type_str(env, reg->type)); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -4459,32 +4567,36 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_tp_buffer_access(env, reg, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_BTF_ID) { + } else if (base_type(reg->type) == PTR_TO_BTF_ID && + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { err = check_ptr_to_map_access(env, regs, regno, off, size, t, value_regno); - } else if (reg->type == PTR_TO_RDONLY_BUF) { - if (t == BPF_WRITE) { - verbose(env, "R%d cannot write into %s\n", - regno, reg_type_str[reg->type]); - return -EACCES; + } else if (base_type(reg->type) == PTR_TO_BUF) { + bool rdonly_mem = type_is_rdonly_mem(reg->type); + u32 *max_access; + + if (rdonly_mem) { + if (t == BPF_WRITE) { + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str(env, reg->type)); + return -EACCES; + } + max_access = &env->prog->aux->max_rdonly_access; + } else { + max_access = &env->prog->aux->max_rdwr_access; } + err = check_buffer_access(env, reg, regno, off, size, false, - "rdonly", - &env->prog->aux->max_rdonly_access); - if (!err && value_regno >= 0) - mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_RDWR_BUF) { - err = check_buffer_access(env, reg, regno, off, size, false, - "rdwr", - &env->prog->aux->max_rdwr_access); - if (!err && t == BPF_READ && value_regno >= 0) + max_access); + + if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ)) mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EACCES; } @@ -4535,9 +4647,16 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (insn->imm == BPF_CMPXCHG) { /* Check comparison of R0 with memory location */ - err = check_reg_arg(env, BPF_REG_0, SRC_OP); + const u32 aux_reg = BPF_REG_0; + + err = check_reg_arg(env, aux_reg, SRC_OP); if (err) return err; + + if (is_pointer_value(env, aux_reg)) { + verbose(env, "R%d leaks addr into mem\n", aux_reg); + return -EACCES; + } } if (is_pointer_value(env, insn->src_reg)) { @@ -4551,7 +4670,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -4572,13 +4691,19 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i load_reg = -1; } - /* check whether we can read the memory */ + /* Check whether we can read the memory, with second call for fetch + * case to simulate the register fill. + */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, true); + BPF_SIZE(insn->code), BPF_READ, -1, true); + if (!err && load_reg >= 0) + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, load_reg, + true); if (err) return err; - /* check whether we can write into the same memory */ + /* Check whether we can write into the same memory. */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true); if (err) @@ -4686,7 +4811,7 @@ static int check_stack_range_initialized( } if (is_spilled_reg(&state->stack[spi]) && - state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID) + base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID) goto mark; if (is_spilled_reg(&state->stack[spi]) && @@ -4728,8 +4853,9 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + u32 *max_access; - switch (reg->type) { + switch (base_type(reg->type)) { case PTR_TO_PACKET: case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, @@ -4748,18 +4874,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->mem_size, zero_size_allowed); - case PTR_TO_RDONLY_BUF: - if (meta && meta->raw_mode) - return -EACCES; - return check_buffer_access(env, reg, regno, reg->off, - access_size, zero_size_allowed, - "rdonly", - &env->prog->aux->max_rdonly_access); - case PTR_TO_RDWR_BUF: + case PTR_TO_BUF: + if (type_is_rdonly_mem(reg->type)) { + if (meta && meta->raw_mode) + return -EACCES; + + max_access = &env->prog->aux->max_rdonly_access; + } else { + max_access = &env->prog->aux->max_rdwr_access; + } return check_buffer_access(env, reg, regno, reg->off, access_size, zero_size_allowed, - "rdwr", - &env->prog->aux->max_rdwr_access); + max_access); case PTR_TO_STACK: return check_stack_range_initialized( env, @@ -4771,20 +4897,76 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, register_is_null(reg)) return 0; - verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[reg->type], - reg_type_str[PTR_TO_STACK]); + verbose(env, "R%d type=%s ", regno, + reg_type_str(env, reg->type)); + verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK)); return -EACCES; } } +static int check_mem_size_reg(struct bpf_verifier_env *env, + struct bpf_reg_state *reg, u32 regno, + bool zero_size_allowed, + struct bpf_call_arg_meta *meta) +{ + int err; + + /* This is used to refine r0 return value bounds for helpers + * that enforce this value as an upper bound on return values. + * See do_refine_retval_range() for helpers that can refine + * the return value. C type of helper is u32 so we pull register + * bound from umax_value however, if negative verifier errors + * out. Only upper bounds can be learned because retval is an + * int type and negative retvals are allowed. + */ + if (meta) + meta->msize_max_value = reg->umax_value; + + /* The register is SCALAR_VALUE; the access check + * happens using its boundaries. + */ + if (!tnum_is_const(reg->var_off)) + /* For unprivileged variable accesses, disable raw + * mode so that the program is required to + * initialize all the memory that the helper could + * just partially fill up. + */ + meta = NULL; + + if (reg->smin_value < 0) { + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", + regno); + return -EACCES; + } + + if (reg->umin_value == 0) { + err = check_helper_mem_access(env, regno - 1, 0, + zero_size_allowed, + meta); + if (err) + return err; + } + + if (reg->umax_value >= BPF_MAX_VAR_SIZ) { + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + regno); + return -EACCES; + } + err = check_helper_mem_access(env, regno - 1, + reg->umax_value, + zero_size_allowed, meta); + if (!err) + err = mark_chain_precision(env, regno); + return err; +} + int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size) { if (register_is_null(reg)) return 0; - if (reg_type_may_be_null(reg->type)) { + if (type_may_be_null(reg->type)) { /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -4801,6 +4983,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, return check_helper_mem_access(env, regno, mem_size, true, NULL); } +int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno) +{ + struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1]; + bool may_be_null = type_may_be_null(mem_reg->type); + struct bpf_reg_state saved_reg; + int err; + + WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5); + + if (may_be_null) { + saved_reg = *mem_reg; + mark_ptr_not_null_reg(mem_reg); + } + + err = check_mem_size_reg(env, reg, regno, true, NULL); + + if (may_be_null) + *mem_reg = saved_reg; + return err; +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -4932,9 +5136,8 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { - return type == ARG_PTR_TO_MEM || - type == ARG_PTR_TO_MEM_OR_NULL || - type == ARG_PTR_TO_UNINIT_MEM; + return base_type(type) == ARG_PTR_TO_MEM || + base_type(type) == ARG_PTR_TO_UNINIT_MEM; } static bool arg_type_is_mem_size(enum bpf_arg_type type) @@ -5039,8 +5242,8 @@ static const struct bpf_reg_types mem_types = { PTR_TO_MAP_KEY, PTR_TO_MAP_VALUE, PTR_TO_MEM, - PTR_TO_RDONLY_BUF, - PTR_TO_RDWR_BUF, + PTR_TO_MEM | MEM_ALLOC, + PTR_TO_BUF, }, }; @@ -5057,11 +5260,11 @@ static const struct bpf_reg_types int_ptr_types = { static const struct bpf_reg_types fullsock_types = { .types = { PTR_TO_SOCKET } }; static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; -static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } }; +static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | MEM_ALLOC } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } }; +static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5071,31 +5274,26 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = { [ARG_PTR_TO_MAP_KEY] = &map_key_value_types, [ARG_PTR_TO_MAP_VALUE] = &map_key_value_types, [ARG_PTR_TO_UNINIT_MAP_VALUE] = &map_key_value_types, - [ARG_PTR_TO_MAP_VALUE_OR_NULL] = &map_key_value_types, [ARG_CONST_SIZE] = &scalar_types, [ARG_CONST_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_ALLOC_SIZE_OR_ZERO] = &scalar_types, [ARG_CONST_MAP_PTR] = &const_map_ptr_types, [ARG_PTR_TO_CTX] = &context_types, - [ARG_PTR_TO_CTX_OR_NULL] = &context_types, [ARG_PTR_TO_SOCK_COMMON] = &sock_types, #ifdef CONFIG_NET [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types, #endif [ARG_PTR_TO_SOCKET] = &fullsock_types, - [ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types, [ARG_PTR_TO_BTF_ID] = &btf_ptr_types, [ARG_PTR_TO_SPIN_LOCK] = &spin_lock_types, [ARG_PTR_TO_MEM] = &mem_types, - [ARG_PTR_TO_MEM_OR_NULL] = &mem_types, [ARG_PTR_TO_UNINIT_MEM] = &mem_types, [ARG_PTR_TO_ALLOC_MEM] = &alloc_mem_types, - [ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types, [ARG_PTR_TO_INT] = &int_ptr_types, [ARG_PTR_TO_LONG] = &int_ptr_types, [ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types, [ARG_PTR_TO_FUNC] = &func_ptr_types, - [ARG_PTR_TO_STACK_OR_NULL] = &stack_ptr_types, + [ARG_PTR_TO_STACK] = &stack_ptr_types, [ARG_PTR_TO_CONST_STR] = &const_str_ptr_types, [ARG_PTR_TO_TIMER] = &timer_types, }; @@ -5109,12 +5307,27 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, const struct bpf_reg_types *compatible; int i, j; - compatible = compatible_reg_types[arg_type]; + compatible = compatible_reg_types[base_type(arg_type)]; if (!compatible) { verbose(env, "verifier internal error: unsupported arg type %d\n", arg_type); return -EFAULT; } + /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM + RDONLY, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with PTR_TO_MEM + RDONLY + * + * Same for MAYBE_NULL: + * + * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and PTR_TO_MEM + MAYBE_NULL, + * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with PTR_TO_MEM + MAYBE_NULL + * + * Therefore we fold these flags depending on the arg_type before comparison. + */ + if (arg_type & MEM_RDONLY) + type &= ~MEM_RDONLY; + if (arg_type & PTR_MAYBE_NULL) + type &= ~PTR_MAYBE_NULL; + for (i = 0; i < ARRAY_SIZE(compatible->types); i++) { expected = compatible->types[i]; if (expected == NOT_INIT) @@ -5124,14 +5337,14 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, goto found; } - verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]); + verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, reg->type)); for (j = 0; j + 1 < i; j++) - verbose(env, "%s, ", reg_type_str[compatible->types[j]]); - verbose(env, "%s\n", reg_type_str[compatible->types[j]]); + verbose(env, "%s, ", reg_type_str(env, compatible->types[j])); + verbose(env, "%s\n", reg_type_str(env, compatible->types[j])); return -EACCES; found: - if (type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID) { if (!arg_btf_id) { if (!compatible->btf_id) { verbose(env, "verifier internal error: missing arg compatible BTF ID\n"); @@ -5147,15 +5360,63 @@ found: kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } + } + + return 0; +} - if (!tnum_is_const(reg->var_off) || reg->var_off.value) { - verbose(env, "R%d is a pointer to in-kernel struct with non-zero offset\n", +int check_func_arg_reg_off(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int regno, + enum bpf_arg_type arg_type, + bool is_release_func) +{ + bool fixed_off_ok = false, release_reg; + enum bpf_reg_type type = reg->type; + + switch ((u32)type) { + case SCALAR_VALUE: + /* Pointer types where reg offset is explicitly allowed: */ + case PTR_TO_PACKET: + case PTR_TO_PACKET_META: + case PTR_TO_MAP_KEY: + case PTR_TO_MAP_VALUE: + case PTR_TO_MEM: + case PTR_TO_MEM | MEM_RDONLY: + case PTR_TO_MEM | MEM_ALLOC: + case PTR_TO_BUF: + case PTR_TO_BUF | MEM_RDONLY: + case PTR_TO_STACK: + /* Some of the argument types nevertheless require a + * zero register offset. + */ + if (arg_type != ARG_PTR_TO_ALLOC_MEM) + return 0; + break; + /* All the rest must be rejected, except PTR_TO_BTF_ID which allows + * fixed offset. + */ + case PTR_TO_BTF_ID: + /* When referenced PTR_TO_BTF_ID is passed to release function, + * it's fixed offset must be 0. We rely on the property that + * only one referenced register can be passed to BPF helpers and + * kfuncs. In the other cases, fixed offset can be non-zero. + */ + release_reg = is_release_func && reg->ref_obj_id; + if (release_reg && reg->off) { + verbose(env, "R%d must have zero offset when passed to release func\n", regno); - return -EACCES; + return -EINVAL; } + /* For release_reg == true, fixed_off_ok must be false, but we + * already checked and rejected reg->off != 0 above, so set to + * true to allow fixed offset for all other cases. + */ + fixed_off_ok = true; + break; + default: + break; } - - return 0; + return __check_ptr_off_reg(env, reg, regno, fixed_off_ok); } static int check_func_arg(struct bpf_verifier_env *env, u32 arg, @@ -5190,15 +5451,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, return -EACCES; } - if (arg_type == ARG_PTR_TO_MAP_VALUE || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || - arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { + if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { err = resolve_map_arg_type(env, meta, &arg_type); if (err) return err; } - if (register_is_null(reg) && arg_type_may_be_null(arg_type)) + if (register_is_null(reg) && type_may_be_null(arg_type)) /* A NULL register has a SCALAR_VALUE type, so skip * type checking. */ @@ -5208,13 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, if (err) return err; - if (type == PTR_TO_CTX) { - err = check_ctx_reg(env, reg, regno); - if (err < 0) - return err; - } + err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id)); + if (err) + return err; skip_type_check: + /* check_func_arg_reg_off relies on only one referenced register being + * allowed for BPF helpers. + */ if (reg->ref_obj_id) { if (meta->ref_obj_id) { verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", @@ -5267,10 +5528,11 @@ skip_type_check: err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, false, NULL); - } else if (arg_type == ARG_PTR_TO_MAP_VALUE || - (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && - !register_is_null(reg)) || - arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { + } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE || + base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) { + if (type_may_be_null(arg_type) && register_is_null(reg)) + return 0; + /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity */ @@ -5314,51 +5576,7 @@ skip_type_check: } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* This is used to refine r0 return value bounds for helpers - * that enforce this value as an upper bound on return values. - * See do_refine_retval_range() for helpers that can refine - * the return value. C type of helper is u32 so we pull register - * bound from umax_value however, if negative verifier errors - * out. Only upper bounds can be learned because retval is an - * int type and negative retvals are allowed. - */ - meta->msize_max_value = reg->umax_value; - - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. - */ - if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ - meta = NULL; - - if (reg->smin_value < 0) { - verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", - regno); - return -EACCES; - } - - if (reg->umin_value == 0) { - err = check_helper_mem_access(env, regno - 1, 0, - zero_size_allowed, - meta); - if (err) - return err; - } - - if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", - regno); - return -EACCES; - } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); - if (!err) - err = mark_chain_precision(env, regno); + err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { verbose(env, "R%d is not a known constant'\n", @@ -5934,6 +6152,7 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn } if (insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback) { struct bpf_verifier_state *async_cb; @@ -5992,9 +6211,9 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "caller:\n"); - print_verifier_state(env, caller); + print_verifier_state(env, caller, true); verbose(env, "callee:\n"); - print_verifier_state(env, callee); + print_verifier_state(env, callee, true); } return 0; } @@ -6085,6 +6304,27 @@ static int set_map_elem_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_loop_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, + * u64 flags); + * callback_fn(u32 index, void *callback_ctx); + */ + callee->regs[BPF_REG_1].type = SCALAR_VALUE; + callee->regs[BPF_REG_2] = caller->regs[BPF_REG_3]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_3]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + + callee->in_callback_fn = true; + return 0; +} + static int set_timer_callback_state(struct bpf_verifier_env *env, struct bpf_func_state *caller, struct bpf_func_state *callee, @@ -6114,6 +6354,33 @@ static int set_timer_callback_state(struct bpf_verifier_env *env, return 0; } +static int set_find_vma_callback_state(struct bpf_verifier_env *env, + struct bpf_func_state *caller, + struct bpf_func_state *callee, + int insn_idx) +{ + /* bpf_find_vma(struct task_struct *task, u64 addr, + * void *callback_fn, void *callback_ctx, u64 flags) + * (callback_fn)(struct task_struct *task, + * struct vm_area_struct *vma, void *callback_ctx); + */ + callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1]; + + callee->regs[BPF_REG_2].type = PTR_TO_BTF_ID; + __mark_reg_known_zero(&callee->regs[BPF_REG_2]); + callee->regs[BPF_REG_2].btf = btf_vmlinux; + callee->regs[BPF_REG_2].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA], + + /* pointer to stack or null */ + callee->regs[BPF_REG_3] = caller->regs[BPF_REG_4]; + + /* unused */ + __mark_reg_not_init(env, &callee->regs[BPF_REG_4]); + __mark_reg_not_init(env, &callee->regs[BPF_REG_5]); + callee->in_callback_fn = true; + return 0; +} + static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) { struct bpf_verifier_state *state = env->cur_state; @@ -6161,9 +6428,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) *insn_idx = callee->callsite + 1; if (env->log.level & BPF_LOG_LEVEL) { verbose(env, "returning from callee:\n"); - print_verifier_state(env, callee); + print_verifier_state(env, callee, true); verbose(env, "to caller at %d:\n", *insn_idx); - print_verifier_state(env, caller); + print_verifier_state(env, caller, true); } /* clear everything in the callee */ free_func_state(callee); @@ -6329,13 +6596,11 @@ static int check_bpf_snprintf_call(struct bpf_verifier_env *env, static int check_get_func_ip(struct bpf_verifier_env *env) { - enum bpf_attach_type eatype = env->prog->expected_attach_type; enum bpf_prog_type type = resolve_prog_type(env->prog); int func_id = BPF_FUNC_get_func_ip; if (type == BPF_PROG_TYPE_TRACING) { - if (eatype != BPF_TRACE_FENTRY && eatype != BPF_TRACE_FEXIT && - eatype != BPF_MODIFY_RETURN) { + if (!bpf_prog_has_trampoline(env->prog)) { verbose(env, "func %s#%d supported only for fentry/fexit/fmod_ret programs\n", func_id_name(func_id), func_id); return -ENOTSUPP; @@ -6354,6 +6619,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn int *insn_idx_p) { const struct bpf_func_proto *fn = NULL; + enum bpf_return_type ret_type; + enum bpf_type_flag ret_flag; struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; int insn_idx = *insn_idx_p; @@ -6431,13 +6698,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn return err; } - if (func_id == BPF_FUNC_tail_call) { - err = check_reference_leak(env); - if (err) { - verbose(env, "tail_call would lead to reference leak\n"); - return err; - } - } else if (is_release_function(func_id)) { + if (is_release_function(func_id)) { err = release_reference(env, meta.ref_obj_id); if (err) { verbose(env, "func %s#%d reference has not been acquired before\n", @@ -6448,35 +6709,47 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs = cur_regs(env); - /* check that flags argument in get_local_storage(map, flags) is 0, - * this is required because get_local_storage() can't return an error. - */ - if (func_id == BPF_FUNC_get_local_storage && - !register_is_null(®s[BPF_REG_2])) { - verbose(env, "get_local_storage() doesn't support non-zero flags\n"); - return -EINVAL; - } - - if (func_id == BPF_FUNC_for_each_map_elem) { + switch (func_id) { + case BPF_FUNC_tail_call: + err = check_reference_leak(env); + if (err) { + verbose(env, "tail_call would lead to reference leak\n"); + return err; + } + break; + case BPF_FUNC_get_local_storage: + /* check that flags argument in get_local_storage(map, flags) is 0, + * this is required because get_local_storage() can't return an error. + */ + if (!register_is_null(®s[BPF_REG_2])) { + verbose(env, "get_local_storage() doesn't support non-zero flags\n"); + return -EINVAL; + } + break; + case BPF_FUNC_for_each_map_elem: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_map_elem_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_timer_set_callback) { + break; + case BPF_FUNC_timer_set_callback: err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, set_timer_callback_state); - if (err < 0) - return -EINVAL; - } - - if (func_id == BPF_FUNC_snprintf) { + break; + case BPF_FUNC_find_vma: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_find_vma_callback_state); + break; + case BPF_FUNC_snprintf: err = check_bpf_snprintf_call(env, regs); - if (err < 0) - return err; + break; + case BPF_FUNC_loop: + err = __check_func_call(env, insn, insn_idx_p, meta.subprogno, + set_loop_callback_state); + break; } + if (err) + return err; + /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(env, regs, caller_saved[i]); @@ -6487,13 +6760,14 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* update return register (already marked as written above) */ - if (fn->ret_type == RET_INTEGER) { + ret_type = fn->ret_type; + ret_flag = type_flag(fn->ret_type); + if (ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ mark_reg_unknown(env, regs, BPF_REG_0); - } else if (fn->ret_type == RET_VOID) { + } else if (ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; - } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || - fn->ret_type == RET_PTR_TO_MAP_VALUE) { + } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) { /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); /* remember map_ptr, so that check_map_access() @@ -6507,28 +6781,25 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].map_uid = meta.map_uid; - if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; - if (map_value_has_spin_lock(meta.map_ptr)) - regs[BPF_REG_0].id = ++env->id_gen; - } else { - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; + if (!type_may_be_null(ret_type) && + map_value_has_spin_lock(meta.map_ptr)) { + regs[BPF_REG_0].id = ++env->id_gen; } - } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { + } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; - } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag; + } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) { mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = meta.mem_size; - } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) { const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); @@ -6546,29 +6817,30 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn tname, PTR_ERR(ret)); return -EINVAL; } - regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_MEM : PTR_TO_MEM_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag; regs[BPF_REG_0].mem_size = tsize; } else { - regs[BPF_REG_0].type = - fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? - PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + /* MEM_RDONLY may be carried from ret_flag, but it + * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise + * it will confuse the check of PTR_TO_BTF_ID in + * check_mem_access(). + */ + ret_flag &= ~MEM_RDONLY; + + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || - fn->ret_type == RET_PTR_TO_BTF_ID) { + } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? - PTR_TO_BTF_ID : - PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { - verbose(env, "invalid return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "invalid return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), + func_id); return -EINVAL; } /* current BPF helper definitions are only coming from @@ -6577,12 +6849,12 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].btf = btf_vmlinux; regs[BPF_REG_0].btf_id = ret_btf_id; } else { - verbose(env, "unknown return type %d of func %s#%d\n", - fn->ret_type, func_id_name(func_id), func_id); + verbose(env, "unknown return type %u of func %s#%d\n", + base_type(ret_type), func_id_name(func_id), func_id); return -EINVAL; } - if (reg_type_may_be_null(regs[BPF_REG_0].type)) + if (type_may_be_null(regs[BPF_REG_0].type)) regs[BPF_REG_0].id = ++env->id_gen; if (is_ptr_cast_function(func_id)) { @@ -6663,22 +6935,23 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno, } } -static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) +static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx_p) { const struct btf_type *t, *func, *func_proto, *ptr_type; struct bpf_reg_state *regs = cur_regs(env); const char *func_name, *ptr_type_name; u32 i, nargs, func_id, ptr_type_id; - struct module *btf_mod = NULL; + int err, insn_idx = *insn_idx_p; const struct btf_param *args; struct btf *desc_btf; - int err; + bool acq; /* skip for now, but return error when we find this in fixup_kfunc_call */ if (!insn->imm) return 0; - desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod); + desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off); if (IS_ERR(desc_btf)) return PTR_ERR(desc_btf); @@ -6687,23 +6960,43 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) func_name = btf_name_by_offset(desc_btf, func->name_off); func_proto = btf_type_by_id(desc_btf, func->type); - if (!env->ops->check_kfunc_call || - !env->ops->check_kfunc_call(func_id, btf_mod)) { + if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_CHECK, func_id)) { verbose(env, "calling kernel function %s is not allowed\n", func_name); return -EACCES; } + acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_ACQUIRE, func_id); + /* Check the arguments */ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs); - if (err) + if (err < 0) return err; + /* In case of release function, we get register number of refcounted + * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now + */ + if (err) { + err = release_reference(env, regs[err].ref_obj_id); + if (err) { + verbose(env, "kfunc %s#%d reference has not been acquired before\n", + func_name, func_id); + return err; + } + } for (i = 0; i < CALLER_SAVED_REGS; i++) mark_reg_not_init(env, regs, caller_saved[i]); /* Check return type */ t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL); + + if (acq && !btf_type_is_ptr(t)) { + verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n"); + return -EINVAL; + } + if (btf_type_is_scalar(t)) { mark_reg_unknown(env, regs, BPF_REG_0); mark_btf_func_reg_size(env, BPF_REG_0, t->size); @@ -6722,7 +7015,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn) regs[BPF_REG_0].btf = desc_btf; regs[BPF_REG_0].type = PTR_TO_BTF_ID; regs[BPF_REG_0].btf_id = ptr_type_id; + if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), + BTF_KFUNC_TYPE_RET_NULL, func_id)) { + regs[BPF_REG_0].type |= PTR_MAYBE_NULL; + /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */ + regs[BPF_REG_0].id = ++env->id_gen; + } mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *)); + if (acq) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + regs[BPF_REG_0].id = id; + regs[BPF_REG_0].ref_obj_id = id; + } } /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */ nargs = btf_type_vlen(func_proto); @@ -6791,25 +7098,25 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { verbose(env, "math between %s pointer and %lld is not allowed\n", - reg_type_str[type], val); + reg_type_str(env, type), val); return false; } if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { verbose(env, "%s pointer offset %d is not allowed\n", - reg_type_str[type], reg->off); + reg_type_str(env, type), reg->off); return false; } if (smin == S64_MIN) { verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", - reg_type_str[type]); + reg_type_str(env, type)); return false; } if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { verbose(env, "value %lld makes %s pointer be out of bounds\n", - smin, reg_type_str[type]); + smin, reg_type_str(env, type)); return false; } @@ -7186,11 +7493,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - switch (ptr_reg->type) { - case PTR_TO_MAP_VALUE_OR_NULL: + if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; + } + + switch (base_type(ptr_reg->type)) { case CONST_PTR_TO_MAP: /* smin_val represents the known value */ if (known && smin_val == 0 && opcode == BPF_ADD) @@ -7198,14 +7507,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, fallthrough; case PTR_TO_PACKET_END: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: verbose(env, "R%d pointer arithmetic on %s prohibited\n", - dst, reg_type_str[ptr_reg->type]); + dst, reg_type_str(env, ptr_reg->type)); return -EACCES; default: break; @@ -8178,12 +8484,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, state); + print_verifier_state(env, state, true); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, state); + print_verifier_state(env, state, true); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -8292,6 +8598,10 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->dst_reg); } zext_32_to_64(dst_reg); + + __update_reg_bounds(dst_reg); + __reg_deduce_bounds(dst_reg); + __reg_bound_offset(dst_reg); } } else { /* case: R = imm @@ -8406,7 +8716,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, new_range = dst_reg->off; if (range_right_open) - new_range--; + new_range++; /* Examples for register markings: * @@ -8924,17 +9234,17 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, struct bpf_reg_state *reg, u32 id, bool is_null) { - if (reg_type_may_be_null(reg->type) && reg->id == id && + if (type_may_be_null(reg->type) && reg->id == id && !WARN_ON_ONCE(!reg->id)) { - /* Old offset (both fixed and variable parts) should - * have been known-zero, because we don't allow pointer - * arithmetic on pointers that might be NULL. - */ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || !tnum_equals_const(reg->var_off, 0) || reg->off)) { - __mark_reg_known_zero(reg); - reg->off = 0; + /* Old offset (both fixed and variable parts) should + * have been known-zero, because we don't allow pointer + * arithmetic on pointers that might be NULL. If we + * see this happening, don't convert the register. + */ + return; } if (is_null) { reg->type = SCALAR_VALUE; @@ -9302,7 +9612,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && - reg_type_may_be_null(dst_reg->type)) { + type_may_be_null(dst_reg->type)) { /* Mark all identical registers in each branch as either * safe or unknown depending R == 0 or R != 0 conditional. */ @@ -9318,7 +9628,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level & BPF_LOG_LEVEL) - print_verifier_state(env, this_branch->frame[this_branch->curframe]); + print_insn_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -9353,16 +9663,19 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } - if (insn->src_reg == BPF_PSEUDO_BTF_ID) { - mark_reg_known_zero(env, regs, insn->dst_reg); + /* All special src_reg cases are listed below. From this point onwards + * we either succeed and assign a corresponding dst_reg->type after + * zeroing the offset, or fail and reject the program. + */ + mark_reg_known_zero(env, regs, insn->dst_reg); + if (insn->src_reg == BPF_PSEUDO_BTF_ID) { dst_reg->type = aux->btf_var.reg_type; - switch (dst_reg->type) { + switch (base_type(dst_reg->type)) { case PTR_TO_MEM: dst_reg->mem_size = aux->btf_var.mem_size; break; case PTR_TO_BTF_ID: - case PTR_TO_PERCPU_BTF_ID: dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; @@ -9393,7 +9706,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) } map = env->used_maps[aux->map_index]; - mark_reg_known_zero(env, regs, insn->dst_reg); dst_reg->map_ptr = map; if (insn->src_reg == BPF_PSEUDO_MAP_VALUE || @@ -9497,7 +9809,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } - err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); + err = check_ptr_off_reg(env, ®s[ctx_reg], ctx_reg); if (err < 0) return err; @@ -9556,7 +9868,7 @@ static int check_return_code(struct bpf_verifier_env *env) /* enforce return zero from async callbacks like timer */ if (reg->type != SCALAR_VALUE) { verbose(env, "In async callback the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -9570,7 +9882,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (is_subprog) { if (reg->type != SCALAR_VALUE) { verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } return 0; @@ -9634,7 +9946,7 @@ static int check_return_code(struct bpf_verifier_env *env) if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", - reg_type_str[reg->type]); + reg_type_str(env, reg->type)); return -EINVAL; } @@ -10088,8 +10400,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env) aux->func_info[i].insn_off = env->subprog_info[i].start; } -#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ - sizeof(((struct bpf_line_info *)(0))->line_col)) +#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col) #define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE static int check_btf_line(struct bpf_verifier_env *env, @@ -10217,6 +10528,78 @@ err_free: return err; } +#define MIN_CORE_RELO_SIZE sizeof(struct bpf_core_relo) +#define MAX_CORE_RELO_SIZE MAX_FUNCINFO_REC_SIZE + +static int check_core_relo(struct bpf_verifier_env *env, + const union bpf_attr *attr, + bpfptr_t uattr) +{ + u32 i, nr_core_relo, ncopy, expected_size, rec_size; + struct bpf_core_relo core_relo = {}; + struct bpf_prog *prog = env->prog; + const struct btf *btf = prog->aux->btf; + struct bpf_core_ctx ctx = { + .log = &env->log, + .btf = btf, + }; + bpfptr_t u_core_relo; + int err; + + nr_core_relo = attr->core_relo_cnt; + if (!nr_core_relo) + return 0; + if (nr_core_relo > INT_MAX / sizeof(struct bpf_core_relo)) + return -EINVAL; + + rec_size = attr->core_relo_rec_size; + if (rec_size < MIN_CORE_RELO_SIZE || + rec_size > MAX_CORE_RELO_SIZE || + rec_size % sizeof(u32)) + return -EINVAL; + + u_core_relo = make_bpfptr(attr->core_relos, uattr.is_kernel); + expected_size = sizeof(struct bpf_core_relo); + ncopy = min_t(u32, expected_size, rec_size); + + /* Unlike func_info and line_info, copy and apply each CO-RE + * relocation record one at a time. + */ + for (i = 0; i < nr_core_relo; i++) { + /* future proofing when sizeof(bpf_core_relo) changes */ + err = bpf_check_uarg_tail_zero(u_core_relo, expected_size, rec_size); + if (err) { + if (err == -E2BIG) { + verbose(env, "nonzero tailing record in core_relo"); + if (copy_to_bpfptr_offset(uattr, + offsetof(union bpf_attr, core_relo_rec_size), + &expected_size, sizeof(expected_size))) + err = -EFAULT; + } + break; + } + + if (copy_from_bpfptr(&core_relo, u_core_relo, ncopy)) { + err = -EFAULT; + break; + } + + if (core_relo.insn_off % 8 || core_relo.insn_off / 8 >= prog->len) { + verbose(env, "Invalid core_relo[%u].insn_off:%u prog->len:%u\n", + i, core_relo.insn_off, prog->len); + err = -EINVAL; + break; + } + + err = bpf_core_apply(&ctx, &core_relo, i, + &prog->insnsi[core_relo.insn_off / 8]); + if (err) + break; + bpfptr_add(&u_core_relo, rec_size); + } + return err; +} + static int check_btf_info(struct bpf_verifier_env *env, const union bpf_attr *attr, bpfptr_t uattr) @@ -10247,6 +10630,10 @@ static int check_btf_info(struct bpf_verifier_env *env, if (err) return err; + err = check_core_relo(env, attr, uattr); + if (err) + return err; + return 0; } @@ -10415,7 +10802,7 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return true; if (rcur->type == NOT_INIT) return false; - switch (rold->type) { + switch (base_type(rold->type)) { case SCALAR_VALUE: if (env->explore_alu_limits) return false; @@ -10437,6 +10824,22 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, } case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: + /* a PTR_TO_MAP_VALUE could be safe to use as a + * PTR_TO_MAP_VALUE_OR_NULL into the same map. + * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- + * checked, doing so could have affected others with the same + * id, and we can't check for that because we lost the id when + * we converted to a PTR_TO_MAP_VALUE. + */ + if (type_may_be_null(rold->type)) { + if (!type_may_be_null(rcur->type)) + return false; + if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) + return false; + /* Check our ids match any regs they're supposed to */ + return check_ids(rold->id, rcur->id, idmap); + } + /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. * 'id' is not compared, since it's only used for maps with @@ -10448,20 +10851,6 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); - case PTR_TO_MAP_VALUE_OR_NULL: - /* a PTR_TO_MAP_VALUE could be safe to use as a - * PTR_TO_MAP_VALUE_OR_NULL into the same map. - * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- - * checked, doing so could have affected others with the same - * id, and we can't check for that because we lost the id when - * we converted to a PTR_TO_MAP_VALUE. - */ - if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) - return false; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) - return false; - /* Check our ids match any regs they're supposed to */ - return check_ids(rold->id, rcur->id, idmap); case PTR_TO_PACKET_META: case PTR_TO_PACKET: if (rcur->type != rold->type) @@ -10490,11 +10879,8 @@ static bool regsafe(struct bpf_verifier_env *env, struct bpf_reg_state *rold, case PTR_TO_PACKET_END: case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -11020,17 +11406,13 @@ next: /* Return true if it's OK to have the same insn return a different type. */ static bool reg_type_mismatch_ok(enum bpf_reg_type type) { - switch (type) { + switch (base_type(type)) { case PTR_TO_CTX: case PTR_TO_SOCKET: - case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: - case PTR_TO_SOCK_COMMON_OR_NULL: case PTR_TO_TCP_SOCK: - case PTR_TO_TCP_SOCK_OR_NULL: case PTR_TO_XDP_SOCK: case PTR_TO_BTF_ID: - case PTR_TO_BTF_ID_OR_NULL: return false; default: return true; @@ -11110,16 +11492,12 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (env->log.level & BPF_LOG_LEVEL2 || - (env->log.level & BPF_LOG_LEVEL && do_print_state)) { - if (env->log.level & BPF_LOG_LEVEL2) - verbose(env, "%d:", env->insn_idx); - else - verbose(env, "\nfrom %d to %d%s:", - env->prev_insn_idx, env->insn_idx, - env->cur_state->speculative ? - " (speculative execution)" : ""); - print_verifier_state(env, state->frame[state->curframe]); + if (env->log.level & BPF_LOG_LEVEL2 && do_print_state) { + verbose(env, "\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + print_verifier_state(env, state->frame[state->curframe], true); do_print_state = false; } @@ -11130,9 +11508,15 @@ static int do_check(struct bpf_verifier_env *env) .private_data = env, }; + if (verifier_state_scratched(env)) + print_insn_state(env, state->frame[state->curframe]); + verbose_linfo(env, env->insn_idx, "; "); + env->prev_log_len = env->log.len_used; verbose(env, "%d: ", env->insn_idx); print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + env->prev_insn_print_len = env->log.len_used - env->prev_log_len; + env->prev_log_len = env->log.len_used; } if (bpf_prog_is_dev_bound(env->prog->aux)) { @@ -11254,7 +11638,7 @@ static int do_check(struct bpf_verifier_env *env) if (is_ctx_reg(env, insn->dst_reg)) { verbose(env, "BPF_ST stores into R%d %s is not allowed\n", insn->dst_reg, - reg_type_str[reg_state(env, insn->dst_reg)->type]); + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); return -EACCES; } @@ -11291,7 +11675,7 @@ static int do_check(struct bpf_verifier_env *env) if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) - err = check_kfunc_call(env, insn); + err = check_kfunc_call(env, insn, &env->insn_idx); else err = check_helper_call(env, insn, &env->insn_idx); if (err) @@ -11341,6 +11725,7 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; process_bpf_exit: + mark_verifier_state_scratched(env); update_branch_counts(env, env->cur_state); err = pop_stack(env, &prev_insn_idx, &env->insn_idx, pop_log); @@ -11489,7 +11874,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, type = t->type; t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { - aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU; aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { @@ -11506,7 +11891,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, err = -EINVAL; goto err_put; } - aux->btf_var.reg_type = PTR_TO_MEM; + aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; @@ -11631,6 +12016,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if (map_value_has_timer(map)) { + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_timer yet\n"); + return -EINVAL; + } + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); @@ -11659,6 +12051,9 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_INODE_STORAGE: + case BPF_MAP_TYPE_SK_STORAGE: + case BPF_MAP_TYPE_TASK_STORAGE: break; default: verbose(env, @@ -12628,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->name[0] = 'F'; func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; func[i]->jit_requested = 1; + func[i]->blinding_requested = prog->blinding_requested; func[i]->aux->kfunc_tab = prog->aux->kfunc_tab; func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab; func[i]->aux->linfo = prog->aux->linfo; @@ -12723,6 +13119,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) prog->jited = 1; prog->bpf_func = func[0]->bpf_func; + prog->jited_len = func[0]->jited_len; prog->aux->func = func; prog->aux->func_cnt = env->subprog_cnt; bpf_prog_jit_attempt_done(prog); @@ -12750,6 +13147,7 @@ out_free: out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; + prog->blinding_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (!bpf_pseudo_call(insn)) continue; @@ -12842,7 +13240,7 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, static int do_misc_fixups(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; - bool expect_blinding = bpf_jit_blinding_enabled(prog); + enum bpf_attach_type eatype = prog->expected_attach_type; enum bpf_prog_type prog_type = resolve_prog_type(prog); struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; @@ -13006,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->bpf_capable && !expect_blinding && + if (env->bpf_capable && !prog->blinding_requested && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -13094,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto patch_call_imm; } + if (insn->imm == BPF_FUNC_task_storage_get || + insn->imm == BPF_FUNC_sk_storage_get || + insn->imm == BPF_FUNC_inode_storage_get) { + if (env->prog->aux->sleepable) + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL); + else + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC); + insn_buf[1] = *insn; + cnt = 2; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto patch_call_imm; + } + /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * and other inlining handlers are currently limited to 64 bit * only. @@ -13212,11 +13630,79 @@ patch_map_ops_generic: continue; } + /* Implement bpf_get_func_arg inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_JMP32_REG(BPF_JGE, BPF_REG_2, BPF_REG_0, 6); + insn_buf[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 3); + insn_buf[3] = BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1); + insn_buf[4] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0); + insn_buf[5] = BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[6] = BPF_MOV64_IMM(BPF_REG_0, 0); + insn_buf[7] = BPF_JMP_A(1); + insn_buf[8] = BPF_MOV64_IMM(BPF_REG_0, -EINVAL); + cnt = 9; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + /* Implement bpf_get_func_ret inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_ret) { + if (eatype == BPF_TRACE_FEXIT || + eatype == BPF_MODIFY_RETURN) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + insn_buf[1] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3); + insn_buf[2] = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1); + insn_buf[3] = BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0); + insn_buf[4] = BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0); + insn_buf[5] = BPF_MOV64_IMM(BPF_REG_0, 0); + cnt = 6; + } else { + insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, -EOPNOTSUPP); + cnt = 1; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + + /* Implement get_func_arg_cnt inline. */ + if (prog_type == BPF_PROG_TYPE_TRACING && + insn->imm == BPF_FUNC_get_func_arg_cnt) { + /* Load nr_args from ctx - 8 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); + if (!new_prog) + return -ENOMEM; + + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + /* Implement bpf_get_func_ip inline. */ if (prog_type == BPF_PROG_TYPE_TRACING && insn->imm == BPF_FUNC_get_func_ip) { - /* Load IP address from ctx - 8 */ - insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8); + /* Load IP address from ctx - 16 */ + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -16); new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, 1); if (!new_prog) @@ -13330,7 +13816,7 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); - else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + else if (base_type(regs[i].type) == PTR_TO_MEM) { const u32 mem_size = regs[i].mem_size; mark_reg_known_zero(env, regs, i); @@ -13918,13 +14404,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr) log->ubuf = (char __user *) (unsigned long) attr->log_buf; log->len_total = attr->log_size; - ret = -EINVAL; /* log attributes have to be sane */ - if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || - !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) + if (!bpf_verifier_log_attr_valid(log)) { + ret = -EINVAL; goto err_unlock; + } } + mark_verifier_state_clean(env); + if (IS_ERR(btf_vmlinux)) { /* Either gcc or pahole or kernel are broken. */ verbose(env, "in-kernel BTF is malformed\n"); diff --git a/kernel/capability.c b/kernel/capability.c index 46a361dde042..765194f5d678 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } +EXPORT_SYMBOL(has_capability_noaudit); static bool ns_capable_common(struct user_namespace *ns, int cap, diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..6e36e854b512 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -65,6 +65,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) return container_of(kfc, struct cgroup_fs_context, kfc); } +struct cgroup_pidlist; + +struct cgroup_file_ctx { + struct cgroup_namespace *ns; + + struct { + void *trigger; + } psi; + + struct { + bool started; + struct css_task_iter iter; + } procs; + + struct { + struct cgroup_pidlist *pidlist; + } procs1; +}; + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 81c9e0685948..afc6c0e9c966 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -394,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * next pid to display, if any */ struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -403,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) mutex_lock(&cgrp->pidlist_mutex); /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * !NULL @ctx->procs1.pidlist indicates that this isn't the first + * start() after open. If the matching pidlist is around, we can use + * that. Look for it. Note that @ctx->procs1.pidlist can't be used + * directly. It could already have been destroyed. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); + if (ctx->procs1.pidlist) + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); + if (!ctx->procs1.pidlist) { + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } - l = of->priv; + l = ctx->procs1.pidlist; if (pid) { int end = l->length; @@ -449,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, @@ -460,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* @@ -504,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, goto out_unlock; /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. + * Even if we're attaching all tasks in the thread group, we only need + * to check permissions on one of them. Check permissions using the + * credentials from file open to protect against inherited fd attacks. */ - cred = current_cred(); + cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && @@ -543,9 +546,19 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; + struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + ctx = of->priv; + if ((ctx->ns->user_ns != &init_user_ns) || + !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -951,6 +964,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); + /* + * Release agent gets called with all capabilities, + * require capabilities to set release agent. + */ + if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) + return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..adb820e98f24 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -30,6 +30,7 @@ #include "cgroup-internal.h" +#include <linux/bpf-cgroup.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/init_task.h> @@ -1301,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) { - struct cgroup *root_cgrp = kf_root->kn->priv; + struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv; return root_cgrp->root; } @@ -2024,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ret = PTR_ERR(root->kf_root); goto exit_root_id; } - root_cgrp->kn = root->kf_root->kn; + root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); @@ -2650,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_preload_node)) return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); @@ -3630,6 +3631,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { + struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; @@ -3641,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); + /* Allow only one trigger per file descriptor */ + if (ctx->psi.trigger) { + cgroup_put(cgrp); + return -EBUSY; + } + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { @@ -3648,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&of->priv, new); - + smp_store_release(&ctx->psi.trigger, new); cgroup_put(cgrp); return nbytes; @@ -3679,12 +3686,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { - return psi_trigger_poll(&of->priv, of->file, pt); + struct cgroup_file_ctx *ctx = of->priv; + + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + struct cgroup_file_ctx *ctx = of->priv; + + psi_trigger_destroy(ctx->psi.trigger); } bool cgroup_psi_enabled(void) @@ -3811,24 +3822,43 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) { + put_cgroup_ns(ctx->ns); + kfree(ctx); + } + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; @@ -3845,7 +3875,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -4751,21 +4781,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4773,21 +4803,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); @@ -4838,9 +4865,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; int ret; @@ -4869,11 +4896,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb, bool threadgroup) + struct super_block *sb, bool threadgroup, + struct cgroup_namespace *ns) { int ret = 0; - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; @@ -4890,8 +4918,10 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; bool locked; @@ -4909,9 +4939,16 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - /* process and thread migrations follow same delegation rule */ + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, threadgroup); + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); + revert_creds(saved_cred); if (ret) goto out_finish; @@ -5711,7 +5748,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; - css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); + css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); @@ -6129,8 +6166,23 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) if (ret) goto err; + /* + * Spawning a task directly into a cgroup works by passing a file + * descriptor to the target cgroup directory. This can even be an O_PATH + * file descriptor. But it can never be a cgroup.procs file descriptor. + * This was done on purpose so spawning into a cgroup could be + * conceptualized as an atomic + * + * fd = openat(dfd_cgroup, "cgroup.procs", ...); + * write(fd, <child-pid>, ...); + * + * sequence, i.e. it's a shorthand for the caller opening and writing + * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us + * to always use the caller's credentials. + */ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, - !(kargs->flags & CLONE_THREAD)); + !(kargs->flags & CLONE_THREAD), + current->nsproxy->cgroup_ns); if (ret) goto err; @@ -6191,6 +6243,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) /** * cgroup_can_fork - called on a new task before the process is exposed * @child: the child process + * @kargs: the arguments passed to create the child process * * This prepares a new css_set for the child process which the child will * be attached to in cgroup_post_fork(). @@ -6253,6 +6306,7 @@ void cgroup_cancel_fork(struct task_struct *child, /** * cgroup_post_fork - finalize cgroup setup for the child process * @child: the child process + * @kargs: the arguments passed to create the child process * * Attach the child process to its css_set calling the subsystem fork() * callbacks. diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d0e163a02099..9390bfd9f1cd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory - * node binding, add this key to provide a quick low-cost judgement + * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); @@ -591,6 +591,35 @@ static inline void free_cpuset(struct cpuset *cs) } /* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + +/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * @@ -614,28 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; + int ret = 0; rcu_read_lock(); - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) + goto out; /* Remaining checks don't apply to root cpuset */ - ret = 0; if (cur == &top_cpuset) goto out; par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) - goto out; - /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -811,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } @@ -841,7 +863,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_FLAG_DOMAIN)))) + housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (root_load_balance && @@ -930,7 +952,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -1159,7 +1181,7 @@ enum subparts_cmd { * effective_cpus. The function will return 0 if all the CPUs listed in * cpus_allowed can be granted or an error code will be returned. * - * For partcmd_disable, the cpuset is being transofrmed from a partition + * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back * into parent's effective_cpus. 0 should always be returned. @@ -1183,9 +1205,7 @@ enum subparts_cmd { * * Because of the implicit cpu exclusive nature of a partition root, * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. + * permitted when checked by validate_change(). */ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, struct cpumask *newmask, @@ -1530,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1541,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } @@ -1615,8 +1645,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Make sure that subparts_cpus is a subset of cpus_allowed. */ if (cs->nr_subparts_cpus) { - cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } spin_unlock_irq(&callback_lock); @@ -1998,7 +2027,7 @@ out: } /* - * update_prstate - update partititon_root_state + * update_prstate - update partition_root_state * cs: the cpuset to update * new_prs: new partition root state * @@ -2260,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2313,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); + cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ @@ -2848,7 +2879,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for - * histrical reasons - the flag may be specified during mount. + * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to @@ -3045,7 +3076,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migratecd to an ancestor. + * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) update_tasks_cpumask(cs); @@ -3493,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3536,7 +3567,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ + bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) @@ -3665,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void) int cpuset_memory_pressure_enabled __read_mostly; -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. @@ -3681,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly; * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. - **/ + */ void __cpuset_memory_pressure_bump(void) { diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 3984dd6b8ddb..617861a54793 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,4 +1,4 @@ -//SPDX-License-Identifier: GPL-2.0 +// SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1486768f2318..24b5c2ab5598 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ - if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; raw_spin_lock_irqsave(cpu_lock, flags); @@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; if (pos == root) return NULL; @@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * We're gonna walk down to the first leaf and visit/remove it. We * can pick whatever unvisited node as the starting point. */ - if (!pos) + if (!pos) { pos = root; - else + /* return NULL if this subtree is not on-list */ + if (!cgroup_rstat_cpu(pos, cpu)->updated_next) + return NULL; + } else { pos = cgroup_parent(pos); + } /* walk down to the first leaf */ while (true) { @@ -115,33 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - if (rstatc->updated_next) { - struct cgroup *parent = cgroup_parent(pos); - - if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; - - prstatc = cgroup_rstat_cpu(parent, cpu); - nextp = &prstatc->updated_children; - while (true) { - struct cgroup_rstat_cpu *nrstatc; - - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; - } - *nextp = rstatc->updated_next; - } + parent = cgroup_parent(pos); + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; - rstatc->updated_next = NULL; - return pos; + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (*nextp != pos) { + struct cgroup_rstat_cpu *nrstatc; + + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; + } + *nextp = rstatc->updated_next; } - /* only happens for @root */ - return NULL; + rstatc->updated_next = NULL; + return pos; } /* see cgroup_rstat_flush() */ @@ -156,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @@ -169,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || @@ -318,7 +324,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_base_stat cur, delta; + struct cgroup_base_stat delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ @@ -328,11 +334,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); - cur.cputime = rstatc->bstat.cputime; + delta = rstatc->bstat; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* propagate percpu delta to global */ - delta = cur; cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index eb0029c9a6a6..e400fbbc8aba 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -1,5 +1,5 @@ # KEEP ALPHABETICALLY SORTED -# CONFIG_AIO is not set +# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_LEGACY_PTYS is not set diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config new file mode 100644 index 000000000000..e8db8d938661 --- /dev/null +++ b/kernel/configs/debug.config @@ -0,0 +1,106 @@ +# The config is based on running daily CI for enterprise Linux distros to +# seek regressions on linux-next builds on different bare-metal and virtual +# platforms. It can be used for example, +# +# $ make ARCH=arm64 defconfig debug.config +# +# Keep alphabetically sorted inside each section. +# +# printk and dmesg options +# +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DYNAMIC_DEBUG=y +CONFIG_PRINTK_CALLER=y +CONFIG_PRINTK_TIME=y +CONFIG_SYMBOLIC_ERRNAME=y +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y +CONFIG_DEBUG_SECTION_MISMATCH=y +CONFIG_FRAME_WARN=2048 +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# +# Generic Kernel Debugging Instruments +# +# CONFIG_UBSAN_ALIGNMENT is not set +# CONFIG_UBSAN_DIV_ZERO is not set +# CONFIG_UBSAN_TRAP is not set +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_FS_ALLOW_ALL=y +CONFIG_DEBUG_IRQFLAGS=y +CONFIG_UBSAN=y +CONFIG_UBSAN_BOOL=y +CONFIG_UBSAN_BOUNDS=y +CONFIG_UBSAN_ENUM=y +CONFIG_UBSAN_SHIFT=y +CONFIG_UBSAN_UNREACHABLE=y +# +# Memory Debugging +# +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_WX is not set +# CONFIG_KFENCE is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_SLUB_STATS is not set +CONFIG_PAGE_EXTENSION=y +CONFIG_PAGE_OWNER=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_DEBUG_OBJECTS_FREE=y +CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y +CONFIG_DEBUG_OBJECTS_RCU_HEAD=y +CONFIG_DEBUG_OBJECTS_TIMERS=y +CONFIG_DEBUG_OBJECTS_WORK=y +CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_VIRTUAL=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_GENERIC_PTDUMP=y +CONFIG_KASAN=y +CONFIG_KASAN_GENERIC=y +CONFIG_KASAN_INLINE=y +CONFIG_KASAN_VMALLOC=y +CONFIG_PTDUMP_DEBUGFS=y +CONFIG_SCHED_STACK_END_CHECK=y +CONFIG_SLUB_DEBUG_ON=y +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SOFTLOCKUP_DETECTOR=y +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +# CONFIG_PROVE_RAW_LOCK_NESTING is not set +CONFIG_PROVE_LOCKING=y +# +# Debug kernel data structures +# +CONFIG_BUG_ON_DATA_CORRUPTION=y +# +# RCU Debugging +# +CONFIG_PROVE_RCU=y +CONFIG_PROVE_RCU_LIST=y +# +# Tracers +# +CONFIG_BRANCH_PROFILE_NONE=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y diff --git a/kernel/cpu.c b/kernel/cpu.c index 192e43a87407..d0a9aa0b42e8 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -31,8 +31,10 @@ #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> +#include <linux/scs.h> #include <linux/percpu-rwsem.h> #include <linux/cpuset.h> +#include <linux/random.h> #include <trace/events/power.h> #define CREATE_TRACE_POINTS @@ -69,7 +71,6 @@ struct cpuhp_cpu_state { bool rollback; bool single; bool bringup; - int cpu; struct hlist_node *node; struct hlist_node *last; enum cpuhp_state cb_state; @@ -473,7 +474,7 @@ static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } #endif static inline enum cpuhp_state -cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) +cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; bool bringup = st->state < target; @@ -484,14 +485,15 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) st->target = target; st->single = false; st->bringup = bringup; - if (cpu_dying(st->cpu) != !bringup) - set_cpu_dying(st->cpu, !bringup); + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); return prev_state; } static inline void -cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) +cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state prev_state) { bool bringup = !st->bringup; @@ -518,8 +520,8 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) } st->bringup = bringup; - if (cpu_dying(st->cpu) != !bringup) - set_cpu_dying(st->cpu, !bringup); + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); } /* Regular hotplug invocation of the AP hotplug thread */ @@ -539,15 +541,16 @@ static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st) wait_for_ap_thread(st, st->bringup); } -static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target) +static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st, + enum cpuhp_state target) { enum cpuhp_state prev_state; int ret; - prev_state = cpuhp_set_state(st, target); + prev_state = cpuhp_set_state(cpu, st, target); __cpuhp_kick_ap(st); if ((ret = st->result)) { - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } @@ -579,7 +582,7 @@ static int bringup_wait_for_ap(unsigned int cpu) if (st->target <= CPUHP_AP_ONLINE_IDLE) return 0; - return cpuhp_kick_ap(st, st->target); + return cpuhp_kick_ap(cpu, st, st->target); } static int bringup_cpu(unsigned int cpu) @@ -588,6 +591,12 @@ static int bringup_cpu(unsigned int cpu) int ret; /* + * Reset stale stack state from the last time this CPU was online. + */ + scs_task_reset(idle); + kasan_unpoison_task_stack(idle); + + /* * Some architectures have to walk the irq descriptors to * setup the vector space for the cpu which comes online. * Prevent irq alloc/free across the bringup. @@ -696,7 +705,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret, cpu, cpuhp_get_step(st->state)->name, st->state); - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); if (can_rollback_cpu(st)) WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, prev_state)); @@ -713,7 +722,6 @@ static void cpuhp_create(unsigned int cpu) init_completion(&st->done_up); init_completion(&st->done_down); - st->cpu = cpu; } static int cpuhp_should_run(unsigned int cpu) @@ -867,7 +875,7 @@ static int cpuhp_kick_ap_work(unsigned int cpu) cpuhp_lock_release(true); trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work); - ret = cpuhp_kick_ap(st, st->target); + ret = cpuhp_kick_ap(cpu, st, st->target); trace_cpuhp_exit(cpu, st->state, prev_state, ret); return ret; @@ -1099,7 +1107,7 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, ret, cpu, cpuhp_get_step(st->state)->name, st->state); - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); if (st->state < prev_state) WARN_ON(cpuhp_invoke_callback_range(true, cpu, st, @@ -1126,7 +1134,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, cpuhp_tasks_frozen = tasks_frozen; - prev_state = cpuhp_set_state(st, target); + prev_state = cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread. @@ -1157,7 +1165,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, ret = cpuhp_down_callbacks(cpu, st, target); if (ret && st->state < prev_state) { if (st->state == CPUHP_TEARDOWN_CPU) { - cpuhp_reset_state(st, prev_state); + cpuhp_reset_state(cpu, st, prev_state); __cpuhp_kick_ap(st); } else { WARN(1, "DEAD callback error for CPU%d", cpu); @@ -1344,7 +1352,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) cpuhp_tasks_frozen = tasks_frozen; - cpuhp_set_state(st, target); + cpuhp_set_state(cpu, st, target); /* * If the current CPU state is in the range of the AP hotplug thread, * then we need to kick the thread once more. @@ -1481,8 +1489,8 @@ int freeze_secondary_cpus(int primary) cpu_maps_update_begin(); if (primary == -1) { primary = cpumask_first(cpu_online_mask); - if (!housekeeping_cpu(primary, HK_FLAG_TIMER)) - primary = housekeeping_any_cpu(HK_FLAG_TIMER); + if (!housekeeping_cpu(primary, HK_TYPE_TIMER)) + primary = housekeeping_any_cpu(HK_TYPE_TIMER); } else { if (!cpu_online(primary)) primary = cpumask_first(cpu_online_mask); @@ -1652,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = perf_event_init_cpu, .teardown.single = perf_event_exit_cpu, }, + [CPUHP_RANDOM_PREPARE] = { + .name = "random:prepare", + .startup.single = random_prepare_cpu, + .teardown.single = NULL, + }, [CPUHP_WORKQUEUE_PREP] = { .name = "workqueue:prepare", .startup.single = workqueue_prepare_cpu, @@ -1775,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { .startup.single = workqueue_online_cpu, .teardown.single = workqueue_offline_cpu, }, + [CPUHP_AP_RANDOM_ONLINE] = { + .name = "random:online", + .startup.single = random_online_cpu, + .teardown.single = NULL, + }, [CPUHP_AP_RCUTREE_ONLINE] = { .name = "RCU/tree:online", .startup.single = rcutree_online_cpu, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index eb53f5ec62c9..256cf6db573c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -6,6 +6,7 @@ #include <linux/buildid.h> #include <linux/crash_core.h> +#include <linux/init.h> #include <linux/utsname.h> #include <linux/vmalloc.h> @@ -295,6 +296,16 @@ int __init parse_crashkernel_low(char *cmdline, "crashkernel=", suffix_tbl[SUFFIX_LOW]); } +/* + * Add a dummy early_param handler to mark crashkernel= as a known command line + * parameter and suppress incorrect warnings in init/main.c. + */ +static int __init parse_crashkernel_dummy(char *arg) +{ + return 0; +} +early_param("crashkernel", parse_crashkernel_dummy); + Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, void *data, size_t data_len) { diff --git a/kernel/cred.c b/kernel/cred.c index 473d17c431f3..e10c15f51c1f 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -665,21 +665,16 @@ EXPORT_SYMBOL(cred_fscmp); int set_cred_ucounts(struct cred *new) { - struct task_struct *task = current; - const struct cred *old = task->real_cred; struct ucounts *new_ucounts, *old_ucounts = new->ucounts; - if (new->user == old->user && new->user_ns == old->user_ns) - return 0; - /* * This optimization is needed because alloc_ucounts() uses locks * for table lookups. */ - if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid)) return 0; - if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid))) + if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid))) return -EAGAIN; new->ucounts = new_ucounts; @@ -875,7 +870,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, /* * report use of invalid credentials */ -void __invalid_creds(const struct cred *cred, const char *file, unsigned line) +void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line) { printk(KERN_ERR "CRED: Invalid credentials\n"); printk(KERN_ERR "CRED: At %s:%u\n", file, line); diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index df2bface866e..85cb51c4a17e 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -291,7 +291,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size) */ int kdb_putarea_size(unsigned long addr, void *res, size_t size) { - int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size); + int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size); if (ret) { if (!KDB_STATE(SUPPRESS)) { kdb_func_printf("Bad address 0x%lx\n", addr); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 51530d5b15a8..c5e8cea9e05f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -100,19 +100,10 @@ void __delayacct_blkio_start(void) */ void __delayacct_blkio_end(struct task_struct *p) { - struct task_delay_info *delays = p->delays; - u64 *total; - u32 *count; - - if (p->delays->flags & DELAYACCT_PF_SWAPIN) { - total = &delays->swapin_delay; - count = &delays->swapin_count; - } else { - total = &delays->blkio_delay; - count = &delays->blkio_count; - } - - delayacct_end(&delays->lock, &delays->blkio_start, total, count); + delayacct_end(&p->delays->lock, + &p->delays->blkio_start, + &p->delays->blkio_delay, + &p->delays->blkio_count); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -164,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + tmp = d->compact_delay_total + tsk->delays->compact_delay; + d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; + d->compact_count += tsk->delays->compact_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -179,8 +173,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) unsigned long flags; raw_spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); + ret = nsec_to_clock_t(tsk->delays->blkio_delay); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } @@ -210,3 +203,29 @@ void __delayacct_thrashing_end(void) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count); } + +void __delayacct_swapin_start(void) +{ + current->delays->swapin_start = local_clock(); +} + +void __delayacct_swapin_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->swapin_start, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); +} + +void __delayacct_compact_start(void) +{ + current->delays->compact_start = local_clock(); +} + +void __delayacct_compact_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->compact_start, + ¤t->delays->compact_delay, + ¤t->delays->compact_count); +} diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 1b02179758cb..56866aaa2ae1 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -110,15 +110,10 @@ config DMA_GLOBAL_POOL select DMA_DECLARE_COHERENT bool -config DMA_REMAP - bool - depends on MMU - select DMA_NONCOHERENT_MMAP - config DMA_DIRECT_REMAP bool - select DMA_REMAP select DMA_COHERENT_POOL + select DMA_NONCOHERENT_MMAP config DMA_CMA bool "DMA Contiguous Memory Allocator" diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index 0dd65ec1d234..21926e46ef4f 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o -obj-$(CONFIG_DMA_REMAP) += remap.o +obj-$(CONFIG_MMU) += remap.o obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 3d63d91cba5c..6ea80ae42622 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = { static int __init rmem_cma_setup(struct reserved_mem *rmem) { - phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; @@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; - if ((rmem->base & mask) || (rmem->size & mask)) { + if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) { pr_err("Reserved memory: incorrect alignment of CMA region\n"); return -EINVAL; } diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 7a14ca29c377..f8ff598596b8 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -927,7 +927,7 @@ static __init int dma_debug_cmdline(char *str) global_disable = true; } - return 0; + return 1; } static __init int dma_debug_entries_cmdline(char *str) @@ -936,7 +936,7 @@ static __init int dma_debug_entries_cmdline(char *str) return -EINVAL; if (!get_option(&str, &nr_prealloc_entries)) nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES; - return 0; + return 1; } __setup("dma_debug=", dma_debug_cmdline); diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 4c6c5e0635e3..9743c6ccce1a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -75,15 +75,45 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit); } +static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size) +{ + if (!force_dma_unencrypted(dev)) + return 0; + return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size)); +} + +static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size) +{ + int ret; + + if (!force_dma_unencrypted(dev)) + return 0; + ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size)); + if (ret) + pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n"); + return ret; +} + static void __dma_direct_free_pages(struct device *dev, struct page *page, size_t size) { - if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && - swiotlb_free(dev, page, size)) + if (swiotlb_free(dev, page, size)) return; dma_free_contiguous(dev, page, size); } +static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size) +{ + struct page *page = swiotlb_alloc(dev, size); + + if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { + swiotlb_free(dev, page, size); + return NULL; + } + + return page; +} + static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, gfp_t gfp) { @@ -93,18 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, WARN_ON_ONCE(!PAGE_ALIGNED(size)); + if (is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_swiotlb(dev, size); + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_limit); - if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) && - is_swiotlb_for_alloc(dev)) { - page = swiotlb_alloc(dev, size); - if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { - __dma_direct_free_pages(dev, page, size); - return NULL; - } - return page; - } - page = dma_alloc_contiguous(dev, size, gfp); if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { dma_free_contiguous(dev, page, size); @@ -133,6 +156,15 @@ again: return page; } +/* + * Check if a potentially blocking operations needs to dip into the atomic + * pools for the given device/gfp. + */ +static bool dma_direct_use_pool(struct device *dev, gfp_t gfp) +{ + return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev); +} + static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { @@ -140,6 +172,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, u64 phys_mask; void *ret; + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL))) + return NULL; + gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, &phys_mask); page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok); @@ -149,54 +184,81 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size, return ret; } +static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + struct page *page; + + page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); + if (!page) + return NULL; + + /* remove any dirty cache lines on the kernel alias */ + if (!PageHighMem(page)) + arch_dma_prep_coherent(page, size); + + /* return the page pointer as the opaque cookie */ + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); + return page; +} + void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { + bool remap = false, set_uncached = false; struct page *page; void *ret; - int err; size = PAGE_ALIGN(size); if (attrs & DMA_ATTR_NO_WARN) gfp |= __GFP_NOWARN; if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) && - !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) { - page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO); - if (!page) - return NULL; - /* remove any dirty cache lines on the kernel alias */ - if (!PageHighMem(page)) - arch_dma_prep_coherent(page, size); - *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); - /* return the page pointer as the opaque cookie */ - return page; - } + !force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) + return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp); - if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && - !dev_is_dma_coherent(dev) && - !is_swiotlb_for_alloc(dev)) - return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); + if (!dev_is_dma_coherent(dev)) { + /* + * Fallback to the arch handler if it exists. This should + * eventually go away. + */ + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && + !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && + !IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && + !is_swiotlb_for_alloc(dev)) + return arch_dma_alloc(dev, size, dma_handle, gfp, + attrs); - if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) && - !dev_is_dma_coherent(dev)) - return dma_alloc_from_global_coherent(dev, size, dma_handle); + /* + * If there is a global pool, always allocate from it for + * non-coherent devices. + */ + if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL)) + return dma_alloc_from_global_coherent(dev, size, + dma_handle); + + /* + * Otherwise remap if the architecture is asking for it. But + * given that remapping memory is a blocking operation we'll + * instead have to dip into the atomic pools. + */ + remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP); + if (remap) { + if (dma_direct_use_pool(dev, gfp)) + return dma_direct_alloc_from_pool(dev, size, + dma_handle, gfp); + } else { + if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED)) + return NULL; + set_uncached = true; + } + } /* - * Remapping or decrypting memory may block. If either is required and - * we can't block, allocate the memory from the atomic pools. - * If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must - * set up another device coherent pool by shared-dma-pool and use - * dma_alloc_from_dev_coherent instead. + * Decrypting memory may block, so allocate the memory from the atomic + * pools if we can't block. */ - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - !gfpflags_allow_blocking(gfp) && - (force_dma_unencrypted(dev) || - (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev))) && - !is_swiotlb_for_alloc(dev)) + if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); /* we always manually zero the memory once we are done */ @@ -204,68 +266,51 @@ void *dma_direct_alloc(struct device *dev, size_t size, if (!page) return NULL; - if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && - !dev_is_dma_coherent(dev)) || - (IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) { + /* + * dma_alloc_contiguous can return highmem pages depending on a + * combination the cma= arguments and per-arch setup. These need to be + * remapped to return a kernel virtual address. + */ + if (PageHighMem(page)) { + remap = true; + set_uncached = false; + } + + if (remap) { + pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs); + + if (force_dma_unencrypted(dev)) + prot = pgprot_decrypted(prot); + /* remove any dirty cache lines on the kernel alias */ arch_dma_prep_coherent(page, size); /* create a coherent mapping */ - ret = dma_common_contiguous_remap(page, size, - dma_pgprot(dev, PAGE_KERNEL, attrs), + ret = dma_common_contiguous_remap(page, size, prot, __builtin_return_address(0)); if (!ret) goto out_free_pages; - if (force_dma_unencrypted(dev)) { - err = set_memory_decrypted((unsigned long)ret, - 1 << get_order(size)); - if (err) - goto out_free_pages; - } - memset(ret, 0, size); - goto done; - } - - if (PageHighMem(page)) { - /* - * Depending on the cma= arguments and per-arch setup - * dma_alloc_contiguous could return highmem pages. - * Without remapping there is no way to return them here, - * so log an error and fail. - */ - dev_info(dev, "Rejecting highmem page from CMA.\n"); - goto out_free_pages; - } - - ret = page_address(page); - if (force_dma_unencrypted(dev)) { - err = set_memory_decrypted((unsigned long)ret, - 1 << get_order(size)); - if (err) + } else { + ret = page_address(page); + if (dma_set_decrypted(dev, ret, size)) goto out_free_pages; } memset(ret, 0, size); - if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) && - !dev_is_dma_coherent(dev)) { + if (set_uncached) { arch_dma_prep_coherent(page, size); ret = arch_dma_set_uncached(ret, size); if (IS_ERR(ret)) goto out_encrypt_pages; } -done: + *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return ret; out_encrypt_pages: - if (force_dma_unencrypted(dev)) { - err = set_memory_encrypted((unsigned long)page_address(page), - 1 << get_order(size)); - /* If memory cannot be re-encrypted, it must be leaked */ - if (err) - return NULL; - } + if (dma_set_encrypted(dev, page_address(page), size)) + return NULL; out_free_pages: __dma_direct_free_pages(dev, page, size); return NULL; @@ -304,13 +349,14 @@ void dma_direct_free(struct device *dev, size_t size, dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size))) return; - if (force_dma_unencrypted(dev)) - set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); - - if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) + if (is_vmalloc_addr(cpu_addr)) { vunmap(cpu_addr); - else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) - arch_dma_clear_uncached(cpu_addr, size); + } else { + if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED)) + arch_dma_clear_uncached(cpu_addr, size); + if (dma_set_encrypted(dev, cpu_addr, 1 << page_order)) + return; + } __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size); } @@ -321,9 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, struct page *page; void *ret; - if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) && - force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) && - !is_swiotlb_for_alloc(dev)) + if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp)) return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp); page = __dma_direct_alloc_pages(dev, size, gfp); @@ -341,11 +385,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size, } ret = page_address(page); - if (force_dma_unencrypted(dev)) { - if (set_memory_decrypted((unsigned long)ret, - 1 << get_order(size))) - goto out_free_pages; - } + if (dma_set_decrypted(dev, ret, size)) + goto out_free_pages; memset(ret, 0, size); *dma_handle = phys_to_dma_direct(dev, page_to_phys(page)); return page; @@ -366,9 +407,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, dma_free_from_pool(dev, vaddr, size)) return; - if (force_dma_unencrypted(dev)) - set_memory_encrypted((unsigned long)vaddr, 1 << page_order); - + if (dma_set_encrypted(dev, vaddr, 1 << page_order)) + return; __dma_direct_free_pages(dev, page, size); } @@ -499,6 +539,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, int ret = -ENXIO; vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs); + if (force_dma_unencrypted(dev)) + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret)) return ret; diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 4632b0f4f72e..8a6cd53dbe8c 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -114,6 +114,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, dma_direct_sync_single_for_cpu(dev, addr, size, dir); if (unlikely(is_swiotlb_buffer(dev, phys))) - swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); + swiotlb_tbl_unmap_single(dev, phys, size, dir, + attrs | DMA_ATTR_SKIP_CPU_SYNC); } #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 9b9af1bd6be3..0520a8f4fb1d 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -11,6 +11,7 @@ #include <linux/dma-mapping.h> #include <linux/kernel.h> #include <linux/kthread.h> +#include <linux/map_benchmark.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/pci.h> @@ -18,30 +19,6 @@ #include <linux/slab.h> #include <linux/timekeeping.h> -#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) -#define DMA_MAP_MAX_THREADS 1024 -#define DMA_MAP_MAX_SECONDS 300 -#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC) - -#define DMA_MAP_BIDIRECTIONAL 0 -#define DMA_MAP_TO_DEVICE 1 -#define DMA_MAP_FROM_DEVICE 2 - -struct map_benchmark { - __u64 avg_map_100ns; /* average map latency in 100ns */ - __u64 map_stddev; /* standard deviation of map latency */ - __u64 avg_unmap_100ns; /* as above */ - __u64 unmap_stddev; - __u32 threads; /* how many threads will do map/unmap in parallel */ - __u32 seconds; /* how long the test will last */ - __s32 node; /* which numa node this benchmark will run on */ - __u32 dma_bits; /* DMA addressing capability */ - __u32 dma_dir; /* DMA data direction */ - __u32 dma_trans_ns; /* time for DMA transmission in ns */ - __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */ - __u8 expansion[76]; /* For future use */ -}; - struct map_benchmark_data { struct map_benchmark bparam; struct device *dev; diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9478eccd1c8e..db7244291b74 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -407,8 +407,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs); */ pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs) { - if (force_dma_unencrypted(dev)) - prot = pgprot_decrypted(prot); if (dev_is_dma_coherent(dev)) return prot; #ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE @@ -745,7 +743,6 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask) { /* @@ -761,7 +758,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) return 0; } EXPORT_SYMBOL(dma_set_coherent_mask); -#endif size_t dma_max_mapping_size(struct device *dev) { diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5f84e6cdb78e..4d40dcce7604 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void) GFP_KERNEL); if (!atomic_pool_kernel) ret = -ENOMEM; - if (IS_ENABLED(CONFIG_ZONE_DMA)) { + if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) @@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) if (prev == NULL) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) return atomic_pool_dma32; - if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + if (atomic_pool_dma && (gfp & GFP_DMA)) return atomic_pool_dma; return atomic_pool_kernel; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 8e840fbbed7c..73a41cec9e38 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -21,39 +21,33 @@ #define pr_fmt(fmt) "software IO TLB: " fmt #include <linux/cache.h> +#include <linux/cc_platform.h> +#include <linux/ctype.h> +#include <linux/debugfs.h> #include <linux/dma-direct.h> #include <linux/dma-map-ops.h> -#include <linux/mm.h> #include <linux/export.h> +#include <linux/gfp.h> +#include <linux/highmem.h> +#include <linux/io.h> +#include <linux/iommu-helper.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/mm.h> +#include <linux/pfn.h> +#include <linux/scatterlist.h> +#include <linux/set_memory.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/swiotlb.h> -#include <linux/pfn.h> #include <linux/types.h> -#include <linux/ctype.h> -#include <linux/highmem.h> -#include <linux/gfp.h> -#include <linux/scatterlist.h> -#include <linux/cc_platform.h> -#include <linux/set_memory.h> -#ifdef CONFIG_DEBUG_FS -#include <linux/debugfs.h> -#endif #ifdef CONFIG_DMA_RESTRICTED_POOL -#include <linux/io.h> #include <linux/of.h> #include <linux/of_fdt.h> #include <linux/of_reserved_mem.h> #include <linux/slab.h> #endif -#include <asm/io.h> -#include <asm/dma.h> - -#include <linux/init.h> -#include <linux/memblock.h> -#include <linux/iommu-helper.h> - #define CREATE_TRACE_POINTS #include <trace/events/swiotlb.h> @@ -72,6 +66,8 @@ enum swiotlb_force swiotlb_force; struct io_tlb_mem io_tlb_default_mem; +phys_addr_t swiotlb_unencrypted_base; + /* * Max segment that we can provide which (if pages are contingous) will * not be bounced (unless SWIOTLB_FORCE is set). @@ -156,6 +152,34 @@ static inline unsigned long nr_slots(u64 val) } /* + * Remap swioltb memory in the unencrypted physical address space + * when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP + * Isolation VMs). + */ +#ifdef CONFIG_HAS_IOMEM +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + void *vaddr = NULL; + + if (swiotlb_unencrypted_base) { + phys_addr_t paddr = mem->start + swiotlb_unencrypted_base; + + vaddr = memremap(paddr, bytes, MEMREMAP_WB); + if (!vaddr) + pr_err("Failed to map the unencrypted memory %pa size %lx.\n", + &paddr, bytes); + } + + return vaddr; +} +#else +static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes) +{ + return NULL; +} +#endif + +/* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to * call SWIOTLB when the operations are possible. It needs to be called @@ -172,7 +196,10 @@ void __init swiotlb_update_mem_attributes(void) vaddr = phys_to_virt(mem->start); bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT); set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT); - memset(vaddr, 0, bytes); + + mem->vaddr = swiotlb_mem_remap(mem, bytes); + if (!mem->vaddr) + mem->vaddr = vaddr; } static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, @@ -196,7 +223,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start, mem->slots[i].orig_addr = INVALID_PHYS_ADDR; mem->slots[i].alloc_size = 0; } + + /* + * If swiotlb_unencrypted_base is set, the bounce buffer memory will + * be remapped and cleared in swiotlb_update_mem_attributes. + */ + if (swiotlb_unencrypted_base) + return; + memset(vaddr, 0, bytes); + mem->vaddr = vaddr; + return; } int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) @@ -371,7 +408,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size phys_addr_t orig_addr = mem->slots[index].orig_addr; size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); - unsigned char *vaddr = phys_to_virt(tlb_addr); + unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start; unsigned int tlb_offset, orig_addr_offset; if (orig_addr == INVALID_PHYS_ADDR) @@ -581,9 +618,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, for (i = 0; i < nr_slots(alloc_size + offset); i++) mem->slots[index + i].orig_addr = slot_addr(orig_addr, i); tlb_addr = slot_addr(mem->start, index) + offset; - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && - (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) - swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); + /* + * When dir == DMA_FROM_DEVICE we could omit the copy from the orig + * to the tlb buffer, if we knew for sure the device will + * overwirte the entire current content. But we don't. Thus + * unconditional bounce may prevent leaking swiotlb content (i.e. + * kernel memory) to user-space. + */ + swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE); return tlb_addr; } @@ -712,47 +754,29 @@ bool is_swiotlb_active(struct device *dev) } EXPORT_SYMBOL_GPL(is_swiotlb_active); -#ifdef CONFIG_DEBUG_FS -static struct dentry *debugfs_dir; - -static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem) +static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem, + const char *dirname) { + mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs); + if (!mem->nslabs) + return; + debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs); debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used); } -static int __init swiotlb_create_default_debugfs(void) +static int __init __maybe_unused swiotlb_create_default_debugfs(void) { - struct io_tlb_mem *mem = &io_tlb_default_mem; - - debugfs_dir = debugfs_create_dir("swiotlb", NULL); - if (mem->nslabs) { - mem->debugfs = debugfs_dir; - swiotlb_create_debugfs_files(mem); - } + swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb"); return 0; } +#ifdef CONFIG_DEBUG_FS late_initcall(swiotlb_create_default_debugfs); - #endif #ifdef CONFIG_DMA_RESTRICTED_POOL -#ifdef CONFIG_DEBUG_FS -static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) -{ - struct io_tlb_mem *mem = rmem->priv; - - mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir); - swiotlb_create_debugfs_files(mem); -} -#else -static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem) -{ -} -#endif - struct page *swiotlb_alloc(struct device *dev, size_t size) { struct io_tlb_mem *mem = dev->dma_io_tlb_mem; @@ -799,8 +823,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, if (!mem) return -ENOMEM; - mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs), - GFP_KERNEL); + mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL); if (!mem->slots) { kfree(mem); return -ENOMEM; @@ -814,7 +837,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem, rmem->priv = mem; - rmem_swiotlb_debugfs_init(rmem); + swiotlb_create_debugfs_files(mem, rmem->name); } dev->dma_io_tlb_mem = mem; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index d5a61d565ad5..93c3b86e781c 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,7 +2,9 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> +#include <linux/resume_user_mode.h> #include <linux/highmem.h> +#include <linux/jump_label.h> #include <linux/livepatch.h> #include <linux/audit.h> #include <linux/tick.h> @@ -58,7 +60,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { - ret = arch_syscall_enter_tracehook(regs); + ret = ptrace_report_syscall_entry(regs); if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -138,15 +140,7 @@ void noinstr exit_to_user_mode(void) } /* Workaround to allow gradual conversion of architecture code */ -void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } - -static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) -{ - if (ti_work & _TIF_NOTIFY_SIGNAL) - tracehook_notify_signal(); - - arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); -} +void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) @@ -169,10 +163,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, klp_update_patch_state(current); if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) - handle_signal_work(regs, ti_work); + arch_do_signal_or_restart(regs); if (ti_work & _TIF_NOTIFY_RESUME) - tracehook_notify_resume(regs); + resume_user_mode_work(regs); /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work); @@ -187,7 +181,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, /* Check if any of the above work has queued a deferred wakeup */ tick_nohz_user_enter_prepare(); - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); } /* Return the latest work state for arch_exit_to_user_mode() */ @@ -196,7 +190,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, static void exit_to_user_mode_prepare(struct pt_regs *regs) { - unsigned long ti_work = READ_ONCE(current_thread_info()->flags); + unsigned long ti_work = read_thread_flags(); lockdep_assert_irqs_disabled(); @@ -252,7 +246,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work) step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) - arch_syscall_exit_tracehook(regs, step); + ptrace_report_syscall_exit(regs, step); } /* @@ -380,7 +374,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) return ret; } -void irqentry_exit_cond_resched(void) +void raw_irqentry_exit_cond_resched(void) { if (!preempt_count()) { /* Sanity check RCU and thread stack */ @@ -392,7 +386,17 @@ void irqentry_exit_cond_resched(void) } } #ifdef CONFIG_PREEMPT_DYNAMIC -DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched); +#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched); +void dynamic_irqentry_exit_cond_resched(void) +{ + if (!static_branch_unlikely(&sk_dynamic_irqentry_exit_cond_resched)) + return; + raw_irqentry_exit_cond_resched(); +} +#endif #endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) @@ -420,13 +424,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) { -#ifdef CONFIG_PREEMPT_DYNAMIC - static_call(irqentry_exit_cond_resched)(); -#else + if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); -#endif - } + /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index 49972ee99aff..9d09f489b60e 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -8,8 +8,11 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) do { int ret; - if (ti_work & _TIF_NOTIFY_SIGNAL) - tracehook_notify_signal(); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) { + clear_notify_signal(); + if (task_work_pending(current)) + task_work_run(); + } if (ti_work & _TIF_SIGPENDING) { kvm_handle_signal_exit(vcpu); @@ -20,13 +23,13 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) schedule(); if (ti_work & _TIF_NOTIFY_RESUME) - tracehook_notify_resume(NULL); + resume_user_mode_work(NULL); ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work); if (ret) return ret; - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); return 0; } @@ -43,7 +46,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu) * disabled in the inner loop before going into guest mode. No need * to disable interrupts here. */ - ti_work = READ_ONCE(current_thread_info()->flags); + ti_work = read_thread_flags(); if (!(ti_work & XFER_TO_GUEST_MODE_WORK)) return 0; diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index 4508201847d2..0b6379adff6b 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -48,7 +48,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) * the selector is loaded by userspace. */ if (unlikely(__get_user(state, sd->selector))) { - force_fatal_sig(SIGSEGV); + force_exit_sig(SIGSEGV); return true; } @@ -56,7 +56,7 @@ bool syscall_user_dispatch(struct pt_regs *regs) return false; if (state != SYSCALL_DISPATCH_FILTER_BLOCK) { - force_fatal_sig(SIGSYS); + force_exit_sig(SIGSYS); return true; } } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 58cbe357fb2b..1273be84392c 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { - mm_segment_t fs; - if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); - fs = force_uaccess_begin(); perf_callchain_user(&ctx, regs); - force_uaccess_end(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 523106a506ee..23bb19716ad3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -574,8 +574,7 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, enum event_type_t event_type); static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); @@ -674,6 +673,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) WRITE_ONCE(event->state, state); } +/* + * UP store-release, load-acquire + */ + +#define __store_release(ptr, val) \ +do { \ + barrier(); \ + WRITE_ONCE(*(ptr), (val)); \ +} while (0) + +#define __load_acquire(ptr) \ +({ \ + __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ + barrier(); \ + ___p; \ +}) + #ifdef CONFIG_CGROUP_PERF static inline bool @@ -719,35 +735,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) return t->time; } -static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { - struct perf_cgroup_info *info; - u64 now; - - now = perf_clock(); + struct perf_cgroup_info *t; - info = this_cpu_ptr(cgrp->info); + t = per_cpu_ptr(event->cgrp->info, event->cpu); + if (!__load_acquire(&t->active)) + return t->time; + now += READ_ONCE(t->timeoffset); + return now; +} - info->time += now - info->timestamp; +static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) +{ + if (adv) + info->time += now - info->timestamp; info->timestamp = now; + /* + * see update_context_time() + */ + WRITE_ONCE(info->timeoffset, info->time - info->timestamp); } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) { struct perf_cgroup *cgrp = cpuctx->cgrp; struct cgroup_subsys_state *css; + struct perf_cgroup_info *info; if (cgrp) { + u64 now = perf_clock(); + for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); - __update_cgrp_time(cgrp); + info = this_cpu_ptr(cgrp->info); + + __update_cgrp_time(info, now, true); + if (final) + __store_release(&info->active, 0); } } } static inline void update_cgrp_time_from_event(struct perf_event *event) { - struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; /* * ensure we access cgroup data only when needed and @@ -756,19 +788,19 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current, event->ctx); + info = this_cpu_ptr(event->cgrp->info); /* * Do not update time when cgroup is not active */ - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - __update_cgrp_time(event->cgrp); + if (info->active) + __update_cgrp_time(info, perf_clock(), true); } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { - struct perf_cgroup *cgrp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_cgroup *cgrp = cpuctx->cgrp; struct perf_cgroup_info *info; struct cgroup_subsys_state *css; @@ -777,32 +809,28 @@ perf_cgroup_set_timestamp(struct task_struct *task, * ensure we do not access cgroup data * unless we have the cgroup pinned (css_get) */ - if (!task || !ctx->nr_cgroups) + if (!cgrp) return; - cgrp = perf_cgroup_from_task(task, ctx); + WARN_ON_ONCE(!ctx->nr_cgroups); for (css = &cgrp->css; css; css = css->parent) { cgrp = container_of(css, struct perf_cgroup, css); info = this_cpu_ptr(cgrp->info); - info->timestamp = ctx->timestamp; + __update_cgrp_time(info, ctx->timestamp, false); + __store_release(&info->active, 1); } } static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list); -#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ -#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ - /* * reschedule events based on the cgroup constraint of task. - * - * mode SWOUT : schedule out everything - * mode SWIN : schedule in based on cgroup for next */ -static void perf_cgroup_switch(struct task_struct *task, int mode) +static void perf_cgroup_switch(struct task_struct *task) { - struct perf_cpu_context *cpuctx; + struct perf_cgroup *cgrp; + struct perf_cpu_context *cpuctx, *tmp; struct list_head *list; unsigned long flags; @@ -812,35 +840,31 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) */ local_irq_save(flags); + cgrp = perf_cgroup_from_task(task, NULL); + list = this_cpu_ptr(&cgrp_cpuctx_list); - list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) { + list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) { WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); + if (READ_ONCE(cpuctx->cgrp) == cgrp) + continue; perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - if (mode & PERF_CGROUP_SWOUT) { - cpu_ctx_sched_out(cpuctx, EVENT_ALL); - /* - * must not be done before ctxswout due - * to event_filter_match() in event_sched_out() - */ - cpuctx->cgrp = NULL; - } + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to update_cgrp_time_from_cpuctx() in + * ctx_sched_out() + */ + cpuctx->cgrp = cgrp; + /* + * set cgrp before ctxsw in to allow + * perf_cgroup_set_timestamp() in ctx_sched_in() + * to not have to pass task around + */ + cpu_ctx_sched_in(cpuctx, EVENT_ALL); - if (mode & PERF_CGROUP_SWIN) { - WARN_ON_ONCE(cpuctx->cgrp); - /* - * set cgrp before ctxsw in to allow - * event_filter_match() to not have to pass - * task around - * we pass the cpuctx->ctx to perf_cgroup_from_task() - * because cgorup events are only per-cpu - */ - cpuctx->cgrp = perf_cgroup_from_task(task, - &cpuctx->ctx); - cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); - } perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } @@ -848,58 +872,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) local_irq_restore(flags); } -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(next, NULL); - - /* - * only schedule out current cgroup events if we know - * that we are switching to a different cgroup. Otherwise, - * do no touch the cgroup events. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWOUT); - - rcu_read_unlock(); -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cgroup *cgrp1; - struct perf_cgroup *cgrp2 = NULL; - - rcu_read_lock(); - /* - * we come here when we know perf_cgroup_events > 0 - * we do not need to pass the ctx here because we know - * we are holding the rcu lock - */ - cgrp1 = perf_cgroup_from_task(task, NULL); - cgrp2 = perf_cgroup_from_task(prev, NULL); - - /* - * only need to schedule in cgroup events if we are changing - * cgroup during ctxsw. Cgroup events were not scheduled - * out of ctxsw out if that was not the case. - */ - if (cgrp1 != cgrp2) - perf_cgroup_switch(task, PERF_CGROUP_SWIN); - - rcu_read_unlock(); -} - static int perf_cgroup_ensure_storage(struct perf_event *event, struct cgroup_subsys_state *css) { @@ -982,14 +954,6 @@ out: } static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) -{ - struct perf_cgroup_info *t; - t = per_cpu_ptr(event->cgrp->info, event->cpu); - event->shadow_ctx_time = now - t->timestamp; -} - -static inline void perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx; @@ -1003,22 +967,10 @@ perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ct */ cpuctx = container_of(ctx, struct perf_cpu_context, ctx); - /* - * Since setting cpuctx->cgrp is conditional on the current @cgrp - * matching the event's cgroup, we must do this for every new event, - * because if the first would mismatch, the second would not try again - * and we would leave cpuctx->cgrp unset. - */ - if (ctx->is_active && !cpuctx->cgrp) { - struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx); - - if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) - cpuctx->cgrp = cgrp; - } - if (ctx->nr_cgroups++) return; + cpuctx->cgrp = perf_cgroup_from_task(current, ctx); list_add(&cpuctx->cgrp_cpuctx_entry, per_cpu_ptr(&cgrp_cpuctx_list, event->cpu)); } @@ -1040,9 +992,7 @@ perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *c if (--ctx->nr_cgroups) return; - if (ctx->is_active && cpuctx->cgrp) - cpuctx->cgrp = NULL; - + cpuctx->cgrp = NULL; list_del(&cpuctx->cgrp_cpuctx_entry); } @@ -1066,17 +1016,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) { } -static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) -{ -} - -static inline void perf_cgroup_sched_out(struct task_struct *task, - struct task_struct *next) -{ -} - -static inline void perf_cgroup_sched_in(struct task_struct *prev, - struct task_struct *task) +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, + bool final) { } @@ -1088,22 +1029,16 @@ static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, } static inline void -perf_cgroup_set_timestamp(struct task_struct *task, - struct perf_event_context *ctx) -{ -} - -static inline void -perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) { } -static inline void -perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +static inline u64 perf_cgroup_event_time(struct perf_event *event) { + return 0; } -static inline u64 perf_cgroup_event_time(struct perf_event *event) +static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) { return 0; } @@ -1117,6 +1052,10 @@ static inline void perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx) { } + +static void perf_cgroup_switch(struct task_struct *task) +{ +} #endif /* @@ -1525,22 +1464,59 @@ static void perf_unpin_context(struct perf_event_context *ctx) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_event_context *ctx) +static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); - ctx->time += now - ctx->timestamp; + if (adv) + ctx->time += now - ctx->timestamp; ctx->timestamp = now; + + /* + * The above: time' = time + (now - timestamp), can be re-arranged + * into: time` = now + (time - timestamp), which gives a single value + * offset to compute future time without locks on. + * + * See perf_event_time_now(), which can be used from NMI context where + * it's (obviously) not possible to acquire ctx->lock in order to read + * both the above values in a consistent manner. + */ + WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); +} + +static void update_context_time(struct perf_event_context *ctx) +{ + __update_context_time(ctx, true); } static u64 perf_event_time(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; + if (unlikely(!ctx)) + return 0; + if (is_cgroup_event(event)) return perf_cgroup_event_time(event); - return ctx ? ctx->time : 0; + return ctx->time; +} + +static u64 perf_event_time_now(struct perf_event *event, u64 now) +{ + struct perf_event_context *ctx = event->ctx; + + if (unlikely(!ctx)) + return 0; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time_now(event, now); + + if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) + return ctx->time; + + now += READ_ONCE(ctx->timeoffset); + return now; } static enum event_type_t get_event_type(struct perf_event *event) @@ -1808,6 +1784,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1999,6 +1977,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; ctx->nr_events--; + if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT) + ctx->nr_user--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2346,7 +2326,7 @@ __perf_remove_from_context(struct perf_event *event, if (ctx->is_active & EVENT_TIME) { update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, false); } event_sched_out(event, cpuctx, ctx); @@ -2357,6 +2337,9 @@ __perf_remove_from_context(struct perf_event *event, list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { + if (ctx == &cpuctx->ctx) + update_cgrp_time_from_cpuctx(cpuctx, true); + ctx->is_active = 0; ctx->rotate_necessary = 0; if (ctx->task) { @@ -2388,7 +2371,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla * event_function_call() user. */ raw_spin_lock_irq(&ctx->lock); - if (!ctx->is_active) { + /* + * Cgroup events are per-cpu events, and must IPI because of + * cgrp_cpuctx_list. + */ + if (!ctx->is_active && !is_cgroup_event(event)) { __perf_remove_from_context(event, __get_cpu_context(ctx), ctx, (void *)flags); raw_spin_unlock_irq(&ctx->lock); @@ -2478,40 +2465,6 @@ void perf_event_disable_inatomic(struct perf_event *event) irq_work_queue(&event->pending); } -static void perf_set_shadow_time(struct perf_event *event, - struct perf_event_context *ctx) -{ - /* - * use the correct time source for the time snapshot - * - * We could get by without this by leveraging the - * fact that to get to this function, the caller - * has most likely already called update_context_time() - * and update_cgrp_time_xx() and thus both timestamp - * are identical (or very close). Given that tstamp is, - * already adjusted for cgroup, we could say that: - * tstamp - ctx->timestamp - * is equivalent to - * tstamp - cgrp->timestamp. - * - * Then, in perf_output_read(), the calculation would - * work with no changes because: - * - event is guaranteed scheduled in - * - no scheduled out in between - * - thus the timestamp would be the same - * - * But this is a bit hairy. - * - * So instead, we have an explicit cgroup call to remain - * within the time source all along. We believe it - * is cleaner and simpler to understand. - */ - if (is_cgroup_event(event)) - perf_cgroup_set_shadow_time(event, event->tstamp); - else - event->shadow_ctx_time = event->tstamp - ctx->timestamp; -} - #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -2552,8 +2505,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - perf_set_shadow_time(event, ctx); - perf_log_itrace_start(event); if (event->pmu->add(event, PERF_EF_START)) { @@ -2671,8 +2622,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task); + enum event_type_t event_type); static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, @@ -2688,15 +2638,14 @@ static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, } static void perf_event_sched_in(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, - struct task_struct *task) + struct perf_event_context *ctx) { - cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_PINNED); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); - cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_PINNED); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); } /* @@ -2746,7 +2695,7 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, else if (ctx_event_type & EVENT_PINNED) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); } @@ -2857,11 +2806,14 @@ perf_install_in_context(struct perf_event_context *ctx, * perf_event_attr::disabled events will not run and can be initialized * without IPI. Except when this is the first event for the context, in * that case we need the magic of the IPI to set ctx->is_active. + * Similarly, cgroup events for the context also needs the IPI to + * manipulate the cgrp_cpuctx_list. * * The IOC_ENABLE that is sure to follow the creation of a disabled * event will issue the IPI and reprogram the hardware. */ - if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) { + if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && + ctx->nr_events && !is_cgroup_event(event)) { raw_spin_lock_irq(&ctx->lock); if (ctx->task == TASK_TOMBSTONE) { raw_spin_unlock_irq(&ctx->lock); @@ -2966,7 +2918,7 @@ static void __perf_event_enable(struct perf_event *event, return; if (!event_filter_match(event)) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -2975,7 +2927,7 @@ static void __perf_event_enable(struct perf_event *event, * then don't put it on unless the group is on. */ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); return; } @@ -3193,6 +3145,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp, return err; } +/* + * Copy event-type-independent attributes that may be modified. + */ +static void perf_event_modify_copy_attr(struct perf_event_attr *to, + const struct perf_event_attr *from) +{ + to->sig_data = from->sig_data; +} + static int perf_event_modify_attr(struct perf_event *event, struct perf_event_attr *attr) { @@ -3215,10 +3176,17 @@ static int perf_event_modify_attr(struct perf_event *event, WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->child_mutex); + /* + * Event-type-independent attributes must be copied before event-type + * modification, which will validate that final attributes match the + * source attributes after all relevant attributes have been copied. + */ + perf_event_modify_copy_attr(&event->attr, attr); err = func(event, attr); if (err) goto out; list_for_each_entry(child, &event->child_list, child_list) { + perf_event_modify_copy_attr(&child->attr, attr); err = func(child, attr); if (err) goto out; @@ -3247,16 +3215,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, return; } - ctx->is_active &= ~event_type; - if (!(ctx->is_active & EVENT_ALL)) - ctx->is_active = 0; - - if (ctx->task) { - WARN_ON_ONCE(cpuctx->task_ctx != ctx); - if (!ctx->is_active) - cpuctx->task_ctx = NULL; - } - /* * Always update time if it was set; not only when it changes. * Otherwise we can 'forget' to update time for any but the last @@ -3270,7 +3228,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (is_active & EVENT_TIME) { /* update (and stop) ctx time */ update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); + update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; } is_active ^= ctx->is_active; /* changed bits */ @@ -3602,7 +3575,7 @@ void __perf_event_task_sched_out(struct task_struct *task, * cgroup event are system-wide mode only */ if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_out(task, next); + perf_cgroup_switch(next); } /* @@ -3707,13 +3680,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +/* + * Because the userpage is strictly per-event (there is no concept of context, + * so there cannot be a context indirection), every userpage must be updated + * when context time starts :-( + * + * IOW, we must not miss EVENT_TIME edges. + */ static inline bool event_update_userpage(struct perf_event *event) { if (likely(!atomic_read(&event->mmap_count))) return false; perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_update_userpage(event); return true; @@ -3793,17 +3772,26 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { int is_active = ctx->is_active; - u64 now; lockdep_assert_held(&ctx->lock); if (likely(!ctx->nr_events)) return; + if (is_active ^ EVENT_TIME) { + /* start ctx time */ + __update_context_time(ctx, false); + perf_cgroup_set_timestamp(cpuctx); + /* + * CPU-release for the below ->is_active store, + * see __load_acquire() in perf_event_time_now() + */ + barrier(); + } + ctx->is_active |= (event_type | EVENT_TIME); if (ctx->task) { if (!is_active) @@ -3814,13 +3802,6 @@ ctx_sched_in(struct perf_event_context *ctx, is_active ^= ctx->is_active; /* changed bits */ - if (is_active & EVENT_TIME) { - /* start ctx time */ - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); - } - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -3834,12 +3815,11 @@ ctx_sched_in(struct perf_event_context *ctx, } static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, - enum event_type_t event_type, - struct task_struct *task) + enum event_type_t event_type) { struct perf_event_context *ctx = &cpuctx->ctx; - ctx_sched_in(ctx, cpuctx, event_type, task); + ctx_sched_in(ctx, cpuctx, event_type); } static void perf_event_context_sched_in(struct perf_event_context *ctx, @@ -3881,7 +3861,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, */ if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - perf_event_sched_in(cpuctx, ctx, task); + perf_event_sched_in(cpuctx, ctx); if (cpuctx->sched_cb_usage && pmu->sched_task) pmu->sched_task(cpuctx->task_ctx, true); @@ -3909,16 +3889,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; - /* - * If cgroup events exist on this CPU, then we need to check if we have - * to switch in PMU state; cgroup event are system-wide mode only. - * - * Since cgroup events are CPU events, we must schedule these in before - * we schedule in the task events. - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); - for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -4192,7 +4162,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) if (cpu_event) rotate_ctx(&cpuctx->ctx, cpu_event); - perf_event_sched_in(cpuctx, task_ctx, current); + perf_event_sched_in(cpuctx, task_ctx); perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, cpuctx->task_ctx); @@ -4264,7 +4234,7 @@ static void perf_event_enable_on_exec(int ctxn) clone_ctx = unclone_ctx(ctx); ctx_resched(cpuctx, ctx, event_type); } else { - ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + ctx_sched_in(ctx, cpuctx, EVENT_TIME); } perf_ctx_unlock(cpuctx, ctx); @@ -4414,6 +4384,18 @@ static inline u64 perf_event_count(struct perf_event *event) return local64_read(&event->count) + atomic64_read(&event->child_count); } +static void calc_timer_values(struct perf_event *event, + u64 *now, + u64 *enabled, + u64 *running) +{ + u64 ctx_time; + + *now = perf_clock(); + ctx_time = perf_event_time_now(event, *now); + __perf_update_times(event, ctx_time, enabled, running); +} + /* * NMI-safe method to read a local event, that is an event that * is: @@ -4473,10 +4455,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value, *value = local64_read(&event->count); if (enabled || running) { - u64 now = event->shadow_ctx_time + perf_clock(); - u64 __enabled, __running; + u64 __enabled, __running, __now;; - __perf_update_times(event, now, &__enabled, &__running); + calc_timer_values(event, &__now, &__enabled, &__running); if (enabled) *enabled = __enabled; if (running) @@ -5798,18 +5779,6 @@ static int perf_event_index(struct perf_event *event) return event->pmu->event_idx(event); } -static void calc_timer_values(struct perf_event *event, - u64 *now, - u64 *enabled, - u64 *running) -{ - u64 ctx_time; - - *now = perf_clock(); - ctx_time = event->shadow_ctx_time + *now; - __perf_update_times(event, ctx_time, enabled, running); -} - static void perf_event_init_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; @@ -5934,6 +5903,8 @@ static void ring_buffer_attach(struct perf_event *event, struct perf_buffer *old_rb = NULL; unsigned long flags; + WARN_ON_ONCE(event->parent); + if (event->rb) { /* * Should be impossible, we set this when removing @@ -5991,6 +5962,9 @@ static void ring_buffer_wakeup(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6004,6 +5978,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event) { struct perf_buffer *rb; + if (event->parent) + event = event->parent; + rcu_read_lock(); rb = rcu_dereference(event->rb); if (rb) { @@ -6349,7 +6326,6 @@ accounting: ring_buffer_attach(event, rb); perf_event_update_time(event); - perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { @@ -6521,26 +6497,43 @@ static void perf_pending_event(struct irq_work *entry) perf_swevent_put_recursion_context(rctx); } -/* - * We assume there is only KVM supporting the callbacks. - * Later on, we might change it to a list if there is - * another virtualization implementation supporting the callbacks. - */ -struct perf_guest_info_callbacks *perf_guest_cbs; +#ifdef CONFIG_GUEST_PERF_EVENTS +struct perf_guest_info_callbacks __rcu *perf_guest_cbs; -int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); +DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); +DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); + +void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = cbs; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs))) + return; + + rcu_assign_pointer(perf_guest_cbs, cbs); + static_call_update(__perf_guest_state, cbs->state); + static_call_update(__perf_guest_get_ip, cbs->get_ip); + + /* Implementing ->handle_intel_pt_intr is optional. */ + if (cbs->handle_intel_pt_intr) + static_call_update(__perf_guest_handle_intel_pt_intr, + cbs->handle_intel_pt_intr); } EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); -int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) { - perf_guest_cbs = NULL; - return 0; + if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs)) + return; + + rcu_assign_pointer(perf_guest_cbs, NULL); + static_call_update(__perf_guest_state, (void *)&__static_call_return0); + static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); + static_call_update(__perf_guest_handle_intel_pt_intr, + (void *)&__static_call_return0); + synchronize_rcu(); } EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); +#endif static void perf_output_sample_regs(struct perf_output_handle *handle, @@ -6648,7 +6641,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, unsigned long sp; unsigned int rem; u64 dyn_size; - mm_segment_t fs; /* * We dump: @@ -6666,9 +6658,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, /* Data. */ sp = perf_user_stack_pointer(regs); - fs = force_uaccess_begin(); rem = __output_copy_user(handle, (void *) sp, dump_size); - force_uaccess_end(fs); dyn_size = dump_size - rem; perf_output_skip(handle, rem); @@ -6696,7 +6686,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event, if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id())) goto out; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) goto out; @@ -6762,7 +6752,7 @@ static void perf_aux_sample_output(struct perf_event *event, if (WARN_ON_ONCE(!sampler || !data->aux_size)) return; - rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler); + rb = ring_buffer_get(sampler); if (!rb) return; @@ -9759,6 +9749,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, continue; if (event->attr.config != entry->type) continue; + /* Cannot deliver synchronous signal to other task. */ + if (event->attr.sigtrap) + continue; if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); } @@ -10430,8 +10423,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ if (state == IF_STATE_END) { ret = -EINVAL; - if (kernel && event->attr.exclude_kernel) - goto fail; /* * ACTION "filter" must have a non-zero length region @@ -10473,8 +10464,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, } /* ready to consume more filters */ + kfree(filename); + filename = NULL; state = IF_STATE_ACTION; filter = NULL; + kernel = 0; } } @@ -11536,6 +11530,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (parent_event) + event->event_caps = parent_event->event_caps; + if (event->attr.sigtrap) atomic_set(&event->event_limit, 1); @@ -13463,7 +13460,7 @@ static int __perf_cgroup_move(void *info) { struct task_struct *task = info; rcu_read_lock(); - perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + perf_cgroup_switch(task); rcu_read_unlock(); return 0; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6357c3580d07..6418083901d4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; - struct page_vma_mapped_walk pvmw = { - .page = compound_head(old_page), - .vma = vma, - .address = addr, - }; + DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); int err; struct mmu_notifier_range range; @@ -173,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() and munlock_vma_page() below */ + /* For try_to_free_swap() below */ lock_page(old_page); mmu_notifier_invalidate_range_start(&range); @@ -201,13 +197,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(old_page, false); + page_remove_rmap(old_page, vma, false); if (!page_mapped(old_page)) try_to_free_swap(old_page); page_vma_mapped_walk_done(&pvmw); - - if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page)) - munlock_vma_page(old_page); put_page(old_page); err = 0; diff --git a/kernel/exit.c b/kernel/exit.c index f702a6a63686..f072959fcab7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -49,7 +49,8 @@ #include <linux/audit.h> /* for audit_free() */ #include <linux/resource.h> #include <linux/task_io_accounting_ops.h> -#include <linux/tracehook.h> +#include <linux/blkdev.h> +#include <linux/task_work.h> #include <linux/fs_struct.h> #include <linux/init_task.h> #include <linux/perf_event.h> @@ -64,6 +65,7 @@ #include <linux/compat.h> #include <linux/io_uring.h> #include <linux/kprobes.h> +#include <linux/rethook.h> #include <linux/uaccess.h> #include <asm/unistd.h> @@ -116,7 +118,7 @@ static void __exit_signal(struct task_struct *tsk) * then notify it: */ if (sig->notify_count > 0 && !--sig->notify_count) - wake_up_process(sig->group_exit_task); + wake_up_process(sig->group_exec_task); if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); @@ -169,6 +171,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); kprobe_flush_task(tsk); + rethook_flush_task(tsk); perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); @@ -697,7 +700,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) - wake_up_process(tsk->signal->group_exit_task); + wake_up_process(tsk->signal->group_exec_task); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { @@ -735,37 +738,8 @@ void __noreturn do_exit(long code) struct task_struct *tsk = current; int group_dead; - /* - * We can get here from a kernel oops, sometimes with preemption off. - * Start by checking for critical errors. - * Then fix up important state like USER_DS and preemption. - * Then do everything else. - */ - - WARN_ON(blk_needs_flush_plug(tsk)); - - if (unlikely(in_interrupt())) - panic("Aiee, killing interrupt handler!"); - if (unlikely(!tsk->pid)) - panic("Attempted to kill the idle task!"); - - /* - * If do_exit is called because this processes oopsed, it's possible - * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before - * continuing. Amongst other possible reasons, this is to prevent - * mm_release()->clear_child_tid() from writing to a user-controlled - * kernel address. - */ - force_uaccess_begin(); + WARN_ON(tsk->plug); - if (unlikely(in_atomic())) { - pr_info("note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); - preempt_count_set(PREEMPT_ENABLED); - } - - profile_task_exit(tsk); kcov_task_exit(tsk); coredump_task_exit(tsk); @@ -773,17 +747,6 @@ void __noreturn do_exit(long code) validate_creds_for_do_exit(tsk); - /* - * We're taking recursive faults here in do_exit. Safest is to just - * leave this task alone and wait for reboot. - */ - if (unlikely(tsk->flags & PF_EXITING)) { - pr_alert("Fixing recursive fault but reboot is needed!\n"); - futex_exit_recursive(tsk); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - } - io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ @@ -871,6 +834,7 @@ void __noreturn do_exit(long code) put_page(tsk->task_frag.page); validate_creds_for_do_exit(tsk); + exit_task_stack_account(tsk); check_stack_usage(); preempt_disable(); @@ -882,16 +846,46 @@ void __noreturn do_exit(long code) lockdep_free_task(tsk); do_task_dead(); } -EXPORT_SYMBOL_GPL(do_exit); -void complete_and_exit(struct completion *comp, long code) +void __noreturn make_task_dead(int signr) { - if (comp) - complete(comp); + /* + * Take the task off the cpu after something catastrophic has + * happened. + * + * We can get here from a kernel oops, sometimes with preemption off. + * Start by checking for critical errors. + * Then fix up important state like USER_DS and preemption. + * Then do everything else. + */ + struct task_struct *tsk = current; + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } - do_exit(code); + /* + * We're taking recursive faults here in make_task_dead. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + pr_alert("Fixing recursive fault but reboot is needed!\n"); + futex_exit_recursive(tsk); + tsk->exit_state = EXIT_DEAD; + refcount_inc(&tsk->rcu_users); + do_task_dead(); + } + + do_exit(signr); } -EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) { @@ -902,22 +896,24 @@ SYSCALL_DEFINE1(exit, int, error_code) * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). */ -void +void __noreturn do_group_exit(int exit_code) { struct signal_struct *sig = current->signal; - BUG_ON(exit_code & 0x80); /* core dumps don't get here */ - - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); - if (signal_group_exit(sig)) + if (sig->flags & SIGNAL_GROUP_EXIT) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; + else if (sig->group_exec_task) + exit_code = 0; else { sig->group_exit_code = exit_code; sig->flags = SIGNAL_GROUP_EXIT; @@ -1012,7 +1008,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) return 0; if (unlikely(wo->wo_flags & WNOWAIT)) { - status = p->exit_code; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; get_task_struct(p); read_unlock(&tasklist_lock); sched_annotate_sleep(); diff --git a/kernel/extable.c b/kernel/extable.c index b6f330f0fe74..bda5e9761541 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -3,6 +3,7 @@ Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. */ +#include <linux/elf.h> #include <linux/ftrace.h> #include <linux/memory.h> #include <linux/extable.h> @@ -132,12 +133,33 @@ out: } /* - * On some architectures (PPC64, IA64) function pointers + * On some architectures (PPC64, IA64, PARISC) function pointers * are actually only tokens to some data that then holds the * real function address. As a result, to find if a function * pointer is part of the kernel text, we need to do some * special dereferencing first. */ +#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS +void *dereference_function_descriptor(void *ptr) +{ + func_desc_t *desc = ptr; + void *p; + + if (!get_kernel_nofault(p, (void *)&desc->addr)) + ptr = p; + return ptr; +} +EXPORT_SYMBOL_GPL(dereference_function_descriptor); + +void *dereference_kernel_function_descriptor(void *ptr) +{ + if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd) + return ptr; + + return dereference_function_descriptor(ptr); +} +#endif + int func_ptr_is_kernel_text(void *ptr) { unsigned long addr; diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..9796897560ab 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/mm_inline.h> #include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> @@ -96,6 +97,7 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> +#include <linux/sched/mm.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -184,7 +186,7 @@ static inline void free_task_struct(struct task_struct *tsk) */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) -#ifdef CONFIG_VMAP_STACK +# ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. @@ -192,6 +194,41 @@ static inline void free_task_struct(struct task_struct *tsk) #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +struct vm_stack { + struct rcu_head rcu; + struct vm_struct *stack_vm_area; +}; + +static bool try_release_thread_stack_to_cache(struct vm_struct *vm) +{ + unsigned int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL) + continue; + return true; + } + return false; +} + +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); + + if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) + return; + + vfree(vm_stack); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct vm_stack *vm_stack = tsk->stack; + + vm_stack->stack_vm_area = tsk->stack_vm_area; + call_rcu(&vm_stack->rcu, thread_stack_free_rcu); +} + static int free_vm_stack_cache(unsigned int cpu) { struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); @@ -209,11 +246,35 @@ static int free_vm_stack_cache(unsigned int cpu) return 0; } -#endif -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) +static int memcg_charge_kernel_stack(struct vm_struct *vm) { -#ifdef CONFIG_VMAP_STACK + int i; + int ret; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); + if (ret) + goto err; + } + return 0; +err: + /* + * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is + * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will + * ignore this page. + */ + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); + return ret; +} + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + struct vm_struct *vm; void *stack; int i; @@ -225,15 +286,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; - /* Mark stack accessible for KASAN. */ + /* Reset stack metadata. */ kasan_unpoison_range(s->addr, THREAD_SIZE); + stack = kasan_reset_tag(s->addr); + /* Clear stale pointers from reused stack. */ - memset(s->addr, 0, THREAD_SIZE); + memset(stack, 0, THREAD_SIZE); + + if (memcg_charge_kernel_stack(s)) { + vfree(s->addr); + return -ENOMEM; + } tsk->stack_vm_area = s; - tsk->stack = s->addr; - return s->addr; + tsk->stack = stack; + return 0; } /* @@ -246,71 +314,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) THREADINFO_GFP & ~__GFP_ACCOUNT, PAGE_KERNEL, 0, node, __builtin_return_address(0)); + if (!stack) + return -ENOMEM; + vm = find_vm_area(stack); + if (memcg_charge_kernel_stack(vm)) { + vfree(stack); + return -ENOMEM; + } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ - if (stack) { - tsk->stack_vm_area = find_vm_area(stack); - tsk->stack = stack; - } - return stack; -#else + tsk->stack_vm_area = vm; + stack = kasan_reset_tag(stack); + tsk->stack = stack; + return 0; +} + +static void free_thread_stack(struct task_struct *tsk) +{ + if (!try_release_thread_stack_to_cache(tsk->stack_vm_area)) + thread_stack_delayed_free(tsk); + + tsk->stack = NULL; + tsk->stack_vm_area = NULL; +} + +# else /* !CONFIG_VMAP_STACK */ + +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER); +} + +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; + + call_rcu(rh, thread_stack_free_rcu); +} + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); - return tsk->stack; + return 0; } - return NULL; -#endif + return -ENOMEM; } -static inline void free_thread_stack(struct task_struct *tsk) +static void free_thread_stack(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { - int i; + thread_stack_delayed_free(tsk); + tsk->stack = NULL; +} - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) - memcg_kmem_uncharge_page(vm->pages[i], 0); +# endif /* CONFIG_VMAP_STACK */ +# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ - for (i = 0; i < NR_CACHED_STACKS; i++) { - if (this_cpu_cmpxchg(cached_stacks[i], - NULL, tsk->stack_vm_area) != NULL) - continue; +static struct kmem_cache *thread_stack_cache; - return; - } +static void thread_stack_free_rcu(struct rcu_head *rh) +{ + kmem_cache_free(thread_stack_cache, rh); +} - vfree_atomic(tsk->stack); - return; - } -#endif +static void thread_stack_delayed_free(struct task_struct *tsk) +{ + struct rcu_head *rh = tsk->stack; - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + call_rcu(rh, thread_stack_free_rcu); } -# else -static struct kmem_cache *thread_stack_cache; -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; - return stack; + return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, tsk->stack); + thread_stack_delayed_free(tsk); + tsk->stack = NULL; } void thread_stack_cache_init(void) @@ -320,8 +413,26 @@ void thread_stack_cache_init(void) THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } -# endif -#endif + +# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ +#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ + +static int alloc_thread_stack_node(struct task_struct *tsk, int node) +{ + unsigned long *stack; + + stack = arch_alloc_thread_stack_node(tsk, node); + tsk->stack = stack; + return stack ? 0 : -ENOMEM; +} + +static void free_thread_stack(struct task_struct *tsk) +{ + arch_free_thread_stack(tsk); + tsk->stack = NULL; +} + +#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; @@ -365,61 +476,47 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_anon_vma_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { + free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); } static void account_kernel_stack(struct task_struct *tsk, int account) { - void *stack = task_stack_page(tsk); - struct vm_struct *vm = task_stack_vm_area(tsk); - - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { + void *stack = task_stack_page(tsk); + /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } } -static int memcg_charge_kernel_stack(struct task_struct *tsk) +void exit_task_stack_account(struct task_struct *tsk) { -#ifdef CONFIG_VMAP_STACK - struct vm_struct *vm = task_stack_vm_area(tsk); - int ret; - - BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + account_kernel_stack(tsk, -1); - if (vm) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + struct vm_struct *vm; int i; - BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); - - for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { - /* - * If memcg_kmem_charge_page() fails, page's - * memory cgroup pointer is NULL, and - * memcg_kmem_uncharge_page() in free_thread_stack() - * will ignore this page. - */ - ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, - 0); - if (ret) - return ret; - } + vm = task_stack_vm_area(tsk); + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) + memcg_kmem_uncharge_page(vm->pages[i], 0); } -#endif - return 0; } static void release_task_stack(struct task_struct *tsk) @@ -427,12 +524,7 @@ static void release_task_stack(struct task_struct *tsk) if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ - account_kernel_stack(tsk, -1); free_thread_stack(tsk); - tsk->stack = NULL; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = NULL; -#endif } #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -754,9 +846,7 @@ void __put_task_struct(struct task_struct *tsk) delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); sched_core_free(tsk); - - if (!profile_handoff_task(tsk)) - free_task(tsk); + free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); @@ -873,8 +963,6 @@ void set_task_stack_end_magic(struct task_struct *tsk) static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; - unsigned long *stack; - struct vm_struct *stack_vm_area __maybe_unused; int err; if (node == NUMA_NO_NODE) @@ -883,32 +971,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!tsk) return NULL; - stack = alloc_thread_stack_node(tsk, node); - if (!stack) + err = arch_dup_task_struct(tsk, orig); + if (err) goto free_tsk; - if (memcg_charge_kernel_stack(tsk)) - goto free_stack; - - stack_vm_area = task_stack_vm_area(tsk); - - err = arch_dup_task_struct(tsk, orig); + err = alloc_thread_stack_node(tsk, node); + if (err) + goto free_tsk; - /* - * arch_dup_task_struct() clobbers the stack-related fields. Make - * sure they're properly initialized before using any stack-related - * functions again. - */ - tsk->stack = stack; -#ifdef CONFIG_VMAP_STACK - tsk->stack_vm_area = stack_vm_area; -#endif #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif - - if (err) - goto free_stack; + account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) @@ -950,9 +1024,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - tsk->pf_io_worker = NULL; - - account_kernel_stack(tsk, 1); + tsk->worker_private = NULL; kcov_task_init(tsk); kmap_local_fork(tsk); @@ -966,12 +1038,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->use_memdelay = 0; #endif +#ifdef CONFIG_IOMMU_SVA + tsk->pasid_activated = 0; +#endif + #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif return tsk; free_stack: + exit_task_stack_account(tsk); free_thread_stack(tsk); free_tsk: free_task_struct(tsk); @@ -1018,13 +1095,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } -static void mm_init_pasid(struct mm_struct *mm) -{ -#ifdef CONFIG_IOMMU_SUPPORT - mm->pasid = INIT_PASID; -#endif -} - static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES @@ -1053,7 +1123,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); - mm_init_pasid(mm); + mm_pasid_init(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); @@ -1120,6 +1190,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + mm_pasid_drop(mm); mmdrop(mm); } @@ -1556,32 +1627,6 @@ out: return error; } -static int copy_io(unsigned long clone_flags, struct task_struct *tsk) -{ -#ifdef CONFIG_BLOCK - struct io_context *ioc = current->io_context; - struct io_context *new_ioc; - - if (!ioc) - return 0; - /* - * Share io context with parent, if CLONE_IO is set - */ - if (clone_flags & CLONE_IO) { - ioc_task_link(ioc); - tsk->io_context = ioc; - } else if (ioprio_valid(ioc->ioprio)) { - new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE); - if (unlikely(!new_ioc)) - return -ENOMEM; - - new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc); - } -#endif - return 0; -} - static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; @@ -2032,12 +2077,6 @@ static __latent_entropy struct task_struct *copy_process( siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } - /* - * This _must_ happen before we call free_task(), i.e. before we jump - * to any of the bad_fork_* labels. This is to avoid freeing - * p->set_child_tid which is (ab)used as a kthread's data pointer for - * kernel threads (PF_KTHREAD). - */ p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? @@ -2052,18 +2091,18 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; + retval = -EAGAIN; if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) - goto bad_fork_free; + goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; - retval = copy_creds(p, clone_flags); - if (retval < 0) - goto bad_fork_free; - /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there @@ -2118,12 +2157,16 @@ static __latent_entropy struct task_struct *copy_process( p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); + if (p->flags & PF_KTHREAD) { + if (!set_kthread_struct(p)) + goto bad_fork_cleanup_delayacct; + } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_threadgroup_lock; + goto bad_fork_cleanup_delayacct; } #endif #ifdef CONFIG_CPUSETS @@ -2282,6 +2325,9 @@ static __latent_entropy struct task_struct *copy_process( #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif +#ifdef CONFIG_RETHOOK + p->rethooks.first = NULL; +#endif /* * Ensure that the cgroup subsystem policies allow the new process to be @@ -2294,6 +2340,17 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_put_pidfd; /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by @@ -2350,10 +2407,6 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cancel_cgroup; } - /* past the last point of failure */ - if (pidfile) - fd_install(pidfd, pidfile); - init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -2402,8 +2455,11 @@ static __latent_entropy struct task_struct *copy_process( syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); + if (pidfile) + fd_install(pidfd, pidfile); + proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); @@ -2460,14 +2516,15 @@ bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); -bad_fork_cleanup_threadgroup_lock: #endif +bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); + exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 25d8a88b32e5..b22ef1efe751 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -41,11 +41,6 @@ #include "futex.h" #include "../locking/rtmutex_common.h" -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG -int __read_mostly futex_cmpxchg_enabled; -#endif - - /* * The base of the bucket array and its size are always used together * (after initialization only in futex_hash()), so ensure that they @@ -307,7 +302,7 @@ again: * found it, but truncated or holepunched or subjected to * invalidate_complete_page2 before we got the page lock (also * cases which we are happy to fail). And we hold a reference, - * so refcount care in invalidate_complete_page's remove_mapping + * so refcount care in invalidate_inode_page's remove_mapping * prevents drop_caches from setting mapping to NULL beneath us. * * The case we do have to guard against is when memory pressure made @@ -776,9 +771,6 @@ static void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; - if (!futex_cmpxchg_enabled) - return; - /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -874,9 +866,6 @@ static void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; - if (!futex_cmpxchg_enabled) - return; - /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -950,8 +939,6 @@ static void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; - if (!futex_cmpxchg_enabled) - return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1044,7 +1031,7 @@ static void futex_cleanup(struct task_struct *tsk) * actually finished the futex cleanup. The worst case for this is that the * waiter runs through the wait loop until the state becomes visible. * - * This is called from the recursive fault handling path in do_exit(). + * This is called from the recursive fault handling path in make_task_dead(). * * This is best effort. Either the futex exit code has run already or * not. If the OWNER_DIED bit has been set on the futex then the waiter can @@ -1125,26 +1112,6 @@ void futex_exit_release(struct task_struct *tsk) futex_cleanup_end(tsk, FUTEX_STATE_DEAD); } -static void __init futex_detect_cmpxchg(void) -{ -#ifndef CONFIG_HAVE_FUTEX_CMPXCHG - u32 curval; - - /* - * This will fail and we want it. Some arch implementations do - * runtime detection of the futex_atomic_cmpxchg_inatomic() - * functionality. We want to know that before we call in any - * of the complex code paths. Also we want to prevent - * registration of robust lists in that case. NULL is - * guaranteed to fault and we get -EFAULT on functional - * implementation, the non-functional ones will return - * -ENOSYS. - */ - if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT) - futex_cmpxchg_enabled = 1; -#endif -} - static int __init futex_init(void) { unsigned int futex_shift; @@ -1163,8 +1130,6 @@ static int __init futex_init(void) futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; - futex_detect_cmpxchg(); - for (i = 0; i < futex_hashsize; i++) { atomic_set(&futex_queues[i].waiters, 0); plist_head_init(&futex_queues[i].chain); diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 040ae4277cb0..c264cbeab71c 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -27,12 +27,6 @@ #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 -#ifdef CONFIG_HAVE_FUTEX_CMPXCHG -#define futex_cmpxchg_enabled 1 -#else -extern int __read_mostly futex_cmpxchg_enabled; -#endif - #ifdef CONFIG_FAIL_FUTEX extern bool should_fail_futex(bool fshared); #else diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c index 6f91a07a6a83..086a22d1adb7 100644 --- a/kernel/futex/syscalls.c +++ b/kernel/futex/syscalls.c @@ -29,8 +29,6 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) { - if (!futex_cmpxchg_enabled) - return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -56,9 +54,6 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, unsigned long ret; struct task_struct *p; - if (!futex_cmpxchg_enabled) - return -ENOSYS; - rcu_read_lock(); ret = -ESRCH; @@ -104,17 +99,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, } switch (cmd) { - case FUTEX_LOCK_PI: - case FUTEX_LOCK_PI2: - case FUTEX_UNLOCK_PI: - case FUTEX_TRYLOCK_PI: - case FUTEX_WAIT_REQUEUE_PI: - case FUTEX_CMP_REQUEUE_PI: - if (!futex_cmpxchg_enabled) - return -ENOSYS; - } - - switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; fallthrough; @@ -323,9 +307,6 @@ COMPAT_SYSCALL_DEFINE2(set_robust_list, struct compat_robust_list_head __user *, head, compat_size_t, len) { - if (!futex_cmpxchg_enabled) - return -ENOSYS; - if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -342,9 +323,6 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, unsigned long ret; struct task_struct *p; - if (!futex_cmpxchg_enabled) - return -ENOSYS; - rcu_read_lock(); ret = -ESRCH; diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 053447183ac5..04f4ebdc3cf5 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - depends on !CC_IS_CLANG || CLANG_VERSION >= 110000 depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR select CONSTRUCTORS default n diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9888e2bc8c76..52501e5f7655 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -63,7 +63,9 @@ static struct task_struct *watchdog_task; * Should we dump all CPUs backtraces in a hung task event? * Defaults to 0, can be changed via sysctl. */ -unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 #endif /* CONFIG_SMP */ /* @@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked, MAX_SCHEDULE_TIMEOUT; } +#ifdef CONFIG_SYSCTL /* * Process updating of timeout sysctl */ -int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, return ret; } +/* + * This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs + * and hung_task_check_interval_secs + */ +static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ); +static struct ctl_table hung_task_sysctls[] = { +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_check_interval_secs", + .data = &sysctl_hung_task_check_interval_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = (void *)&hung_task_timeout_max, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_NEG_ONE, + }, + {} +}; + +static void __init hung_task_sysctl_init(void) +{ + register_sysctl_init("kernel", hung_task_sysctls); +} +#else +#define hung_task_sysctl_init() do { } while (0) +#endif /* CONFIG_SYSCTL */ + + static atomic_t reset_hung_task = ATOMIC_INIT(0); void reset_hung_task_detector(void) @@ -310,6 +384,7 @@ static int __init hung_task_init(void) pm_notifier(hungtask_pm_notify, 0); watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + hung_task_sysctl_init(); return 0; } diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f7ff8919dc9b..fdf170404650 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec, */ if (numvecs <= nodes) { for_each_node_mask(n, nodemsk) { - cpumask_or(&masks[curvec].mask, &masks[curvec].mask, - node_to_cpumask[n]); + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); if (++curvec == last_affv) curvec = firstvec; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f895265d7548..54af0deb239b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -38,7 +38,7 @@ struct irqaction chained_action = { * @irq: irq number * @chip: pointer to irq chip description structure */ -int irq_set_chip(unsigned int irq, struct irq_chip *chip) +int irq_set_chip(unsigned int irq, const struct irq_chip *chip) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); @@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) if (!desc) return -EINVAL; - if (!chip) - chip = &no_irq_chip; - - desc->irq_data.chip = chip; + desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip); irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in @@ -575,8 +572,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq); */ void handle_untracked_irq(struct irq_desc *desc) { - unsigned int flags = 0; - raw_spin_lock(&desc->lock); if (!irq_may_run(desc)) @@ -593,7 +588,7 @@ void handle_untracked_irq(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock(&desc->lock); - __handle_irq_event_percpu(desc, &flags); + __handle_irq_event_percpu(desc); raw_spin_lock(&desc->lock); irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); @@ -1075,7 +1070,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle, EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data); void -irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, +irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip); @@ -1560,6 +1555,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return 0; } +static struct device *irq_get_parent_device(struct irq_data *data) +{ + if (data->domain) + return data->domain->dev; + + return NULL; +} + /** * irq_chip_pm_get - Enable power for an IRQ chip * @data: Pointer to interrupt specific data @@ -1569,12 +1572,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) */ int irq_chip_pm_get(struct irq_data *data) { + struct device *dev = irq_get_parent_device(data); int retval; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) { - retval = pm_runtime_get_sync(data->chip->parent_device); + if (IS_ENABLED(CONFIG_PM) && dev) { + retval = pm_runtime_get_sync(dev); if (retval < 0) { - pm_runtime_put_noidle(data->chip->parent_device); + pm_runtime_put_noidle(dev); return retval; } } @@ -1592,10 +1596,11 @@ int irq_chip_pm_get(struct irq_data *data) */ int irq_chip_pm_put(struct irq_data *data) { + struct device *dev = irq_get_parent_device(data); int retval = 0; - if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) - retval = pm_runtime_put(data->chip->parent_device); + if (IS_ENABLED(CONFIG_PM) && dev) + retval = pm_runtime_put(dev); return (retval < 0) ? retval : 0; } diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 39a41c56ad4f..1ed2b1739363 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu) { const struct cpumask *hk_mask; - if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) + if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) return false; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask)) return false; diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index e4cff358b437..2b43f5f5033d 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -69,8 +69,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind) seq_printf(m, "chip: None\n"); return; } - seq_printf(m, "%*schip: %s\n", ind, "", chip->name); - seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags); + seq_printf(m, "%*schip: ", ind, ""); + if (chip->irq_print_chip) + chip->irq_print_chip(data, m); + else + seq_printf(m, "%s", chip->name); + seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags); irq_debug_show_bits(m, ind, chip->flags, irqchip_flags, ARRAY_SIZE(irqchip_flags)); } diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index 6f29bf4c8515..f0862eb6b506 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -451,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) } -struct irq_domain_ops irq_generic_chip_ops = { +const struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27182003b879..9489f93b3db3 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action) wake_up_process(action->thread); } -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags) +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int irq = desc->irq_data.irq; @@ -174,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags } __irq_wake_thread(desc, action); - - fallthrough; /* to add to randomness */ - case IRQ_HANDLED: - *flags |= action->flags; break; default: @@ -193,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval; - unsigned int flags = 0; - retval = __handle_irq_event_percpu(desc, &flags); + retval = __handle_irq_event_percpu(desc); - add_interrupt_randomness(desc->irq_data.irq, flags); + add_interrupt_randomness(desc->irq_data.irq); if (!irq_settings_no_debug(desc)) note_interrupt(desc, retval); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 54363527feea..99cbdf55a8bd 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data, extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); +irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event_percpu(struct irq_desc *desc); irqreturn_t handle_irq_event(struct irq_desc *desc); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2267e6527db3..939d21cd55c3 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -640,7 +640,7 @@ int handle_irq_desc(struct irq_desc *desc) return -EINVAL; data = irq_desc_get_irq_data(desc); - if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data))) + if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); @@ -662,6 +662,29 @@ int generic_handle_irq(unsigned int irq) } EXPORT_SYMBOL_GPL(generic_handle_irq); +/** + * generic_handle_irq_safe - Invoke the handler for a particular irq from any + * context. + * @irq: The irq number to handle + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process context). It + * will report an error if not invoked from IRQ context and the irq has been + * marked to enforce IRQ-context only. + */ +int generic_handle_irq_safe(unsigned int irq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_to_desc(irq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_irq_safe); + #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging @@ -676,7 +699,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq); */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { - WARN_ON_ONCE(!in_irq()); + WARN_ON_ONCE(!in_hardirq()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index bf38c546aa25..d5ce96510549 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @chip_data: The associated chip data */ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, + const struct irq_chip *chip, void *chip_data) { struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); @@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return -ENOENT; irq_data->hwirq = hwirq; - irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip); irq_data->chip_data = chip_data; return 0; @@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { @@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); * @handler_name: The interrupt handler name */ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, - irq_hw_number_t hwirq, struct irq_chip *chip, + irq_hw_number_t hwirq, const struct irq_chip *chip, void *chip_data, irq_flow_handler_t handler, void *handler_data, const char *handler_name) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 7405e384e5ed..c03f71d5ec10 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -247,13 +247,13 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, * online. */ if (irqd_affinity_is_managed(data) && - housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) { + housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) { const struct cpumask *hk_mask, *prog_mask; static DEFINE_RAW_SPINLOCK(tmp_mask_lock); static struct cpumask tmp_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ); + hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ); raw_spin_lock(&tmp_mask_lock); cpumask_and(&tmp_mask, mask, hk_mask); @@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask) } EXPORT_SYMBOL_GPL(irq_force_affinity); -int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m, + bool setaffinity) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); @@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) return -EINVAL; desc->affinity_hint = m; irq_put_desc_unlock(desc, flags); - /* set the initial affinity to prevent every interrupt being on CPU0 */ - if (m) + if (m && setaffinity) __irq_set_affinity(irq, m, false); return 0; } -EXPORT_SYMBOL_GPL(irq_set_affinity_hint); +EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint); static void irq_affinity_notify(struct work_struct *work) { diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 7f350ae59c5f..2bdfce5edafd 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -14,12 +14,15 @@ #include <linux/irqdomain.h> #include <linux/msi.h> #include <linux/slab.h> +#include <linux/sysfs.h> #include <linux/pci.h> #include "internals.h" +static inline int msi_sysfs_create_group(struct device *dev); + /** - * alloc_msi_entry - Allocate an initialized msi_desc + * msi_alloc_desc - Allocate an initialized msi_desc * @dev: Pointer to the device for which this is allocated * @nvec: The number of vectors used in this entry * @affinity: Optional pointer to an affinity mask array size of @nvec @@ -29,34 +32,134 @@ * * Return: pointer to allocated &msi_desc on success or %NULL on failure */ -struct msi_desc *alloc_msi_entry(struct device *dev, int nvec, - const struct irq_affinity_desc *affinity) +static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec, + const struct irq_affinity_desc *affinity) { - struct msi_desc *desc; + struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL); - desc = kzalloc(sizeof(*desc), GFP_KERNEL); if (!desc) return NULL; - INIT_LIST_HEAD(&desc->list); desc->dev = dev; desc->nvec_used = nvec; if (affinity) { - desc->affinity = kmemdup(affinity, - nvec * sizeof(*desc->affinity), GFP_KERNEL); + desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL); if (!desc->affinity) { kfree(desc); return NULL; } } - return desc; } -void free_msi_entry(struct msi_desc *entry) +static void msi_free_desc(struct msi_desc *desc) { - kfree(entry->affinity); - kfree(entry); + kfree(desc->affinity); + kfree(desc); +} + +static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index) +{ + int ret; + + desc->msi_index = index; + ret = xa_insert(&md->__store, index, desc, GFP_KERNEL); + if (ret) + msi_free_desc(desc); + return ret; +} + +/** + * msi_add_msi_desc - Allocate and initialize a MSI descriptor + * @dev: Pointer to the device for which the descriptor is allocated + * @init_desc: Pointer to an MSI descriptor to initialize the new descriptor + * + * Return: 0 on success or an appropriate failure code. + */ +int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc) +{ + struct msi_desc *desc; + + lockdep_assert_held(&dev->msi.data->mutex); + + desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity); + if (!desc) + return -ENOMEM; + + /* Copy type specific data to the new descriptor. */ + desc->pci = init_desc->pci; + return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index); +} + +/** + * msi_add_simple_msi_descs - Allocate and initialize MSI descriptors + * @dev: Pointer to the device for which the descriptors are allocated + * @index: Index for the first MSI descriptor + * @ndesc: Number of descriptors to allocate + * + * Return: 0 on success or an appropriate failure code. + */ +static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc) +{ + unsigned int idx, last = index + ndesc - 1; + struct msi_desc *desc; + int ret; + + lockdep_assert_held(&dev->msi.data->mutex); + + for (idx = index; idx <= last; idx++) { + desc = msi_alloc_desc(dev, 1, NULL); + if (!desc) + goto fail_mem; + ret = msi_insert_desc(dev->msi.data, desc, idx); + if (ret) + goto fail; + } + return 0; + +fail_mem: + ret = -ENOMEM; +fail: + msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last); + return ret; +} + +static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter) +{ + switch (filter) { + case MSI_DESC_ALL: + return true; + case MSI_DESC_NOTASSOCIATED: + return !desc->irq; + case MSI_DESC_ASSOCIATED: + return !!desc->irq; + } + WARN_ON_ONCE(1); + return false; +} + +/** + * msi_free_msi_descs_range - Free MSI descriptors of a device + * @dev: Device to free the descriptors + * @filter: Descriptor state filter + * @first_index: Index to start freeing from + * @last_index: Last index to be freed + */ +void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter, + unsigned int first_index, unsigned int last_index) +{ + struct xarray *xa = &dev->msi.data->__store; + struct msi_desc *desc; + unsigned long idx; + + lockdep_assert_held(&dev->msi.data->mutex); + + xa_for_each_range(xa, idx, desc, first_index, last_index) { + if (msi_desc_match(desc, filter)) { + xa_erase(xa, idx); + msi_free_desc(desc); + } + } } void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) @@ -72,139 +175,290 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) } EXPORT_SYMBOL_GPL(get_cached_msi_msg); -static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, - char *buf) +static void msi_device_data_release(struct device *dev, void *res) { - struct msi_desc *entry; - bool is_msix = false; - unsigned long irq; - int retval; + struct msi_device_data *md = res; - retval = kstrtoul(attr->attr.name, 10, &irq); - if (retval) - return retval; + WARN_ON_ONCE(!xa_empty(&md->__store)); + xa_destroy(&md->__store); + dev->msi.data = NULL; +} - entry = irq_get_msi_desc(irq); - if (!entry) - return -ENODEV; +/** + * msi_setup_device_data - Setup MSI device data + * @dev: Device for which MSI device data should be set up + * + * Return: 0 on success, appropriate error code otherwise + * + * This can be called more than once for @dev. If the MSI device data is + * already allocated the call succeeds. The allocated memory is + * automatically released when the device is destroyed. + */ +int msi_setup_device_data(struct device *dev) +{ + struct msi_device_data *md; + int ret; - if (dev_is_pci(dev)) - is_msix = entry->msi_attrib.is_msix; + if (dev->msi.data) + return 0; - return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); + md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL); + if (!md) + return -ENOMEM; + + ret = msi_sysfs_create_group(dev); + if (ret) { + devres_free(md); + return ret; + } + + xa_init(&md->__store); + mutex_init(&md->mutex); + dev->msi.data = md; + devres_add(dev, md); + return 0; +} + +/** + * msi_lock_descs - Lock the MSI descriptor storage of a device + * @dev: Device to operate on + */ +void msi_lock_descs(struct device *dev) +{ + mutex_lock(&dev->msi.data->mutex); +} +EXPORT_SYMBOL_GPL(msi_lock_descs); + +/** + * msi_unlock_descs - Unlock the MSI descriptor storage of a device + * @dev: Device to operate on + */ +void msi_unlock_descs(struct device *dev) +{ + /* Invalidate the index wich was cached by the iterator */ + dev->msi.data->__iter_idx = MSI_MAX_INDEX; + mutex_unlock(&dev->msi.data->mutex); +} +EXPORT_SYMBOL_GPL(msi_unlock_descs); + +static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter) +{ + struct msi_desc *desc; + + xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) { + if (msi_desc_match(desc, filter)) + return desc; + } + md->__iter_idx = MSI_MAX_INDEX; + return NULL; } /** - * msi_populate_sysfs - Populate msi_irqs sysfs entries for devices - * @dev: The device(PCI, platform etc) who will get sysfs entries + * msi_first_desc - Get the first MSI descriptor of a device + * @dev: Device to operate on + * @filter: Descriptor state filter + * + * Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs() + * must be invoked before the call. * - * Return attribute_group ** so that specific bus MSI can save it to - * somewhere during initilizing msi irqs. If devices has no MSI irq, - * return NULL; if it fails to populate sysfs, return ERR_PTR + * Return: Pointer to the first MSI descriptor matching the search + * criteria, NULL if none found. */ -const struct attribute_group **msi_populate_sysfs(struct device *dev) -{ - const struct attribute_group **msi_irq_groups; - struct attribute **msi_attrs, *msi_attr; - struct device_attribute *msi_dev_attr; - struct attribute_group *msi_irq_group; - struct msi_desc *entry; - int ret = -ENOMEM; - int num_msi = 0; - int count = 0; - int i; +struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter) +{ + struct msi_device_data *md = dev->msi.data; - /* Determine how many msi entries we have */ - for_each_msi_entry(entry, dev) - num_msi += entry->nvec_used; - if (!num_msi) + if (WARN_ON_ONCE(!md)) return NULL; - /* Dynamically create the MSI attributes for the device */ - msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL); - if (!msi_attrs) - return ERR_PTR(-ENOMEM); - - for_each_msi_entry(entry, dev) { - for (i = 0; i < entry->nvec_used; i++) { - msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL); - if (!msi_dev_attr) - goto error_attrs; - msi_attrs[count] = &msi_dev_attr->attr; - - sysfs_attr_init(&msi_dev_attr->attr); - msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d", - entry->irq + i); - if (!msi_dev_attr->attr.name) - goto error_attrs; - msi_dev_attr->attr.mode = 0444; - msi_dev_attr->show = msi_mode_show; - ++count; + lockdep_assert_held(&md->mutex); + + md->__iter_idx = 0; + return msi_find_desc(md, filter); +} +EXPORT_SYMBOL_GPL(msi_first_desc); + +/** + * msi_next_desc - Get the next MSI descriptor of a device + * @dev: Device to operate on + * + * The first invocation of msi_next_desc() has to be preceeded by a + * successful invocation of __msi_first_desc(). Consecutive invocations are + * only valid if the previous one was successful. All these operations have + * to be done within the same MSI mutex held region. + * + * Return: Pointer to the next MSI descriptor matching the search + * criteria, NULL if none found. + */ +struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter) +{ + struct msi_device_data *md = dev->msi.data; + + if (WARN_ON_ONCE(!md)) + return NULL; + + lockdep_assert_held(&md->mutex); + + if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX) + return NULL; + + md->__iter_idx++; + return msi_find_desc(md, filter); +} +EXPORT_SYMBOL_GPL(msi_next_desc); + +/** + * msi_get_virq - Return Linux interrupt number of a MSI interrupt + * @dev: Device to operate on + * @index: MSI interrupt index to look for (0-based) + * + * Return: The Linux interrupt number on success (> 0), 0 if not found + */ +unsigned int msi_get_virq(struct device *dev, unsigned int index) +{ + struct msi_desc *desc; + unsigned int ret = 0; + bool pcimsi; + + if (!dev->msi.data) + return 0; + + pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false; + + msi_lock_descs(dev); + desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index); + if (desc && desc->irq) { + /* + * PCI-MSI has only one descriptor for multiple interrupts. + * PCI-MSIX and platform MSI use a descriptor per + * interrupt. + */ + if (pcimsi) { + if (index < desc->nvec_used) + ret = desc->irq + index; + } else { + ret = desc->irq; } } + msi_unlock_descs(dev); + return ret; +} +EXPORT_SYMBOL_GPL(msi_get_virq); - msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL); - if (!msi_irq_group) - goto error_attrs; - msi_irq_group->name = "msi_irqs"; - msi_irq_group->attrs = msi_attrs; +#ifdef CONFIG_SYSFS +static struct attribute *msi_dev_attrs[] = { + NULL +}; - msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL); - if (!msi_irq_groups) - goto error_irq_group; - msi_irq_groups[0] = msi_irq_group; +static const struct attribute_group msi_irqs_group = { + .name = "msi_irqs", + .attrs = msi_dev_attrs, +}; - ret = sysfs_create_groups(&dev->kobj, msi_irq_groups); - if (ret) - goto error_irq_groups; - - return msi_irq_groups; - -error_irq_groups: - kfree(msi_irq_groups); -error_irq_group: - kfree(msi_irq_group); -error_attrs: - count = 0; - msi_attr = msi_attrs[count]; - while (msi_attr) { - msi_dev_attr = container_of(msi_attr, struct device_attribute, attr); - kfree(msi_attr->name); - kfree(msi_dev_attr); - ++count; - msi_attr = msi_attrs[count]; +static inline int msi_sysfs_create_group(struct device *dev) +{ + return devm_device_add_group(dev, &msi_irqs_group); +} + +static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + /* MSI vs. MSIX is per device not per interrupt */ + bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false; + + return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi"); +} + +static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) +{ + struct device_attribute *attrs = desc->sysfs_attrs; + int i; + + if (!attrs) + return; + + desc->sysfs_attrs = NULL; + for (i = 0; i < desc->nvec_used; i++) { + if (attrs[i].show) + sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name); + kfree(attrs[i].attr.name); } - kfree(msi_attrs); - return ERR_PTR(ret); + kfree(attrs); } +static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) +{ + struct device_attribute *attrs; + int ret, i; + + attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL); + if (!attrs) + return -ENOMEM; + + desc->sysfs_attrs = attrs; + for (i = 0; i < desc->nvec_used; i++) { + sysfs_attr_init(&attrs[i].attr); + attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i); + if (!attrs[i].attr.name) { + ret = -ENOMEM; + goto fail; + } + + attrs[i].attr.mode = 0444; + attrs[i].show = msi_mode_show; + + ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name); + if (ret) { + attrs[i].show = NULL; + goto fail; + } + } + return 0; + +fail: + msi_sysfs_remove_desc(dev, desc); + return ret; +} + +#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS /** - * msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices - * @dev: The device(PCI, platform etc) who will remove sysfs entries - * @msi_irq_groups: attribute_group for device msi_irqs entries + * msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device + * @dev: The device (PCI, platform etc) which will get sysfs entries */ -void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups) -{ - struct device_attribute *dev_attr; - struct attribute **msi_attrs; - int count = 0; - - if (msi_irq_groups) { - sysfs_remove_groups(&dev->kobj, msi_irq_groups); - msi_attrs = msi_irq_groups[0]->attrs; - while (msi_attrs[count]) { - dev_attr = container_of(msi_attrs[count], - struct device_attribute, attr); - kfree(dev_attr->attr.name); - kfree(dev_attr); - ++count; - } - kfree(msi_attrs); - kfree(msi_irq_groups[0]); - kfree(msi_irq_groups); +int msi_device_populate_sysfs(struct device *dev) +{ + struct msi_desc *desc; + int ret; + + msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + if (desc->sysfs_attrs) + continue; + ret = msi_sysfs_populate_desc(dev, desc); + if (ret) + return ret; } + return 0; } +/** + * msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device + * @dev: The device (PCI, platform etc) for which to remove + * sysfs entries + */ +void msi_device_destroy_sysfs(struct device *dev) +{ + struct msi_desc *desc; + + msi_for_each_desc(desc, dev, MSI_DESC_ALL) + msi_sysfs_remove_desc(dev, desc); +} +#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */ +#else /* CONFIG_SYSFS */ +static inline int msi_sysfs_create_group(struct device *dev) { return 0; } +static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; } +static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { } +#endif /* !CONFIG_SYSFS */ + #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN static inline void irq_chip_write_msi_msg(struct irq_data *data, struct msi_msg *msg) @@ -456,43 +710,38 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, } int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, - int virq, int nvec, msi_alloc_info_t *arg) + int virq_base, int nvec, msi_alloc_info_t *arg) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; struct msi_desc *desc; - int ret = 0; + int ret, virq; - for_each_msi_entry(desc, dev) { - /* Don't even try the multi-MSI brain damage. */ - if (WARN_ON(!desc->irq || desc->nvec_used != 1)) { - ret = -EINVAL; - break; - } + msi_lock_descs(dev); + ret = msi_add_simple_msi_descs(dev, virq_base, nvec); + if (ret) + goto unlock; - if (!(desc->irq >= virq && desc->irq < (virq + nvec))) - continue; + for (virq = virq_base; virq < virq_base + nvec; virq++) { + desc = xa_load(&dev->msi.data->__store, virq); + desc->irq = virq; ops->set_desc(arg, desc); - /* Assumes the domain mutex is held! */ - ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1, - arg); + ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg); if (ret) - break; + goto fail; - irq_set_msi_desc_off(desc->irq, 0, desc); - } - - if (ret) { - /* Mop up the damage */ - for_each_msi_entry(desc, dev) { - if (!(desc->irq >= virq && desc->irq < (virq + nvec))) - continue; - - irq_domain_free_irqs_common(domain, desc->irq, 1); - } + irq_set_msi_desc(virq, desc); } + msi_unlock_descs(dev); + return 0; +fail: + for (--virq; virq >= virq_base; virq--) + irq_domain_free_irqs_common(domain, virq, 1); + msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1); +unlock: + msi_unlock_descs(dev); return ret; } @@ -531,8 +780,59 @@ static bool msi_check_reservation_mode(struct irq_domain *domain, * Checking the first MSI descriptor is sufficient. MSIX supports * masking and MSI does so when the can_mask attribute is set. */ - desc = first_msi_entry(dev); - return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask; + desc = msi_first_desc(dev, MSI_DESC_ALL); + return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask; +} + +static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc, + int allocated) +{ + switch(domain->bus_token) { + case DOMAIN_BUS_PCI_MSI: + case DOMAIN_BUS_VMD_MSI: + if (IS_ENABLED(CONFIG_PCI_MSI)) + break; + fallthrough; + default: + return -ENOSPC; + } + + /* Let a failed PCI multi MSI allocation retry */ + if (desc->nvec_used > 1) + return 1; + + /* If there was a successful allocation let the caller know */ + return allocated ? allocated : -ENOSPC; +} + +#define VIRQ_CAN_RESERVE 0x01 +#define VIRQ_ACTIVATE 0x02 +#define VIRQ_NOMASK_QUIRK 0x04 + +static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags) +{ + struct irq_data *irqd = irq_domain_get_irq_data(domain, virq); + int ret; + + if (!(vflags & VIRQ_CAN_RESERVE)) { + irqd_clr_can_reserve(irqd); + if (vflags & VIRQ_NOMASK_QUIRK) + irqd_set_msi_nomask_quirk(irqd); + } + + if (!(vflags & VIRQ_ACTIVATE)) + return 0; + + ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE); + if (ret) + return ret; + /* + * If the interrupt uses reservation mode, clear the activated bit + * so request_irq() will assign the final vector. + */ + if (vflags & VIRQ_CAN_RESERVE) + irqd_clr_activated(irqd); + return 0; } int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, @@ -540,83 +840,103 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - struct irq_data *irq_data; - struct msi_desc *desc; msi_alloc_info_t arg = { }; + unsigned int vflags = 0; + struct msi_desc *desc; + int allocated = 0; int i, ret, virq; - bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) return ret; - for_each_msi_entry(desc, dev) { + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) + vflags |= VIRQ_ACTIVATE; + + /* + * Interrupt can use a reserved vector and will not occupy + * a real device vector until the interrupt is requested. + */ + if (msi_check_reservation_mode(domain, info, dev)) { + vflags |= VIRQ_CAN_RESERVE; + /* + * MSI affinity setting requires a special quirk (X86) when + * reservation mode is active. + */ + if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) + vflags |= VIRQ_NOMASK_QUIRK; + } + + msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) { ops->set_desc(&arg, desc); virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false, desc->affinity); - if (virq < 0) { - ret = -ENOSPC; - if (ops->handle_error) - ret = ops->handle_error(domain, desc, ret); - if (ops->msi_finish) - ops->msi_finish(&arg, ret); - return ret; - } + if (virq < 0) + return msi_handle_pci_fail(domain, desc, allocated); for (i = 0; i < desc->nvec_used; i++) { irq_set_msi_desc_off(virq, i, desc); irq_debugfs_copy_devname(virq + i, dev); + ret = msi_init_virq(domain, virq + i, vflags); + if (ret) + return ret; + } + if (info->flags & MSI_FLAG_DEV_SYSFS) { + ret = msi_sysfs_populate_desc(dev, desc); + if (ret) + return ret; } + allocated++; } + return 0; +} - if (ops->msi_finish) - ops->msi_finish(&arg, 0); +static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info, + struct device *dev, + unsigned int num_descs) +{ + if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS)) + return 0; - can_reserve = msi_check_reservation_mode(domain, info, dev); + return msi_add_simple_msi_descs(dev, 0, num_descs); +} - /* - * This flag is set by the PCI layer as we need to activate - * the MSI entries before the PCI layer enables MSI in the - * card. Otherwise the card latches a random msi message. - */ - if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) - goto skip_activate; - - for_each_msi_vector(desc, i, dev) { - if (desc->irq == i) { - virq = desc->irq; - dev_dbg(dev, "irq [%d-%d] for MSI\n", - virq, virq + desc->nvec_used - 1); - } +/** + * msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @nvec: The number of interrupts to allocate + * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own vector + * allocation/free. + * + * Return: %0 on success or an error code. + */ +int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + int ret; - irq_data = irq_domain_get_irq_data(domain, i); - if (!can_reserve) { - irqd_clr_can_reserve(irq_data); - if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) - irqd_set_msi_nomask_quirk(irq_data); - } - ret = irq_domain_activate_irq(irq_data, can_reserve); - if (ret) - goto cleanup; - } + lockdep_assert_held(&dev->msi.data->mutex); -skip_activate: - /* - * If these interrupts use reservation mode, clear the activated bit - * so request_irq() will assign the final vector. - */ - if (can_reserve) { - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - irqd_clr_activated(irq_data); - } - } - return 0; + ret = msi_domain_add_simple_msi_descs(info, dev, nvec); + if (ret) + return ret; -cleanup: - msi_domain_free_irqs(domain, dev); + ret = ops->domain_alloc_irqs(domain, dev, nvec); + if (ret) + msi_domain_free_irqs_descs_locked(domain, dev); return ret; } @@ -629,52 +949,78 @@ cleanup: * * Return: %0 on success or an error code. */ -int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, - int nvec) +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec) { - struct msi_domain_info *info = domain->host_data; - struct msi_domain_ops *ops = info->ops; + int ret; - return ops->domain_alloc_irqs(domain, dev, nvec); + msi_lock_descs(dev); + ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec); + msi_unlock_descs(dev); + return ret; } void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { - struct irq_data *irq_data; + struct msi_domain_info *info = domain->host_data; + struct irq_data *irqd; struct msi_desc *desc; int i; - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - if (irqd_is_activated(irq_data)) - irq_domain_deactivate_irq(irq_data); - } - - for_each_msi_entry(desc, dev) { - /* - * We might have failed to allocate an MSI early - * enough that there is no IRQ associated to this - * entry. If that's the case, don't do anything. - */ - if (desc->irq) { - irq_domain_free_irqs(desc->irq, desc->nvec_used); - desc->irq = 0; + /* Only handle MSI entries which have an interrupt associated */ + msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) { + /* Make sure all interrupts are deactivated */ + for (i = 0; i < desc->nvec_used; i++) { + irqd = irq_domain_get_irq_data(domain, desc->irq + i); + if (irqd && irqd_is_activated(irqd)) + irq_domain_deactivate_irq(irqd); } + + irq_domain_free_irqs(desc->irq, desc->nvec_used); + if (info->flags & MSI_FLAG_DEV_SYSFS) + msi_sysfs_remove_desc(dev, desc); + desc->irq = 0; } } +static void msi_domain_free_msi_descs(struct msi_domain_info *info, + struct device *dev) +{ + if (info->flags & MSI_FLAG_FREE_MSI_DESCS) + msi_free_msi_descs(dev); +} + /** - * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev + * msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev * @domain: The domain to managing the interrupts * @dev: Pointer to device struct of the device for which the interrupts * are free + * + * Must be invoked from within a msi_lock_descs() / msi_unlock_descs() + * pair. Use this for MSI irqdomains which implement their own vector + * allocation. */ -void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev) { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - return ops->domain_free_irqs(domain, dev); + lockdep_assert_held(&dev->msi.data->mutex); + + ops->domain_free_irqs(domain, dev); + msi_domain_free_msi_descs(info, dev); +} + +/** + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated to @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + msi_lock_descs(dev); + msi_domain_free_irqs_descs_locked(domain, dev); + msi_unlock_descs(dev); } /** diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index ee595ec09778..623b8136e9af 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -137,7 +137,7 @@ static inline int irq_select_affinity_usr(unsigned int irq) static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { - unsigned int irq = (int)(long)PDE_DATA(file_inode(file)); + unsigned int irq = (int)(long)pde_data(file_inode(file)); cpumask_var_t new_value; int err; @@ -190,12 +190,12 @@ static ssize_t irq_affinity_list_proc_write(struct file *file, static int irq_affinity_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_proc_show, pde_data(inode)); } static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) { - return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode)); + return single_open(file, irq_affinity_list_proc_show, pde_data(inode)); } static const struct proc_ops irq_affinity_proc_ops = { @@ -265,7 +265,7 @@ out: static int default_affinity_open(struct inode *inode, struct file *file) { - return single_open(file, default_affinity_show, PDE_DATA(inode)); + return single_open(file, default_affinity_show, pde_data(inode)); } static const struct proc_ops default_affinity_proc_ops = { diff --git a/kernel/irq_work.c b/kernel/irq_work.c index f7df715ec28e..7afa40fe5cc4 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -137,7 +137,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (!irq_work_claim(work)) return false; - kasan_record_aux_stack(work); + kasan_record_aux_stack_noalloc(work); preempt_disable(); if (cpu != smp_processor_id()) { diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3011bc33a5ba..79f2eb617a62 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -212,6 +212,10 @@ unsigned long kallsyms_lookup_name(const char *name) unsigned long i; unsigned int off; + /* Skip the search for empty string. */ + if (!*name) + return 0; + for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); @@ -243,6 +247,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; + cond_resched(); } return 0; } diff --git a/kernel/kcov.c b/kernel/kcov.c index 36ca640c4f8e..475524bd900a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t) static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) { int res = 0; - void *area; struct kcov *kcov = vma->vm_file->private_data; unsigned long size, off; struct page *page; unsigned long flags; - area = vmalloc_user(vma->vm_end - vma->vm_start); - if (!area) - return -ENOMEM; - spin_lock_irqsave(&kcov->lock, flags); size = kcov->size * sizeof(unsigned long); - if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 || + if (kcov->area == NULL || vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != size) { res = -EINVAL; goto exit; } - if (!kcov->area) { - kcov->area = area; - vma->vm_flags |= VM_DONTEXPAND; - spin_unlock_irqrestore(&kcov->lock, flags); - for (off = 0; off < size; off += PAGE_SIZE) { - page = vmalloc_to_page(kcov->area + off); - if (vm_insert_page(vma, vma->vm_start + off, page)) - WARN_ONCE(1, "vm_insert_page() failed"); - } - return 0; + spin_unlock_irqrestore(&kcov->lock, flags); + vma->vm_flags |= VM_DONTEXPAND; + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); } + return 0; exit: spin_unlock_irqrestore(&kcov->lock, flags); - vfree(area); return res; } @@ -564,31 +555,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, unsigned long arg) { struct task_struct *t; - unsigned long size, unused; + unsigned long flags, unused; int mode, i; struct kcov_remote_arg *remote_arg; struct kcov_remote *remote; - unsigned long flags; switch (cmd) { - case KCOV_INIT_TRACE: - /* - * Enable kcov in trace mode and setup buffer size. - * Must happen before anything else. - */ - if (kcov->mode != KCOV_MODE_DISABLED) - return -EBUSY; - /* - * Size must be at least 2 to hold current position and one PC. - * Later we allocate size * sizeof(unsigned long) memory, - * that must not overflow. - */ - size = arg; - if (size < 2 || size > INT_MAX / sizeof(unsigned long)) - return -EINVAL; - kcov->size = size; - kcov->mode = KCOV_MODE_INIT; - return 0; case KCOV_ENABLE: /* * Enable coverage for the current task. @@ -692,9 +664,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) struct kcov_remote_arg *remote_arg = NULL; unsigned int remote_num_handles; unsigned long remote_arg_size; - unsigned long flags; + unsigned long size, flags; + void *area; - if (cmd == KCOV_REMOTE_ENABLE) { + kcov = filep->private_data; + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + * + * First check the size argument - it must be at least 2 + * to hold the current position and one PC. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + area = vmalloc_user(size * sizeof(unsigned long)); + if (area == NULL) + return -ENOMEM; + spin_lock_irqsave(&kcov->lock, flags); + if (kcov->mode != KCOV_MODE_DISABLED) { + spin_unlock_irqrestore(&kcov->lock, flags); + vfree(area); + return -EBUSY; + } + kcov->area = area; + kcov->size = size; + kcov->mode = KCOV_MODE_INIT; + spin_unlock_irqrestore(&kcov->lock, flags); + return 0; + case KCOV_REMOTE_ENABLE: if (get_user(remote_num_handles, (unsigned __user *)(arg + offsetof(struct kcov_remote_arg, num_handles)))) return -EFAULT; @@ -710,16 +710,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) return -EINVAL; } arg = (unsigned long)remote_arg; + fallthrough; + default: + /* + * All other commands can be normally executed under a spin lock, so we + * obtain and release it here in order to simplify kcov_ioctl_locked(). + */ + spin_lock_irqsave(&kcov->lock, flags); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock_irqrestore(&kcov->lock, flags); + kfree(remote_arg); + return res; } - - kcov = filep->private_data; - spin_lock_irqsave(&kcov->lock, flags); - res = kcov_ioctl_locked(kcov, cmd, arg); - spin_unlock_irqrestore(&kcov->lock, flags); - - kfree(remote_arg); - - return res; } static const struct file_operations kcov_fops = { diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile index c2bb07f5bcc7..4f35d1bced6a 100644 --- a/kernel/kcsan/Makefile +++ b/kernel/kcsan/Makefile @@ -8,9 +8,12 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \ + $(call cc-option,-mno-outline-atomics) \ -fno-stack-protector -DDISABLE_BRANCH_PROFILING obj-y := core.o debugfs.o report.o + +KCSAN_INSTRUMENT_BARRIERS_selftest.o := y obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 4b84c8e7884b..fe12dfe254ec 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -40,15 +40,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644); module_param_named(skip_watch, kcsan_skip_watch, long, 0644); module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444); +#ifdef CONFIG_KCSAN_WEAK_MEMORY +static bool kcsan_weak_memory = true; +module_param_named(weak_memory, kcsan_weak_memory, bool, 0644); +#else +#define kcsan_weak_memory false +#endif + bool kcsan_enabled; /* Per-CPU kcsan_ctx for interrupts */ static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = { - .disable_count = 0, - .atomic_next = 0, - .atomic_nest_count = 0, - .in_flat_atomic = false, - .access_mask = 0, .scoped_accesses = {LIST_POISON1, NULL}, }; @@ -209,15 +211,17 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip); static noinline void kcsan_check_scoped_accesses(void) { struct kcsan_ctx *ctx = get_ctx(); - struct list_head *prev_save = ctx->scoped_accesses.prev; struct kcsan_scoped_access *scoped_access; - ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */ + if (ctx->disable_scoped) + return; + + ctx->disable_scoped++; list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) { check_access(scoped_access->ptr, scoped_access->size, scoped_access->type, scoped_access->ip); } - ctx->scoped_accesses.prev = prev_save; + ctx->disable_scoped--; } /* Rules for generic atomic accesses. Called from fast-path. */ @@ -325,6 +329,21 @@ static void delay_access(int type) udelay(delay); } +/* + * Reads the instrumented memory for value change detection; value change + * detection is currently done for accesses up to a size of 8 bytes. + */ +static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size) +{ + switch (size) { + case 1: return READ_ONCE(*(const u8 *)ptr); + case 2: return READ_ONCE(*(const u16 *)ptr); + case 4: return READ_ONCE(*(const u32 *)ptr); + case 8: return READ_ONCE(*(const u64 *)ptr); + default: return 0; /* Ignore; we do not diff the values. */ + } +} + void kcsan_save_irqtrace(struct task_struct *task) { #ifdef CONFIG_TRACE_IRQFLAGS @@ -339,6 +358,76 @@ void kcsan_restore_irqtrace(struct task_struct *task) #endif } +static __always_inline int get_kcsan_stack_depth(void) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + return current->kcsan_stack_depth; +#else + BUILD_BUG(); + return 0; +#endif +} + +static __always_inline void add_kcsan_stack_depth(int val) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + current->kcsan_stack_depth += val; +#else + BUILD_BUG(); +#endif +} + +static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx) +{ +#ifdef CONFIG_KCSAN_WEAK_MEMORY + return ctx->disable_scoped ? NULL : &ctx->reorder_access; +#else + return NULL; +#endif +} + +static __always_inline bool +find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, + int type, unsigned long ip) +{ + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (!reorder_access) + return false; + + /* + * Note: If accesses are repeated while reorder_access is identical, + * never matches the new access, because !(type & KCSAN_ACCESS_SCOPED). + */ + return reorder_access->ptr == ptr && reorder_access->size == size && + reorder_access->type == type && reorder_access->ip == ip; +} + +static inline void +set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, + int type, unsigned long ip) +{ + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (!reorder_access || !kcsan_weak_memory) + return; + + /* + * To avoid nested interrupts or scheduler (which share kcsan_ctx) + * reading an inconsistent reorder_access, ensure that the below has + * exclusive access to reorder_access by disallowing concurrent use. + */ + ctx->disable_scoped++; + barrier(); + reorder_access->ptr = ptr; + reorder_access->size = size; + reorder_access->type = type | KCSAN_ACCESS_SCOPED; + reorder_access->ip = ip; + reorder_access->stack_depth = get_kcsan_stack_depth(); + barrier(); + ctx->disable_scoped--; +} + /* * Pull everything together: check_access() below contains the performance * critical operations; the fast-path (including check_access) functions should @@ -377,8 +466,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr, * The access_mask check relies on value-change comparison. To avoid * reporting a race where e.g. the writer set up the watchpoint, but the * reader has access_mask!=0, we have to ignore the found watchpoint. + * + * reorder_access is never created from an access with access_mask set. */ - if (ctx->access_mask) + if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip)) return; /* @@ -428,11 +519,13 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0; atomic_long_t *watchpoint; u64 old, new, diff; - unsigned long access_mask; enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE; + bool interrupt_watcher = kcsan_interrupt_watcher; unsigned long ua_flags = user_access_save(); struct kcsan_ctx *ctx = get_ctx(); + unsigned long access_mask = ctx->access_mask; unsigned long irq_flags = 0; + bool is_reorder_access; /* * Always reset kcsan_skip counter in slow-path to avoid underflow; see @@ -456,12 +549,32 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned } /* + * The local CPU cannot observe reordering of its own accesses, and + * therefore we need to take care of 2 cases to avoid false positives: + * + * 1. Races of the reordered access with interrupts. To avoid, if + * the current access is reorder_access, disable interrupts. + * 2. Avoid races of scoped accesses from nested interrupts (below). + */ + is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip); + if (is_reorder_access) + interrupt_watcher = false; + /* + * Avoid races of scoped accesses from nested interrupts (or scheduler). + * Assume setting up a watchpoint for a non-scoped (normal) access that + * also conflicts with a current scoped access. In a nested interrupt, + * which shares the context, it would check a conflicting scoped access. + * To avoid, disable scoped access checking. + */ + ctx->disable_scoped++; + + /* * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's * runtime is entered for every memory access, and potentially useful * information is lost if dirtied by KCSAN. */ kcsan_save_irqtrace(current); - if (!kcsan_interrupt_watcher) + if (!interrupt_watcher) local_irq_save(irq_flags); watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write); @@ -482,23 +595,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * Read the current value, to later check and infer a race if the data * was modified via a non-instrumented access, e.g. from a device. */ - old = 0; - switch (size) { - case 1: - old = READ_ONCE(*(const u8 *)ptr); - break; - case 2: - old = READ_ONCE(*(const u16 *)ptr); - break; - case 4: - old = READ_ONCE(*(const u32 *)ptr); - break; - case 8: - old = READ_ONCE(*(const u64 *)ptr); - break; - default: - break; /* ignore; we do not diff the values */ - } + old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size); /* * Delay this thread, to increase probability of observing a racy @@ -510,23 +607,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned * Re-read value, and check if it is as expected; if not, we infer a * racy access. */ - access_mask = ctx->access_mask; - new = 0; - switch (size) { - case 1: - new = READ_ONCE(*(const u8 *)ptr); - break; - case 2: - new = READ_ONCE(*(const u16 *)ptr); - break; - case 4: - new = READ_ONCE(*(const u32 *)ptr); - break; - case 8: - new = READ_ONCE(*(const u64 *)ptr); - break; - default: - break; /* ignore; we do not diff the values */ + if (!is_reorder_access) { + new = read_instrumented_memory(ptr, size); + } else { + /* + * Reordered accesses cannot be used for value change detection, + * because the memory location may no longer be accessible and + * could result in a fault. + */ + new = 0; + access_mask = 0; } diff = old ^ new; @@ -596,10 +686,20 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned */ remove_watchpoint(watchpoint); atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]); + out_unlock: - if (!kcsan_interrupt_watcher) + if (!interrupt_watcher) local_irq_restore(irq_flags); kcsan_restore_irqtrace(current); + ctx->disable_scoped--; + + /* + * Reordered accesses cannot be used for value change detection, + * therefore never consider for reordering if access_mask is set. + * ASSERT_EXCLUSIVE are not real accesses, ignore them as well. + */ + if (!access_mask && !is_assert) + set_reorder_access(ctx, ptr, size, type, ip); out: user_access_restore(ua_flags); } @@ -607,7 +707,6 @@ out: static __always_inline void check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) { - const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0; atomic_long_t *watchpoint; long encoded_watchpoint; @@ -618,12 +717,14 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) if (unlikely(size == 0)) return; +again: /* * Avoid user_access_save in fast-path: find_watchpoint is safe without * user_access_save, as the address that ptr points to is only used to * check if a watchpoint exists; ptr is never dereferenced. */ - watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write, + watchpoint = find_watchpoint((unsigned long)ptr, size, + !(type & KCSAN_ACCESS_WRITE), &encoded_watchpoint); /* * It is safe to check kcsan_is_enabled() after find_watchpoint in the @@ -637,9 +738,42 @@ check_access(const volatile void *ptr, size_t size, int type, unsigned long ip) else { struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */ - if (unlikely(should_watch(ctx, ptr, size, type))) + if (unlikely(should_watch(ctx, ptr, size, type))) { kcsan_setup_watchpoint(ptr, size, type, ip); - else if (unlikely(ctx->scoped_accesses.prev)) + return; + } + + if (!(type & KCSAN_ACCESS_SCOPED)) { + struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx); + + if (reorder_access) { + /* + * reorder_access check: simulates reordering of + * the access after subsequent operations. + */ + ptr = reorder_access->ptr; + type = reorder_access->type; + ip = reorder_access->ip; + /* + * Upon a nested interrupt, this context's + * reorder_access can be modified (shared ctx). + * We know that upon return, reorder_access is + * always invalidated by setting size to 0 via + * __tsan_func_exit(). Therefore we must read + * and check size after the other fields. + */ + barrier(); + size = READ_ONCE(reorder_access->size); + if (size) + goto again; + } + } + + /* + * Always checked last, right before returning from runtime; + * if reorder_access is valid, checked after it was checked. + */ + if (unlikely(ctx->scoped_accesses.prev)) kcsan_check_scoped_accesses(); } } @@ -814,6 +948,22 @@ void __kcsan_check_access(const volatile void *ptr, size_t size, int type) } EXPORT_SYMBOL(__kcsan_check_access); +#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \ + void __kcsan_##name(void) \ + { \ + struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \ + if (!sa) \ + return; \ + if (order_before_cond) \ + sa->size = 0; \ + } \ + EXPORT_SYMBOL(__kcsan_##name) + +DEFINE_MEMORY_BARRIER(mb, true); +DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND)); +DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND)); +DEFINE_MEMORY_BARRIER(release, true); + /* * KCSAN uses the same instrumentation that is emitted by supported compilers * for ThreadSanitizer (TSAN). @@ -926,19 +1076,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8); DEFINE_TSAN_VOLATILE_READ_WRITE(16); /* - * The below are not required by KCSAN, but can still be emitted by the - * compiler. + * Function entry and exit are used to determine the validty of reorder_access. + * Reordering of the access ends at the end of the function scope where the + * access happened. This is done for two reasons: + * + * 1. Artificially limits the scope where missing barriers are detected. + * This minimizes false positives due to uninstrumented functions that + * contain the required barriers but were missed. + * + * 2. Simplifies generating the stack trace of the access. */ void __tsan_func_entry(void *call_pc); -void __tsan_func_entry(void *call_pc) +noinline void __tsan_func_entry(void *call_pc) { + if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + return; + + add_kcsan_stack_depth(1); } EXPORT_SYMBOL(__tsan_func_entry); + void __tsan_func_exit(void); -void __tsan_func_exit(void) +noinline void __tsan_func_exit(void) { + struct kcsan_scoped_access *reorder_access; + + if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY)) + return; + + reorder_access = get_reorder_access(get_ctx()); + if (!reorder_access) + goto out; + + if (get_kcsan_stack_depth() <= reorder_access->stack_depth) { + /* + * Access check to catch cases where write without a barrier + * (supposed release) was last access in function: because + * instrumentation is inserted before the real access, a data + * race due to the write giving up a c-s would only be caught if + * we do the conflicting access after. + */ + check_access(reorder_access->ptr, reorder_access->size, + reorder_access->type, reorder_access->ip); + reorder_access->size = 0; + reorder_access->stack_depth = INT_MIN; + } +out: + add_kcsan_stack_depth(-1); } EXPORT_SYMBOL(__tsan_func_exit); + void __tsan_init(void); void __tsan_init(void) { @@ -961,10 +1148,19 @@ EXPORT_SYMBOL(__tsan_init); * functions, whose job is to also execute the operation itself. */ +static __always_inline void kcsan_atomic_builtin_memorder(int memorder) +{ + if (memorder == __ATOMIC_RELEASE || + memorder == __ATOMIC_SEQ_CST || + memorder == __ATOMIC_ACQ_REL) + __kcsan_release(); +} + #define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \ } \ @@ -974,6 +1170,7 @@ EXPORT_SYMBOL(__tsan_init); void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \ @@ -986,6 +1183,7 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \ { \ + kcsan_atomic_builtin_memorder(memorder); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1018,6 +1216,7 @@ EXPORT_SYMBOL(__tsan_init); int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \ u##bits val, int mo, int fail_mo) \ { \ + kcsan_atomic_builtin_memorder(mo); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1033,6 +1232,7 @@ EXPORT_SYMBOL(__tsan_init); u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \ int mo, int fail_mo) \ { \ + kcsan_atomic_builtin_memorder(mo); \ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \ check_access(ptr, bits / BITS_PER_BYTE, \ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \ @@ -1064,10 +1264,47 @@ DEFINE_TSAN_ATOMIC_OPS(64); void __tsan_atomic_thread_fence(int memorder); void __tsan_atomic_thread_fence(int memorder) { + kcsan_atomic_builtin_memorder(memorder); __atomic_thread_fence(memorder); } EXPORT_SYMBOL(__tsan_atomic_thread_fence); +/* + * In instrumented files, we emit instrumentation for barriers by mapping the + * kernel barriers to an __atomic_signal_fence(), which is interpreted specially + * and otherwise has no relation to a real __atomic_signal_fence(). No known + * kernel code uses __atomic_signal_fence(). + * + * Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which + * are turned into calls to __tsan_atomic_signal_fence(), such instrumentation + * can be disabled via the __no_kcsan function attribute (vs. an explicit call + * which could not). When __no_kcsan is requested, __atomic_signal_fence() + * generates no code. + * + * Note: The result of using __atomic_signal_fence() with KCSAN enabled is + * potentially limiting the compiler's ability to reorder operations; however, + * if barriers were instrumented with explicit calls (without LTO), the compiler + * couldn't optimize much anyway. The result of a hypothetical architecture + * using __atomic_signal_fence() in normal code would be KCSAN false negatives. + */ void __tsan_atomic_signal_fence(int memorder); -void __tsan_atomic_signal_fence(int memorder) { } +noinline void __tsan_atomic_signal_fence(int memorder) +{ + switch (memorder) { + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb: + __kcsan_mb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb: + __kcsan_wmb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb: + __kcsan_rmb(); + break; + case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release: + __kcsan_release(); + break; + default: + break; + } +} EXPORT_SYMBOL(__tsan_atomic_signal_fence); diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 660729238588..a36fca063a73 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -16,9 +16,12 @@ #define pr_fmt(fmt) "kcsan_test: " fmt #include <kunit/test.h> +#include <linux/atomic.h> +#include <linux/bitops.h> #include <linux/jiffies.h> #include <linux/kcsan-checks.h> #include <linux/kernel.h> +#include <linux/mutex.h> #include <linux/sched.h> #include <linux/seqlock.h> #include <linux/spinlock.h> @@ -151,7 +154,7 @@ struct expect_report { /* Check observed report matches information in @r. */ __no_kcsan -static bool report_matches(const struct expect_report *r) +static bool __report_matches(const struct expect_report *r) { const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT; bool ret = false; @@ -213,9 +216,9 @@ static bool report_matches(const struct expect_report *r) const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC); const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED); const char *const access_type_aux = - (is_atomic && is_scoped) ? " (marked, scoped)" + (is_atomic && is_scoped) ? " (marked, reordered)" : (is_atomic ? " (marked)" - : (is_scoped ? " (scoped)" : "")); + : (is_scoped ? " (reordered)" : "")); if (i == 1) { /* Access 2 */ @@ -253,6 +256,40 @@ out: return ret; } +static __always_inline const struct expect_report * +__report_set_scoped(struct expect_report *r, int accesses) +{ + BUILD_BUG_ON(accesses > 3); + + if (accesses & 1) + r->access[0].type |= KCSAN_ACCESS_SCOPED; + else + r->access[0].type &= ~KCSAN_ACCESS_SCOPED; + + if (accesses & 2) + r->access[1].type |= KCSAN_ACCESS_SCOPED; + else + r->access[1].type &= ~KCSAN_ACCESS_SCOPED; + + return r; +} + +__no_kcsan +static bool report_matches_any_reordered(struct expect_report *r) +{ + return __report_matches(__report_set_scoped(r, 0)) || + __report_matches(__report_set_scoped(r, 1)) || + __report_matches(__report_set_scoped(r, 2)) || + __report_matches(__report_set_scoped(r, 3)); +} + +#ifdef |