diff options
31 files changed, 1363 insertions, 799 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 62feb8f31381..bb884c14b2f6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1572,12 +1572,28 @@ The above will cause the "foo" tracing instance to trigger a snapshot at the end of boot up. - ftrace_dump_on_oops[=orig_cpu] + ftrace_dump_on_oops[=2(orig_cpu) | =<instance>][,<instance> | + ,<instance>=2(orig_cpu)] [FTRACE] will dump the trace buffers on oops. - If no parameter is passed, ftrace will dump - buffers of all CPUs, but if you pass orig_cpu, it will - dump only the buffer of the CPU that triggered the - oops. + If no parameter is passed, ftrace will dump global + buffers of all CPUs, if you pass 2 or orig_cpu, it + will dump only the buffer of the CPU that triggered + the oops, or the specific instance will be dumped if + its name is passed. Multiple instance dump is also + supported, and instances are separated by commas. Each + instance supports only dump on CPU that triggered the + oops by passing 2 or orig_cpu to it. + + ftrace_dump_on_oops=foo=orig_cpu + + The above will dump only the buffer of "foo" instance + on CPU that triggered the oops. + + ftrace_dump_on_oops,foo,bar=orig_cpu + + The above will dump global buffer on all CPUs, the + buffer of "foo" instance on all CPUs and the buffer + of "bar" instance on CPU that triggered the oops. ftrace_filter=[function-list] [FTRACE] Limit the functions traced by the function diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index a9b71190399d..7fd43947832f 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -296,12 +296,30 @@ kernel panic). This will output the contents of the ftrace buffers to the console. This is very useful for capturing traces that lead to crashes and outputting them to a serial console. -= =================================================== -0 Disabled (default). -1 Dump buffers of all CPUs. -2 Dump the buffer of the CPU that triggered the oops. -= =================================================== - +======================= =========================================== +0 Disabled (default). +1 Dump buffers of all CPUs. +2(orig_cpu) Dump the buffer of the CPU that triggered the + oops. +<instance> Dump the specific instance buffer on all CPUs. +<instance>=2(orig_cpu) Dump the specific instance buffer on the CPU + that triggered the oops. +======================= =========================================== + +Multiple instance dump is also supported, and instances are separated +by commas. If global buffer also needs to be dumped, please specify +the dump mode (1/2/orig_cpu) first for global buffer. + +So for example to dump "foo" and "bar" instance buffer on all CPUs, +user can:: + + echo "foo,bar" > /proc/sys/kernel/ftrace_dump_on_oops + +To dump global buffer and "foo" instance buffer on all +CPUs along with the "bar" instance buffer on CPU that triggered the +oops, user can:: + + echo "1,foo,bar=2" > /proc/sys/kernel/ftrace_dump_on_oops ftrace_enabled, stack_tracer_enabled ==================================== diff --git a/Documentation/trace/user_events.rst b/Documentation/trace/user_events.rst index d8f12442aaa6..1d5a7626e6a6 100644 --- a/Documentation/trace/user_events.rst +++ b/Documentation/trace/user_events.rst @@ -92,6 +92,24 @@ The following flags are currently supported. process closes or unregisters the event. Requires CAP_PERFMON otherwise -EPERM is returned. ++ USER_EVENT_REG_MULTI_FORMAT: The event can contain multiple formats. This + allows programs to prevent themselves from being blocked when their event + format changes and they wish to use the same name. When this flag is used the + tracepoint name will be in the new format of "name.unique_id" vs the older + format of "name". A tracepoint will be created for each unique pair of name + and format. This means if several processes use the same name and format, + they will use the same tracepoint. If yet another process uses the same name, + but a different format than the other processes, it will use a different + tracepoint with a new unique id. Recording programs need to scan tracefs for + the various different formats of the event name they are interested in + recording. The system name of the tracepoint will also use "user_events_multi" + instead of "user_events". This prevents single-format event names conflicting + with any multi-format event names within tracefs. The unique_id is output as + a hex string. Recording programs should ensure the tracepoint name starts with + the event name they registered and has a suffix that starts with . and only + has hex characters. For example to find all versions of the event "test" you + can use the regex "^test\.[0-9a-fA-F]+$". + Upon successful registration the following is set. + write_index: The index to use for this file descriptor that represents this @@ -106,6 +124,9 @@ or perf record -e user_events:[name] when attaching/recording. **NOTE:** The event subsystem name by default is "user_events". Callers should not assume it will always be "user_events". Operators reserve the right in the future to change the subsystem name per-process to accommodate event isolation. +In addition if the USER_EVENT_REG_MULTI_FORMAT flag is used the tracepoint name +will have a unique id appended to it and the system name will be +"user_events_multi" as described above. Command Format ^^^^^^^^^^^^^^ @@ -156,7 +177,11 @@ to request deletes than the one used for registration due to this. to the event. If programs do not want auto-delete, they must use the USER_EVENT_REG_PERSIST flag when registering the event. Once that flag is used the event exists until DIAG_IOCSDEL is invoked. Both register and delete of an -event that persists requires CAP_PERFMON, otherwise -EPERM is returned. +event that persists requires CAP_PERFMON, otherwise -EPERM is returned. When +there are multiple formats of the same event name, all events with the same +name will be attempted to be deleted. If only a specific version is wanted to +be deleted then the /sys/kernel/tracing/dynamic_events file should be used for +that specific format of the event. Unregistering ------------- diff --git a/drivers/cxl/core/trace.h b/drivers/cxl/core/trace.h index bdf117a33744..e5f13260fc52 100644 --- a/drivers/cxl/core/trace.h +++ b/drivers/cxl/core/trace.h @@ -646,18 +646,18 @@ u64 cxl_trace_hpa(struct cxl_region *cxlr, struct cxl_memdev *memdev, u64 dpa); TRACE_EVENT(cxl_poison, - TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *region, + TP_PROTO(struct cxl_memdev *cxlmd, struct cxl_region *cxlr, const struct cxl_poison_record *record, u8 flags, __le64 overflow_ts, enum cxl_poison_trace_type trace_type), - TP_ARGS(cxlmd, region, record, flags, overflow_ts, trace_type), + TP_ARGS(cxlmd, cxlr, record, flags, overflow_ts, trace_type), TP_STRUCT__entry( __string(memdev, dev_name(&cxlmd->dev)) __string(host, dev_name(cxlmd->dev.parent)) __field(u64, serial) __field(u8, trace_type) - __string(region, region) + __string(region, cxlr ? dev_name(&cxlr->dev) : "") __field(u64, overflow_ts) __field(u64, hpa) __field(u64, dpa) @@ -677,10 +677,10 @@ TRACE_EVENT(cxl_poison, __entry->source = cxl_poison_record_source(record); __entry->trace_type = trace_type; __entry->flags = flags; - if (region) { - __assign_str(region, dev_name(®ion->dev)); - memcpy(__entry->uuid, ®ion->params.uuid, 16); - __entry->hpa = cxl_trace_hpa(region, cxlmd, + if (cxlr) { + __assign_str(region, dev_name(&cxlr->dev)); + memcpy(__entry->uuid, &cxlr->params.uuid, 16); + __entry->hpa = cxl_trace_hpa(cxlr, cxlmd, __entry->dpa); } else { __assign_str(region, ""); diff --git a/drivers/gpu/drm/i915/display/intel_display_trace.h b/drivers/gpu/drm/i915/display/intel_display_trace.h index 99bdb833591c..7862e7cefe02 100644 --- a/drivers/gpu/drm/i915/display/intel_display_trace.h +++ b/drivers/gpu/drm/i915/display/intel_display_trace.h @@ -411,7 +411,7 @@ TRACE_EVENT(intel_fbc_activate, struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name) + __assign_str(name, plane->base.name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -438,7 +438,7 @@ TRACE_EVENT(intel_fbc_deactivate, struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name) + __assign_str(name, plane->base.name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); @@ -465,7 +465,7 @@ TRACE_EVENT(intel_fbc_nuke, struct intel_crtc *crtc = intel_crtc_for_pipe(to_i915(plane->base.dev), plane->pipe); __assign_str(dev, __dev_name_kms(plane)); - __assign_str(name, plane->base.name) + __assign_str(name, plane->base.name); __entry->pipe = crtc->pipe; __entry->frame = intel_crtc_get_vblank_counter(crtc); __entry->scanline = intel_get_crtc_scanline(crtc); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h index 8510b88d4982..f3cd5a376eca 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_trace.h @@ -24,7 +24,7 @@ TRACE_EVENT(hclge_pf_mbx_get, __field(u8, code) __field(u8, subcode) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->vport[0].nic.kinfo.netdev->name) + __string(devname, hdev->vport[0].nic.kinfo.netdev->name) __array(u32, mbx_data, PF_GET_MBX_LEN) ), @@ -33,7 +33,7 @@ TRACE_EVENT(hclge_pf_mbx_get, __entry->code = req->msg.code; __entry->subcode = req->msg.subcode; __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->vport[0].nic.kinfo.netdev->name); + __assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_vf_to_pf_cmd)); ), @@ -56,7 +56,7 @@ TRACE_EVENT(hclge_pf_mbx_send, __field(u8, vfid) __field(u16, code) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->vport[0].nic.kinfo.netdev->name) + __string(devname, hdev->vport[0].nic.kinfo.netdev->name) __array(u32, mbx_data, PF_SEND_MBX_LEN) ), @@ -64,7 +64,7 @@ TRACE_EVENT(hclge_pf_mbx_send, __entry->vfid = req->dest_vfid; __entry->code = le16_to_cpu(req->msg.code); __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->vport[0].nic.kinfo.netdev->name); + __assign_str(devname, hdev->vport[0].nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_pf_to_vf_cmd)); ), diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h index 5d4895bb57a1..b259e95dd53c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_trace.h @@ -23,7 +23,7 @@ TRACE_EVENT(hclge_vf_mbx_get, __field(u8, vfid) __field(u16, code) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->nic.kinfo.netdev->name) + __string(devname, hdev->nic.kinfo.netdev->name) __array(u32, mbx_data, VF_GET_MBX_LEN) ), @@ -31,7 +31,7 @@ TRACE_EVENT(hclge_vf_mbx_get, __entry->vfid = req->dest_vfid; __entry->code = le16_to_cpu(req->msg.code); __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->nic.kinfo.netdev->name); + __assign_str(devname, hdev->nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_pf_to_vf_cmd)); ), @@ -55,7 +55,7 @@ TRACE_EVENT(hclge_vf_mbx_send, __field(u8, code) __field(u8, subcode) __string(pciname, pci_name(hdev->pdev)) - __string(devname, &hdev->nic.kinfo.netdev->name) + __string(devname, hdev->nic.kinfo.netdev->name) __array(u32, mbx_data, VF_SEND_MBX_LEN) ), @@ -64,7 +64,7 @@ TRACE_EVENT(hclge_vf_mbx_send, __entry->code = req->msg.code; __entry->subcode = req->msg.subcode; __assign_str(pciname, pci_name(hdev->pdev)); - __assign_str(devname, &hdev->nic.kinfo.netdev->name); + __assign_str(devname, hdev->nic.kinfo.netdev->name); memcpy(__entry->mbx_data, req, sizeof(struct hclge_mbx_vf_to_pf_cmd)); ), diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index e545e92c4408..1cd2076210b1 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -104,7 +104,7 @@ TRACE_EVENT(nfsd_compound, TP_fast_assign( __entry->xid = be32_to_cpu(rqst->rq_xid); __entry->opcnt = opcnt; - __assign_str_len(tag, tag, taglen); + __assign_str(tag, tag); ), TP_printk("xid=0x%08x opcnt=%u tag=%s", __entry->xid, __entry->opcnt, __get_str(tag) @@ -485,7 +485,7 @@ TRACE_EVENT(nfsd_dirent, TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __assign_str_len(name, name, namlen) + __assign_str(name, name); ), TP_printk("fh_hash=0x%08x ino=%llu name=%s", __entry->fh_hash, __entry->ino, __get_str(name) @@ -896,7 +896,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __array(unsigned char, addr, sizeof(struct sockaddr_in6)) __field(unsigned long, flavor) __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) - __string_len(name, name, clp->cl_name.len) + __string_len(name, clp->cl_name.data, clp->cl_name.len) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -906,7 +906,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - __assign_str_len(name, clp->cl_name.data, clp->cl_name.len); + __assign_str(name, clp->cl_name.data); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), @@ -1976,7 +1976,7 @@ TRACE_EVENT(nfsd_ctl_time, TP_fast_assign( __entry->netns_ino = net->ns.inum; __entry->time = time; - __assign_str_len(name, name, namelen); + __assign_str(name, name); ), TP_printk("file=%s time=%d\n", __get_str(name), __entry->time diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 110e8a272189..dc067eeb6387 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -35,6 +35,17 @@ static DEFINE_MUTEX(eventfs_mutex); /* Choose something "unique" ;-) */ #define EVENTFS_FILE_INODE_INO 0x12c4e37 +struct eventfs_root_inode { + struct eventfs_inode ei; + struct dentry *events_dir; +}; + +static struct eventfs_root_inode *get_root_inode(struct eventfs_inode *ei) +{ + WARN_ON_ONCE(!ei->is_events); + return container_of(ei, struct eventfs_root_inode, ei); +} + /* Just try to make something consistent and unique */ static int eventfs_dir_ino(struct eventfs_inode *ei) { @@ -73,12 +84,18 @@ enum { static void release_ei(struct kref *ref) { struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref); + struct eventfs_root_inode *rei; WARN_ON_ONCE(!ei->is_freed); kfree(ei->entry_attrs); kfree_const(ei->name); - kfree_rcu(ei, rcu); + if (ei->is_events) { + rei = get_root_inode(ei); + kfree_rcu(rei, ei.rcu); + } else { + kfree_rcu(ei, rcu); + } } static inline void put_ei(struct eventfs_inode *ei) @@ -408,19 +425,43 @@ static struct dentry *lookup_dir_entry(struct dentry *dentry, return NULL; } +static inline struct eventfs_inode *init_ei(struct eventfs_inode *ei, const char *name) +{ + ei->name = kstrdup_const(name, GFP_KERNEL); + if (!ei->name) + return NULL; + kref_init(&ei->kref); + return ei; +} + static inline struct eventfs_inode *alloc_ei(const char *name) { struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL); + struct eventfs_inode *result; if (!ei) return NULL; - ei->name = kstrdup_const(name, GFP_KERNEL); - if (!ei->name) { + result = init_ei(ei, name); + if (!result) kfree(ei); + + return result; +} + +static inline struct eventfs_inode *alloc_root_ei(const char *name) +{ + struct eventfs_root_inode *rei = kzalloc(sizeof(*rei), GFP_KERNEL); + struct eventfs_inode *ei; + + if (!rei) return NULL; - } - kref_init(&ei->kref); + + rei->ei.is_events = 1; + ei = init_ei(&rei->ei, name); + if (!ei) + kfree(rei); + return ei; } @@ -483,7 +524,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *result = NULL; ti = get_tracefs(dir); - if (!(ti->flags & TRACEFS_EVENT_INODE)) + if (WARN_ON_ONCE(!(ti->flags & TRACEFS_EVENT_INODE))) return ERR_PTR(-EIO); mutex_lock(&eventfs_mutex); @@ -495,7 +536,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, list_for_each_entry(ei_child, &ei->children, list) { if (strcmp(ei_child->name, name) != 0) continue; - if (ei_child->is_freed) + /* A child is freed and removed from the list at the same time */ + if (WARN_ON_ONCE(ei_child->is_freed)) goto out; result = lookup_dir_entry(dentry, ei, ei_child); goto out; @@ -709,6 +751,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry int size, void *data) { struct dentry *dentry = tracefs_start_creating(name, parent); + struct eventfs_root_inode *rei; struct eventfs_inode *ei; struct tracefs_inode *ti; struct inode *inode; @@ -721,7 +764,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry if (IS_ERR(dentry)) return ERR_CAST(dentry); - ei = alloc_ei(name); + ei = alloc_root_ei(name); if (!ei) goto fail; @@ -730,10 +773,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry goto fail; // Note: we have a ref to the dentry from tracefs_start_creating() - ei->events_dir = dentry; + rei = get_root_inode(ei); + rei->events_dir = dentry; + ei->entries = entries; ei->nr_entries = size; - ei->is_events = 1; ei->data = data; /* Save the ownership of this directory */ @@ -844,13 +888,15 @@ void eventfs_remove_dir(struct eventfs_inode *ei) */ void eventfs_remove_events_dir(struct eventfs_inode *ei) { + struct eventfs_root_inode *rei; struct dentry *dentry; - dentry = ei->events_dir; + rei = get_root_inode(ei); + dentry = rei->events_dir; if (!dentry) return; - ei->events_dir = NULL; + rei->events_dir = NULL; eventfs_remove_dir(ei); /* diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index beb3dcd0e434..15c26f9aaad4 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -36,7 +36,6 @@ struct eventfs_attr { * @children: link list into the child eventfs_inode * @entries: the array of entries representing the files in the directory * @name: the name of the directory to create - * @events_dir: the dentry of the events directory * @entry_attrs: Saved mode and ownership of the @d_children * @data: The private data to pass to the callbacks * @attr: Saved mode and ownership of eventfs_inode itself @@ -54,7 +53,6 @@ struct eventfs_inode { struct list_head children; const struct eventfs_entry *entries; const char *name; - struct dentry *events_dir; struct eventfs_attr *entry_attrs; void *data; struct eventfs_attr attr; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e8921871ef9a..54d53f345d14 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1151,7 +1151,9 @@ static inline void unpause_graph_tracing(void) { } #ifdef CONFIG_TRACING enum ftrace_dump_mode; -extern enum ftrace_dump_mode ftrace_dump_on_oops; +#define MAX_TRACER_SIZE 100 +extern char ftrace_dump_on_oops[]; +extern int ftrace_dump_on_oops_enabled(void); extern int tracepoint_printk; extern void disable_trace_on_warning(void); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d718fbec72dd..be2e8c0a187e 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -215,6 +215,7 @@ enum ftrace_dump_mode { DUMP_NONE, DUMP_ALL, DUMP_ORIG, + DUMP_PARAM, }; #ifdef CONFIG_TRACING diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index fc6d0af56bb1..6f9bdfb09d1d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -17,6 +17,9 @@ struct dentry; struct bpf_prog; union bpf_attr; +/* Used for event string fields when they are NULL */ +#define EVENT_NULL_STR "(null)" + const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 88c0ba623ee6..689b6d71590e 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -199,7 +199,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) if (!(cond)) \ return; \ \ - if (WARN_ON_ONCE(RCUIDLE_COND(rcuidle))) \ + if (WARN_ONCE(RCUIDLE_COND(rcuidle), \ + "Bad RCU usage for tracepoint")) \ return; \ \ /* keep srcu and sched-rcu usage consistent */ \ @@ -259,7 +260,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) TP_ARGS(args), \ TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ - WARN_ON_ONCE(!rcu_is_watching()); \ + WARN_ONCE(!rcu_is_watching(), \ + "RCU not watching for tracepoint"); \ } \ } \ __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 4a2abf9daa46..ac05ed06a071 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1331,18 +1331,18 @@ TRACE_EVENT(xs_stream_read_data, __field(ssize_t, err) __field(size_t, total) __string(addr, xprt ? xprt->address_strings[RPC_DISPLAY_ADDR] : - "(null)") + EVENT_NULL_STR) __string(port, xprt ? xprt->address_strings[RPC_DISPLAY_PORT] : - "(null)") + EVENT_NULL_STR) ), TP_fast_assign( __entry->err = err; __entry->total = total; __assign_str(addr, xprt ? - xprt->address_strings[RPC_DISPLAY_ADDR] : "(null)"); + xprt->address_strings[RPC_DISPLAY_ADDR] : EVENT_NULL_STR); __assign_str(port, xprt ? - xprt->address_strings[RPC_DISPLAY_PORT] : "(null)"); + xprt->address_strings[RPC_DISPLAY_PORT] : EVENT_NULL_STR); ), TP_printk("peer=[%s]:%s err=%zd total=%zu", __get_str(addr), @@ -1787,7 +1787,7 @@ TRACE_EVENT(svc_process, __string(service, name) __string(procedure, svc_proc_name(rqst)) __string(addr, rqst->rq_xprt ? - rqst->rq_xprt->xpt_remotebuf : "(null)") + rqst->rq_xprt->xpt_remotebuf : EVENT_NULL_STR) ), TP_fast_assign( @@ -1797,7 +1797,7 @@ TRACE_EVENT(svc_process, __assign_str(service, name); __assign_str(procedure, svc_proc_name(rqst)); __assign_str(addr, rqst->rq_xprt ? - rqst->rq_xprt->xpt_remotebuf : "(null)"); + rqst->rq_xprt->xpt_remotebuf : EVENT_NULL_STR); ), TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%s", diff --git a/include/trace/stages/stage2_data_offsets.h b/include/trace/stages/stage2_data_offsets.h index 469b6a64293d..8b0cff06d346 100644 --- a/include/trace/stages/stage2_data_offsets.h +++ b/include/trace/stages/stage2_data_offsets.h @@ -24,7 +24,7 @@ #define __array(type, item, len) #undef __dynamic_array -#define __dynamic_array(type, item, len) u32 item; +#define __dynamic_array(type, item, len) u32 item; const void *item##_ptr_; #undef __string #define __string(item, src) __dynamic_array(char, item, -1) @@ -45,7 +45,7 @@ #define __sockaddr(field, len) __dynamic_array(u8, field, len) #undef __rel_dynamic_array -#define __rel_dynamic_array(type, item, len) u32 item; +#define __rel_dynamic_array(type, item, len) u32 item; const void *item##_ptr_; #undef __rel_string #define __rel_string(item, src) __rel_dynamic_array(char, item, -1) diff --git a/include/trace/stages/stage5_get_offsets.h b/include/trace/stages/stage5_get_offsets.h index e30a13be46ba..c6a62dfb18ef 100644 --- a/include/trace/stages/stage5_get_offsets.h +++ b/include/trace/stages/stage5_get_offsets.h @@ -9,6 +9,16 @@ #undef __entry #define __entry entry +#ifndef __STAGE5_STRING_SRC_H +#define __STAGE5_STRING_SRC_H +static inline const char *__string_src(const char *str) +{ + if (!str) + return EVENT_NULL_STR; + return str; +} +#endif /* __STAGE5_STRING_SRC_H */ + /* * Fields should never declare an array: i.e. __field(int, arr[5]) * If they do, it will cause issues in parsing and possibly corrupt the @@ -47,10 +57,12 @@ #undef __string #define __string(item, src) __dynamic_array(char, item, \ - strlen((src) ? (const char *)(src) : "(null)") + 1) + strlen(__string_src(src)) + 1) \ + __data_offsets->item##_ptr_ = src; #undef __string_len -#define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1) +#define __string_len(item, src, len) __dynamic_array(char, item, (len) + 1)\ + __data_offsets->item##_ptr_ = src; #undef __vstring #define __vstring(item, fmt, ap) __dynamic_array(char, item, \ @@ -67,11 +79,14 @@ __data_size += __item_length; #undef __rel_string -#define __rel_string(item, src) __rel_dynamic_array(char, item, \ - strlen((src) ? (const char *)(src) : "(null)") + 1) +#define __rel_string(item, src) __rel_dynamic_array(char, item, \ + strlen(__string_src(src)) + 1) \ + __data_offsets->item##_ptr_ = src; #undef __rel_string_len -#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, (len) + 1) +#define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, (len) + 1)\ + __data_offsets->item##_ptr_ = src; + /* * __bitmask_size_in_bytes_raw is the number of bytes needed to hold * num_possible_cpus(). diff --git a/include/trace/stages/stage6_event_callback.h b/include/trace/stages/stage6_event_callback.h index 919b1a4da980..83da83a0c14f 100644 --- a/include/trace/stages/stage6_event_callback.h +++ b/include/trace/stages/stage6_event_callback.h @@ -32,14 +32,16 @@ #undef __assign_str #define __assign_str(dst, src) \ - strcpy(__get_str(dst), (src) ? (const char *)(src) : "(null)"); - -#undef __assign_str_len -#define __assign_str_len(dst, src, len) \ do { \ - memcpy(__get_str(dst), (src), (len)); \ - __get_str(dst)[len] = '\0'; \ - } while(0) + char *__str__ = __get_str(dst); \ + int __len__ = __get_dynamic_array_len(dst) - 1; \ + WARN_ON_ONCE(__builtin_constant_p(src) ? \ + strcmp((src), __data_offsets.dst##_ptr_) : \ + (src) != __data_offsets.dst##_ptr_); \ + memcpy(__str__, __data_offsets.dst##_ptr_ ? : \ + EVENT_NULL_STR, __len__); \ + __str__[__len__] = '\0'; \ + } while (0) #undef __assign_vstr #define __assign_vstr(dst, fmt, va) \ @@ -91,14 +93,13 @@ #define __rel_string_len(item, src, len) __rel_dynamic_array(char, item, -1) #undef __assign_rel_str -#define __assign_rel_str(dst, src) \ - strcpy(__get_rel_str(dst), (src) ? (const char *)(src) : "(null)"); - -#undef __assign_rel_str_len -#define __assign_rel_str_len(dst, src, len) \ +#define __assign_rel_str(dst) \ do { \ - memcpy(__get_rel_str(dst), (src), (len)); \ - __get_rel_str(dst)[len] = '\0'; \ + char *__str__ = __get_rel_str(dst); \ + int __len__ = __get_rel_dynamic_array_len(dst) - 1; \ + memcpy(__str__, __data_offsets.dst##_ptr_ ? : \ + EVENT_NULL_STR, __len__); \ + __str__[__len__] = '\0'; \ } while (0) #undef __rel_bitmask diff --git a/include/uapi/linux/user_events.h b/include/uapi/linux/user_events.h index f74f3aedd49c..a03de03dccbc 100644 --- a/include/uapi/linux/user_events.h +++ b/include/uapi/linux/user_events.h @@ -12,6 +12,7 @@ #include <linux/ioctl.h> #define USER_EVENTS_SYSTEM "user_events" +#define USER_EVENTS_MULTI_SYSTEM "user_events_multi" #define USER_EVENTS_PREFIX "u:" /* Create dynamic location entry within a 32-bit value */ @@ -22,8 +23,11 @@ enum user_reg_flag { /* Event will not delete upon last reference closing */ USER_EVENT_REG_PERSIST = 1U << 0, + /* Event will be allowed to have multiple formats */ + USER_EVENT_REG_MULTI_FORMAT = 1U << 1, + /* This value or above is currently non-ABI */ - USER_EVENT_REG_MAX = 1U << 1, + USER_EVENT_REG_MAX = 1U << 2, }; /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 157f7ce2942d..81cc974913bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1710,9 +1710,9 @@ static struct ctl_table kern_table[] = { { .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, - .maxlen = sizeof(int), + .maxlen = MAX_TRACER_SIZE, .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dostring, }, { .procname = "traceoff_on_warning", diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83ba342aef31..da1710499698 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1160,7 +1160,7 @@ __ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) * Search a given @hash to see if a given instruction pointer (@ip) * exists in it. * - * Returns the entry that holds the @ip if found. NULL otherwise. + * Returns: the entry that holds the @ip if found. NULL otherwise. */ struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) @@ -1282,7 +1282,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) /** * ftrace_free_filter - remove all filters for an ftrace_ops - * @ops - the ops to remove the filters from + * @ops: the ops to remove the filters from */ void ftrace_free_filter(struct ftrace_ops *ops) { @@ -1587,7 +1587,7 @@ static struct dyn_ftrace *lookup_rec(unsigned long start, unsigned long end) * @end: end of range to search (inclusive). @end points to the last byte * to check. * - * Returns rec->ip if the related ftrace location is a least partly within + * Returns: rec->ip if the related ftrace location is a least partly within * the given address range. That is, the first address of the instruction * that is either a NOP or call to the function tracer. It checks the ftrace * internal tables to determine if the address belongs or not. @@ -1607,9 +1607,10 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end) * ftrace_location - return the ftrace location * @ip: the instruction pointer to check * - * If @ip matches the ftrace location, return @ip. - * If @ip matches sym+0, return sym's ftrace location. - * Otherwise, return 0. + * Returns: + * * If @ip matches the ftrace location, return @ip. + * * If @ip matches sym+0, return sym's ftrace location. + * * Otherwise, return 0. */ unsigned long ftrace_location(unsigned long ip) { @@ -1639,7 +1640,7 @@ out: * @start: start of range to search * @end: end of range to search (inclusive). @end points to the last byte to check. * - * Returns 1 if @start and @end contains a ftrace location. + * Returns: 1 if @start and @end contains a ftrace location. * That is, the instruction that is either a NOP or call to * the function tracer. It checks the ftrace internal tables to * determine if the address belongs or not. @@ -2574,7 +2575,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS * is not set, then it wants to convert to the normal callback. * - * Returns the address of the trampoline to set to + * Returns: the address of the trampoline to set to */ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) { @@ -2615,7 +2616,7 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec) * a function that saves all the regs. Basically the '_EN' version * represents the current state of the function. * - * Returns the address of the trampoline that is currently being called + * Returns: the address of the trampoline that is currently being called */ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) { @@ -2719,7 +2720,7 @@ struct ftrace_rec_iter { /** * ftrace_rec_iter_start - start up iterating over traced functions * - * Returns an iterator handle that is used to iterate over all + * Returns: an iterator handle that is used to iterate over all * the records that represent address locations where functions * are traced. * @@ -2751,7 +2752,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void) * ftrace_rec_iter_next - get the next record to process. * @iter: The handle to the iterator. * - * Returns the next iterator after the given iterator @iter. + * Returns: the next iterator after the given iterator @iter. */ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) { @@ -2776,7 +2777,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) * ftrace_rec_iter_record - get the record at the iterator location * @iter: The current iterator location * - * Returns the record that the current @iter is at. + * Returns: the record that the current @iter is at. */ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter) { @@ -4010,6 +4011,8 @@ ftrace_avail_addrs_open(struct inode *inode, struct file *file) * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. * tracing_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). + * + * Returns: 0 on success or a negative errno value on failure */ int ftrace_regex_open(struct ftrace_ops *ops, int flag, @@ -4626,7 +4629,7 @@ struct ftrace_func_mapper { /** * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper * - * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + * Returns: a ftrace_func_mapper descriptor that can be used to map ips to data. */ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) { @@ -4646,7 +4649,7 @@ struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) * @mapper: The mapper that has the ip maps * @ip: the instruction pointer to find the data for * - * Returns the data mapped to @ip if found otherwise NULL. The return + * Returns: the data mapped to @ip if found otherwise NULL. The return * is actually the address of the mapper data pointer. The address is * returned for use cases where the data is no bigger than a long, and * the user can use the data pointer as its data instead of having to @@ -4672,7 +4675,7 @@ void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to map @data to * @data: The data to map to @ip * - * Returns 0 on success otherwise an error. + * Returns: 0 on success otherwise an error. */ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data) @@ -4701,7 +4704,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @mapper: The mapper that has the ip maps * @ip: The instruction pointer address to remove the data from * - * Returns the data if it is found, otherwise NULL. + * Returns: the data if it is found, otherwise NULL. * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. @@ -5625,10 +5628,10 @@ EXPORT_SYMBOL_GPL(modify_ftrace_direct); /** * ftrace_set_filter_ip - set a function to filter on in ftrace by address - * @ops - the ops to set the filter with - * @ip - the address to add to or remove from the filter. - * @remove - non zero to remove the ip from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ip: the address to add to or remove from the filter. + * @remove: non zero to remove the ip from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ip is NULL, it fails to update filter. @@ -5647,11 +5650,11 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); /** * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses - * @ops - the ops to set the filter with - * @ips - the array of addresses to add to or remove from the filter. - * @cnt - the number of addresses in @ips - * @remove - non zero to remove ips from the filter - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @ips: the array of addresses to add to or remove from the filter. + * @cnt: the number of addresses in @ips + * @remove: non zero to remove ips from the filter + * @reset: non zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled * If @ips array or any ip specified within is NULL , it fails to update filter. @@ -5670,7 +5673,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter_ips); /** * ftrace_ops_set_global_filter - setup ops to use global filters - * @ops - the ops which will use the global filters + * @ops: the ops which will use the global filters * * ftrace users who need global function trace filtering should call this. * It can set the global filter only if ops were not initialized before. @@ -5694,10 +5697,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, /** * ftrace_set_filter - set a function to filter on in ftrace - * @ops - the ops to set the filter with - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the filter with + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5716,10 +5719,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); /** * ftrace_set_notrace - set a function to not trace in ftrace - * @ops - the ops to set the notrace filter with - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @ops: the ops to set the notrace filter with + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -5738,9 +5741,9 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** * ftrace_set_global_filter - set a function to filter on with global tracers - * @buf - the string that holds the function filter text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function filter text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. @@ -5753,9 +5756,9 @@ EXPORT_SYMBOL_GPL(ftrace_set_global_filter); /** * ftrace_set_global_notrace - set a function to not trace with global tracers - * @buf - the string that holds the function notrace text. - * @len - the length of the string. - * @reset - non zero to reset all filters before applying this filter. + * @buf: the string that holds the function notrace text. + * @len: the length of the string. + * @reset: non-zero to reset all filters before applying this filter. * * Notrace Filters denote which functions should not be enabled when tracing * is enabled. If @buf is NULL and reset is set, all functions will be enabled @@ -7443,7 +7446,7 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func); * have its own recursion protection, then it should call the * ftrace_ops_assist_func() instead. * - * Returns the function that the trampoline should call for @ops. + * Returns: the function that the trampoline should call for @ops. */ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { @@ -7897,7 +7900,7 @@ void ftrace_kill(void) /** * ftrace_is_dead - Test if ftrace is dead or not. * - * Returns 1 if ftrace is "dead", zero otherwise. + * Returns: 1 if ftrace is "dead", zero otherwise. */ int ftrace_is_dead(void) { @@ -8142,8 +8145,7 @@ static int kallsyms_callback(void *data, const char *name, unsigned long addr) * @addrs array, which needs to be big enough to store at least @cnt * addresses. * - * This function returns 0 if all provided symbols are found, - * -ESRCH otherwise. + * Returns: 0 if all provided symbols are found, -ESRCH otherwise. */ int ftrace_lookup_symbols(const char **sorted_syms, size_t cnt, unsigned long *addrs) { diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 09df645ab9c7..25476ead681b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -384,6 +384,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + atomic_t seq; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -753,6 +754,9 @@ static void rb_wake_up_waiters(struct irq_work *work) { struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + /* For waiters waiting for the first wake up */ + (void)atomic_fetch_inc_release(&rbwork->seq); + wake_up_all(&rbwork->waiters); if (rbwork->full_waiters_pending || rbwork->wakeup_full) { /* Only cpu_buffer sets the above flags */ @@ -881,20 +885,21 @@ rb_wait_cond(struct rb_irq_work *rbwork, struct trace_buffer *buffer, return false; } +struct rb_wait_data { + struct rb_irq_work *irq_work; + int seq; +}; + /* * The default wait condition for ring_buffer_wait() is to just to exit the * wait loop the first time it is woken up. */ static bool rb_wait_once(void *data) { - long *once = data; - - /* wait_event() actually calls this twice before scheduling*/ - if (*once > 1) - return true; + struct rb_wait_data *rdata = data; + struct rb_irq_work *rbwork = rdata->irq_work; - (*once)++; - return false; + return atomic_read_acquire(&rbwork->seq) != rdata->seq; } /** @@ -915,14 +920,9 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, struct ring_buffer_per_cpu *cpu_buffer; struct wait_queue_head *waitq; struct rb_irq_work *rbwork; - long once = 0; + struct rb_wait_data rdata; int ret = 0; - if (!cond) { - cond = rb_wait_once; - data = &once; - } - /* * Depending on what the caller is waiting for, either any * data in any cpu buffer, or a specific buffer, put the @@ -944,6 +944,14 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full, else waitq = &rbwork->waiters; + /* Set up to exit loop as soon as it is woken */ + if (!cond) { + cond = rb_wait_once; + rdata.irq_work = rbwork; + rdata.seq = atomic_read_acquire(&rbwork->seq); + data = &rdata; + } + ret = wait_event_interruptible((*waitq), rb_wait_cond(rbwork, buffer, cpu, full, cond, data)); @@ -1515,7 +1523,8 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), + mflags | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) goto free_pages; @@ -1600,7 +1609,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu) cpu_buffer->reader_page = bpage; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order); + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_ZERO, + cpu_buffer->buffer->subbuf_order); if (!page) goto fail_free_reader; bpage->page = page_address(page); @@ -4380,7 +4390,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; reader = cpu_buffer->reader_page; head_page = cpu_buffer->head_page; - commit_page = cpu_buffer->commit_page; + commit_page = READ_ONCE(cpu_buffer->commit_page); commit_ts = commit_page->page->time_stamp; /* @@ -5568,7 +5578,8 @@ ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu) if (bpage->data) goto out; - page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, + page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO, cpu_buffer->buffer->subbuf_order); if (!page) { kfree(bpage); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab4c1a1fbda8..233d1af39fff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -13,7 +13,7 @@ * Copyright (C) 2004 Nadia Yvette Chambers */ #include <linux/ring_buffer.h> -#include <generated/utsrelease.h> +#include <linux/utsname.h> #include <linux/stacktrace.h> #include <linux/writeback.h> #include <linux/kallsyms.h> @@ -39,7 +39,6 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/panic_notifier.h> -#include <linux/kmemleak.h> #include <linux/poll.h> #include <linux/nmi.h> #include <linux/fs.h> @@ -105,7 +104,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) * tracing is active, only save the comm when a trace event * occurred. */ -static DEFINE_PER_CPU(bool, trace_taskinfo_save); +DEFINE_PER_CPU(bool, trace_taskinfo_save); /* * Kill all tracing for good (never come back). @@ -131,9 +130,12 @@ cpumask_var_t __read_mostly tracing_buffer_mask; * /proc/sys/kernel/ftrace_dump_on_oops * Set 1 if you want to dump buffers of all CPUs * Set 2 if you want to dump the buffer of the CPU that triggered oops + * Set instance name if you want to dump the specific trace instance + * Multiple instance dump is also supported, and instances are seperated + * by commas. */ - -enum ftrace_dump_mode ftrace_dump_on_oops; +/* Set to string format zero to disable by default */ +char ftrace_dump_on_oops[MAX_TRACER_SIZE] = "0"; /* When set, tracing will stop when a WARN*() is hit */ int __disable_trace_on_warning; @@ -179,7 +181,6 @@ static void ftrace_trace_userstack(struct trace_array *tr, struct trace_buffer *buffer, unsigned int trace_ctx); -#define MAX_TRACER_SIZE 100 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; @@ -202,19 +203,33 @@ static int __init set_cmdline_ftrace(char *str) } __setup("ftrace=", set_cmdline_ftrace); +int ftrace_dump_on_oops_enabled(void) +{ + if (!strcmp("0", ftrace_dump_on_oops)) + return 0; + else + return 1; +} + static int __init set_ftrace_dump_on_oops(char *str) { - if (*str++ != '=' || !*str || !strcmp("1", str)) { - ftrace_dump_on_oops = DUMP_ALL; + if (!*str) { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); return 1; } - if (!strcmp("orig_cpu", str) || !strcmp("2", str)) { - ftrace_dump_on_oops = DUMP_ORIG; - return 1; - } + if (*str == ',') { + strscpy(ftrace_dump_on_oops, "1", MAX_TRACER_SIZE); + strscpy(ftrace_dump_on_oops + 1, str, MAX_TRACER_SIZE - 1); + return 1; + } + + if (*str++ == '=') { + strscpy(ftrace_dump_on_oops, str, MAX_TRACER_SIZE); + return 1; + } - return 0; + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); @@ -1301,6 +1316,50 @@ static void free_snapshot(struct trace_array *tr) tr->allocated_snapshot = false; } +static int tracing_arm_snapshot_locked(struct trace_array *tr) +{ + int ret; + + lockdep_assert_held(&trace_types_lock); + + spin_lock(&tr->snapshot_trigger_lock); + if (tr->snapshot == UINT_MAX) { + spin_unlock(&tr->snapshot_trigger_lock); + return -EBUSY; + } + + tr->snapshot++; + spin_unlock(&tr->snapshot_trigger_lock); + + ret = tracing_alloc_snapshot_instance(tr); + if (ret) { + spin_lock(&tr->snapshot_trigger_lock); + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); + } + + return ret; +} + +int tracing_arm_snapshot(struct trace_array *tr) +{ + int ret; + + mutex_lock(&trace_types_lock); + ret = tracing_arm_snapshot_locked(tr); + mutex_unlock(&trace_types_lock); + + return ret; +} + +void tracing_disarm_snapshot(struct trace_array *tr) +{ + spin_lock(&tr->snapshot_trigger_lock); + if (!WARN_ON(!tr->snapshot)) + tr->snapshot--; + spin_unlock(&tr->snapshot_trigger_lock); +} + /** * tracing_alloc_snapshot - allocate snapshot buffer. * @@ -1374,10 +1433,6 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, mutex_lock(&trace_types_lock); - ret = tracing_alloc_snapshot_instance(tr); - if (ret) - goto fail_unlock; - if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto fail_unlock; @@ -1396,6 +1451,10 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + ret = tracing_arm_snapshot_locked(tr); + if (ret) + goto fail_unlock; + local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; @@ -1440,6 +1499,8 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) arch_spin_unlock(&tr->max_lock); local_irq_enable(); + tracing_disarm_snapshot(tr); + return ret; } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); @@ -1482,6 +1543,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } EXPORT_SYMBOL_GPL(tracing_snapshot_cond_disable); #define free_snapshot(tr) do { } while (0) +#define tracing_arm_snapshot_locked(tr) ({ -EBUSY; }) #endif /* CONFIG_TRACER_SNAPSHOT */ void tracer_tracing_off(struct trace_array *tr) @@ -2320,98 +2382,6 @@ void tracing_reset_all_online_cpus(void) mutex_unlock(&trace_types_lock); } -/* - * The tgid_map array maps from pid to tgid; i.e. the value stored at index i - * is the tgid last observed corresponding to pid=i. - */ -static int *tgid_map; - -/* The maximum valid index into tgid_map. */ -static size_t tgid_map_max; - -#define SAVED_CMDLINES_DEFAULT 128 -#define NO_CMDLINE_MAP UINT_MAX -/* - * Preemption must be disabled before acquiring trace_cmdline_lock. - * The various trace_arrays' max_lock must be acquired in a context - * where interrupt is disabled. - */ -static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; -struct saved_cmdlines_buffer { - unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; - unsigned *map_cmdline_to_pid; - unsigned cmdline_num; - int cmdline_idx; - char saved_cmdlines[]; -}; -static struct saved_cmdlines_buffer *savedcmd; - -static inline char *get_saved_cmdlines(int idx) -{ - return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; -} - -static inline void set_cmdline(int idx, const char *cmdline) -{ - strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); -} - -static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) -{ - int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); - - kfree(s->map_cmdline_to_pid); - kmemleak_free(s); - free_pages((unsigned long)s, order); -} - -static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) -{ - struct saved_cmdlines_buffer *s; - struct page *page; - int orig_size, size; - int order; - - /* Figure out how much is needed to hold the given number of cmdlines */ - orig_size = sizeof(*s) + val * TASK_COMM_LEN; - order = get_order(orig_size); - size = 1 << (order + PAGE_SHIFT); - page = alloc_pages(GFP_KERNEL, order); - if (!page) - return NULL; - - s = page_address(page); - kmemleak_alloc(s, size, 1, GFP_KERNEL); - memset(s, 0, sizeof(*s)); - - /* Round up to actual allocation */ - val = (size - sizeof(*s)) / TASK_COMM_LEN; - s->cmdline_num = val; - - s->map_cmdline_to_pid = kmalloc_array(val, - sizeof(*s->map_cmdline_to_pid), - GFP_KERNEL); - if (!s->map_cmdline_to_pid) { - free_saved_cmdlines_buffer(s); - return NULL; - } - - s->cmdline_idx = 0; - memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, - sizeof(s->map_pid_to_cmdline)); - memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, - val * sizeof(*s->map_cmdline_to_pid)); - - return s; -} - -static int trace_create_savedcmd(void) -{ - savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); - - return savedcmd ? 0 : -ENOMEM; -} - int is_tracing_stopped(void) { return global_trace.stop_count; @@ -2504,201 +2474,6 @@ void tracing_stop(void) return tracing_stop_tr(&global_trace); } -static int trace_save_cmdline(struct task_struct *tsk) -{ - unsigned tpid, idx; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - tpid = tsk->pid & (PID_MAX_DEFAULT - 1); - - /* - * It's not the end of the world if we don't get - * the lock, but we also don't want to spin - * nor do we want to disable interrupts, - * so if we miss here, then better luck next time. - * - * This is called within the scheduler and wake up, so interrupts - * had better been disabled and run queue lock been held. - */ - lockdep_assert_preemption_disabled(); - if (!arch_spin_trylock(&trace_cmdline_lock)) - return 0; - - idx = savedcmd->map_pid_to_cmdline[tpid]; - if (idx == NO_CMDLINE_MAP) { - idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; - - savedcmd->map_pid_to_cmdline[tpid] = idx; - savedcmd->cmdline_idx = idx; - } - - savedcmd->map_cmdline_to_pid[idx] = tsk->pid; - set_cmdline(idx, tsk->comm); - - arch_spin_unlock(&trace_cmdline_lock); - - return 1; -} - -static void __trace_find_cmdline(int pid, char comm[]) -{ - unsigned map; - int tpid; - - if (!pid) { - strcpy(comm, "<idle>"); - return; - } - - if (WARN_ON_ONCE(pid < 0)) { - strcpy(comm, "<XXX>"); - return; - } - - tpid = pid & (PID_MAX_DEFAULT - 1); - map = savedcmd->map_pid_to_cmdline[tpid]; - if (map != NO_CMDLINE_MAP) { - tpid = savedcmd->map_cmdline_to_pid[map]; - if (tpid == pid) { - strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); - return; - } - } - strcpy(comm, "<...>"); -} - -void trace_find_cmdline(int pid, char comm[]) -{ - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - __trace_find_cmdline(pid, comm); - - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int *trace_find_tgid_ptr(int pid) -{ - /* - * Pairs with the smp_store_release in set_tracer_flag() to ensure that - * if we observe a non-NULL tgid_map then we also observe the correct - * tgid_map_max. - */ - int *map = smp_load_acquire(&tgid_map); - - if (unlikely(!map || pid > tgid_map_max)) - return NULL; - - return &map[pid]; -} - -int trace_find_tgid(int pid) -{ - int *ptr = trace_find_tgid_ptr(pid); - - return ptr ? *ptr : 0; -} - -static int trace_save_tgid(struct task_struct *tsk) -{ - int *ptr; - - /* treat recording of idle task as a success */ - if (!tsk->pid) - return 1; - - ptr = trace_find_tgid_ptr(tsk->pid); - if (!ptr) - return 0; - - *ptr = tsk->tgid; - return 1; -} - -static bool tracing_record_taskinfo_skip(int flags) -{ - if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) - return true; - if (!__this_cpu_read(trace_taskinfo_save)) - return true; - return false; -} - -/** - * tracing_record_taskinfo - record the task info of a task - * - * @task: task to record - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo(struct task_struct *task, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/** - * tracing_record_taskinfo_sched_switch - record task info for sched_switch - * - * @prev: previous task during sched_switch - * @next: next task during sched_switch - * @flags: TRACE_RECORD_CMDLINE for recording comm - * TRACE_RECORD_TGID for recording tgid - */ -void tracing_record_taskinfo_sched_switch(struct task_struct *prev, - struct task_struct *next, int flags) -{ - bool done; - - if (tracing_record_taskinfo_skip(flags)) - return; - - /* - * Record as much task information as possible. If some fail, continue - * to try to record the others. - */ - done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); - done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); - done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); - - /* If recording any information failed, retry again soon. */ - if (!done) - return; - - __this_cpu_write(trace_taskinfo_save, false); -} - -/* Helpers to record a specific task information */ -void tracing_record_cmdline(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); -} - -void tracing_record_tgid(struct task_struct *task) -{ - tracing_record_taskinfo(task, TRACE_RECORD_TGID); -} - /* * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function @@ -4389,7 +4164,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", - name, UTS_RELEASE); + name, init_utsname()->release); seq_puts(m, "# -----------------------------------" "---------------------------------\n"); seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" @@ -5457,8 +5232,6 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { - int *map; - if ((mask == TRACE_ITER_RECORD_TGID) || (mask == TRACE_ITER_RECORD_CMD)) lockdep_assert_held(&event_mutex); @@ -5481,20 +5254,8 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_RECORD_TGID) { - if (!tgid_map) { - tgid_map_max = pid_max; - map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), - GFP_KERNEL); - /* - * Pairs with smp_load_acquire() in - * trace_find_tgid_ptr() to ensure that if it observes - * the tgid_map we just allocated then it also observes - * the corresponding tgid_map_max value. - */ - smp_store_release(&tgid_map, map); - } - if (!tgid_map) { + if (trace_alloc_tgid_map() < 0) { tr->trace_flags &= ~TRACE_ITER_RECORD_TGID; return -ENOMEM; } @@ -5938,207 +5699,6 @@ static const struct file_operations tracing_readme_fops = { .llseek = generic_file_llseek, }; -static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) -{ - int pid = ++(*pos); - - return trace_find_tgid_ptr(pid); -} - -static void *saved_tgids_start(struct seq_file *m, loff_t *pos) -{ - int pid = *pos; - - return trace_find_tgid_ptr(pid); -} - -static void saved_tgids_stop(struct seq_file *m, void *v) -{ -} - -static int saved_tgids_show(struct seq_file *m, void *v) -{ - int *entry = (int *)v; - int pid = entry - tgid_map; - int tgid = *entry; - - if (tgid == 0) - return SEQ_SKIP; - - seq_printf(m, "%d %d\n", pid, tgid); - return 0; -} - -static const struct seq_operations tracing_saved_tgids_seq_ops = { - .start = saved_tgids_start, - .stop = saved_tgids_stop, - .next = saved_tgids_next, - .show = saved_tgids_show, -}; - -static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_tgids_seq_ops); -} - - -static const struct file_operations tracing_saved_tgids_fops = { - .open = tracing_saved_tgids_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) -{ - unsigned int *ptr = v; - - if (*pos || m->count) - ptr++; - - (*pos)++; - - for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; - ptr++) { - if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) - continue; - - return ptr; - } - - return NULL; -} - -static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) -{ - void *v; - loff_t l = 0; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - - v = &savedcmd->map_cmdline_to_pid[0]; - while (l <= *pos) { - v = saved_cmdlines_next(m, v, &l); - if (!v) - return NULL; - } - - return v; -} - -static void saved_cmdlines_stop(struct seq_file *m, void *v) -{ - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); -} - -static int saved_cmdlines_show(struct seq_file *m, void *v) -{ - char buf[TASK_COMM_LEN]; - unsigned int *pid = v; - - __trace_find_cmdline(*pid, buf); - seq_printf(m, "%d %s\n", *pid, buf); - return 0; -} - -static const struct seq_operations tracing_saved_cmdlines_seq_ops = { - .start = saved_cmdlines_start, - .next = saved_cmdlines_next, - .stop = saved_cmdlines_stop, - .show = saved_cmdlines_show, -}; - -static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) -{ - int ret; - - ret = tracing_check_open_get_tr(NULL); - if (ret) - return ret; - - return seq_open(filp, &tracing_saved_cmdlines_seq_ops); -} - -static const struct file_operations tracing_saved_cmdlines_fops = { - .open = tracing_saved_cmdlines_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static ssize_t -tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static int tracing_resize_saved_cmdlines(unsigned int val) -{ - struct saved_cmdlines_buffer *s, *savedcmd_temp; - - s = allocate_cmdlines_buffer(val); - if (!s) - return -ENOMEM; - - preempt_disable(); - arch_spin_lock(&trace_cmdline_lock); - savedcmd_temp = savedcmd; - savedcmd = s; - arch_spin_unlock(&trace_cmdline_lock); - preempt_enable(); - free_saved_cmdlines_buffer(savedcmd_temp); - - return 0; -} - -static ssize_t -tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - /* must have at least 1 entry or less than PID_MAX_DEFAULT */ - if (!val || val > PID_MAX_DEFAULT) - return -EINVAL; - - ret = tracing_resize_saved_cmdlines((unsigned int)val); - if (ret < 0) - return ret; - - *ppos += cnt; - - return cnt; -} - -static const struct file_operations tracing_saved_cmdlines_size_fops = { - .open = tracing_open_generic, - .read = tracing_saved_cmdlines_size_read, - .write = tracing_saved_cmdlines_size_write, -}; - #ifdef CONFIG_TRACE_EVAL_MAP_FILE static union trace_eval_map_item * update_eval_map(union trace_eval_map_item *ptr) @@ -6615,11 +6175,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) */ synchronize_rcu(); free_snapshot(tr); + tracing_disarm_snapshot(tr); } - if (t->use_max_tr && !tr->allocated_snapshot) { - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) + if (!had_max_tr && t->use_max_tr) { + ret = tracing_arm_snapshot_locked(tr); + if (ret) goto out; } #else @@ -6628,8 +6189,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (t->init) { ret = tracer_init(t, tr); - if (ret) + if (ret) { +#ifdef CONFIG_TRACER_MAX_TRACE + if (t->use_max_tr) + tracing_disarm_snapshot(tr); +#endif goto out; + } } tr->current_trace = t; @@ -7731,10 +7297,11 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, if (tr->allocated_snapshot) ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->array_buffer, iter->cpu_file); - else - ret = tracing_alloc_snapshot_instance(tr); - if (ret < 0) + + ret = tracing_arm_snapshot_locked(tr); + if (ret) break; + /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { local_irq_disable(); @@ -7744,6 +7311,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, smp_call_function_single(iter->cpu_file, tracing_swap_cpu_buffer, (void *)tr, 1); } + tracing_disarm_snapshot(tr); break; default: if (tr->allocated_snapshot) { @@ -8875,8 +8443,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') - return unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (glob[0] == '!') { + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); + if (!ret) + tracing_disarm_snapshot(tr); + + return ret; + } if (!param) goto out_reg; @@ -8895,12 +8468,13 @@ ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, return ret; out_reg: - ret = tracing_alloc_snapshot_instance(tr); + ret = tracing_arm_snapshot(tr); if (ret < 0) goto out; ret = register_ftrace_function_probe(glob, tr, ops, count); - + if (ret < 0) + tracing_disarm_snapshot(tr); out: return ret < 0 ? ret : 0; } @@ -9707,7 +9281,9 @@ trace_array_create_systems(const char *name, const char *systems) raw_spin_lock_init(&tr->start_lock); tr->max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&tr->snapshot_trigger_lock); +#endif tr->current_trace = &nop_trace; INIT_LIST_HEAD(&tr->systems); @@ -10272,14 +9848,14 @@ static struct notifier_block trace_die_notifier = { static int trace_die_panic_handler(struct notifier_block *self, unsigned long ev, void *unused) { - if (!ftrace_dump_on_oops) + if (!ftrace_dump_on_oops_enabled()) return NOTIFY_DONE; /* The die notifier requires DIE_OOPS to trigger */ if (self == &trace_die_notifier && ev != DIE_OOPS) return NOTIFY_DONE; - ftrace_dump(ftrace_dump_on_oops); + ftrace_dump(DUMP_PARAM); return NOTIFY_DONE; } @@ -10320,12 +9896,12 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -void trace_init_global_iter(struct trace_iterator *iter) +static void trace_init_iter(struct trace_iterator *iter, struct trace_array *tr) { - iter->tr = &global_trace; + iter->tr = tr; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; - iter->array_buffer = &global_trace.array_buffer; + iter->array_buffer = &tr->array_buffer; if (iter->trace && iter->trace->open) iter->trace->open(iter); @@ -10345,22 +9921,19 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->fmt_size = STATIC_FMT_BUF_SIZE; } -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +void trace_init_global_iter(struct trace_iterator *iter) +{ + trace_init_iter(iter, &global_trace); +} + +static void ftrace_dump_one(struct trace_array *tr, enum ftrace_dump_mode dump_mode) { /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; - static atomic_t dump_running; - struct trace_array *tr = &global_trace; unsigned int old_userobj; unsigned long flags; int cnt = 0, cpu; - /* Only allow one dump user at a time. */ - if (atomic_inc_return(&dump_running) != 1) { - atomic_dec(&dump_running); - return; - } - /* * Always turn off tracing when we dump. * We don't need to show trace output of what happens @@ -10369,12 +9942,12 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) * If the user does a sysrq-z, then they can re-enable * tracing with echo 1 > tracing_on. */ - tracing_off(); + tracer_tracing_off(tr); local_irq_save(flags); /* Simulate the iterator */ - trace_init_global_iter(&iter); + trace_init_iter(&iter, tr); for_each_tracing_cpu(cpu) { atomic_inc(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); @@ -10385,21 +9958,15 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) /* don't look at user memory in panic mode */ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - switch (oops_dump_mode) { - case DUMP_ALL: - iter.cpu_file = RING_BUFFER_ALL_CPUS; - break; - case DUMP_ORIG: + if (dump_mode == DUMP_ORIG) iter.cpu_file = raw_smp_processor_id(); - break; - case DUMP_NONE: - goto out_enable; - default: - printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + else iter.cpu_file = RING_BUFFER_ALL_CPUS; - } - printk(KERN_TRACE "Dumping ftrace buffer:\n"); + if (tr == &global_trace) + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + else + printk(KERN_TRACE "Dumping ftrace instance %s buffer:\n", tr->name); /* Did function tracer already get disabled? */ if (ftrace_is_dead()) { @@ -10441,15 +10008,84 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) else printk(KERN_TRACE "---------------------------------\n"); - out_enable: tr->trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } - atomic_dec(&dump_running); local_irq_restore(flags); } + +static void ftrace_dump_by_param(void) +{ + bool first_param = true; + char dump_param[MAX_TRACER_SIZE]; + char *buf, *token, *inst_name; + struct trace_array *tr; + + strscpy(dump_param, ftrace_dump_on_oops, MAX_TRACER_SIZE); + buf = dump_param; + + while ((token = strsep(&buf, ",")) != NULL) { + if (first_param) { + first_param = false; + if (!strcmp("0", token)) + continue; + else if (!strcmp("1", token)) { + ftrace_dump_one(&global_trace, DUMP_ALL); + continue; + } + else if (!strcmp("2", token) || + !strcmp("orig_cpu", token)) { + ftrace_dump_one(&global_trace, DUMP_ORIG); + continue; + } + } + + inst_name = strsep(&token, "="); + tr = trace_array_find(inst_name); + if (!tr) { + printk(KERN_TRACE "Instance %s not found\n", inst_name); + continue; + } + + if (token && (!strcmp("2", token) || + !strcmp("orig_cpu", token))) + ftrace_dump_one(tr, DUMP_ORIG); + else + ftrace_dump_one(tr, DUMP_ALL); + } +} + +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + static atomic_t dump_running; + + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + + switch (oops_dump_mode) { + case DUMP_ALL: + ftrace_dump_one(&global_trace, DUMP_ALL); + break; + case DUMP_ORIG: + ftrace_dump_one(&global_trace, DUMP_ORIG); + break; + case DUMP_PARAM: + ftrace_dump_by_param(); + break; + case DUMP_NONE: + break; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + ftrace_dump_one(&global_trace, DUMP_ALL); + } + + atomic_dec(&dump_running); +} EXPORT_SYMBOL_GPL(ftrace_dump); #define WRITE_BUFSIZE 4096 @@ -10677,7 +10313,9 @@ __init static int tracer_alloc_buffers(void) global_trace.current_trace = &nop_trace; global_trace.max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; - +#ifdef CONFIG_TRACER_MAX_TRACE + spin_lock_init(&global_trace.snapshot_trigger_lock); +#endif ftrace_init_global_array_ops(&global_trace); init_trace_flags_index(&global_trace); @@ -10714,7 +10352,7 @@ __init static int tracer_alloc_buffers(void) out_free_pipe_cpumask: free_cpumask_var(global_trace.pipe_cpumask); out_free_savedcmd: - free_saved_cmdlines_buffer(savedcmd); + trace_free_saved_cmdlines_buffer(); out_free_temp_buffer: ring_buffer_free(temp_buffer); out_rm_hp_state: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 00f873910c5d..64450615ca0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -334,8 +334,8 @@ struct trace_array { */ struct array_buffer max_buffer; bool allocated_snapshot; -#endif -#ifdef CONFIG_TRACER_MAX_TRACE + spinlock_t snapshot_trigger_lock; + unsigned int snapshot; unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; @@ -1375,6 +1375,16 @@ static inline void trace_buffer_unlock_commit(struct trace_array *tr, trace_buffer_unlock_commit_regs(tr, buffer, event, trace_ctx, NULL); } +DECLARE_PER_CPU(bool, trace_taskinfo_save); +int trace_save_cmdline(struct task_struct *tsk); +int trace_create_savedcmd(void); +int trace_alloc_tgid_map(void); +void trace_free_saved_cmdlines_buffer(void); + +extern const struct file_operations tracing_saved_cmdlines_fops; +extern const struct file_operations tracing_saved_tgids_fops; +extern const struct file_operations tracing_saved_cmdlines_size_fops; + DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); @@ -1973,12 +1983,16 @@ static inline void trace_event_eval_update(struct trace_eval_map **map, int len) #ifdef CONFIG_TRACER_SNAPSHOT void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); +int tracing_arm_snapshot(struct trace_array *tr); +void tracing_disarm_snapshot(struct trace_array *tr); #else static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } +static inline int tracing_arm_snapshot(struct trace_array *tr) { return 0; } +static inline void tracing_disarm_snapshot(struct trace_array *tr) { } #endif #ifdef CONFIG_PREEMPT_TRACER diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 54d5fa35c90a..811b08439406 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -92,7 +92,6 @@ static void trace_do_benchmark(void) bm_total += delta; bm_totalsq += delta * delta; - if (bm_cnt > 1) { /* * Apply Welford's method to calculate standard deviation: @@ -105,7 +104,7 @@ static void trace_do_benchmark(void) stddev = 0; delta = bm_total; - do_div(delta, bm_cnt); + delta = div64_u64(delta, bm_cnt); avg = delta; if (stddev > 0) { @@ -127,7 +126,7 @@ static void trace_do_benchmark(void) seed = stddev; if (!last_seed) break; - do_div(seed, last_seed); + seed = div64_u64(seed, last_seed); seed += last_seed; do_div(seed, 2); } while (i++ < 10 && last_seed != seed); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index b33c3861fbbb..4bec043c8690 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -597,20 +597,12 @@ out: return ret; } -/** - * unregister_trigger - Generic event_command @unreg implementation - * @glob: The raw string used to register the trigger - * @test: Trigger-specific data used to find the trigger to remove - * @file: The trace_event_file associated with the event - * - * Common implementation for event trigger unregistration. - * - * Usually used directly as the @unreg method in event command - * implementations. +/* + * True if the trigger was found and unregistered, else false. */ -static void unregister_trigger(char *glob, - struct event_trigger_data *test, - struct trace_event_file *file) +static bool try_unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data = NULL, *iter; @@ -626,8 +618,32 @@ static void unregister_trigger(char *glob, } } - if (data && data->ops->free) - data->ops->free(data); + if (data) { + if (data->ops->free) + data->ops->free(data); + + return true; + } + + return false; +} + +/** + * unregister_trigger - Generic event_command @unreg implementation + * @glob: The raw string used to register the trigger + * @test: Trigger-specific data used to find the trigger to remove + * @file: The trace_event_file associated with the event + * + * Common implementation for event trigger unregistration. + * + * Usually used directly as the @unreg method in event command + * implementations. + */ +static void unregister_trigger(char *glob, + struct event_trigger_data *test, + struct trace_event_file *file) +{ + try_unregister_trigger(glob, test, file); } /* @@ -1470,12 +1486,23 @@ register_snapshot_trigger(char *glob, struct event_trigger_data *data, struct trace_event_file *file) { - int ret = tracing_alloc_snapshot_instance(file->tr); + int ret = tracing_arm_snapshot(file->tr); if (ret < 0) return ret; - return register_trigger(glob, data, file); + ret = register_trigger(glob, data, file); + if (ret < 0) + tracing_disarm_snapshot(file->tr); + return ret; +} + +static void unregister_snapshot_trigger(char *glob, + struct event_trigger_data *data, + struct trace_event_file *file) +{ + if (try_unregister_trigger(glob, data, file)) + tracing_disarm_snapshot(file->tr); } static int @@ -1510,7 +1537,7 @@ static struct event_command trigger_snapshot_cmd = { .trigger_type = ETT_SNAPSHOT, .parse = event_trigger_parse, .reg = register_snapshot_trigger, - .unreg = unregister_trigger, + .unreg = unregister_snapshot_trigger, .get_trigger_ops = snapshot_get_trigger_ops, .set_filter = set_trigger_filter, }; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index e76f5e1efdf2..70d428c394b6 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -34,7 +34,8 @@ /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 -#define EVENT_NAME(user_event) ((user_event)->tracepoint.name) +#define EVENT_NAME(user_event) ((user_event)->reg_name) +#define EVENT_TP_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 /* @@ -54,10 +55,13 @@ * allows isolation for events by various means. */ struct user_event_group { - char *system_name; - struct hlist_node node; - struct mutex reg_mutex; + char *system_name; + char *system_multi_name; + struct hlist_node node; + struct mutex reg_mutex; DECLARE_HASHTABLE(register_table, 8); + /* ID that moves forward within the group for multi-event names */ + u64 multi_id; }; /* Group for init_user_ns mapping, top-most group */ @@ -78,6 +82,7 @@ static unsigned int current_user_events; */ struct user_event { struct user_event_group *group; + char *reg_name; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -127,6 +132,8 @@ struct user_event_enabler { #define ENABLE_BIT(e) ((int)((e)->values & ENABLE_VAL_BIT_MASK)) +#define EVENT_MULTI_FORMAT(f) ((f) & USER_EVENT_REG_MULTI_FORMAT) + /* Used for asynchronous faulting in of pages */ struct user_event_enabler_fault { struct work_struct work; @@ -202,6 +209,8 @@ static struct user_event_mm *user_event_mm_get(struct user_event_mm *mm); static struct user_event_mm *user_event_mm_get_all(struct user_event *user); static void user_event_mm_put(struct user_event_mm *mm); static int destroy_user_event(struct user_event *user); +static bool user_fields_match(struct user_event *user, int argc, + const char **argv); static u32 user_event_key(char *name) { @@ -328,6 +337,7 @@ out: static void user_event_group_destroy(struct user_event_group *group) { kfree(group->system_name); + kfree(group->system_multi_name); kfree(group); } @@ -346,6 +356,11 @@ static char *user_event_group_system_name(void) return system_name; } +static char *user_event_group_system_multi_name(void) +{ + return kstrdup(USER_EVENTS_MULTI_SYSTEM, GFP_KERNEL); +} + static struct user_event_group *current_user_event_group(void) { return init_group; @@ -365,6 +380,11 @@ static struct user_event_group *user_event_group_create(void) if (!group->system_name) goto error; + group->system_multi_name = user_event_group_system_multi_name(); + + if (!group->system_multi_name) + goto error; + mutex_init(&group->reg_mutex); hash_init(group->register_table); @@ -1480,6 +1500,11 @@ static int destroy_user_event(struct user_event *user) hash_del(&user->node); user_event_destroy_validators(user); + + /* If we have different names, both must be freed */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user->call.print_fmt); kfree(EVENT_NAME(user)); kfree(user); @@ -1493,17 +1518,36 @@ static int destroy_user_event(struct user_event *user) } static struct user_event *find_user_event(struct user_event_group *group, - char *name, u32 *outkey) + char *name, int argc, const char **argv, + u32 flags, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(group->register_table, user, node, key) - if (!strcmp(EVENT_NAME(user), name)) + hash_for_each_possible(group->register_table, user, node, key) { + /* + * Single-format events shouldn't return multi-format + * events. Callers expect the underlying tracepoint to match + * the name exactly in these cases. Only check like-formats. + */ + if (EVENT_MULTI_FORMAT(flags) != EVENT_MULTI_FORMAT(user->reg_flags)) + continue; + + if (strcmp(EVENT_NAME(user), name)) + continue; + + if (user_fields_match(user, argc, argv)) return user_event_get(user); + /* Scan others if this is a multi-format event */ + if (EVENT_MULTI_FORMAT(flags)) + continue; + + return ERR_PTR(-EADDRINUSE); + } + return NULL; } @@ -1860,6 +1904,9 @@ static bool user_fields_match(struct user_event *user, int argc, struct list_head *head = &user->fields; int i = 0; + if (argc == 0) + return list_empty(head); + list_for_each_entry_reverse(field, head, link) { if (!user_field_match(field, argc, argv, &i)) return false; @@ -1877,13 +1924,15 @@ static bool user_event_match(const char *system, const char *event, struct user_event *user = container_of(ev, struct user_event, devent); bool match; - match = strcmp(EVENT_NAME(user), event) == 0 && - (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0); + match = strcmp(EVENT_NAME(user), event) == 0; + + if (match && system) { + match = strcmp(system, user->group->system_name) == 0 || + strcmp(system, user->group->system_multi_name) == 0; + } - if (match && argc > 0) + if (match) match = user_fields_match(user, argc, argv); - else if (match && argc == 0) - match = list_empty(&user->fields); return match; } @@ -1913,6 +1962,33 @@ static int user_event_trace_register(struct user_event *user) return ret; } +static int user_event_set_tp_name(struct user_event *user) +{ + lockdep_assert_held(&user->group->reg_mutex); + + if (EVENT_MULTI_FORMAT(user->reg_flags)) { + char *multi_name; + + multi_name = kasprintf(GFP_KERNEL_ACCOUNT, "%s.%llx", + user->reg_name, user->group->multi_id); + + if (!multi_name) + return -ENOMEM; + + user->call.name = multi_name; + user->tracepoint.name = multi_name; + + /* Inc to ensure unique multi-event name next time */ + user->group->multi_id++; + } else { + /* Non Multi-format uses register name */ + user->call.name = user->reg_name; + user->tracepoint.name = user->reg_name; + } + + return 0; +} + /* * Parses the event name, arguments and flags then registers if successful. * The name buffer lifetime is owned by this method for success cases only. @@ -1922,11 +1998,11 @@ static int user_event_parse(struct user_event_group *group, char *name, char *args, char *flags, struct user_event **newuser, int reg_flags) { - int ret; - u32 key; struct user_event *user; + char **argv = NULL; int argc = 0; - char **argv; + int ret; + u32 key; /* Currently don't support any text based flags */ if (flags != NULL) @@ -1935,41 +2011,34 @@ static int user_event_parse(struct user_event_group *group, char *name, if (!user_event_capable(reg_flags)) return -EPERM; + if (args) { + argv = argv_split(GFP_KERNEL, args, &argc); + + if (!argv) + return -ENOMEM; + } + /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(group, name, &key); + user = find_user_event(group, name, argc, (const char **)argv, + reg_flags, &key); mutex_unlock(&event_mutex); - if (user) { - if (args) { - argv = argv_split(GFP_KERNEL, args, &argc); - if (!argv) { - ret = -ENOMEM; - goto error; - } + if (argv) + argv_free(argv); - ret = user_fields_match(user, argc, (const char **)argv); - argv_free(argv); - - } else - ret = list_empty(&user->fields); - - if (ret) { - *newuser = user; - /* - * Name is allocated by caller, free it since it already exists. - * Caller only worries about failure cases for freeing. - */ - kfree(name); - } else { - ret = -EADDRINUSE; - goto error; - } + if (IS_ERR(user)) + return PTR_ERR(user); + + if (user) { + *newuser = user; + /* + * Name is allocated by caller, free it since it already exists. + * Caller only worries about failure cases for freeing. + */ + kfree(name); return 0; -error: - user_event_put(user, false); - return ret; } user = kzalloc(sizeof(*user), GFP_KERNEL_ACCOUNT); @@ -1982,7 +2051,13 @@ error: INIT_LIST_HEAD(&user->validators); user->group = group; - user->tracepoint.name = name; + user->reg_name = name; + user->reg_flags = reg_flags; + + ret = user_event_set_tp_name(user); + + if (ret) + goto put_user; ret = user_event_parse_fields(user, args); @@ -1996,11 +2071,14 @@ error: user->call.data = user; user->call.class = &user->class; - user->call.name = name; user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; - user->class.system = group->system_name; + + if (EVENT_MULTI_FORMAT(user->reg_flags)) + user->class.system = group->system_multi_name; + else + user->class.system = group->system_name; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; @@ -2022,8 +2100,6 @@ error: if (ret) goto put_user_lock; - user->reg_flags = reg_flags; - if (user->reg_flags & USER_EVENT_REG_PERSIST) { /* Ensure we track self ref and caller ref (2) */ refcount_set(&user->refcnt, 2); @@ -2047,30 +2123,43 @@ put_user: user_event_destroy_fields(user); user_event_destroy_validators(user); kfree(user->call.print_fmt); + + /* Caller frees reg_name on error, but not multi-name */ + if (EVENT_NAME(user) != EVENT_TP_NAME(user)) + kfree(EVENT_TP_NAME(user)); + kfree(user); return ret; } /* - * Deletes a previously created event if it is no longer being used. + * Deletes previously created events if they are no longer being used. */ static int delete_user_event(struct user_event_group *group, char *name) { - u32 key; - struct user_event *user = find_user_event(group, name, &key); + struct user_event *user; + struct hlist_node *tmp; + u32 key = user_event_key(name); + int ret = -ENOENT; - if (!user) - return -ENOENT; + /* Attempt to delete all event(s) with the name passed in */ + hash_for_each_possible_safe(group->register_table, user, tmp, node, key) { + if (strcmp(EVENT_NAME(user), name)) + continue; - user_event_put(user, true); + if (!user_event_last_ref(user)) + return -EBUSY; - if (!user_event_last_ref(user)) - return -EBUSY; + if (!user_event_capable(user->reg_flags)) + return -EPERM; - if (!user_event_capable(user->reg_flags)) - return -EPERM; + ret = destroy_user_event(user); - return destroy_user_event(user); + if (ret) + goto out; + } +out: + return ret; } /* @@ -2628,7 +2717,7 @@ static int user_seq_show(struct seq_file *m, void *p) hash_for_each(group->register_table, i, user, node) { status = user->status; - seq_printf(m, "%s", EVENT_NAME(user)); + seq_printf(m, "%s", EVENT_TP_NAME(user)); if (status != 0) seq_puts(m, " #"); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index c9ffdcfe622e..8a407adb0e1c 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/uaccess.h> +#include <linux/kmemleak.h> #include <linux/ftrace.h> #include <trace/events/sched.h> @@ -148,3 +149,517 @@ void tracing_stop_tgid_record(void) { tracing_stop_sched_switch(RECORD_TGID); } + +/* + * The tgid_map array maps from pid to tgid; i.e. the value stored at index i + * is the tgid last observed corresponding to pid=i. + */ +static int *tgid_map; + +/* The maximum valid index into tgid_map. */ +static size_t tgid_map_max; + +#define SAVED_CMDLINES_DEFAULT 128 +#define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; +struct saved_cmdlines_buffer { + unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; + unsigned *map_cmdline_to_pid; + unsigned cmdline_num; + int cmdline_idx; + char saved_cmdlines[]; +}; +static struct saved_cmdlines_buffer *savedcmd; + +/* Holds the size of a cmdline and pid element */ +#define SAVED_CMDLINE_MAP_ELEMENT_SIZE(s) \ + (TASK_COMM_LEN + sizeof((s)->map_cmdline_to_pid[0])) + +static inline char *get_saved_cmdlines(int idx) +{ + return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; +} + +static inline void set_cmdline(int idx, const char *cmdline) +{ + strncpy(get_saved_cmdlines(idx), cmdline, TASK_COMM_LEN); +} + +static void free_saved_cmdlines_buffer(struct saved_cmdlines_buffer *s) +{ + int order = get_order(sizeof(*s) + s->cmdline_num * TASK_COMM_LEN); + + kmemleak_free(s); + free_pages((unsigned long)s, order); +} + +static struct saved_cmdlines_buffer *allocate_cmdlines_buffer(unsigned int val) +{ + struct saved_cmdlines_buffer *s; + struct page *page; + int orig_size, size; + int order; + + /* Figure out how much is needed to hold the given number of cmdlines */ + orig_size = sizeof(*s) + val * SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + order = get_order(orig_size); + size = 1 << (order + PAGE_SHIFT); + page = alloc_pages(GFP_KERNEL, order); + if (!page) + return NULL; + + s = page_address(page); + kmemleak_alloc(s, size, 1, GFP_KERNEL); + memset(s, 0, sizeof(*s)); + + /* Round up to actual allocation */ + val = (size - sizeof(*s)) / SAVED_CMDLINE_MAP_ELEMENT_SIZE(s); + s->cmdline_num = val; + + /* Place map_cmdline_to_pid array right after saved_cmdlines */ + s->map_cmdline_to_pid = (unsigned *)&s->saved_cmdlines[val * TASK_COMM_LEN]; + + s->cmdline_idx = 0; + memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP, + sizeof(s->map_pid_to_cmdline)); + memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP, + val * sizeof(*s->map_cmdline_to_pid)); + + return s; +} + +int trace_create_savedcmd(void) +{ + savedcmd = allocate_cmdlines_buffer(SAVED_CMDLINES_DEFAULT); + + return savedcmd ? 0 : -ENOMEM; +} + +int trace_save_cmdline(struct task_struct *tsk) +{ + unsigned tpid, idx; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + tpid = tsk->pid & (PID_MAX_DEFAULT - 1); + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. + */ + lockdep_assert_preemption_disabled(); + if (!arch_spin_trylock(&trace_cmdline_lock)) + return 0; + + idx = savedcmd->map_pid_to_cmdline[tpid]; + if (idx == NO_CMDLINE_MAP) { + idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num; + + savedcmd->map_pid_to_cmdline[tpid] = idx; + savedcmd->cmdline_idx = idx; + } + + savedcmd->map_cmdline_to_pid[idx] = tsk->pid; + set_cmdline(idx, tsk->comm); + + arch_spin_unlock(&trace_cmdline_lock); + + return 1; +} + +static void __trace_find_cmdline(int pid, char comm[]) +{ + unsigned map; + int tpid; + + if (!pid) { + strcpy(comm, "<idle>"); + return; + } + + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + + tpid = pid & (PID_MAX_DEFAULT - 1); + map = savedcmd->map_pid_to_cmdline[tpid]; + if (map != NO_CMDLINE_MAP) { + tpid = savedcmd->map_cmdline_to_pid[map]; + if (tpid == pid) { + strscpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); + return; + } + } + strcpy(comm, "<...>"); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + __trace_find_cmdline(pid, comm); + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int *trace_find_tgid_ptr(int pid) +{ + /* + * Pairs with the smp_store_release in set_tracer_flag() to ensure that + * if we observe a non-NULL tgid_map then we also observe the correct + * tgid_map_max. + */ + int *map = smp_load_acquire(&tgid_map); + + if (unlikely(!map || pid > tgid_map_max)) + return NULL; + + return &map[pid]; +} + +int trace_find_tgid(int pid) +{ + int *ptr = trace_find_tgid_ptr(pid); + + return ptr ? *ptr : 0; +} + +static int trace_save_tgid(struct task_struct *tsk) +{ + int *ptr; + + /* treat recording of idle task as a success */ + if (!tsk->pid) + return 1; + + ptr = trace_find_tgid_ptr(tsk->pid); + if (!ptr) + return 0; + + *ptr = tsk->tgid; + return 1; +} + +static bool tracing_record_taskinfo_skip(int flags) +{ + if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) + return true; + if (!__this_cpu_read(trace_taskinfo_save)) + return true; + return false; +} + +/** + * tracing_record_taskinfo - record the task info of a task + * + * @task: task to record + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo(struct task_struct *task, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/** + * tracing_record_taskinfo_sched_switch - record task info for sched_switch + * + * @prev: previous task during sched_switch + * @next: next task during sched_switch + * @flags: TRACE_RECORD_CMDLINE for recording comm + * TRACE_RECORD_TGID for recording tgid + */ +void tracing_record_taskinfo_sched_switch(struct task_struct *prev, + struct task_struct *next, int flags) +{ + bool done; + + if (tracing_record_taskinfo_skip(flags)) + return; + + /* + * Record as much task information as possible. If some fail, continue + * to try to record the others. + */ + done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev); + done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev); + done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next); + + /* If recording any information failed, retry again soon. */ + if (!done) + return; + + __this_cpu_write(trace_taskinfo_save, false); +} + +/* Helpers to record a specific task information */ +void tracing_record_cmdline(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE); +} + +void tracing_record_tgid(struct task_struct *task) +{ + tracing_record_taskinfo(task, TRACE_RECORD_TGID); +} + +int trace_alloc_tgid_map(void) +{ + int *map; + + if (tgid_map) + return 0; + + tgid_map_max = pid_max; + map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map), + GFP_KERNEL); + if (!map) + return -ENOMEM; + + /* + * Pairs with smp_load_acquire() in + * trace_find_tgid_ptr() to ensure that if it observes + * the tgid_map we just allocated then it also observes + * the corresponding tgid_map_max value. + */ + smp_store_release(&tgid_map, map); + return 0; +} + +static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos) +{ + int pid = ++(*pos); + + return trace_find_tgid_ptr(pid); +} + +static void *saved_tgids_start(struct seq_file *m, loff_t *pos) +{ + int pid = *pos; + + return trace_find_tgid_ptr(pid); +} + +static void saved_tgids_stop(struct seq_file *m, void *v) +{ +} + +static int saved_tgids_show(struct seq_file *m, void *v) +{ + int *entry = (int *)v; + int pid = entry - tgid_map; + int tgid = *entry; + + if (tgid == 0) + return SEQ_SKIP; + + seq_printf(m, "%d %d\n", pid, tgid); + return 0; +} + +static const struct seq_operations tracing_saved_tgids_seq_ops = { + .start = saved_tgids_start, + .stop = saved_tgids_stop, + .next = saved_tgids_next, + .show = saved_tgids_show, +}; + +static int tracing_saved_tgids_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_tgids_seq_ops); +} + + +const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_saved_tgids_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos) +{ + unsigned int *ptr = v; + + if (*pos || m->count) + ptr++; + + (*pos)++; + + for (; ptr < &savedcmd->map_cmdline_to_pid[savedcmd->cmdline_num]; + ptr++) { + if (*ptr == -1 || *ptr == NO_CMDLINE_MAP) + continue; + + return ptr; + } + + return NULL; +} + +static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos) +{ + void *v; + loff_t l = 0; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + + v = &savedcmd->map_cmdline_to_pid[0]; + while (l <= *pos) { + v = saved_cmdlines_next(m, v, &l); + if (!v) + return NULL; + } + + return v; +} + +static void saved_cmdlines_stop(struct seq_file *m, void *v) +{ + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +static int saved_cmdlines_show(struct seq_file *m, void *v) +{ + char buf[TASK_COMM_LEN]; + unsigned int *pid = v; + + __trace_find_cmdline(*pid, buf); + seq_printf(m, "%d %s\n", *pid, buf); + return 0; +} + +static const struct seq_operations tracing_saved_cmdlines_seq_ops = { + .start = saved_cmdlines_start, + .next = saved_cmdlines_next, + .stop = saved_cmdlines_stop, + .show = saved_cmdlines_show, +}; + +static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp) +{ + int ret; + + ret = tracing_check_open_get_tr(NULL); + if (ret) + return ret; + + return seq_open(filp, &tracing_saved_cmdlines_seq_ops); +} + +const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_saved_cmdlines_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static ssize_t +tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +void trace_free_saved_cmdlines_buffer(void) +{ + free_saved_cmdlines_buffer(savedcmd); +} + +static int tracing_resize_saved_cmdlines(unsigned int val) +{ + struct saved_cmdlines_buffer *s, *savedcmd_temp; + + s = allocate_cmdlines_buffer(val); + if (!s) + return -ENOMEM; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + savedcmd_temp = savedcmd; + savedcmd = s; + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + free_saved_cmdlines_buffer(savedcmd_temp); + + return 0; +} + +static ssize_t +tracing_saved_cmdlines_size_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + /* must have at least 1 entry or less than PID_MAX_DEFAULT */ + if (!val || val > PID_MAX_DEFAULT) + return -EINVAL; + + ret = tracing_resize_saved_cmdlines((unsigned int)val); + if (ret < 0) + return ret; + + *ppos += cnt; + + return cnt; +} + +const struct file_operations tracing_saved_cmdlines_size_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_size_read, + .write = tracing_saved_cmdlines_size_write, +}; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 529590499b1f..e9c5058a8efd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -768,7 +768,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) { + if (ftrace_dump_on_oops_enabled()) { ftrace_dump(DUMP_ALL); /* ftrace_dump() disables tracing */ tracing_on(); diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 23f923ccd529..500981eca74d 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -163,8 +163,7 @@ * __string(). * * __string_len: This is a helper to a __dynamic_array, but it understands - * that the array has characters in it, and with the combined - * use of __assign_str_len(), it will allocate 'len' + 1 bytes + * that the array has characters in it, it will allocate 'len' + 1 bytes * in the ring buffer and add a '\0' to the string. This is * useful if the string being saved has no terminating '\0' byte. * It requires that the length of the string is known as it acts @@ -174,9 +173,11 @@ * * __string_len(foo, bar, len) * - * To assign this string, use the helper macro __assign_str_len(). + * To assign this string, use the helper macro __assign_str(). + * The length is saved via the __string_len() and is retrieved in + * __assign_str(). * - * __assign_str_len(foo, bar, len); + * __assign_str(foo, bar); * * Then len + 1 is allocated to the ring buffer, and a nul terminating * byte is added. This is similar to: @@ -302,6 +303,7 @@ TRACE_EVENT(foo_bar, __bitmask( cpus, num_possible_cpus() ) __cpumask( cpum ) __vstring( vstr, fmt, va ) + __string_len( lstr, foo, bar / 2 < strlen(foo) ? bar / 2 : strlen(foo) ) ), TP_fast_assign( @@ -310,12 +312,13 @@ TRACE_EVENT(foo_bar, memcpy(__get_dynamic_array(list), lst, __length_of(lst) * sizeof(int)); __assign_str(str, string); + __assign_str(lstr, foo); __assign_vstr(vstr, fmt, va); __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); __assign_cpumask(cpum, cpumask_bits(mask)); ), - TP_printk("foo %s %d %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s %s (%s) (%s) %s", __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -359,7 +362,8 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), - __get_str(str), __get_bitmask(cpus), __get_cpumask(cpum), + __get_str(str), __get_str(lstr), + __get_bitmask(cpus), __get_cpumask(cpum), __get_str(vstr)) ); @@ -570,7 +574,7 @@ TRACE_EVENT(foo_rel_loc, ), TP_fast_assign( - __assign_rel_str(foo, foo); + __assign_rel_str(foo); __entry->bar = bar; __assign_rel_bitmask(bitmask, mask, BITS_PER_BYTE * sizeof(unsigned long)); diff --git a/tools/testing/selftests/user_events/abi_test.c b/tools/testing/selftests/user_events/abi_test.c index cef1ff1af223..7288a05136ba 100644 --- a/tools/testing/selftests/user_events/abi_test.c +++ b/tools/testing/selftests/user_events/abi_test.c @@ -16,6 +16,8 @@ #include <sys/ioctl.h> #include <sys/stat.h> #include <unistd.h> +#include <glob.h> +#include <string.h> #include <asm/unistd.h> #include "../kselftest_harness.h" @@ -23,6 +25,62 @@ const char *data_file = "/sys/kernel/tracing/user_events_data"; const char *enable_file = "/sys/kernel/tracing/events/user_events/__abi_event/enable"; +const char *multi_dir_glob = "/sys/kernel/tracing/events/user_events_multi/__abi_event.*"; + +static int wait_for_delete(char *dir) +{ + struct stat buf; + int i; + + for (i = 0; i < 10000; ++i) { + if (stat(dir, &buf) == -1 && errno == ENOENT) + return 0; + + usleep(1000); + } + + return -1; +} + +static int find_multi_event_dir(char *unique_field, char *out_dir, int dir_len) +{ + char path[256]; + glob_t buf; + int i, ret; + + ret = glob(multi_dir_glob, GLOB_ONLYDIR, NULL, &buf); + + if (ret) + return -1; + + ret = -1; + + for (i = 0; i < buf.gl_pathc; ++i) { + FILE *fp; + + snprintf(path, sizeof(path), "%s/format", buf.gl_pathv[i]); + fp = fopen(path, "r"); + + if (!fp) + continue; + + while (fgets(path, sizeof(path), fp) != NULL) { + if (strstr(path, unique_field)) { + fclose(fp); + /* strscpy is not available, use snprintf */ + snprintf(out_dir, dir_len, "%s", buf.gl_pathv[i]); + ret = 0; + goto out; + } + } + + fclose(fp); + } +out: + globfree(&buf); + + return ret; +} static bool event_exists(void) { @@ -74,6 +132,39 @@ static int event_delete(void) return ret; } +static int reg_enable_multi(void *enable, int size, int bit, int flags, + char *args) +{ + struct user_reg reg = {0}; + char full_args[512] = {0}; + int fd = open(data_file, O_RDWR); + int len; + int ret; + + if (fd < 0) + return -1; + + len = snprintf(full_args, sizeof(full_args), "__abi_event %s", args); + + if (len > sizeof(full_args)) { + ret = -E2BIG; + goto out; + } + + reg.size = sizeof(reg); + reg.name_args = (__u64)full_args; + reg.flags = USER_EVENT_REG_MULTI_FORMAT | flags; + reg.enable_bit = bit; + reg.enable_addr = (__u64)enable; + reg.enable_size = size; + + ret = ioctl(fd, DIAG_IOCSREG, ®); +out: + close(fd); + + return ret; +} + static int reg_enable_flags(void *enable, int size, int bit, int flags) { struct user_reg reg = {0}; @@ -207,6 +298,49 @@ TEST_F(user, bit_sizes) { ASSERT_NE(0, reg_enable(&self->check, 128, 0)); } +TEST_F(user, multi_format) { + char first_dir[256]; + char second_dir[256]; + struct stat buf; + + /* Multiple formats for the same name should work */ + ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 0, + 0, "u32 multi_first")); + + ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 1, + 0, "u64 multi_second")); + + /* Same name with same format should also work */ + ASSERT_EQ(0, reg_enable_multi(&self->check, sizeof(int), 2, + 0, "u64 multi_second")); + + ASSERT_EQ(0, find_multi_event_dir("multi_first", + first_dir, sizeof(first_dir))); + + ASSERT_EQ(0, find_multi_event_dir("multi_second", + second_dir, sizeof(second_dir))); + + /* Should not be found in the same dir */ + ASSERT_NE(0, strcmp(first_dir, second_dir)); + + /* First dir should still exist */ + ASSERT_EQ(0, stat(first_dir, &buf)); + + /* Disabling first register should remove first dir */ + ASSERT_EQ(0, reg_disable(&self->check, 0)); + ASSERT_EQ(0, wait_for_delete(first_dir)); + + /* Second dir should still exist */ + ASSERT_EQ(0, stat(second_dir, &buf)); + + /* Disabling second register should remove second dir */ + ASSERT_EQ(0, reg_disable(&self->check, 1)); + /* Ensure bit 1 and 2 are tied together, should not delete yet */ + ASSERT_EQ(0, stat(second_dir, &buf)); + ASSERT_EQ(0, reg_disable(&self->check, 2)); + ASSERT_EQ(0, wait_for_delete(second_dir)); +} + TEST_F(user, forks) { int i; |