From 6d54057d76e25c91165cda0e6e007f1811faa2be Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:33:26 -0500 Subject: tracing/filter: Have no filter return a match The n_preds field of a file can change at anytime, and even can become zero, just as the filter is about to be processed by an event. In the case that is zero on entering the filter, return 1, telling the caller the event matchs and should be trace. Also use a variable and assign it with ACCESS_ONCE() such that the count stays consistent within the function. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 36d40104b17..7275f0310ed 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -383,9 +383,14 @@ int filter_match_preds(struct event_filter *filter, void *rec) int match, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int n_preds = ACCESS_ONCE(filter->n_preds); int i; - for (i = 0; i < filter->n_preds; i++) { + /* no filter is considered a match */ + if (!n_preds) + return 1; + + for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec, val1, val2); -- cgit v1.2.3-70-g09d2 From 58d9a597c4275d830a819625e7d437cd6fb23fa5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:37:09 -0500 Subject: tracing/filter: Move OR and AND logic out of fn() method The ops OR and AND act different from the other ops, as they are the only ones to take other ops as their arguements. These ops als change the logic of the filter_match_preds. By removing the OR and AND fn's we can also remove the val1 and val2 that is passed to all other fn's and are unused. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +-- kernel/trace/trace_events_filter.c | 51 ++++++++++++++------------------------ 2 files changed, 20 insertions(+), 34 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9021f8c0c0c..1597bc0749c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,8 +677,7 @@ struct event_subsystem { struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, - int val1, int val2); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7275f0310ed..5d719b340a2 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -124,8 +124,7 @@ struct filter_parse_state { }; #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_##type(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ type *addr = (type *)(event + pred->offset); \ type val = (type)pred->val; \ @@ -152,8 +151,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \ } #define DEFINE_EQUALITY_PRED(size) \ -static int filter_pred_##size(struct filter_pred *pred, void *event, \ - int val1, int val2) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ { \ u##size *addr = (u##size *)(event + pred->offset); \ u##size val = (u##size)pred->val; \ @@ -178,23 +176,8 @@ DEFINE_EQUALITY_PRED(32); DEFINE_EQUALITY_PRED(16); DEFINE_EQUALITY_PRED(8); -static int filter_pred_and(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 && val2; -} - -static int filter_pred_or(struct filter_pred *pred __attribute((unused)), - void *event __attribute((unused)), - int val1, int val2) -{ - return val1 || val2; -} - /* Filter predicate for fixed sized arrays of characters */ -static int filter_pred_string(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_string(struct filter_pred *pred, void *event) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -207,8 +190,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event, } /* Filter predicate for char * pointers */ -static int filter_pred_pchar(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_pchar(struct filter_pred *pred, void *event) { char **addr = (char **)(event + pred->offset); int cmp, match; @@ -231,8 +213,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event, * and add it to the address of the entry, and at last we have * the address of the string. */ -static int filter_pred_strloc(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_strloc(struct filter_pred *pred, void *event) { u32 str_item = *(u32 *)(event + pred->offset); int str_loc = str_item & 0xffff; @@ -247,8 +228,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event, return match; } -static int filter_pred_none(struct filter_pred *pred, void *event, - int val1, int val2) +static int filter_pred_none(struct filter_pred *pred, void *event) { return 0; } @@ -380,7 +360,7 @@ static void filter_build_regex(struct filter_pred *pred) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match, top = 0, val1 = 0, val2 = 0; + int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); @@ -393,7 +373,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; i < n_preds; i++) { pred = filter->preds[i]; if (!pred->pop_n) { - match = pred->fn(pred, rec, val1, val2); + match = pred->fn(pred, rec); stack[top++] = match; continue; } @@ -403,7 +383,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) } val1 = stack[--top]; val2 = stack[--top]; - match = pred->fn(pred, rec, val1, val2); + switch (pred->op) { + case OP_AND: + match = val1 && val2; + break; + case OP_OR: + match = val1 || val2; + break; + default: + WARN_ONCE(1, "filter op is not AND or OR"); + } stack[top++] = match; } @@ -775,15 +764,13 @@ static int filter_add_pred(struct filter_parse_state *ps, unsigned long long val; int ret; - pred->fn = filter_pred_none; + fn = pred->fn = filter_pred_none; if (pred->op == OP_AND) { pred->pop_n = 2; - fn = filter_pred_and; goto add_pred_fn; } else if (pred->op == OP_OR) { pred->pop_n = 2; - fn = filter_pred_or; goto add_pred_fn; } -- cgit v1.2.3-70-g09d2 From c9c53ca03d6f97fdd9832d5ed3f15b30ee5cdb86 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:42:43 -0500 Subject: tracing/filter: Dynamically allocate preds For every filter that is made, we create predicates to hold every operation within the filter. We have a max of 32 predicates that we can hold. Currently, we allocate all 32 even if we only need to use one. Part of the reason we do this is that the filter can be used at any moment by any event. Fortunately, the filter is only used with preemption disabled. By reseting the count of preds used "n_preds" to zero, then performing a synchronize_sched(), we can safely free and reallocate a new array of preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 3 +- kernel/trace/trace_events_filter.c | 143 ++++++++++++++++++++++++++++--------- 2 files changed, 110 insertions(+), 36 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1597bc0749c..441fc1bc85d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -661,7 +661,8 @@ struct ftrace_event_field { }; struct event_filter { - int n_preds; + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ struct filter_pred **preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 5d719b340a2..aac6a6183e6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,6 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; + struct filter_pred **preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -370,8 +371,13 @@ int filter_match_preds(struct event_filter *filter, void *rec) if (!n_preds) return 1; + /* + * n_preds and filter->preds is protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; + pred = preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -548,46 +554,55 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } +static void __free_preds(struct event_filter *filter) +{ + int i; + + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; + filter->n_preds = 0; +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; + if (filter->preds) { + for (i = 0; i < filter->n_preds; i++) + filter->preds[i]->fn = filter_pred_none; + } filter->n_preds = 0; - - for (i = 0; i < MAX_FILTER_PRED; i++) - filter->preds[i]->fn = filter_pred_none; } -static void __free_preds(struct event_filter *filter) +static void __free_filter(struct event_filter *filter) { - int i; - if (!filter) return; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); + __free_preds(filter); kfree(filter->filter_string); kfree(filter); } void destroy_preds(struct ftrace_event_call *call) { - __free_preds(call->filter); + __free_filter(call->filter); call->filter = NULL; call->flags &= ~TRACE_EVENT_FL_FILTERED; } -static struct event_filter *__alloc_preds(void) +static struct event_filter *__alloc_filter(void) { struct event_filter *filter; - struct filter_pred *pred; - int i; filter = kzalloc(sizeof(*filter), GFP_KERNEL); if (!filter) @@ -595,32 +610,63 @@ static struct event_filter *__alloc_preds(void) filter->n_preds = 0; - filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ + struct filter_pred *pred; + int i; + + if (filter->preds) { + if (filter->a_preds < n_preds) { + /* We need to reallocate */ + filter->n_preds = 0; + /* + * It is possible that the filter is currently + * being used. We need to zero out the number + * of preds, wait on preemption and then free + * the preds. + */ + synchronize_sched(); + __free_preds(filter); + } + } + + if (!filter->preds) { + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->a_preds = n_preds; + } if (!filter->preds) - goto oom; + return -ENOMEM; + + if (WARN_ON(filter->a_preds < n_preds)) + return -EINVAL; - for (i = 0; i < MAX_FILTER_PRED; i++) { - pred = kzalloc(sizeof(*pred), GFP_KERNEL); + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + if (!pred) + pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; filter->preds[i] = pred; } - return filter; - -oom: + return 0; + oom: __free_preds(filter); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } -static int init_preds(struct ftrace_event_call *call) +static int init_filter(struct ftrace_event_call *call) { if (call->filter) return 0; call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_preds(); + call->filter = __alloc_filter(); if (IS_ERR(call->filter)) return PTR_ERR(call->filter); @@ -636,7 +682,7 @@ static int init_subsystem_preds(struct event_subsystem *system) if (strcmp(call->class->system, system->name) != 0) continue; - err = init_preds(call); + err = init_filter(call); if (err) return err; } @@ -665,7 +711,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, { int idx, err; - if (filter->n_preds == MAX_FILTER_PRED) { + if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1179,6 +1225,20 @@ static int check_preds(struct filter_parse_state *ps) return 0; } +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1191,10 +1251,23 @@ static int replace_preds(struct ftrace_event_call *call, int err; int n_preds = 0; + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + err = check_preds(ps); if (err) return err; + if (!dry_run) { + err = __alloc_preds(filter, n_preds); + if (err) + return err; + } + + n_preds = 0; list_for_each_entry(elt, &ps->postfix, list) { if (elt->op == OP_NONE) { if (!operand1) @@ -1208,7 +1281,7 @@ static int replace_preds(struct ftrace_event_call *call, continue; } - if (n_preds++ == MAX_FILTER_PRED) { + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -1283,7 +1356,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) mutex_lock(&event_mutex); - err = init_preds(call); + err = init_filter(call); if (err) goto out_unlock; @@ -1376,7 +1449,7 @@ void ftrace_profile_free_filter(struct perf_event *event) struct event_filter *filter = event->filter; event->filter = NULL; - __free_preds(filter); + __free_filter(filter); } int ftrace_profile_set_filter(struct perf_event *event, int event_id, @@ -1402,7 +1475,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, if (event->filter) goto out_unlock; - filter = __alloc_preds(); + filter = __alloc_filter(); if (IS_ERR(filter)) { err = PTR_ERR(filter); goto out_unlock; @@ -1411,7 +1484,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - goto free_preds; + goto free_filter; parse_init(ps, filter_ops, filter_str); err = filter_parse(ps); @@ -1427,9 +1500,9 @@ free_ps: postfix_clear(ps); kfree(ps); -free_preds: +free_filter: if (err) - __free_preds(filter); + __free_filter(filter); out_unlock: mutex_unlock(&event_mutex); -- cgit v1.2.3-70-g09d2 From 0fc3ca9a10a61a77f18710fb708b41fd99c79a56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:46:46 -0500 Subject: tracing/filter: Call synchronize_sched() just once for system filters By separating out the reseting of the filter->n_preds to zero from the reallocation of preds for the filter, we can reset groups of filters first, call synchronize_sched() just once, and then reallocate each of the filters in the system group. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 80 ++++++++++++++++++++++++++++++-------- 1 file changed, 63 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index aac6a6183e6..8f00a11ce77 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -570,17 +570,28 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } +static void reset_preds(struct event_filter *filter) +{ + struct filter_pred *pred; + int n_preds = filter->n_preds; + int i; + + filter->n_preds = 0; + if (!filter->preds) + return; + + for (i = 0; i < n_preds; i++) { + pred = filter->preds[i]; + pred->fn = filter_pred_none; + } +} + static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; - int i; call->flags &= ~TRACE_EVENT_FL_FILTERED; - if (filter->preds) { - for (i = 0; i < filter->n_preds; i++) - filter->preds[i]->fn = filter_pred_none; - } - filter->n_preds = 0; + reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -620,15 +631,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) { if (filter->a_preds < n_preds) { - /* We need to reallocate */ - filter->n_preds = 0; /* - * It is possible that the filter is currently - * being used. We need to zero out the number - * of preds, wait on preemption and then free - * the preds. + * We need to reallocate. + * We should have already have zeroed out + * the pred count and called synchronized_sched() + * to make sure no one is using the preds. */ - synchronize_sched(); + if (WARN_ON_ONCE(filter->n_preds)) { + /* We need to reset it now */ + filter->n_preds = 0; + synchronize_sched(); + } __free_preds(filter); } } @@ -1328,6 +1341,30 @@ static int replace_system_preds(struct event_subsystem *system, /* try to see if the filter can be applied */ err = replace_preds(call, filter, ps, filter_string, true); if (err) + goto fail; + } + + /* set all filter pred counts to zero */ + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) + continue; + + reset_preds(filter); + } + + /* + * Since some of the preds may be used under preemption + * we need to wait for them to finish before we may + * reallocate them. + */ + synchronize_sched(); + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter = call->filter; + + if (strcmp(call->class->system, system->name) != 0) continue; /* really apply the filter */ @@ -1342,11 +1379,13 @@ static int replace_system_preds(struct event_subsystem *system, fail = false; } - if (fail) { - parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return -EINVAL; - } + if (fail) + goto fail; + return 0; + fail: + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) @@ -1381,6 +1420,13 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) goto out; } + /* + * Make sure all the pred counts are zero so that + * no task is using it when we reallocate the preds array. + */ + reset_preds(call->filter); + synchronize_sched(); + err = replace_preds(call, call->filter, ps, filter_string, false); if (err) append_filter_err(ps, call->filter); -- cgit v1.2.3-70-g09d2 From 74e9e58c350a24139e268dd6857bbaa55c5aafcf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:49:48 -0500 Subject: tracing/filter: Allocate the preds in an array Currently we allocate an array of pointers to filter_preds, and then allocate a separate filter_pred for each item in the array. This adds slight overhead in the filters as it needs to derefernce twice to get to the op condition. Allocating the preds themselves in a single array removes a dereference as well as helps on the cache footprint. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 +- kernel/trace/trace_events_filter.c | 31 +++++++++---------------------- 2 files changed, 10 insertions(+), 23 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 441fc1bc85d..254d04a84ec 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -663,7 +663,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ - struct filter_pred **preds; + struct filter_pred *preds; char *filter_string; }; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8f00a11ce77..b6c910642a1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -362,7 +362,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) { int match = -1, top = 0, val1 = 0, val2 = 0; int stack[MAX_FILTER_PRED]; - struct filter_pred **preds; + struct filter_pred *preds; struct filter_pred *pred; int n_preds = ACCESS_ONCE(filter->n_preds); int i; @@ -377,7 +377,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) preds = rcu_dereference_sched(filter->preds); for (i = 0; i < n_preds; i++) { - pred = preds[i]; + pred = &preds[i]; if (!pred->pop_n) { match = pred->fn(pred, rec); stack[top++] = match; @@ -559,10 +559,8 @@ static void __free_preds(struct event_filter *filter) int i; if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -572,7 +570,6 @@ static void __free_preds(struct event_filter *filter) static void reset_preds(struct event_filter *filter) { - struct filter_pred *pred; int n_preds = filter->n_preds; int i; @@ -580,10 +577,8 @@ static void reset_preds(struct event_filter *filter) if (!filter->preds) return; - for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - pred->fn = filter_pred_none; - } + for (i = 0; i < n_preds; i++) + filter->preds[i].fn = filter_pred_none; } static void filter_disable_preds(struct ftrace_event_call *call) @@ -658,19 +653,11 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return -EINVAL; for (i = 0; i < n_preds; i++) { - pred = filter->preds[i]; - if (!pred) - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - goto oom; + pred = &filter->preds[i]; pred->fn = filter_pred_none; - filter->preds[i] = pred; } return 0; - oom: - __free_preds(filter); - return -ENOMEM; } static int init_filter(struct ftrace_event_call *call) @@ -730,8 +717,8 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, } idx = filter->n_preds; - filter_clear_pred(filter->preds[idx]); - err = filter_set_pred(filter->preds[idx], pred, fn); + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(&filter->preds[idx], pred, fn); if (err) return err; -- cgit v1.2.3-70-g09d2 From f76690afd05e3e163149310bdcd30234f93b3a7a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:53:06 -0500 Subject: tracing/filter: Free pred array on disabling of filter When a filter is disabled, free the preds. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index b6c910642a1..2f5458e244a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1388,6 +1388,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!strcmp(strstrip(filter_string), "0")) { filter_disable_preds(call); + reset_preds(call->filter); + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_preds(call->filter); remove_filter_string(call->filter); goto out_unlock; } -- cgit v1.2.3-70-g09d2 From 61e9dea20e1ada886cc49a9ec6fe3c6ac0de7324 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 22:54:33 -0500 Subject: tracing/filter: Use a tree instead of stack for filter_match_preds() Currently the filter_match_preds() requires a stack to push and pop the preds to determine if the filter matches the record or not. This has two drawbacks: 1) It requires a stack to store state information. As this is done in fast paths we can't allocate the storage for this stack, and we can't use a global as it must be re-entrant. The stack is stored on the kernel stack and this greatly limits how many preds we may allow. 2) All conditions are calculated even when a short circuit exists. a || b will always calculate a and b even though a was determined to be true. Using a tree we can walk a constant structure that will save the state as we go. The algorithm is simply: pred = root; do { switch (move) { case MOVE_DOWN: if (OR or AND) { pred = left; continue; } if (pred == root) break; match = pred->fn(); pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; case MOVE_UP_FROM_LEFT: /* Only OR or AND can be a parent */ if (match && OR || !match && AND) { /* short circuit */ if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } pred = pred->right; move = MOVE_DOWN; continue; case MOVE_UP_FROM_RIGHT: if (pred == root) break; pred = pred->parent; move = left child ? MOVE_UP_FROM_LEFT : MOVE_UP_FROM_RIGHT; continue; } done = 1; } while (!done); This way there's no strict limit to how many preds we allow and it also will short circuit the logical operations when possible. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 +- kernel/trace/trace_events_filter.c | 231 +++++++++++++++++++++++++++++-------- 2 files changed, 194 insertions(+), 46 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 254d04a84ec..bba34a72c78 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -664,6 +664,7 @@ struct event_filter { int n_preds; /* Number assigned */ int a_preds; /* allocated */ struct filter_pred *preds; + struct filter_pred *root; char *filter_string; }; @@ -675,6 +676,9 @@ struct event_subsystem { int nr_events; }; +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) + struct filter_pred; struct regex; @@ -704,7 +708,10 @@ struct filter_pred { int offset; int not; int op; - int pop_n; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; }; extern struct list_head ftrace_common_fields; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2f5458e244a..10390491b6d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -123,6 +123,11 @@ struct filter_parse_state { } operand; }; +struct pred_stack { + struct filter_pred **preds; + int index; +}; + #define DEFINE_COMPARISON_PRED(type) \ static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ @@ -357,52 +362,95 @@ static void filter_build_regex(struct filter_pred *pred) pred->not ^= not; } +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1, top = 0, val1 = 0, val2 = 0; - int stack[MAX_FILTER_PRED]; + int match = -1; + enum move_type move = MOVE_DOWN; struct filter_pred *preds; struct filter_pred *pred; + struct filter_pred *root; int n_preds = ACCESS_ONCE(filter->n_preds); - int i; + int done = 0; /* no filter is considered a match */ if (!n_preds) return 1; /* - * n_preds and filter->preds is protect with preemption disabled. + * n_preds, root and filter->preds are protect with preemption disabled. */ preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; - for (i = 0; i < n_preds; i++) { - pred = &preds[i]; - if (!pred->pop_n) { + pred = root; + + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* keep going to leaf node */ + pred = &preds[pred->left]; + continue; + } match = pred->fn(pred, rec); - stack[top++] = match; + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* Check for short circuits */ + if ((match && pred->op == OP_OR) || + (!match && pred->op == OP_AND)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); continue; } - if (pred->pop_n > top) { - WARN_ON_ONCE(1); - return 0; - } - val1 = stack[--top]; - val2 = stack[--top]; - switch (pred->op) { - case OP_AND: - match = val1 && val2; - break; - case OP_OR: - match = val1 || val2; - break; - default: - WARN_ONCE(1, "filter op is not AND or OR"); - } - stack[top++] = match; - } + done = 1; + } while (!done); - return stack[--top]; + return match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -539,10 +587,58 @@ static void filter_clear_pred(struct filter_pred *pred) pred->regex.len = 0; } -static int filter_set_pred(struct filter_pred *dest, +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, struct filter_pred *src, filter_pred_fn_t fn) { + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + *dest = *src; if (src->field_name) { dest->field_name = kstrdup(src->field_name, GFP_KERNEL); @@ -550,8 +646,25 @@ static int filter_set_pred(struct filter_pred *dest, return -ENOMEM; } dest->fn = fn; + dest->index = idx; - return 0; + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + dest->left = left->index; + dest->right = right->index; + left->parent = dest->index; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + return __push_pred_stack(stack, dest); } static void __free_preds(struct event_filter *filter) @@ -574,6 +687,7 @@ static void reset_preds(struct event_filter *filter) int i; filter->n_preds = 0; + filter->root = NULL; if (!filter->preds) return; @@ -707,6 +821,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, filter_pred_fn_t fn) { int idx, err; @@ -718,7 +833,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, idx = filter->n_preds; filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(&filter->preds[idx], pred, fn); + err = filter_set_pred(filter, idx, stack, pred, fn); if (err) return err; @@ -803,6 +918,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_call *call, struct event_filter *filter, struct filter_pred *pred, + struct pred_stack *stack, bool dry_run) { struct ftrace_event_field *field; @@ -812,13 +928,10 @@ static int filter_add_pred(struct filter_parse_state *ps, fn = pred->fn = filter_pred_none; - if (pred->op == OP_AND) { - pred->pop_n = 2; + if (pred->op == OP_AND) goto add_pred_fn; - } else if (pred->op == OP_OR) { - pred->pop_n = 2; + else if (pred->op == OP_OR) goto add_pred_fn; - } field = find_event_field(call, pred->field_name); if (!field) { @@ -867,7 +980,7 @@ static int filter_add_pred(struct filter_parse_state *ps, add_pred_fn: if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, fn); + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); return 0; } @@ -1248,6 +1361,7 @@ static int replace_preds(struct ftrace_event_call *call, char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ int err; int n_preds = 0; @@ -1262,9 +1376,12 @@ static int replace_preds(struct ftrace_event_call *call, return err; if (!dry_run) { - err = __alloc_preds(filter, n_preds); + err = __alloc_pred_stack(&stack, n_preds); if (err) return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; } n_preds = 0; @@ -1276,14 +1393,16 @@ static int replace_preds(struct ftrace_event_call *call, operand2 = elt->operand; else { parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } continue; } if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); - return -ENOSPC; + err = -ENOSPC; + goto fail; } if (elt->op == OP_AND || elt->op == OP_OR) { @@ -1293,22 +1412,44 @@ static int replace_preds(struct ftrace_event_call *call, if (!operand1 || !operand2) { parse_error(ps, FILT_ERR_MISSING_FIELD, 0); - return -EINVAL; + err = -EINVAL; + goto fail; } pred = create_pred(elt->op, operand1, operand2); add_pred: - if (!pred) - return -ENOMEM; - err = filter_add_pred(ps, call, filter, pred, dry_run); + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); filter_free_pred(pred); if (err) - return err; + goto fail; operand1 = operand2 = NULL; } - return 0; + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + filter->root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; } static int replace_system_preds(struct event_subsystem *system, -- cgit v1.2.3-70-g09d2 From 55719274188f13cff9e3bd11fdd4c0e7617cd03d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:12:05 -0500 Subject: tracing/filter: Optimize short ciruit check The test if we should break out early for OR and AND operations can be optimized by comparing the current result with (pred->op == OP_OR) That is if the result is true and the op is an OP_OR, or if the result is false and the op is not an OP_OR (thus an OP_AND) we can break out early in either case. Otherwise we continue processing. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 10390491b6d..0a3e0502b50 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -426,9 +426,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) pred->parent, &move); continue; case MOVE_UP_FROM_LEFT: - /* Check for short circuits */ - if ((match && pred->op == OP_OR) || - (!match && pred->op == OP_AND)) { + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { if (pred == root) break; pred = get_pred_parent(pred, preds, -- cgit v1.2.3-70-g09d2 From ec126cac23945de12eb2d103374e1f7ee97c5595 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:14:25 -0500 Subject: tracing/filter: Check the created pred tree Since the filter walks a tree to determine if a match is made or not, if the tree was incorrectly created, it could cause an infinite loop. Add a check to walk the entire tree before assigning it as a filter to make sure the tree is correct. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 72 +++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0a3e0502b50..91c9cdcb040 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1358,6 +1358,68 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1366,6 +1428,7 @@ static int replace_preds(struct ftrace_event_call *call, { char *operand1 = NULL, *operand2 = NULL; struct filter_pred *pred; + struct filter_pred *root; struct postfix_elt *elt; struct pred_stack stack = { }; /* init to NULL */ int err; @@ -1442,7 +1505,7 @@ add_pred: if (!pred) return -EINVAL; /* This item is where we start from in matching */ - filter->root = pred; + root = pred; /* Make sure the stack is empty */ pred = __pop_pred_stack(&stack); if (WARN_ON(pred)) { @@ -1450,6 +1513,13 @@ add_pred: filter->root = NULL; goto fail; } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; } err = 0; -- cgit v1.2.3-70-g09d2 From 43cd414552d8137157e926e46361678ea867e476 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:16:51 -0500 Subject: tracing/filter: Optimize filter by folding the tree There are many cases that a filter will contain multiple ORs or ANDs together near the leafs. Walking up and down the tree to get to the next compare can be a waste. If there are several ORs or ANDs together, fold them into a single pred and allocate an array of the conditions that they check. This will speed up the filter by linearly walking an array and can still break out if a short circuit condition is met. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 12 +- kernel/trace/trace_events_filter.c | 233 +++++++++++++++++++++++++++++++++++-- 2 files changed, 235 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bba34a72c78..d754330934b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -678,6 +678,7 @@ struct event_subsystem { #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) struct filter_pred; struct regex; @@ -704,7 +705,16 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - char *field_name; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; int offset; int not; int op; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 91c9cdcb040..2403ce5b650 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,42 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int type; + int match; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -414,11 +450,16 @@ int filter_match_preds(struct event_filter *filter, void *rec) case MOVE_DOWN: /* only AND and OR have children */ if (pred->left != FILTER_PRED_INVALID) { - /* keep going to leaf node */ - pred = &preds[pred->left]; - continue; - } - match = pred->fn(pred, rec); + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); /* If this pred is the only pred */ if (pred == root) break; @@ -659,17 +700,34 @@ static int filter_set_pred(struct event_filter *filter, left = __pop_pred_stack(stack); if (!left || !right) return -EINVAL; - dest->left = left->index; - dest->right = right->index; - left->parent = dest->index; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; right->parent = dest->index | FILTER_PRED_IS_RIGHT; - } else + } else { /* * Make dest->left invalid to be used as a quick * way to know this is a leaf node. */ dest->left = FILTER_PRED_INVALID; + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + return __push_pred_stack(stack, dest); } @@ -1420,6 +1478,158 @@ static int check_pred_tree(struct event_filter *filter, return 0; } +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + static int replace_preds(struct ftrace_event_call *call, struct event_filter *filter, struct filter_parse_state *ps, @@ -1517,6 +1727,11 @@ add_pred: if (err) goto fail; + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + /* We don't set root until we know it works */ barrier(); filter->root = root; -- cgit v1.2.3-70-g09d2 From 4a3d27e98a7f2682e96d6f863752e0424b00d691 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:19:49 -0500 Subject: tracing/filter: Move MAX_FILTER_PRED to local tracing directory The MAX_FILTER_PRED is only needed by the kernel/trace/*.c files. Move it to kernel/trace/trace.h. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 - kernel/trace/trace.h | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 47e3997f7b5..1a99e7939c2 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -208,7 +208,6 @@ struct ftrace_event_call { #define PERF_MAX_TRACE_SIZE 2048 -#define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 256 /* Should handle KSYM_SYMBOL_LEN */ extern void destroy_preds(struct ftrace_event_call *call); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d754330934b..fbff872f8db 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,6 +680,8 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) +#define MAX_FILTER_PRED 32 + struct filter_pred; struct regex; -- cgit v1.2.3-70-g09d2 From bf93f9ed3a2cb89eb7e58851139d3be375b98027 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 27 Jan 2011 23:21:34 -0500 Subject: tracing/filter: Increase the max preds to 2^14 Now that the filter logic does not require to save the pred results on the stack, we can increase the max number of preds we allow. As the preds are index by a short value, and we use the MSBs as flags we can increase the max preds to 2^14 (16384) which should be way more than enough. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fbff872f8db..856e73cc1d3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -680,7 +680,14 @@ struct event_subsystem { #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) -#define MAX_FILTER_PRED 32 +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 struct filter_pred; struct regex; -- cgit v1.2.3-70-g09d2 From 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:25:46 -0500 Subject: tracing/filter: Swap entire filter of events When creating a new filter, instead of allocating the filter to the event call first and then processing the filter, it is easier to process a temporary filter and then just swap it with the call filter. By doing this, it simplifies the code. A filter is allocated and processed, when it is done, it is swapped with the call filter, synchronize_sched() is called to make sure all callers are done with the old filter (filters are called with premption disabled), and then the old filter is freed. Cc: Tom Zanussi Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 251 +++++++++++++++++++++---------------- 1 file changed, 146 insertions(+), 105 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2403ce5b650..f5d335d28d0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -425,10 +425,15 @@ int filter_match_preds(struct event_filter *filter, void *rec) struct filter_pred *preds; struct filter_pred *pred; struct filter_pred *root; - int n_preds = ACCESS_ONCE(filter->n_preds); + int n_preds; int done = 0; /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + if (!n_preds) return 1; @@ -509,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos) static void remove_filter_string(struct event_filter *filter) { + if (!filter) + return; + kfree(filter->filter_string); filter->filter_string = NULL; } @@ -568,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps, void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { - struct event_filter *filter = call->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = call->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -581,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { - struct event_filter *filter = system->filter; + struct event_filter *filter; mutex_lock(&event_mutex); + filter = system->filter; if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else @@ -745,26 +755,9 @@ static void __free_preds(struct event_filter *filter) filter->n_preds = 0; } -static void reset_preds(struct event_filter *filter) -{ - int n_preds = filter->n_preds; - int i; - - filter->n_preds = 0; - filter->root = NULL; - if (!filter->preds) - return; - - for (i = 0; i < n_preds; i++) - filter->preds[i].fn = filter_pred_none; -} - -static void filter_disable_preds(struct ftrace_event_call *call) +static void filter_disable(struct ftrace_event_call *call) { - struct event_filter *filter = call->filter; - call->flags &= ~TRACE_EVENT_FL_FILTERED; - reset_preds(filter); } static void __free_filter(struct event_filter *filter) @@ -777,11 +770,16 @@ static void __free_filter(struct event_filter *filter) kfree(filter); } +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ void destroy_preds(struct ftrace_event_call *call) { __free_filter(call->filter); call->filter = NULL; - call->flags &= ~TRACE_EVENT_FL_FILTERED; } static struct event_filter *__alloc_filter(void) @@ -789,11 +787,6 @@ static struct event_filter *__alloc_filter(void) struct event_filter *filter; filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!filter) - return ERR_PTR(-ENOMEM); - - filter->n_preds = 0; - return filter; } @@ -838,46 +831,28 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) return 0; } -static int init_filter(struct ftrace_event_call *call) -{ - if (call->filter) - return 0; - - call->flags &= ~TRACE_EVENT_FL_FILTERED; - call->filter = __alloc_filter(); - if (IS_ERR(call->filter)) - return PTR_ERR(call->filter); - - return 0; -} - -static int init_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; - int err; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - err = init_filter(call); - if (err) - return err; + filter_disable(call); + remove_filter_string(call->filter); } - - return 0; } -static void filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_filters(struct event_subsystem *system) { struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { if (strcmp(call->class->system, system->name) != 0) continue; - - filter_disable_preds(call); - remove_filter_string(call->filter); + __free_filter(call->filter); + call->filter = NULL; } } @@ -1743,88 +1718,129 @@ fail: return err; } +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + static int replace_system_preds(struct event_subsystem *system, struct filter_parse_state *ps, char *filter_string) { struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); bool fail = true; int err; list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; if (strcmp(call->class->system, system->name) != 0) continue; - /* try to see if the filter can be applied */ - err = replace_preds(call, filter, ps, filter_string, true); + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); if (err) goto fail; } - /* set all filter pred counts to zero */ list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + struct event_filter *filter; if (strcmp(call->class->system, system->name) != 0) continue; - reset_preds(filter); - } + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; - /* - * Since some of the preds may be used under preemption - * we need to wait for them to finish before we may - * reallocate them. - */ - synchronize_sched(); + list_add_tail(&filter_item->list, &filter_list); - list_for_each_entry(call, &ftrace_events, list) { - struct event_filter *filter = call->filter; + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; - if (strcmp(call->class->system, system->name) != 0) - continue; + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); + if (err) + goto fail_mem; - /* really apply the filter */ - filter_disable_preds(call); err = replace_preds(call, filter, ps, filter_string, false); - if (err) - filter_disable_preds(call); - else { + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; - replace_filter_string(filter, filter_string); - } + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + call->filter = filter_item->filter; + filter_item->filter = filter; + fail = false; } if (fail) goto fail; + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } return 0; fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; } int apply_event_filter(struct ftrace_event_call *call, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; mutex_lock(&event_mutex); - err = init_filter(call); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { - filter_disable_preds(call); - reset_preds(call->filter); + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + call->filter = NULL; /* Make sure the filter is not being used */ synchronize_sched(); - __free_preds(call->filter); - remove_filter_string(call->filter); + __free_filter(filter); goto out_unlock; } @@ -1833,29 +1849,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (!ps) goto out_unlock; - filter_disable_preds(call); - replace_filter_string(call->filter, filter_string); + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); if (err) { - append_filter_err(ps, call->filter); + append_filter_err(ps, filter); goto out; } - /* - * Make sure all the pred counts are zero so that - * no task is using it when we reallocate the preds array. - */ - reset_preds(call->filter); - synchronize_sched(); - - err = replace_preds(call, call->filter, ps, filter_string, false); - if (err) - append_filter_err(ps, call->filter); - else + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else call->flags |= TRACE_EVENT_FL_FILTERED; out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + call->filter = filter; + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); @@ -1868,18 +1896,21 @@ out_unlock: int apply_subsystem_event_filter(struct event_subsystem *system, char *filter_string) { - int err; struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; mutex_lock(&event_mutex); - err = init_subsystem_preds(system); - if (err) - goto out_unlock; - if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); goto out_unlock; } @@ -1888,7 +1919,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!ps) goto out_unlock; - replace_filter_string(system->filter, filter_string); + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); @@ -1945,7 +1986,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; filter = __alloc_filter(); - if (IS_ERR(filter)) { + if (!filter) { err = PTR_ERR(filter); goto out_unlock; } -- cgit v1.2.3-70-g09d2 From 4defe682d81a4960b6840ee4ed1a36f9db77c7bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 3 Feb 2011 23:29:06 -0500 Subject: tracing/filter: Remove synchronize_sched() from __alloc_preds() Because the filters are processed first and then activated (added to the call), we no longer need to worry about the preds of the filter in __alloc_preds() being used. As the filter that is allocating preds is not activated yet. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f5d335d28d0..3249b4f77ef 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -795,33 +795,17 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) struct filter_pred *pred; int i; - if (filter->preds) { - if (filter->a_preds < n_preds) { - /* - * We need to reallocate. - * We should have already have zeroed out - * the pred count and called synchronized_sched() - * to make sure no one is using the preds. - */ - if (WARN_ON_ONCE(filter->n_preds)) { - /* We need to reset it now */ - filter->n_preds = 0; - synchronize_sched(); - } - __free_preds(filter); - } - } + if (filter->preds) + __free_preds(filter); + + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - if (!filter->preds) { - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); - filter->a_preds = n_preds; - } if (!filter->preds) return -ENOMEM; - if (WARN_ON(filter->a_preds < n_preds)) - return -EINVAL; + filter->a_preds = n_preds; + filter->n_preds = 0; for (i = 0; i < n_preds; i++) { pred = &filter->preds[i]; -- cgit v1.2.3-70-g09d2