From c0268e8d1f450e286fc55e77f53a9ede6b72acab Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 24 Oct 2013 10:10:51 -0300 Subject: perf script python: Fix mem leak due to missing Py_DECREFs on dict entries We are using the Python scripting interface in perf to extract kernel events relevant for performance analysis of HPC codes. We noticed that the "perf script" call allocates a significant amount of memory (in the order of several 100 MiB) during it's run, e.g. 125 MiB for a 25 MiB input file: $> perf record -o perf.data -a -R -g fp \ -e power:cpu_frequency -e sched:sched_switch \ -e sched:sched_migrate_task -e sched:sched_process_exit \ -e sched:sched_process_fork -e sched:sched_process_exec \ -e cycles -m 4096 --freq 4000 $> /usr/bin/time perf script -i perf.data -s dummy_script.py 0.84user 0.13system 0:01.92elapsed 51%CPU (0avgtext+0avgdata 125532maxresident)k 73072inputs+0outputs (57major+33086minor)pagefaults 0swaps Upon further investigation using the valgrind massif tool, we noticed that Python objects that are created in trace-event-python.c via PyString_FromString*() (and their Integer and Long counterparts) are never free'd. The reason for this seem to be missing Py_DECREF calls on the objects that are returned by these functions and stored in the Python dictionaries. The Python dictionaries do not steal references (as opposed to Python tuples and lists) but instead add their own reference. Hence, the reference that is returned by these object creation functions is never released and the memory is leaked. (see [1,2]) The attached patch fixes this by wrapping all relevant calls to PyDict_SetItemString() and decrementing the reference counter immediately after the Python function call. This reduces the allocated memory to a reasonable amount: $> /usr/bin/time perf script -i perf.data -s dummy_script.py 0.73user 0.05system 0:00.79elapsed 99%CPU (0avgtext+0avgdata 49132maxresident)k 0inputs+0outputs (0major+14045minor)pagefaults 0swaps For comparison, with a 120 MiB input file the memory consumption reported by time drops from almost 600 MiB to 146 MiB. The patch has been tested using Linux 3.8.2 with Python 2.7.4 and Linux 3.11.6 with Python 2.7.5. Please let me know if you need any further information. [1] http://docs.python.org/2/c-api/tuple.html#PyTuple_SetItem [2] http://docs.python.org/2/c-api/dict.html#PyDict_SetItemString Signed-off-by: Joseph Schuchart Reviewed-by: Tom Zanussi Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Tom Zanussi Link: http://lkml.kernel.org/r/1381468543-25334-4-git-send-email-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- .../util/scripting-engines/trace-event-python.c | 37 ++++++++++++++-------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'tools/perf/util/scripting-engines/trace-event-python.c') diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index cc75a3cef38..95d91a0b23a 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -56,6 +56,17 @@ static void handler_call_die(const char *handler_name) Py_FatalError("problem in Python trace event handler"); } +/* + * Insert val into into the dictionary and decrement the reference counter. + * This is necessary for dictionaries since PyDict_SetItemString() does not + * steal a reference, as opposed to PyTuple_SetItem(). + */ +static void pydict_set_item_string_decref(PyObject *dict, const char *key, PyObject *val) +{ + PyDict_SetItemString(dict, key, val); + Py_DECREF(val); +} + static void define_value(enum print_arg_type field_type, const char *ev_name, const char *field_name, @@ -279,11 +290,11 @@ static void python_process_tracepoint(union perf_event *perf_event PyTuple_SetItem(t, n++, PyInt_FromLong(pid)); PyTuple_SetItem(t, n++, PyString_FromString(comm)); } else { - PyDict_SetItemString(dict, "common_cpu", PyInt_FromLong(cpu)); - PyDict_SetItemString(dict, "common_s", PyInt_FromLong(s)); - PyDict_SetItemString(dict, "common_ns", PyInt_FromLong(ns)); - PyDict_SetItemString(dict, "common_pid", PyInt_FromLong(pid)); - PyDict_SetItemString(dict, "common_comm", PyString_FromString(comm)); + pydict_set_item_string_decref(dict, "common_cpu", PyInt_FromLong(cpu)); + pydict_set_item_string_decref(dict, "common_s", PyInt_FromLong(s)); + pydict_set_item_string_decref(dict, "common_ns", PyInt_FromLong(ns)); + pydict_set_item_string_decref(dict, "common_pid", PyInt_FromLong(pid)); + pydict_set_item_string_decref(dict, "common_comm", PyString_FromString(comm)); } for (field = event->format.fields; field; field = field->next) { if (field->flags & FIELD_IS_STRING) { @@ -313,7 +324,7 @@ static void python_process_tracepoint(union perf_event *perf_event if (handler) PyTuple_SetItem(t, n++, obj); else - PyDict_SetItemString(dict, field->name, obj); + pydict_set_item_string_decref(dict, field->name, obj); } if (!handler) @@ -370,21 +381,21 @@ static void python_process_general_event(union perf_event *perf_event if (!handler || !PyCallable_Check(handler)) goto exit; - PyDict_SetItemString(dict, "ev_name", PyString_FromString(perf_evsel__name(evsel))); - PyDict_SetItemString(dict, "attr", PyString_FromStringAndSize( + pydict_set_item_string_decref(dict, "ev_name", PyString_FromString(perf_evsel__name(evsel))); + pydict_set_item_string_decref(dict, "attr", PyString_FromStringAndSize( (const char *)&evsel->attr, sizeof(evsel->attr))); - PyDict_SetItemString(dict, "sample", PyString_FromStringAndSize( + pydict_set_item_string_decref(dict, "sample", PyString_FromStringAndSize( (const char *)sample, sizeof(*sample))); - PyDict_SetItemString(dict, "raw_buf", PyString_FromStringAndSize( + pydict_set_item_string_decref(dict, "raw_buf", PyString_FromStringAndSize( (const char *)sample->raw_data, sample->raw_size)); - PyDict_SetItemString(dict, "comm", + pydict_set_item_string_decref(dict, "comm", PyString_FromString(thread->comm)); if (al->map) { - PyDict_SetItemString(dict, "dso", + pydict_set_item_string_decref(dict, "dso", PyString_FromString(al->map->dso->name)); } if (al->sym) { - PyDict_SetItemString(dict, "symbol", + pydict_set_item_string_decref(dict, "symbol", PyString_FromString(al->sym->name)); } -- cgit v1.2.3-70-g09d2 From b9c5143a012a543c4ee872498d6dbae5c10beb2e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Sep 2013 14:46:56 +0200 Subject: perf tools: Use an accessor to read thread comm As the thread comm is going to be implemented by way of a more complicated data structure than just a pointer to a string from the thread struct, convert the readers of comm to use an accessor instead of accessing it directly. The accessor will be later overriden to support an enhanced comm implementation. Signed-off-by: Frederic Weisbecker Tested-by: Jiri Olsa Cc: Jiri Olsa Cc: David Ahern Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-wr683zwy94hmj4ibogmnv9ce@git.kernel.org [ Rename thread__comm_curr() to thread__comm_str() ] Signed-off-by: Namhyung Kim [ Fixed up some minor const pointer issues ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-kmem.c | 2 +- tools/perf/builtin-lock.c | 2 +- tools/perf/builtin-sched.c | 16 ++++++++-------- tools/perf/builtin-script.c | 6 +++--- tools/perf/tests/hists_link.c | 2 +- tools/perf/ui/browsers/hists.c | 10 +++++----- tools/perf/util/event.c | 4 ++-- tools/perf/util/scripting-engines/trace-event-perl.c | 2 +- tools/perf/util/scripting-engines/trace-event-python.c | 4 ++-- tools/perf/util/sort.c | 11 ++++++----- tools/perf/util/thread.c | 7 ++++++- tools/perf/util/thread.h | 1 + 12 files changed, 37 insertions(+), 30 deletions(-) (limited to 'tools/perf/util/scripting-engines/trace-event-python.c') diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 1126382659a..a28970f7ddf 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -315,7 +315,7 @@ static int process_sample_event(struct perf_tool *tool __maybe_unused, return -1; } - dump_printf(" ... thread: %s:%d\n", thread->comm, thread->tid); + dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid); if (evsel->handler.func != NULL) { tracepoint_handler f = evsel->handler.func; diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 33c7253295b..35f9aaa565c 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -767,7 +767,7 @@ static void dump_threads(void) while (node) { st = container_of(node, struct thread_stat, rb); t = perf_session__findnew(session, st->tid); - pr_info("%10d: %s\n", st->tid, t->comm); + pr_info("%10d: %s\n", st->tid, thread__comm_str(t)); node = rb_next(node); }; } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index ddb5dc15be1..a81ab1828aa 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -737,12 +737,12 @@ static int replay_fork_event(struct perf_sched *sched, if (verbose) { printf("fork event\n"); - printf("... parent: %s/%d\n", parent->comm, parent->tid); - printf("... child: %s/%d\n", child->comm, child->tid); + printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid); + printf("... child: %s/%d\n", thread__comm_str(child), child->tid); } - register_pid(sched, parent->tid, parent->comm); - register_pid(sched, child->tid, child->comm); + register_pid(sched, parent->tid, thread__comm_str(parent)); + register_pid(sched, child->tid, thread__comm_str(child)); return 0; } @@ -1077,7 +1077,7 @@ static int latency_migrate_task_event(struct perf_sched *sched, if (!atoms) { if (thread_atoms_insert(sched, migrant)) return -1; - register_pid(sched, migrant->tid, migrant->comm); + register_pid(sched, migrant->tid, thread__comm_str(migrant)); atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid); if (!atoms) { pr_err("migration-event: Internal tree error"); @@ -1111,13 +1111,13 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_ /* * Ignore idle threads: */ - if (!strcmp(work_list->thread->comm, "swapper")) + if (!strcmp(thread__comm_str(work_list->thread), "swapper")) return; sched->all_runtime += work_list->total_runtime; sched->all_count += work_list->nb_atoms; - ret = printf(" %s:%d ", work_list->thread->comm, work_list->thread->tid); + ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid); for (i = 0; i < 24 - ret; i++) printf(" "); @@ -1334,7 +1334,7 @@ static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel, printf(" %12.6f secs ", (double)timestamp/1e9); if (new_shortname) { printf("%s => %s:%d\n", - sched_in->shortname, sched_in->comm, sched_in->tid); + sched_in->shortname, thread__comm_str(sched_in), sched_in->tid); } else { printf("\n"); } diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 0ae88c2538a..b866cc8c387 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -291,11 +291,11 @@ static void print_sample_start(struct perf_sample *sample, if (PRINT_FIELD(COMM)) { if (latency_format) - printf("%8.8s ", thread->comm); + printf("%8.8s ", thread__comm_str(thread)); else if (PRINT_FIELD(IP) && symbol_conf.use_callchain) - printf("%s ", thread->comm); + printf("%s ", thread__comm_str(thread)); else - printf("%16s ", thread->comm); + printf("%16s ", thread__comm_str(thread)); } if (PRINT_FIELD(PID) && PRINT_FIELD(TID)) diff --git a/tools/perf/tests/hists_link.c b/tools/perf/tests/hists_link.c index b51abcb2c24..4475b0ff76e 100644 --- a/tools/perf/tests/hists_link.c +++ b/tools/perf/tests/hists_link.c @@ -421,7 +421,7 @@ static void print_hists(struct hists *hists) he = rb_entry(node, struct hist_entry, rb_node_in); pr_info("%2d: entry: %-8s [%-8s] %20s: period = %"PRIu64"\n", - i, he->thread->comm, he->ms.map->dso->short_name, + i, thread__comm_str(he->thread), he->ms.map->dso->short_name, he->ms.sym->name, he->stat.period); i++; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 7ef36c36047..a91b6b21941 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1255,7 +1255,7 @@ static int hists__browser_title(struct hists *hists, char *bf, size_t size, if (thread) printed += scnprintf(bf + printed, size - printed, ", Thread: %s(%d)", - (thread->comm_set ? thread->comm : ""), + (thread->comm_set ? thread__comm_str(thread) : ""), thread->tid); if (dso) printed += scnprintf(bf + printed, size - printed, @@ -1578,7 +1578,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, if (thread != NULL && asprintf(&options[nr_options], "Zoom %s %s(%d) thread", (browser->hists->thread_filter ? "out of" : "into"), - (thread->comm_set ? thread->comm : ""), + (thread->comm_set ? thread__comm_str(thread) : ""), thread->tid) > 0) zoom_thread = nr_options++; @@ -1598,7 +1598,7 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, struct symbol *sym; if (asprintf(&options[nr_options], "Run scripts for samples of thread [%s]", - browser->he_selection->thread->comm) > 0) + thread__comm_str(browser->he_selection->thread)) > 0) scripts_comm = nr_options++; sym = browser->he_selection->ms.sym; @@ -1701,7 +1701,7 @@ zoom_out_thread: sort_thread.elide = false; } else { ui_helpline__fpush("To zoom out press <- or -> + \"Zoom out of %s(%d) thread\"", - thread->comm_set ? thread->comm : "", + thread->comm_set ? thread__comm_str(thread) : "", thread->tid); browser->hists->thread_filter = thread; sort_thread.elide = true; @@ -1717,7 +1717,7 @@ do_scripts: memset(script_opt, 0, 64); if (choice == scripts_comm) - sprintf(script_opt, " -c %s ", browser->he_selection->thread->comm); + sprintf(script_opt, " -c %s ", thread__comm_str(browser->he_selection->thread)); if (choice == scripts_symbol) sprintf(script_opt, " -S %s ", browser->he_selection->ms.sym->name); diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 49096ea58a1..7a2842ed53f 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -721,10 +721,10 @@ int perf_event__preprocess_sample(const union perf_event *event, return -1; if (symbol_conf.comm_list && - !strlist__has_entry(symbol_conf.comm_list, thread->comm)) + !strlist__has_entry(symbol_conf.comm_list, thread__comm_str(thread))) goto out_filtered; - dump_printf(" ... thread: %s:%d\n", thread->comm, thread->tid); + dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid); /* * Have we already created the kernel maps for this machine? * diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index c0c9795c4f0..d5e5969f6fe 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -273,7 +273,7 @@ static void perl_process_tracepoint(union perf_event *perf_event __maybe_unused, int cpu = sample->cpu; void *data = sample->raw_data; unsigned long long nsecs = sample->time; - char *comm = thread->comm; + const char *comm = thread__comm_str(thread); dSP; diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 95d91a0b23a..53c20e7fd90 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -250,7 +250,7 @@ static void python_process_tracepoint(union perf_event *perf_event int cpu = sample->cpu; void *data = sample->raw_data; unsigned long long nsecs = sample->time; - char *comm = thread->comm; + const char *comm = thread__comm_str(thread); t = PyTuple_New(MAX_FIELDS); if (!t) @@ -389,7 +389,7 @@ static void python_process_general_event(union perf_event *perf_event pydict_set_item_string_decref(dict, "raw_buf", PyString_FromStringAndSize( (const char *)sample->raw_data, sample->raw_size)); pydict_set_item_string_decref(dict, "comm", - PyString_FromString(thread->comm)); + PyString_FromString(thread__comm_str(thread))); if (al->map) { pydict_set_item_string_decref(dict, "dso", PyString_FromString(al->map->dso->name)); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 19b4aa279d1..835e8bdd869 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -42,7 +42,7 @@ static int repsep_snprintf(char *bf, size_t size, const char *fmt, ...) return n; } -static int64_t cmp_null(void *l, void *r) +static int64_t cmp_null(const void *l, const void *r) { if (!l && !r) return 0; @@ -63,8 +63,9 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right) static int hist_entry__thread_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { + const char *comm = thread__comm_str(he->thread); return repsep_snprintf(bf, size, "%*s:%5d", width - 6, - he->thread->comm ?: "", he->thread->tid); + comm ?: "", he->thread->tid); } struct sort_entry sort_thread = { @@ -85,8 +86,8 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right) static int64_t sort__comm_collapse(struct hist_entry *left, struct hist_entry *right) { - char *comm_l = left->thread->comm; - char *comm_r = right->thread->comm; + const char *comm_l = thread__comm_str(left->thread); + const char *comm_r = thread__comm_str(right->thread); if (!comm_l || !comm_r) return cmp_null(comm_l, comm_r); @@ -97,7 +98,7 @@ sort__comm_collapse(struct hist_entry *left, struct hist_entry *right) static int hist_entry__comm_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { - return repsep_snprintf(bf, size, "%*s", width, he->thread->comm); + return repsep_snprintf(bf, size, "%*s", width, thread__comm_str(he->thread)); } struct sort_entry sort_comm = { diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index 80d19a08607..56760079565 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -45,6 +45,11 @@ int thread__set_comm(struct thread *thread, const char *comm) return err; } +const char *thread__comm_str(const struct thread *thread) +{ + return thread->comm; +} + int thread__comm_len(struct thread *thread) { if (!thread->comm_len) { @@ -58,7 +63,7 @@ int thread__comm_len(struct thread *thread) size_t thread__fprintf(struct thread *thread, FILE *fp) { - return fprintf(fp, "Thread %d %s\n", thread->tid, thread->comm) + + return fprintf(fp, "Thread %d %s\n", thread->tid, thread__comm_str(thread)) + map_groups__fprintf(&thread->mg, verbose, fp); } diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index 4ebbb40d46d..6561ad21d9a 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -35,6 +35,7 @@ static inline void thread__exited(struct thread *thread) int thread__set_comm(struct thread *self, const char *comm); int thread__comm_len(struct thread *self); +const char *thread__comm_str(const struct thread *thread); void thread__insert_map(struct thread *self, struct map *map); int thread__fork(struct thread *self, struct thread *parent); size_t thread__fprintf(struct thread *thread, FILE *fp); -- cgit v1.2.3-70-g09d2