X-Git-Url: http://git.efficios.com/?a=blobdiff_plain;f=kernel%2Fperf_counter.c;h=fcdafa234a5d9626d28d32a5db50494b2222b2f1;hb=0d905bca23aca5c86a10ee101bcd3b1abbd40b25;hp=1773c5d7427dc9e9d57eded65ee3f8b4cf9a4134;hpb=7dd1fcc258b65da718f01e4684a7b9244501a9fb;p=deliverable%2Flinux.git diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1773c5d7427d..fcdafa234a5d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1,13 +1,16 @@ /* * Performance counter core code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. * - * For licencing details see kernel-base/COPYING + * For licensing details see kernel-base/COPYING */ #include +#include #include #include #include @@ -15,13 +18,17 @@ #include #include #include +#include +#include +#include #include #include #include #include #include -#include -#include +#include + +#include /* * Each CPU has a list of per CPU counters: @@ -32,6 +39,12 @@ int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; +static atomic_t nr_mmap_tracking __read_mostly; +static atomic_t nr_munmap_tracking __read_mostly; +static atomic_t nr_comm_tracking __read_mostly; + +int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ + /* * Mutex for (sysadmin-configurable) counter reservations: */ @@ -40,8 +53,7 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { return NULL; } @@ -70,8 +82,12 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) */ if (counter->group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); - else + else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&counter->event_entry, &ctx->event_list); } static void @@ -80,6 +96,10 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) struct perf_counter *sibling, *tmp; list_del_init(&counter->list_entry); + list_del_rcu(&counter->event_entry); + + if (counter->group_leader != counter) + counter->group_leader->nr_siblings--; /* * If this was a group counter with sibling counters then @@ -103,7 +123,8 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); + counter->tstamp_stopped = ctx->time; + counter->pmu->disable(counter); counter->oncpu = -1; if (!is_software_counter(counter)) @@ -157,8 +178,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); counter_sched_out(counter, cpuctx, ctx); @@ -183,8 +203,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -238,6 +257,55 @@ retry: spin_unlock_irq(&ctx->lock); } +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + u64 run_end; + + if (counter->state < PERF_COUNTER_STATE_INACTIVE) + return; + + counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time; + + counter->total_time_running = run_end - counter->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ + struct perf_counter *counter; + + update_counter_times(leader); + list_for_each_entry(counter, &leader->sibling_list, list_entry) + update_counter_times(counter); +} + /* * Cross CPU call to disable a performance counter */ @@ -255,14 +323,15 @@ static void __perf_counter_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * If the counter is on, turn it off. * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + update_context_time(ctx); + update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -270,8 +339,7 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -307,8 +375,10 @@ static void perf_counter_disable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } spin_unlock_irq(&ctx->lock); } @@ -347,12 +417,14 @@ counter_sched_in(struct perf_counter *counter, */ smp_wmb(); - if (counter->hw_ops->enable(counter)) { + if (counter->pmu->enable(counter)) { counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; return -EAGAIN; } + counter->tstamp_running += ctx->time - counter->tstamp_stopped; + if (!is_software_counter(counter)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -373,9 +445,11 @@ static int is_software_only_group(struct perf_counter *leader) if (!is_software_counter(leader)) return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) if (!is_software_counter(counter)) return 0; + return 1; } @@ -410,6 +484,17 @@ static int group_can_go_on(struct perf_counter *counter, return can_add_hw; } +static void add_counter_to_ctx(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + list_add_counter(counter, ctx); + ctx->nr_counters++; + counter->prev_state = PERF_COUNTER_STATE_OFF; + counter->tstamp_enabled = ctx->time; + counter->tstamp_running = ctx->time; + counter->tstamp_stopped = ctx->time; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -432,8 +517,8 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); /* * Protect the list operation against NMI by disabling the @@ -441,9 +526,7 @@ static void __perf_install_in_context(void *info) */ perf_flags = hw_perf_save_disable(); - list_add_counter(counter, ctx); - ctx->nr_counters++; - counter->prev_state = PERF_COUNTER_STATE_OFF; + add_counter_to_ctx(counter, ctx); /* * Don't put the counter on if it is disabled or if @@ -471,8 +554,10 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } if (!err && !ctx->task && cpuctx->max_pertask) @@ -481,8 +566,7 @@ static void __perf_install_in_context(void *info) unlock: hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -533,10 +617,8 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list_entry)) { - list_add_counter(counter, ctx); - ctx->nr_counters++; - } + if (list_empty(&counter->list_entry)) + add_counter_to_ctx(counter, ctx); spin_unlock_irq(&ctx->lock); } @@ -559,13 +641,14 @@ static void __perf_counter_enable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -587,13 +670,14 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } unlock: - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -644,12 +728,21 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; + } out: spin_unlock_irq(&ctx->lock); } +static void perf_counter_refresh(struct perf_counter *counter, int refresh) +{ + atomic_add(refresh, &counter->event_limit); + perf_counter_enable(counter); +} + /* * Enable a counter and all its children. */ @@ -678,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; + update_context_time(ctx); flags = hw_perf_save_disable(); if (ctx->nr_active) { @@ -704,10 +798,15 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct pt_regs *regs; if (likely(!cpuctx->task_ctx)) return; + update_context_time(ctx); + + regs = task_pt_regs(task); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -779,6 +878,8 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; + ctx->timestamp = perf_clock(); + flags = hw_perf_save_disable(); /* @@ -799,8 +900,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_ERROR; + } } list_for_each_entry(counter, &ctx->counter_list, list_entry) { @@ -868,12 +971,9 @@ int perf_counter_task_disable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); - perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -884,15 +984,15 @@ int perf_counter_task_disable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) + if (counter->state != PERF_COUNTER_STATE_ERROR) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } } hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); return 0; } @@ -909,12 +1009,9 @@ int perf_counter_task_enable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); - perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -928,6 +1025,8 @@ int perf_counter_task_enable(void) if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -936,7 +1035,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); return 0; } @@ -970,18 +1069,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &curr->perf_counter_ctx; - const int rotate_percpu = 0; - if (rotate_percpu) - perf_counter_cpu_sched_out(cpuctx); + perf_counter_cpu_sched_out(cpuctx); perf_counter_task_sched_out(curr, cpu); - if (rotate_percpu) - rotate_ctx(&cpuctx->ctx); + rotate_ctx(&cpuctx->ctx); rotate_ctx(ctx); - if (rotate_percpu) - perf_counter_cpu_sched_in(cpuctx, cpu); + perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); } @@ -991,11 +1086,15 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - curr_rq_lock_irq_save(&flags); - counter->hw_ops->read(counter); - curr_rq_unlock_irq_restore(&flags); + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + counter->pmu->read(counter); + update_counter_times(counter); + local_irq_restore(flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -1007,71 +1106,13 @@ static u64 perf_counter_read(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __read, counter, 1); + } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); } return atomic64_read(&counter->count); } -/* - * Cross CPU call to switch performance data pointers - */ -static void __perf_switch_irq_data(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task) { - if (cpuctx->task_ctx != ctx) - return; - spin_lock(&ctx->lock); - } - - /* Change the pointer NMI safe */ - atomic_long_set((atomic_long_t *)&counter->irqdata, - (unsigned long) counter->usrdata); - counter->usrdata = oldirqdata; - - if (ctx->task) - spin_unlock(&ctx->lock); -} - -static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - struct task_struct *task = ctx->task; - - if (!task) { - smp_call_function_single(counter->cpu, - __perf_switch_irq_data, - counter, 1); - return counter->usrdata; - } - -retry: - spin_lock_irq(&ctx->lock); - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - counter->irqdata = counter->usrdata; - counter->usrdata = oldirqdata; - spin_unlock_irq(&ctx->lock); - return oldirqdata; - } - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_switch_irq_data, counter); - /* Might have failed, because task was scheduled out */ - if (counter->irqdata == oldirqdata) - goto retry; - - return counter->usrdata; -} - static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1089,7 +1130,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (!capable(CAP_SYS_ADMIN)) + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) @@ -1133,6 +1174,33 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ctx; } +static void free_counter_rcu(struct rcu_head *head) +{ + struct perf_counter *counter; + + counter = container_of(head, struct perf_counter, rcu_head); + kfree(counter); +} + +static void perf_pending_sync(struct perf_counter *counter); + +static void free_counter(struct perf_counter *counter) +{ + perf_pending_sync(counter); + + if (counter->hw_event.mmap) + atomic_dec(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_dec(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_dec(&nr_comm_tracking); + + if (counter->destroy) + counter->destroy(counter); + + call_rcu(&counter->rcu_head, free_counter_rcu); +} + /* * Called when the last reference to the file is gone. */ @@ -1151,7 +1219,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - kfree(counter); + free_counter(counter); put_context(ctx); return 0; @@ -1163,10 +1231,8 @@ static int perf_release(struct inode *inode, struct file *file) static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 cntval; - - if (count != sizeof(cntval)) - return -EINVAL; + u64 values[3]; + int n; /* * Return end-of-file for a read on a counter that is in @@ -1177,127 +1243,47 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return 0; mutex_lock(&counter->mutex); - cntval = perf_counter_read(counter); + values[0] = perf_counter_read(counter); + n = 1; + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); mutex_unlock(&counter->mutex); - return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); -} - -static ssize_t -perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) -{ - if (!usrdata->len) - return 0; + if (count < n * sizeof(u64)) + return -EINVAL; + count = n * sizeof(u64); - count = min(count, (size_t)usrdata->len); - if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) + if (copy_to_user(buf, values, count)) return -EFAULT; - /* Adjust the counters */ - usrdata->len -= count; - if (!usrdata->len) - usrdata->rd_idx = 0; - else - usrdata->rd_idx += count; - return count; } -static ssize_t -perf_read_irq_data(struct perf_counter *counter, - char __user *buf, - size_t count, - int nonblocking) -{ - struct perf_data *irqdata, *usrdata; - DECLARE_WAITQUEUE(wait, current); - ssize_t res, res2; - - irqdata = counter->irqdata; - usrdata = counter->usrdata; - - if (usrdata->len + irqdata->len >= count) - goto read_pending; - - if (nonblocking) - return -EAGAIN; - - spin_lock_irq(&counter->waitq.lock); - __add_wait_queue(&counter->waitq, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (usrdata->len + irqdata->len >= count) - break; - - if (signal_pending(current)) - break; - - if (counter->state == PERF_COUNTER_STATE_ERROR) - break; - - spin_unlock_irq(&counter->waitq.lock); - schedule(); - spin_lock_irq(&counter->waitq.lock); - } - __remove_wait_queue(&counter->waitq, &wait); - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&counter->waitq.lock); - - if (usrdata->len + irqdata->len < count && - counter->state != PERF_COUNTER_STATE_ERROR) - return -ERESTARTSYS; -read_pending: - mutex_lock(&counter->mutex); - - /* Drain pending data first: */ - res = perf_copy_usrdata(usrdata, buf, count); - if (res < 0 || res == count) - goto out; - - /* Switch irq buffer: */ - usrdata = perf_switch_irq_data(counter); - res2 = perf_copy_usrdata(usrdata, buf + res, count - res); - if (res2 < 0) { - if (!res) - res = -EFAULT; - } else { - res += res2; - } -out: - mutex_unlock(&counter->mutex); - - return res; -} - static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - return perf_read_hw(counter, buf, count); - - case PERF_RECORD_IRQ: - case PERF_RECORD_GROUP: - return perf_read_irq_data(counter, buf, count, - file->f_flags & O_NONBLOCK); - } - return -EINVAL; + return perf_read_hw(counter, buf, count); } static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; - unsigned int events = 0; - unsigned long flags; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; - poll_wait(file, &counter->waitq, wait); + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); - spin_lock_irqsave(&counter->waitq.lock, flags); - if (counter->usrdata->len || counter->irqdata->len) - events |= POLLIN; - spin_unlock_irqrestore(&counter->waitq.lock, flags); + poll_wait(file, &counter->waitq, wait); return events; } @@ -1314,129 +1300,1084 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_DISABLE: perf_counter_disable_family(counter); break; + case PERF_COUNTER_IOC_REFRESH: + perf_counter_refresh(counter, arg); + break; default: err = -ENOTTY; } return err; } -static const struct file_operations perf_fops = { - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, -}; - /* - * Generic software counter infrastructure + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. */ - -static void perf_swcounter_update(struct perf_counter *counter) +void perf_counter_update_userpage(struct perf_counter *counter) { - struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + struct perf_mmap_data *data; + struct perf_counter_mmap_page *userpg; -again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) - goto again; + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; - delta = now - prev; + userpg = data->user_page; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = counter->hw.idx; + userpg->offset = atomic64_read(&counter->count); + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + userpg->offset -= atomic64_read(&counter->hw.prev_count); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); } -static void perf_swcounter_set_period(struct perf_counter *counter) +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + struct perf_counter *counter = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - } + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); } + get_page(vmf->page); + ret = 0; +unlock: + rcu_read_unlock(); - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); + return ret; } -static void perf_swcounter_save_and_restart(struct perf_counter *counter) +static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) { - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); -} + struct perf_mmap_data *data; + unsigned long size; + int i; -static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; + WARN_ON(atomic_read(&counter->mmap_count)); - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; - *p = data; - irqdata->len += sizeof(u64); + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; } + + data->nr_pages = nr_pages; + + rcu_assign_pointer(counter->data, data); + + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; } -static void perf_swcounter_handle_group(struct perf_counter *sibling) +static void __perf_mmap_data_free(struct rcu_head *rcu_head) { - struct perf_counter *counter, *group_leader = sibling->group_leader; + struct perf_mmap_data *data = container_of(rcu_head, + struct perf_mmap_data, rcu_head); + int i; - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - perf_swcounter_update(counter); - perf_swcounter_store_irq(sibling, counter->hw_event.type); - perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); - } + free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + free_page((unsigned long)data->data_pages[i]); + kfree(data); } -static void perf_swcounter_interrupt(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +static void perf_mmap_data_free(struct perf_counter *counter) { - perf_swcounter_save_and_restart(counter); + struct perf_mmap_data *data = counter->data; - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - break; + WARN_ON(atomic_read(&counter->mmap_count)); - case PERF_RECORD_IRQ: - perf_swcounter_store_irq(counter, instruction_pointer(regs)); - break; + rcu_assign_pointer(counter->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} - case PERF_RECORD_GROUP: - perf_swcounter_handle_group(counter); - break; - } +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; - if (nmi) { - counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); - } else - wake_up(&counter->waitq); + atomic_inc(&counter->mmap_count); } -static int perf_swcounter_match(struct perf_counter *counter, - enum hw_event_types event, - struct pt_regs *regs) +static void perf_mmap_close(struct vm_area_struct *vma) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; + struct perf_counter *counter = vma->vm_file->private_data; - if (counter->hw_event.raw) + if (atomic_dec_and_mutex_lock(&counter->mmap_count, + &counter->mmap_mutex)) { + vma->vm_mm->locked_vm -= counter->data->nr_pages + 1; + perf_mmap_data_free(counter); + mutex_unlock(&counter->mmap_mutex); + } +} + +static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_counter *counter = file->private_data; + unsigned long vma_size; + unsigned long nr_pages; + unsigned long locked, lock_limit; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) { + if (nr_pages != counter->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + locked = vma->vm_mm->locked_vm; + locked += nr_pages + 1; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(counter->data); + ret = perf_mmap_data_alloc(counter, nr_pages); + if (ret) + goto unlock; + + atomic_set(&counter->mmap_count, 1); + vma->vm_mm->locked_vm += nr_pages + 1; +unlock: + mutex_unlock(&counter->mmap_mutex); + + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct perf_counter *counter = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &counter->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ + wake_up_all(&counter->waitq); + + if (counter->pending_kill) { + kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); + counter->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_counter(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + if (counter->pending_disable) { + counter->pending_disable = 0; + perf_counter_disable(counter); + } + + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_counter_wakeup(counter); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_counter_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return counter->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ + wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ + +struct perf_output_handle { + struct perf_counter *counter; + struct perf_mmap_data *data; + unsigned int offset; + unsigned int head; + int wakeup; + int nmi; + int overflow; + int locked; + unsigned long flags; +}; + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->counter->pending_wakeup = 1; + perf_pending_queue(&handle->counter->pending, + perf_pending_counter); + } else + perf_counter_wakeup(handle->counter); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int head, cpu; + + if (handle->wakeup) + data->wakeup_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_xchg(&data->wakeup_head, 0))) { + data->user_page->data_head = head; + handle->wakeup = 1; + } + + /* + * NMI can happen here, which means we can miss a wakeup_head update. + */ + + cpu = atomic_xchg(&data->lock, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_read(&data->wakeup_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + goto again; + } + + if (handle->wakeup) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +static int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size, + int nmi, int overflow) +{ + struct perf_mmap_data *data; + unsigned int offset, head; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto out; + + handle->data = data; + handle->counter = counter; + handle->nmi = nmi; + handle->overflow = overflow; + + if (!data->nr_pages) + goto fail; + + perf_output_lock(handle); + + do { + offset = head = atomic_read(&data->head); + head += size; + } while (atomic_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + + return 0; + +fail: + perf_output_wakeup(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +static void perf_output_copy(struct perf_output_handle *handle, + void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + WARN_ON_ONCE(handle->offset > handle->head); +} + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + +static void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_counter *counter = handle->counter; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = counter->hw_event.wakeup_events; + + if (handle->overflow && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + handle->wakeup = 1; + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs, u64 addr) +{ + int ret; + u64 record_type = counter->hw_event.record_type; + struct perf_output_handle handle; + struct perf_event_header header; + u64 ip; + struct { + u32 pid, tid; + } tid_entry; + struct { + u64 event; + u64 counter; + } group_entry; + struct perf_callchain_entry *callchain = NULL; + int callchain_size = 0; + u64 time; + + header.type = 0; + header.size = sizeof(header); + + header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc |= user_mode(regs) ? + PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; + + if (record_type & PERF_RECORD_IP) { + ip = instruction_pointer(regs); + header.type |= PERF_RECORD_IP; + header.size += sizeof(ip); + } + + if (record_type & PERF_RECORD_TID) { + /* namespace issues */ + tid_entry.pid = current->group_leader->pid; + tid_entry.tid = current->pid; + + header.type |= PERF_RECORD_TID; + header.size += sizeof(tid_entry); + } + + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= PERF_RECORD_TIME; + header.size += sizeof(u64); + } + + if (record_type & PERF_RECORD_ADDR) { + header.type |= PERF_RECORD_ADDR; + header.size += sizeof(u64); + } + + if (record_type & PERF_RECORD_GROUP) { + header.type |= PERF_RECORD_GROUP; + header.size += sizeof(u64) + + counter->nr_siblings * sizeof(group_entry); + } + + if (record_type & PERF_RECORD_CALLCHAIN) { + callchain = perf_callchain(regs); + + if (callchain) { + callchain_size = (1 + callchain->nr) * sizeof(u64); + + header.type |= PERF_RECORD_CALLCHAIN; + header.size += callchain_size; + } + } + + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); + if (ret) + return; + + perf_output_put(&handle, header); + + if (record_type & PERF_RECORD_IP) + perf_output_put(&handle, ip); + + if (record_type & PERF_RECORD_TID) + perf_output_put(&handle, tid_entry); + + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + + if (record_type & PERF_RECORD_ADDR) + perf_output_put(&handle, addr); + + if (record_type & PERF_RECORD_GROUP) { + struct perf_counter *leader, *sub; + u64 nr = counter->nr_siblings; + + perf_output_put(&handle, nr); + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->pmu->read(sub); + + group_entry.event = sub->hw_event.config; + group_entry.counter = atomic64_read(&sub->count); + + perf_output_put(&handle, group_entry); + } + } + + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + + perf_output_end(&handle); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event; +}; + +static void perf_counter_comm_output(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + perf_output_put(&handle, comm_event->event); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_counter_comm_match(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + if (counter->hw_event.comm && + comm_event->event.header.type == PERF_EVENT_COMM) + return 1; + + return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_comm_match(counter, comm_event)) + perf_counter_comm_output(counter, comm_event); + } + rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + unsigned int size; + char *comm = comm_event->task->comm; + + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event.header.size = sizeof(comm_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); +} + +void perf_counter_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (!atomic_read(&nr_comm_tracking)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + .event = { + .header = { .type = PERF_EVENT_COMM, }, + .pid = task->group_leader->pid, + .tid = task->pid, + }, + }; + + perf_counter_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct file *file; + char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event; +}; + +static void perf_counter_mmap_output(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + perf_output_put(&handle, mmap_event->event); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_counter_mmap_match(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + if (counter->hw_event.mmap && + mmap_event->event.header.type == PERF_EVENT_MMAP) + return 1; + + if (counter->hw_event.munmap && + mmap_event->event.header.type == PERF_EVENT_MUNMAP) + return 1; + + return 0; +} + +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_mmap_match(counter, mmap_event)) + perf_counter_mmap_output(counter, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct file *file = mmap_event->file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + char *name; + + if (file) { + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event.header.size = sizeof(mmap_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + + kfree(buf); +} + +void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_tracking)) + return; + + mmap_event = (struct perf_mmap_event){ + .file = file, + .event = { + .header = { .type = PERF_EVENT_MMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + +void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_munmap_tracking)) + return; + + mmap_event = (struct perf_mmap_event){ + .file = file, + .event = { + .header = { .type = PERF_EVENT_MUNMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs, u64 addr) +{ + int events = atomic_read(&counter->event_limit); + int ret = 0; + + counter->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&counter->event_limit)) { + ret = 1; + counter->pending_kill = POLL_HUP; + if (nmi) { + counter->pending_disable = 1; + perf_pending_queue(&counter->pending, + perf_pending_counter); + } else + perf_counter_disable(counter); + } + + perf_counter_output(counter, nmi, regs, addr); + return ret; +} + +/* + * Generic software counter infrastructure + */ + +static void perf_swcounter_update(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 prev, now; + s64 delta; + +again: + prev = atomic64_read(&hwc->prev_count); + now = atomic64_read(&hwc->count); + if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + +static void perf_swcounter_set_period(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->irq_period; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_add(period, &hwc->period_left); + } + + atomic64_set(&hwc->prev_count, -left); + atomic64_set(&hwc->count, -left); +} + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_counter *counter; + struct pt_regs *regs; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->hw_event.exclude_kernel || !regs) && + !counter->hw_event.exclude_user) + regs = task_pt_regs(current); + + if (regs) { + if (perf_counter_overflow(counter, 0, regs, 0)) + ret = HRTIMER_NORESTART; + } + + hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + + return ret; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs, u64 addr) +{ + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); + if (perf_counter_overflow(counter, nmi, regs, addr)) + /* soft-disable the counter */ + ; + +} + +static int perf_swcounter_match(struct perf_counter *counter, + enum perf_event_types type, + u32 event, struct pt_regs *regs) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return 0; + + if (perf_event_raw(&counter->hw_event)) + return 0; + + if (perf_event_type(&counter->hw_event) != type) return 0; - if (counter->hw_event.type != event) + if (perf_event_id(&counter->hw_event) != event) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) @@ -1448,45 +2389,79 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct pt_regs *regs, u64 addr) +{ + int neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) + perf_swcounter_overflow(counter, nmi, regs, addr); +} + static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) + enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_counter *counter; - unsigned long flags; - int neg; - if (list_empty(&ctx->counter_list)) + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; - spin_lock_irqsave(&ctx->lock, flags); - - /* - * XXX: make counter_list RCU safe - */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (perf_swcounter_match(counter, event, regs)) { - neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg) - perf_swcounter_interrupt(counter, nmi, regs); - } + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_swcounter_match(counter, type, event, regs)) + perf_swcounter_add(counter, nr, nmi, regs, addr); } + rcu_read_unlock(); +} - spin_unlock_irqrestore(&ctx->lock, flags); +static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; } -void perf_swcounter_event(enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) +static void __perf_swcounter_event(enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swcounter_recursion_context(cpuctx); + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); - perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs); - if (cpuctx->task_ctx) - perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + nr, nmi, regs, addr); + if (cpuctx->task_ctx) { + perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, + nr, nmi, regs, addr); + } + + barrier(); + (*recursion)--; +out: put_cpu_var(perf_cpu_context); } +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); +} + static void perf_swcounter_read(struct perf_counter *counter) { perf_swcounter_update(counter); @@ -1503,18 +2478,16 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } +static const struct pmu perf_ops_generic = { + .enable = perf_swcounter_enable, + .disable = perf_swcounter_disable, + .read = perf_swcounter_read, +}; + /* * Software counter: cpu wall time clock */ -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - - atomic64_set(&counter->hw.prev_count, cpu_clock(cpu)); - return 0; -} - static void cpu_clock_perf_counter_update(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); @@ -1527,8 +2500,26 @@ static void cpu_clock_perf_counter_update(struct perf_counter *counter) atomic64_add(now - prev, &counter->count); } +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + if (hwc->irq_period) { + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { + hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -1537,7 +2528,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) cpu_clock_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_clock = { +static const struct pmu perf_ops_cpu_clock = { .enable = cpu_clock_perf_counter_enable, .disable = cpu_clock_perf_counter_disable, .read = cpu_clock_perf_counter_read, @@ -1547,122 +2538,62 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { * Software counter: task time clock */ -/* - * Called from within the scheduler: - */ -static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) -{ - struct task_struct *curr = counter->task; - u64 delta; - - delta = __task_delta_exec(curr, update); - - return curr->se.sum_exec_runtime + delta; -} - static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) { u64 prev; s64 delta; - prev = atomic64_read(&counter->hw.prev_count); - - atomic64_set(&counter->hw.prev_count, now); - + prev = atomic64_xchg(&counter->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &counter->count); } -static void task_clock_perf_counter_read(struct perf_counter *counter) +static int task_clock_perf_counter_enable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 1); + struct hw_perf_counter *hwc = &counter->hw; + u64 now; - task_clock_perf_counter_update(counter, now); -} + now = counter->ctx->time; -static int task_clock_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - task_clock_perf_counter_val(counter, 0)); + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + if (hwc->irq_period) { + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } return 0; } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 0); - - task_clock_perf_counter_update(counter, now); -} - -static const struct hw_perf_counter_ops perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, -}; - -/* - * Software counter: page faults - */ - -static const struct hw_perf_counter_ops perf_ops_page_faults = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, -}; - -/* - * Software counter: context switches - */ - -static u64 get_context_switches(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->nvcsw + curr->nivcsw; - return cpu_nr_switches(smp_processor_id()); -} - -static void context_switches_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_context_switches(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; + hrtimer_cancel(&counter->hw.hrtimer); + task_clock_perf_counter_update(counter, counter->ctx->time); - atomic64_add(delta, &counter->count); } -static void context_switches_perf_counter_read(struct perf_counter *counter) +static void task_clock_perf_counter_read(struct perf_counter *counter) { - context_switches_perf_counter_update(counter); -} + u64 time; -static int context_switches_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - get_context_switches(counter)); - return 0; -} + if (!in_nmi()) { + update_context_time(counter->ctx); + time = counter->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - counter->ctx->timestamp; + time = counter->ctx->time + delta; + } -static void context_switches_perf_counter_disable(struct perf_counter *counter) -{ - context_switches_perf_counter_update(counter); + task_clock_perf_counter_update(counter, time); } -static const struct hw_perf_counter_ops perf_ops_context_switches = { - .enable = context_switches_perf_counter_enable, - .disable = context_switches_perf_counter_disable, - .read = context_switches_perf_counter_read, +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_counter_enable, + .disable = task_clock_perf_counter_disable, + .read = task_clock_perf_counter_read, }; /* @@ -1711,17 +2642,57 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { +static const struct pmu perf_ops_cpu_migrations = { .enable = cpu_migrations_perf_counter_enable, .disable = cpu_migrations_perf_counter_disable, .read = cpu_migrations_perf_counter_read, }; -static const struct hw_perf_counter_ops * -sw_perf_counter_init(struct perf_counter *counter) +#ifdef CONFIG_EVENT_PROFILE +void perf_tpcounter_event(int event_id) +{ + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); +} +EXPORT_SYMBOL_GPL(perf_tpcounter_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_counter_destroy(struct perf_counter *counter) +{ + ftrace_profile_disable(perf_event_id(&counter->hw_event)); +} + +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ + int event_id = perf_event_id(&counter->hw_event); + int ret; + + ret = ftrace_profile_enable(event_id); + if (ret) + return NULL; + + counter->destroy = tp_perf_counter_destroy; + counter->hw.irq_period = counter->hw_event.irq_period; + + return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ + return NULL; +} +#endif + +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; - const struct hw_perf_counter_ops *hw_ops = NULL; + const struct pmu *pmu = NULL; struct hw_perf_counter *hwc = &counter->hw; /* @@ -1731,48 +2702,42 @@ sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->hw_event.type) { + switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv)) - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: - if (counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv) - break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. */ if (counter->ctx->task) - hw_ops = &perf_ops_task_clock; + pmu = &perf_ops_task_clock; else - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel)) - hw_ops = &perf_ops_page_faults; - break; + case PERF_COUNT_PAGE_FAULTS_MIN: + case PERF_COUNT_PAGE_FAULTS_MAJ: case PERF_COUNT_CONTEXT_SWITCHES: - if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_context_switches; + pmu = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_cpu_migrations; - break; - default: + pmu = &perf_ops_cpu_migrations; break; } - if (hw_ops) + if (pmu) hwc->irq_period = hw_event->irq_period; - return hw_ops; + return pmu; } /* @@ -1785,12 +2750,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, struct perf_counter *group_leader, gfp_t gfpflags) { - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; struct perf_counter *counter; + long err; counter = kzalloc(sizeof(*counter), gfpflags); if (!counter) - return NULL; + return ERR_PTR(-ENOMEM); /* * Single counters are their own group leaders, with an @@ -1801,35 +2767,64 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mutex); INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->event_entry); INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); + mutex_init(&counter->mmap_mutex); + INIT_LIST_HEAD(&counter->child_list); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; counter->cpu = cpu; counter->hw_event = *hw_event; - counter->wakeup_pending = 0; counter->group_leader = group_leader; - counter->hw_ops = NULL; + counter->pmu = NULL; counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; - hw_ops = NULL; - if (!hw_event->raw && hw_event->type < 0) - hw_ops = sw_perf_counter_init(counter); - else - hw_ops = hw_perf_counter_init(counter); + pmu = NULL; + + if (perf_event_raw(hw_event)) { + pmu = hw_perf_counter_init(counter); + goto done; + } + + switch (perf_event_type(hw_event)) { + case PERF_TYPE_HARDWARE: + pmu = hw_perf_counter_init(counter); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_counter_init(counter); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_counter_init(counter); + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); - if (!hw_ops) { + if (err) { kfree(counter); - return NULL; + return ERR_PTR(err); } - counter->hw_ops = hw_ops; + + counter->pmu = pmu; + + if (counter->hw_event.mmap) + atomic_inc(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_inc(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_inc(&nr_comm_tracking); return counter; } @@ -1901,10 +2896,10 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_put_context; } - ret = -EINVAL; counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, GFP_KERNEL); - if (!counter) + ret = PTR_ERR(counter); + if (IS_ERR(counter)) goto err_put_context; ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); @@ -1947,6 +2942,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); ctx->task = task; } @@ -1975,15 +2971,14 @@ inherit_counter(struct perf_counter *parent_counter, child_counter = perf_counter_alloc(&parent_counter->hw_event, parent_counter->cpu, child_ctx, group_leader, GFP_KERNEL); - if (!child_counter) - return NULL; + if (IS_ERR(child_counter)) + return child_counter; /* * Link it up in the child's context: */ child_counter->task = child; - list_add_counter(child_counter, child_ctx); - child_ctx->nr_counters++; + add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; /* @@ -2028,15 +3023,17 @@ static int inherit_group(struct perf_counter *parent_counter, { struct perf_counter *leader; struct perf_counter *sub; + struct perf_counter *child_ctr; leader = inherit_counter(parent_counter, parent, parent_ctx, child, NULL, child_ctx); - if (!leader) - return -ENOMEM; + if (IS_ERR(leader)) + return PTR_ERR(leader); list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - if (!inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx)) - return -ENOMEM; + child_ctr = inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); } return 0; } @@ -2053,6 +3050,10 @@ static void sync_child_counter(struct perf_counter *child_counter, * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_counter->count); + atomic64_add(child_counter->total_time_enabled, + &parent_counter->child_total_time_enabled); + atomic64_add(child_counter->total_time_running, + &parent_counter->child_total_time_running); /* * Remove this counter from the parent's list @@ -2087,6 +3088,7 @@ __perf_counter_exit_task(struct task_struct *child, if (child != current) { wait_task_inactive(child, 0); list_del_init(&child_counter->list_entry); + update_counter_times(child_counter); } else { struct perf_cpu_context *cpuctx; unsigned long flags; @@ -2098,19 +3100,20 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); perf_flags = hw_perf_save_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); group_sched_out(child_counter, cpuctx, child_ctx); + update_counter_times(child_counter); list_del_init(&child_counter->list_entry); child_ctx->nr_counters--; hw_perf_restore(perf_flags); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); } parent_counter = child_counter->parent; @@ -2125,10 +3128,10 @@ __perf_counter_exit_task(struct task_struct *child, list_entry) { if (sub->parent) { sync_child_counter(sub, sub->parent); - kfree(sub); + free_counter(sub); } } - kfree(child_counter); + free_counter(child_counter); } } @@ -2262,15 +3265,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { .notifier_call = perf_cpu_notify, }; -static int __init perf_counter_init(void) +void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); - - return 0; } -early_initcall(perf_counter_init); static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) {