Linux Perf Event Open硬件事件采样与Ring Buffer
2026/6/14 12:25:05 网站建设 项目流程

Linux perf_event_open 硬件事件采样与 ring_buffer

一、系统调用入口与 perf_event 分配

perf_event_open 是 Linux 性能监控的核心系统调用,定义在 kernel/events/core.c:

SYSCALL_DEFINE5(perf_event_open,
struct perf_event_attr __user *, attr_uptr,
pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
struct perf_event *event;
struct perf_event_attr attr;

/* 从用户态拷贝属性 */
if (copy_from_user(&attr, attr_uptr, sizeof(attr)))
return -EFAULT;

/* 权限校验:需要 CAP_SYS_ADMIN 或 perf_event_paranoid 许可 */
err = perf_event_paranoid_check(&attr);

/* 核心分配函数 */
event = perf_event_alloc(&attr, cpu, task, NULL, NULL, NULL);
if (IS_ERR(event))
return PTR_ERR(event);

/* 分配 event 文件描述符 */
event_fd = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
if (event_fd < 0) {
perf_event_release_kernel(event);
return event_fd;
}

/* 安装 fd 到当前进程的 fdtable */
fd_install(event_fd, event_file);
return event_fd;
}

二、perf_event_alloc 与 PMU 初始化

perf_event_alloc 分配 struct perf_event 结构体,初始化硬件事件上下文:

static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct task_struct *task,
struct perf_event *group_leader,
struct perf_event *parent_event,
perf_overflow_handler_t overflow_handler,
void *context)
{
struct perf_event *event;
int node;

event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL, node);
if (!event)
return ERR_PTR(-ENOMEM);

/* 初始化事件核心字段 */
event->attr = *attr;
event->cpu = cpu;
event->state = PERF_EVENT_STATE_INACTIVE;

/* 初始化采样相关结构 */
if (attr->sample_period) {
/* 硬件计数器溢出采样 */
event->hw.sample_period = attr->sample_period;
event->hw.last_period = attr->sample_period;
local64_set(&event->hw.period_left, attr->sample_period);
}

/* 初始化 ring buffer(采样缓冲区) */
if (attr->sample_type & PERF_SAMPLE_RAW ||
attr->watermark || attr->wakeup_events) {
/* 在 mmap 时实际分配 ring buffer */
event->rb = NULL;
atomic_set(&event->rb_refcount, 0);
}

/* 分配 PMU 特定的 hw 结构 */
event->pmu = perf_init_event(event);
if (IS_ERR(event->pmu)) {
err = PTR_ERR(event->pmu);
goto err_free;
}

return event;
}

三、Ring Buffer 的数据结构

perf ring buffer 是通过 mmap 映射到用户态的数据区域,结构体为 struct perf_buffer:

struct perf_buffer {
int page; /* 当前写入页索引 */
int nr_pages; /* 总页面数 */
int overwritable; /* 是否可覆盖 */
struct perf_event_mmap_page *user_page; /* 用户态头 */
unsigned long *data_pages[]; /* 数据页指针数组 */
};

mmap 时分配 ring buffer 的路径:

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
unsigned long nr_pages;
struct perf_buffer *rb;

/* 检查权限和模式 */
if (vma->vm_flags & VM_WRITE)
return -EINVAL; /* 用户态只读 */

/* 计算页数 */
nr_pages = (vma->vm_end - vma->vm_start - PAGE_SIZE) >> PAGE_SHIFT;

/* 分配 ring buffer */
rb = rb_alloc(nr_pages,
event->attr.watermark ? PERF_RB_WATERMARK : 0,
event->attr.write_backward ? PERF_RB_WRITE_BACKWARD : 0);

/* 初始化用户态头部区域 */
rb->user_page->data_head = 0;
rb->user_page->data_tail = 0;
rb->user_page->data_offset = PAGE_SIZE; /* 数据区在头部页之后 */
rb->user_page->data_size = nr_pages * PAGE_SIZE;

event->rb = rb;

/* 将 ring buffer 映射到用户空间 */
vm_insert_page(vma, vma->vm_start, virt_to_page(rb->user_page));
for (i = 0; i < nr_pages; i++)
vm_insert_page(vma, vma->vm_start + (i + 1) * PAGE_SIZE,
virt_to_page(rb->data_pages[i]));

return 0;
}

四、硬件采样路径:PMU 中断到 ring buffer 写入

硬件性能计数器溢出时触发 NMI 或中断,调用 perf_event_overflow:

void perf_event_overflow(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
/* 1. 取当前 ring buffer */
struct perf_buffer *rb = rcu_dereference(event->rb);
if (!rb)
return;

/* 2. 计算下一次采样周期 */
perf_event_update_userpage(event);

/* 3. 将采样数据写入 ring buffer */
int ret = perf_output_begin(&handle, event,
perf_sample_size(data));
if (ret)
return; /* buffer 满且不可覆盖 */

/* 4. 写入 event type */
perf_output_put(&handle, data->type);

/* 5. 根据 sample_type 写入各字段 */
if (event->attr.sample_type & PERF_SAMPLE_IP)
perf_output_put(&handle, data->ip);

if (event->attr.sample_type & PERF_SAMPLE_TID)
perf_output_put(&handle, data->tid_entry);

if (event->attr.sample_type & PERF_SAMPLE_TIME)
perf_output_put(&handle, data->time);

if (event->attr.sample_type & PERF_SAMPLE_CPU)
perf_output_put(&handle, data->cpu_entry);

if (event->attr.sample_type & PERF_SAMPLE_RAW)
perf_output_put(&handle, data->raw);

/* 6. 提交,刷新 data_head */
perf_output_end(&handle);
}

perf_output_begin 的核心逻辑:

int perf_output_begin(struct perf_output_handle *handle,
struct perf_event *event, unsigned int size)
{
struct perf_buffer *rb = rcu_dereference(event->rb);
int wakeup_events = event->attr.wakeup_events;
unsigned long head;

/* 获取当前的 data_head */
head = local_read(&rb->user_page->data_head);

/* 检查是否有足够空间 */
if (rb->overwritable) {
/* 可覆盖模式:直接写,覆盖旧数据 */
} else {
unsigned long tail = READ_ONCE(rb->user_page->data_tail);
/* 检查剩余空间 */
if (head - tail > rb->nr_pages * PAGE_SIZE - size)
return -ENOSPC;
}

/* 分配 handle 中的偏移 */
handle->rb = rb;
handle->event = event;
handle->size = size;
handle->offset = head;
handle->wakeup = wakeup_events;

return 0;
}

五、用户态读取采样数据

用户态通过 mmap 的 ring buffer 读取采样数据:

struct perf_event_mmap_page *header = mmap(...);
u64 data_tail = header->data_tail;
u64 data_head = READ_ONCE(header->data_head);

/* 确保 data_tail 不跨 cache line */
smp_rmb(); /* 读屏障,保证 data_head 之前的数据可见 */

while (data_tail != data_head) {
struct perf_event_header *ehdr;

/* 取当前事件头部 */
ehdr = (struct perf_event_header *)(data + (data_tail & mask));

/* 处理事件 */
process_sample_event(ehdr);

/* 前进 data_tail */
data_tail += ehdr->size;

/* 绕过 buffer 尾部环回 */
if (data_tail >= rb->data_size)
data_tail -= rb->data_size;
}

/* 更新用户态的 data_tail */
header->data_tail = data_tail;

内核写入 data_head 时使用 smp_store_release 保证正确排序:

static void perf_output_end(struct perf_output_handle *handle)
{
struct perf_buffer *rb = handle->rb;

/* 写入最后一条记录后更新 data_head */
smp_store_release(&rb->user_page->data_head, handle->offset);

/* 根据 watermark 和 wakeup_events 触发信号 */
if (handle->wakeup) {
handle->wakeup--;
if (!handle->wakeup) {
/* 唤醒用户态等待进程 */
wake_up(&event->waitq);
if (event->pending_kill)
kill_fasync(&event->fasync, SIGIO, POLL_IN);
}
}
}

六、硬件 PMU 驱动的计数器配置

x86 架构的 PMU 初始化在 perf_event_intel.c 中:

static int intel_pmu_hw_config(struct perf_event *event)
{
/* 根据 attr->config 选择 PMC 编号 */
if (event->attr.type == PERF_TYPE_HARDWARE) {
switch (event->attr.config) {
case PERF_COUNT_HW_CPU_CYCLES:
event->hw.config = ARCH_PERFMON_EVENTSEL_OS |
ARCH_PERFMON_EVENTSEL_INT |
x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES);
break;
case PERF_COUNT_HW_INSTRUCTIONS:
event->hw.config = ARCH_PERFMON_EVENTSEL_OS |
ARCH_PERFMON_EVENTSEL_INT |
x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS);
break;
}
}

/* 写入 MSR 寄存器 */
wrmsrl(event->hw.event_base, event->hw.config);
return 0;
}

PMU 溢出时,x86 的 NMI handler 查找对应的 perf_event:

static int intel_pmu_handle_irq(struct pt_regs *regs)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
int handled = 0;

/* 读取 IA32_PERF_GLOBAL_STATUS 寄存器 */
u64 status = wrmsrl(MSR_CORE_PERF_GLOBAL_STATUS, 0);

/* 遍历所有 PMC,找到溢出的计数器 */
for_each_set_bit(bit, (unsigned long *)&status, x86_pmu.num_events) {
struct perf_event *event = cpuc->events[bit];

/* 读取新计数器值,计算溢出次数 */
u64 new_count = x86_pmu_event_read(event);

/* 调用 perf_event_overflow 写入 ring buffer */
perf_event_overflow(event, &data, regs);

/* 重新加载计数器 */
wrmsrl(event->hw.event_base + 1,
-event->hw.sample_period);
handled = 1;
}

/* 确认中断 */
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, status);
return handled;
}

七、采样频率控制与节流

perf 的 interrupt throttle 机制防止采样中断过多:

static void perf_sample_event_took(struct perf_event *event, u64 sample_len)
{
/* 计算采样消耗的时钟数 */
u64 throttle = 1000000; /* 1ms 硬限制 */

if (sample_len > throttle) {
/* 采样本身耗时过长,暂停事件 */
event->pmu->stop(event, PERF_EF_UPDATE);
event->state = PERF_EVENT_STATE_OFF;
event->pending_disable = 1;

/* 调度 timer 在 1 tick 后重新启用 */
hrtimer_start(&event->hw.timer, ns_to_ktime(1), HRTIMER_MODE_REL);
}
}

八、mmap 页面布局总结

perf ring buffer mmap 的完整布局:

Offset Content
------ -------
0x0000 struct perf_event_mmap_page (1 page)
0x1000 数据页 0
0x2000 数据页 1
...
0xN000 数据页 N-1 (nr_pages)

用户态通过 data_tail/data_head 协议实现生产者-消费者模型:内核写入 data_head,用户态读取后推进 data_tail。

需要专业的网站建设服务?

联系我们获取免费的网站建设咨询和方案报价,让我们帮助您实现业务目标

立即咨询