Skip to content

Commit 6cd59aa

Browse files
jservhankluo6
andcommitted
Use much faster getcpu() via vDSO
vDSO (virtual dynamic shared object) is exported by Linux kernel into every userspace program, designed to speed up this process for certain system calls. For Linux/x86_64, getcpu() can be called via vDSO, which makes getcpu() much faster. The faster getcpu() invocation is beneficial when retrieving NUMA node information. Benchmarking[1] on AMD Ryzen Threadripper 2990WX 32-Core Processor: getcpu: syscall: 103 nsec/call getcpu: vdso: 18 nsec/call We can not use dlsym to resolve the vDSO symbol "__vdso_getcpu" directly becase it would cause recursive malloc calls when MI_DEBUG_FULL is enabled. [1] https://github.com/nathanlynch/vdsotest Co-authored-by: Chin-Hao Lo <hankluo6@gmail.com> Signed-off-by: Jim Huang <jserv@biilabs.io>
1 parent 076f815 commit 6cd59aa

1 file changed

Lines changed: 87 additions & 3 deletions

File tree

src/os.c

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,81 @@ void _mi_os_init() {
214214
os_alloc_granularity = 16;
215215
}
216216
#else
217+
#if defined(__linux__) && defined(__x86_64__)
218+
#include <elf.h>
219+
#include <sys/auxv.h>
220+
typedef int (*getcpu_vdso_t)(unsigned*, unsigned*, void*);
221+
static getcpu_vdso_t mi_os_getcpu_vdso = NULL;
222+
static struct vdso_info {
223+
uintptr_t load_addr, load_offset;
224+
Elf64_Sym *symtab; // ELF symbol table
225+
const char *symstrings;
226+
void *bucket, *chain;
227+
Elf64_Word nbucket;
228+
} vdso_info;
229+
static void mi_os_vdso_init(void) {
230+
unsigned long base = getauxval(AT_SYSINFO_EHDR);
231+
if (!base)
232+
return;
233+
bool found_vaddr = false;
234+
vdso_info.load_addr = base;
235+
Elf64_Ehdr* hdr = (Elf64_Ehdr*)base;
236+
if (hdr->e_ident[EI_CLASS] != ELFCLASS64)
237+
return;
238+
Elf64_Phdr* pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
239+
Elf64_Dyn* dyn = 0;
240+
// we need to load offset and the dynamic table from the segment table
241+
for (size_t i = 0; i < hdr->e_phnum; i++) {
242+
if (pt[i].p_type == PT_LOAD && !found_vaddr) {
243+
found_vaddr = true;
244+
vdso_info.load_offset = base + (uintptr_t)pt[i].p_offset - (uintptr_t)pt[i].p_vaddr;
245+
} else if (pt[i].p_type == PT_DYNAMIC) {
246+
dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
247+
}
248+
}
249+
if (!found_vaddr || !dyn)
250+
return;
251+
Elf64_Word* hash = 0;
252+
vdso_info.symstrings = NULL;
253+
vdso_info.symtab = NULL;
254+
for (size_t i = 0; dyn[i].d_tag != DT_NULL; i++) {
255+
switch (dyn[i].d_tag) {
256+
case DT_STRTAB:
257+
vdso_info.symstrings = (const char*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
258+
break;
259+
case DT_SYMTAB:
260+
vdso_info.symtab = (Elf64_Sym*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
261+
break;
262+
case DT_HASH:
263+
hash = (Elf64_Word*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
264+
break;
265+
}
266+
}
267+
if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
268+
return;
269+
vdso_info.nbucket = hash[0];
270+
vdso_info.bucket = &hash[2];
271+
vdso_info.chain = &hash[vdso_info.nbucket + 2];
272+
}
273+
static void* mi_os_vdso_get_sym(void) {
274+
const char *name = "__vdso_getcpu";
275+
Elf64_Word chain = ((Elf64_Word*)vdso_info.bucket)[11538501 % vdso_info.nbucket];
276+
for (; chain != STN_UNDEF; chain = ((Elf64_Word*)vdso_info.chain)[chain]) {
277+
Elf64_Sym* sym = &vdso_info.symtab[chain];
278+
// Check for a defined global or weak function with right name
279+
if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
280+
continue;
281+
if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && ELF64_ST_BIND(sym->st_info) != STB_WEAK)
282+
continue;
283+
if (sym->st_shndx == SHN_UNDEF)
284+
continue;
285+
if (strcmp(name, vdso_info.symstrings + sym->st_name))
286+
continue;
287+
return (void*)(vdso_info.load_offset + sym->st_value);
288+
}
289+
return 0;
290+
}
291+
#endif
217292
void _mi_os_init() {
218293
// get the page size
219294
long result = sysconf(_SC_PAGESIZE);
@@ -222,6 +297,11 @@ void _mi_os_init() {
222297
os_alloc_granularity = os_page_size;
223298
}
224299
large_os_page_size = 2*MiB; // TODO: can we query the OS for this?
300+
#if defined(__linux__) && defined(__x86_64__)
301+
// set up symbols exported by vDSO (virtual dynamic shared object)
302+
mi_os_vdso_init();
303+
mi_os_getcpu_vdso = (getcpu_vdso_t)mi_os_vdso_get_sym();
304+
#endif
225305
}
226306
#endif
227307

@@ -1173,9 +1253,13 @@ static size_t mi_os_numa_node_countx(void) {
11731253
#include <stdio.h> // access
11741254

11751255
static size_t mi_os_numa_nodex(void) {
1176-
#ifdef SYS_getcpu
1177-
unsigned long node = 0;
1178-
unsigned long ncpu = 0;
1256+
#if defined(SYS_getcpu)
1257+
unsigned int node = 0, ncpu = 0;
1258+
#if defined(__x86_64__)
1259+
if (mi_likely(mi_os_getcpu_vdso != NULL)) {
1260+
return (mi_os_getcpu_vdso(&ncpu, &node, NULL) != -1) ? node : 0;
1261+
}
1262+
#endif
11791263
long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
11801264
if (err != 0) return 0;
11811265
return node;

0 commit comments

Comments
 (0)