ENTRY(entry_SYSCALL_64) TRACE_IRQS_OFF subq $FRAME_SIZE, %rsp /* Reserve space for pt_regs */ MOV_LDX(regs, %rsp) /* Save user stack pointer */ cmpl $(nr_syscalls),%eax /* syscall number valid? */ jae badsys
/* * Load the syscall table pointer into r10 from a global variable. * We stash it in memory at boot time to workaround boot loader * address randomization. * * movl sys_call_table(,%rax,8),%r10 * * can be replaced with this: * * leal sys_call_table(%rip),%r10 * movq (%r10,%rax,8),%r10 */
/* arch/x86/include/asm/ptrace.h */ struct pt_regs { /* * C ABI says these regs are callee-preserved. They aren't saved on kernel entry * unless syscall needs a complete, fully filled "struct pt_regs". */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long rbp; unsigned long rbx; /* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; /* 程序传递到内核的第 4 个参数。 */ unsigned long r9; /* 程序传递到内核的第 6 个参数。 */ unsigned long r8; /* 程序传递到内核的第 5 个参数。 */ unsigned long ax; /* 程序传递到内核的系统调用号。 */ unsigned long cx; /* 程序传递到内核的 syscall 的下一条指令地址。 */ unsigned long dx; /* 程序传递到内核的第 3 个参数。 */ unsigned long si; /* 程序传递到内核的第 2 个参数。 */ unsigned long di; /* 程序传递到内核的第 1 个参数。 */ /* * On syscall entry, this is syscall#. On CPU exception, this is error code. * On hw interrupt, it's IRQ number: */ unsigned long orig_rax; /* 系统调用号。 */ /* Return frame for iretq * 内核态返回用户态需要恢复现场的数据。*/ unsigned long ip; /* 保存程序调用 syscall 的下一条指令地址。 */ unsigned long cs; /* 用户态代码起始段地址。 */ unsigned long flags; /* 用户态的 CPU 标志。 */ unsigned long sp; /* 用户态的栈顶地址(栈内存是向下增长的)。 */ unsigned long ss; /* 用户态的数据段地址。 */ /* top of stack page */ };
3.3do_syscall_64
do_syscall_64 函数是 Linux 内核中的关键函数之一,它的主要功能是处理 64 位系统调用。当用户程序通过软件中断(syscall)发起系统调用请求时,内核会将控制转移到 do_syscall_64 函数来执行相应的操作。
具体而言,do_syscall_64 函数完成以下主要功能:
获取系统调用号:从当前进程的 CPU 寄存器或栈中获取系统调用号,以确定用户程序请求执行哪个特定的系统调用。
/* arch/x86/entry/common.c */ #ifdef CONFIG_X86_64 __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; ... /* * NB: Native and x32 syscalls are dispatched from the same * table. The only functional difference is the x32 bit in * regs->orig_ax, which changes the behavior of some syscalls. */ nr &= __SYSCALL_MASK; if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); /* 通过系统调用跳转表,调用系统调用号对应的函数。 * 函数返回值保存在 regs->ax 里,最后将这个值,保存到 rax 寄存器传递到用户空间。 */ regs->ax = sys_call_table[nr](regs); }
# Entry can be either just a function name or "function/qualifier" real_entry="${entry%%/*}" if [ "$entry" = "$real_entry" ]; then qualifier= else qualifier=${entry#*/} fi
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__) #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ __MAP(x,__SC_TEST,__VA_ARGS__); \ __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \ return ret; \ } \ SYSCALL_ALIAS(sys##name, SyS##name); \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
asmlinkage long sys_read(unsigned int fd, char __user * buf, size_t count) __attribute__((alias(__stringify(SyS_read))));
static inline long SYSC_read(unsigned int fd, char __user * buf, size_t count); asmlinkage long SyS_read(long int fd, long int buf, long int count);
asmlinkage long SyS_read(long int fd, long int buf, long int count) { long ret = SYSC_read((unsigned int) fd, (char __user *) buf, (size_t) count); asmlinkage_protect(3, ret, fd, buf, count); return ret; }
static inline long SYSC_read(unsigned int fd, char __user * buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; /* ... */
// file: include/linux/export.h /* * Export symbols from the kernel to modules. Forked from module.h * to reduce the amount of pointless cruft we feed to gcc when only * exporting a simple symbol or two. * * Try not to add #includes here. It slows compilation and makes kernel * hackers place grumpy comments in header files. */ /* Indirect, so macros are expanded before pasting. */ #define VMLINUX_SYMBOL(x) __VMLINUX_SYMBOL(x) #define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x)
#define __VMLINUX_SYMBOL(x) x #define __VMLINUX_SYMBOL_STR(x) #x
# # 64-bit system call numbers and entry vectors # # The format is: # <number> <abi> <name> <entry point> # # The abi is "common", "64" or "x32" for this file. # 0 common read sys_read 1 common write sys_write 2 common open sys_open 3 common close sys_close 4 common stat sys_newstat ......
/* * Non-implemented system calls get redirected here. */ asmlinkage long sys_ni_syscall(void) { return -ENOSYS; } typedef void (*sys_call_ptr_t)(void);
extern void sys_ni_syscall(void);
const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ [0 ... __NR_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_64.h> };
/* * Register setup: * rax system call number * rdi arg0 * rcx return address for syscall/sysret, C arg3 * rsi arg1 * rdx arg2 * r10 arg3 (--> moved to rcx for C) * r8 arg4 * r9 arg5 * r11 eflags for syscall/sysret, temporary for C * r12-r15,rbp,rbx saved by C code, not touched. * * Interrupts are off on entry. * Only called from user space. * * XXX if we had a free scratch register we could save the RSP into the stack frame * and report it properly in ps. Unfortunately we haven't. * * When user can change the frames always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. */
void syscall_init(void) { /* * LSTAR and STAR live in a bit strange symbiosis. * They both write to the same internal register. STAR allows to * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); wrmsrl(MSR_LSTAR, system_call); wrmsrl(MSR_CSTAR, ignore_sysret); ......