VFS介绍 在跟踪流程open流程之前我们首先了解一下VFS,Linux 内核中的 VFS(Virtual File System)是一个抽象层,它隐藏了基础文件系统(如 ext4、NTFS 等)的实现细节,并提供一个统一的文件系统接口给用户空间和内核使用。VFS 可以让应用程序在使用不同的底层文件系统时不必了解底层文件系统的细节,这样可以方便用户使用并提高文件系统的灵活性、可扩展性。 VFS 在内核中的位置是 fs/vfs 目录,它包含了各种文件系统操作的定义和实现,例如文件创建、文件打开、读写文件等。VFS 定义了一系列文件相关的结构体,如文件描述符(file descriptor)、超级块(superblock)、索引节点(inode)等,这些结构体描述了一个文件在内核中的属性和状态,并提供了相应的操作函数。这些操作函数在不同的底层文件系统中都具有类似的功能,但底层实现可能会不同。 VFS 也提供了一些虚拟文件系统,比如 tmpfs、proc、sysfs 等,它们在内存中创建了一个文件系统,并提供了一些虚拟文件和目录,用于映射内核状态或特定设备的信息。 总的来说,VFS 是一个非常重要的组件,在 Linux 内核中起到了连接用户空间和底层文件系统的桥梁作用。 open系统调用定义 open系统调用定义在内核源码目录fs/open.c里面 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(AT_FDCWD, filename, flags, mode); } SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) { if (force_o_largefile()) flags |= O_LARGEFILE; return do_sys_open(dfd, filename, flags, mode); } 参数: dfd 文件描述符 filename 文件具体路径名字 flags打开文件标志位,方式 mode 权限控制位 open跟踪 由上面系统调用定义可以知道,不管调用那种方式打开文件,最后都会调用到一个函数中去do_sys_open中。 long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_flags op; int fd = build_open_flags(flags, mode, &op); struct filename *tmp; if (fd) return fd; tmp = getname(filename); //1 if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(flags);//2 if (fd >= 0) { struct file *f = do_filp_open(dfd, tmp, &op);//3 if (IS_ERR(f)) { put_unused_fd(fd); fd = PTR_ERR(f); } else { fsnotify_open(f); fd_install(fd, f);// 4 } } putname(tmp); return fd; //5 } 流程说明: 1.从用户空间获取文件名字,路径; 2.根据当前进程打开文件描述表(也就是一个文件描述符数组 stdin stdout stderr类似)获取 一个空闲的文件描述符fd; 3.调用真正打开文件流程; 4.把当前进程的打开文件与fd安装到,进程描述表中,类似让fd与file绑定; 5.把当前打开文件获取到描述符返回用户空间; 接下来重点跟踪do_filp_open,它是打开文件程序主流程 struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname); filp = path_openat(&nd, op, flags | LOOKUP_RCU);//1 if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } 调用到path_openat函数中 static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { const char *s; struct file *file; int opened = 0; int error; file = get_empty_filp(); //1 if (IS_ERR(file)) return file; file->f_flags = op->open_flag; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file, &opened); goto out2; } if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); if (!error) opened |= FILE_OPENED; goto out2; } s = path_init(nd, flags);//2 if (IS_ERR(s)) { put_filp(file); return ERR_CAST(s); } while (!(error = link_path_walk(s, nd)) && (error = do_last(nd, file, op, &opened)) > 0) {//3 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); s = trailing_symlink(nd); if (IS_ERR(s)) { error = PTR_ERR(s); break; } } terminate_walk(nd); out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); } if (unlikely(error)) { if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } file = ERR_PTR(error); } return file; }
3. do_last() 是 Linux 内核在进行打开文件操作时的一个重要函数,定义在 fs/namei.c 中。 它的作用是在 VFS 中进行文件名到文件描述符的转换,并构建一个表示打开文件的 struct file 结构体; /* * Handle the last step of open() */ static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op, int *opened) { struct dentry *dir = nd->path.dentry; kuid_t dir_uid = nd->inode->i_uid; umode_t dir_mode = nd->inode->i_mode; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; int acc_mode = op->acc_mode; unsigned seq; struct inode *inode; struct path path; int error; nd->flags &= ~LOOKUP_PARENT; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); if (unlikely(error)) return error; goto finish_open; } if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; /* we _can_ be in RCU mode here */ error = lookup_fast(nd, &path, &inode, &seq); if (likely(error > 0)) goto finish_lookup; if (error < 0) return error; BUG_ON(nd->inode != dir->d_inode); BUG_ON(nd->flags & LOOKUP_RCU); } else { /* create side of things */ /* * This will *only* deal with leaving RCU mode - LOOKUP_JUMPED * has been cleared when we got to the last component we are * about to look up */ error = complete_walk(nd); if (error) return error; audit_inode(nd->name, dir, LOOKUP_PARENT); /* trailing slashes? */ if (unlikely(nd->last.name[nd->last.len])) return -EISDIR; } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) got_write = true; /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (error <= 0) { if (error) goto out; if ((*opened & FILE_CREATED) || !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; audit_inode(nd->name, file->f_path.dentry, 0); goto opened; } if (*opened & FILE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = 0; path_to_nameidata(&path, nd); goto finish_open_created; } /* * If atomic_open() acquired write access it is dropped now due to * possible mount and symlink following (this might be optimized away if * necessary...) */ if (got_write) { mnt_drop_write(nd->path.mnt); got_write = false; } error = follow_managed(&path, nd); if (unlikely(error < 0)) return error; if (unlikely(d_is_negative(path.dentry))) { path_to_nameidata(&path, nd); return -ENOENT; } /* * create/update audit record if it already exists. */ audit_inode(nd->name, path.dentry, 0); if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { path_to_nameidata(&path, nd); return -EEXIST; } seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: error = step_into(nd, &path, 0, inode, seq); if (unlikely(error)) return error; finish_open: /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) return error; audit_inode(nd->name, nd->path.dentry, 0); if (open_flag & O_CREAT) { error = -EISDIR; if (d_is_dir(nd->path.dentry)) goto out; error = may_create_in_sticky(dir_mode, dir_uid, d_backing_inode(nd->path.dentry)); if (unlikely(error)) goto out; } error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; if (!d_is_reg(nd->path.dentry)) will_truncate = false; if (will_truncate) { error = mnt_want_write(nd->path.mnt); if (error) goto out; got_write = true; } finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); if (error) goto out; *opened |= FILE_OPENED; opened: error = open_check_o_direct(file); if (!error) error = ima_file_check(file, op->acc_mode, *opened); if (!error && will_truncate) error = handle_truncate(file); out: if (unlikely(error) && (*opened & FILE_OPENED)) fput(file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (got_write) mnt_drop_write(nd->path.mnt); return error; } 代码比较长,我们只需要抓住主干,vfs_open finish_open_created: error = may_open(&nd->path, acc_mode, open_flag);//这里会有权限检查 if (error) goto out; BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); if (error) goto out; *opened |= FILE_OPENED; error = lookup_open(nd, &path, file, op, got_write, opened) lookup_open()是 Linux 内核中处理文件打开的一个重要函数,定义在文件 fs/namei.c 中。该函数的作用是根据给定的路径名(由 nameidata结构体表示)查找文件或者目录并打开它,最终返回一个打开的文件描述符。 首先,lookup_open() 函数调用 path_lookupat() 函数来查找路径名所对应的 inode, 这个 inode 中记录了该文件详细信息和路径,如果查找不到相关的 inode,表示文件不存在,函数返回一个错误值(例如,ENOENT)。接下来,文件系统会带上 file->f_path.dentry->d_inode(dentry 表示“路径”中的目录项)与操作标记 op 进入 combine_open() 函数,进一步进行打开文件操作。如果找到了文件 inode,lookup_open() 函数会检查文件访问权限并打开文件。如果打开文件失败,函数会返回一个错误指针,否则返回打开的文件描述符。 需要注意的是,lookup_open() 函数的作用是打开一个文件(或目录),而不是创建文件。若需要创建文件,应该使用稍有不同的函数,如 vfs_create() 或 vfs_mkdir(). vfs_open是直接调用到了虚拟文件系统的标准接口去 int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); file->f_path = *path; return do_dentry_open(file, d_backing_inode(dentry), NULL, cred); } inode与file file_operations三者数据结构的关系 struct inode、struct file 和 struct file_operations 是 Linux 内核中与文件相关的三个重要结构体,它们之间存在紧密的关系: struct inode结构体描述一个文件在内核中的属性和状态,例如文件所有者、文件权限、文件大小等。每个打开的文件都有一个对应的 struct inode结构体。struct file 结构体的 f_inode成员指向对应的 struct inode 结构体。 struct file结构体表示一个打开的文件描述符,它包含了一些与文件操作相关的信息,例如当前文件位置、读写操作指针、文件打开方式等。struct file_operations结构体包含了一些操作文件的函数指针,如 read()、write()、open()、release() 等。struct file 结构体中的 f_op 成员指向对应的 struct file_operations结构体。 当应用程序通过系统调用打开一个文件时,Linux 内核会首先在文件系统中查找对应的 struct inode结构体。然后,内核会为打开的文件创建一个对应的 struct file 结构体,并将其与 struct inode结构体关联起来。最后,内核会为打开的文件选择一个合适的 struct file_operations结构体,它包含了对该文件进行操作的函数指针。 因此,struct inode、struct file 和 struct file_operations都是文件系统 VFS 层面的结构体,它们分别表示文件的属性和状态、文件描述符以及文件操作的接口函数。这三个结构体是 Linux 内核中文件操作的基础。 struct file struct file { union { struct llist_node fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; struct inode *f_inode; /* cached value *///关键成员 const struct file_operations *f_op; ////关键成员 /* * Protects f_ep_links, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; u64 f_version; #ifdef CONFIG_SECURITY void *f_security; #endif /* needed for tty driver, and maybe others */ void *private_data; #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_operations struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64); ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *, u64); } __randomize_layout; struct inode struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; struct timespec i_atime; struct timespec i_mtime; struct timespec i_ctime; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; unsigned int i_blkbits; enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ unsigned long i_state; struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; u64 i_version; atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #ifdef CONFIG_IMA atomic_t i_readcount; /* struct files open RO */ #endif const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ struct file_lock_context *i_flctx; struct address_space i_data; struct list_head i_devices; union { struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; __u32 i_generation; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_info *i_crypt_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; 继续跟踪主流程 static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *), const struct cred *cred) { static const struct file_operations empty_fops = {}; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; path_get(&f->f_path); f->f_inode = inode; f->f_mapping = inode->i_mapping; /* Ensure that we skip any errors that predate opening of the file */ f->f_wb_err = filemap_sample_wb_err(f->f_mapping); if (unlikely(f->f_flags & O_PATH)) { f->f_mode = FMODE_PATH; f->f_op = &empty_fops; return 0; } /* Any file opened for execve()/uselib() has to be a regular file. */ if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) { error = -EACCES; goto cleanup_file; } if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); if (unlikely(error)) goto cleanup_file; error = __mnt_want_write(f->f_path.mnt); if (unlikely(error)) { put_write_access(inode); goto cleanup_file; } f->f_mode |= FMODE_WRITER; } /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop);//1 if (unlikely(WARN_ON(!f->f_op))) { error = -ENODEV; goto cleanup_all; } error = security_file_open(f, cred); //2 if (error) goto cleanup_all; error = break_lease(locks_inode(f), f->f_flags); if (error) goto cleanup_all; if (!open) open = f->f_op->open; if (open) { error = open(inode, f);//3 if (error) goto cleanup_all; } if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; f->f_write_hint = WRITE_LIFE_NOT_SET; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping); return 0; cleanup_all: fops_put(f->f_op); if (f->f_mode & FMODE_WRITER) { put_write_access(inode); __mnt_drop_write(f->f_path.mnt); } cleanup_file: path_put(&f->f_path); f->f_path.mnt = NULL; f->f_path.dentry = NULL; f->f_inode = NULL; return error; }
安全模块的 hook 函数在内核中扮演着重要的角色。它可以进行访问权限的检查、进程识别以及文件标签等相关的安全检查。在 Linux 中,安全模块 hook 函数作为内核和用户空间的接口,可以定制一些安全策略以允许或拒绝对系统的访问。 需要注意的是,在进行文件访问控制时,除了安全模块 hook 函数外,还有 SELinux、AppArmor 等不同的安全模块可以用于提供强大的安全保护,这些模块也可以通过其相应的 hook 函数来进行安全检查。 到此,打开文件的整个流程分析介绍。小弟才疏学浅,如有错误请大神指正!!!!!!! |
|