文件系统专栏 | 之文件系统挂载

mynotebook 2022-08-31 发布于湖南

展开全文

ext4文件系统挂载

大家可以使用以下命令挂载一个u盘到 /mnt目录下：

mount -t ext4  /dev/sda1 /mnt

其中mount这个应用程序就是使用了mount函数进行系统调用，其系统调用为：

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
  char __user *, type, unsigned long, flags, void __user *, data)
{
 return ksys_mount(dev_name, dir_name, type, flags, data);
}

ksys_mount函数参数含义： dev_name：设备名字 dir_name：挂载目录 type：文件系统类型 flags：挂载标志位 data：挂载选项

ksys_mount函数分析，文件在fs/namespace.c中：

int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
        unsigned long flags, void __user *data)
{
 int ret;
 char *kernel_type;
 char *kernel_dev;
 void *options;

 kernel_type = copy_mount_string(type);//复制用户态的type字符串
 ret = PTR_ERR(kernel_type);
 if (IS_ERR(kernel_type))
  goto out_type;

 kernel_dev = copy_mount_string(dev_name);//复制用户态的dev_name字符串
 ret = PTR_ERR(kernel_dev);
 if (IS_ERR(kernel_dev))
  goto out_dev;

 options = copy_mount_options(data);//复制用户态的data数据
 ret = PTR_ERR(options);
 if (IS_ERR(options))
  goto out_data;

 //真正挂载操作函数
 ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

 kfree(options);
out_data:
 kfree(kernel_dev);
out_dev:
 kfree(kernel_type);
out_type:
 return ret;
}

ksys_mount一开始从用户态复制各种信息到内核，然后调用do_mount来执行真正挂载操作：

long do_mount(const char *dev_name, const char __user *dir_name,
  const char *type_page, unsigned long flags, void *data_page)
{
 struct path path;
 unsigned int mnt_flags = 0, sb_flags;
 int retval = 0;

 /* Discard magic */
 if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
  flags &= ~MS_MGC_MSK;

 /* Basic sanity checks */
 //挂载选项参数的基本检查
 if (data_page)
  ((char *)data_page)[PAGE_SIZE - 1] = 0;

 if (flags & MS_NOUSER)
  return -EINVAL;

 //根据目录名称找到挂载点
 retval = user_path(dir_name, &path);
 if (retval)
  return retval;

 //安全相关的，我也不太懂
 retval = security_sb_mount(dev_name, &path,
       type_page, flags, data_page);
 if (!retval && !may_mount())
  retval = -EPERM;
 if (!retval && (flags & SB_MANDLOCK) && !may_mandlock())
  retval = -EPERM;
 if (retval)
  goto dput_out;

 //默认为relatime
 if (!(flags & MS_NOATIME))
  mnt_flags |= MNT_RELATIME;

 /* 分隔每个挂载点标志 */
 if (flags & MS_NOSUID)
  mnt_flags |= MNT_NOSUID;
 if (flags & MS_NODEV)
  mnt_flags |= MNT_NODEV;
 if (flags & MS_NOEXEC)
  mnt_flags |= MNT_NOEXEC;
 if (flags & MS_NOATIME)
  mnt_flags |= MNT_NOATIME;
 if (flags & MS_NODIRATIME)
  mnt_flags |= MNT_NODIRATIME;
 if (flags & MS_STRICTATIME)
  mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 if (flags & MS_RDONLY)
  mnt_flags |= MNT_READONLY;

 /* 重新挂载的默认时间是保存时间 */
 if ((flags & MS_REMOUNT) &&
     ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
         MS_STRICTATIME)) == 0)) {
  mnt_flags &= ~MNT_ATIME_MASK;
  mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
 }

 //设置超级块flags
 sb_flags = flags & (SB_RDONLY |
       SB_SYNCHRONOUS |
       SB_MANDLOCK |
       SB_DIRSYNC |
       SB_SILENT |
       SB_POSIXACL |
       SB_LAZYTIME |
       SB_I_VERSION);

 
 if (flags & MS_REMOUNT)//如果是重新挂载
  retval = do_remount(&path, flags, sb_flags, mnt_flags,
        data_page);
 else if (flags & MS_BIND)//如果是绑定挂载
  retval = do_loopback(&path, dev_name, flags & MS_REC);
 else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))//如果修改挂载文件系统类型
  retval = do_change_type(&path, flags);
 else if (flags & MS_MOVE)//如果是移动挂载路径
  retval = do_move_mount(&path, dev_name);
 else//平时用到的挂载操作
  retval = do_new_mount(&path, type_page, sb_flags, mnt_flags,
          dev_name, data_page);
dput_out:
 path_put(&path);
 return retval;
}

do_mount大部分是做安全相关的工作，然后根据flags参数来判断需要重新挂载、绑定挂载、修改挂载文件系统类型、移动挂载路径、还是我们最常用的普通挂载，普通挂载的操作函数是do_new_mount：

static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
   int mnt_flags, const char *name, void *data)
{
 struct file_system_type *type;
 struct vfsmount *mnt;
 int err;

 if (!fstype)
  return -EINVAL;

 type = get_fs_type(fstype);//根据文件系统名字查找文件系统类型
 if (!type)
  return -ENODEV;

 //主要准备好一个完整的mount结构
 mnt = vfs_kern_mount(type, sb_flags, name, data);
 //如果此文件系统还有子类型（多见于FUSE），设置子文件系统类型名
 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
     !mnt->mnt_sb->s_subtype)
  mnt = fs_set_subtype(mnt, fstype);

 put_filesystem(type);
 if (IS_ERR(mnt))
  return PTR_ERR(mnt);

 //判断mount结构体是否允许访问，主要处理命名空间的问题和挂载掩盖问题
 if (mount_too_revealing(mnt, &mnt_flags)) {
  mntput(mnt);
  return -EPERM;
 }

 //确定父文件系统的挂载点，并且挂载上去
 err = do_add_mount(real_mount(mnt), path, mnt_flags);
 if (err)
  mntput(mnt);
 return err;
}

do_new_mount挂载函数首先根据文件系统名字查找文件系统类型，然后使用vfs_kern_mount构建一个完整的mount结构体，并且在mount_too_revealing函数中确认mount结构体的命名空间的访问权限，最后通过do_add_mount来添加到中。以下几个函数都是比较重要的：

vfs_kern_mount
mount_too_revealing
do_add_mount

我们下面先分析第一个函数vfs_kern_mount：

struct vfsmount *
vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
{
 struct mount *mnt;
 struct dentry *root;

 if (!type)
  return ERR_PTR(-ENODEV);

 //分配一个新的struct mount结构体，并初始化里面成员内容
 mnt = alloc_vfsmnt(name);
 if (!mnt)
  return ERR_PTR(-ENOMEM);

 if (flags & SB_KERNMOUNT)
  mnt->mnt.mnt_flags = MNT_INTERNAL;

 //调用具体文件系统的mount回调函数type->mount，继续挂载操作
 root = mount_fs(type, flags, name, data);
 if (IS_ERR(root)) {
  mnt_free_id(mnt);
  free_vfsmnt(mnt);
  return ERR_CAST(root);
 }

 //配置struct mount结构体参数
 mnt->mnt.mnt_root = root;
 mnt->mnt.mnt_sb = root->d_sb;
 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
 mnt->mnt_parent = mnt;
 lock_mount_hash();
 
 //把挂载描述符添加到超级块的挂载实例链表中
 list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
 unlock_mount_hash();
 return &mnt->mnt;
}

vfs_kern_mount首先会配一个新的struct mount结构体，并初始化里面成员内容，然后通过mount_fs函数回调ext4这个文件系统的mount回调函数，这个回调函数在上一节说的很清楚可以回去看看，最后配置struct mount结构体参数，并且把挂载描述符添加到超级块的挂载实例链表中后返回。mount_fs函数在文件fs/super.c中：

struct dentry *
mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
{
 struct dentry *root;
 struct super_block *sb;
 char *secdata = NULL;
 int error = -ENOMEM;

 //二进制的mount date需要copy过来
 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
  secdata = alloc_secdata();
  if (!secdata)
   goto out;

  error = security_sb_copy_data(data, secdata);
  if (error)
   goto out_free_secdata;
 }

 //回调file_system_type的mount函数
 root = type->mount(type, flags, name, data);
 if (IS_ERR(root)) {
  error = PTR_ERR(root);
  goto out_free_secdata;
 }
 sb = root->d_sb;
 BUG_ON(!sb);
 WARN_ON(!sb->s_bdi);

 /*
  * Write barrier is for super_cache_count(). We place it before setting
  * SB_BORN as the data dependency between the two functions is the
  * superblock structure contents that we just set up, not the SB_BORN
  * flag.
  */
 smp_wmb();//smp写屏障
 sb->s_flags |= SB_BORN;

 //安全相关
 error = security_sb_kern_mount(sb, flags, secdata);
 if (error)
  goto out_sb;

 /*
  * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
  * but s_maxbytes was an unsigned long long for many releases. Throw
  * this warning for a little while to try and catch filesystems that
  * violate this rule.
  */
 WARN((sb->s_maxbytes < 0), '%s set sb->s_maxbytes to '
  'negative value (%lld)\n', type->name, sb->s_maxbytes);

 up_write(&sb->s_umount);
 free_secdata(secdata);
 return root;
out_sb:
 dput(root);
 deactivate_locked_super(sb);
out_free_secdata:
 free_secdata(secdata);
out:
 return ERR_PTR(error);
}

可以看到mount_fs函数首先根据超级快是否为二进制挂载数据来判断烟不要申请一块安全数据区存放数据，然后就回到file_system_type的mount函数，这个函数在上一章说得很详细，感兴趣的可以去看一下，之后就是smp屏障保证指令运行完毕，最后是一些安全相关的超级快挂载。我们下面分析第二个函数mount_too_revealing，在fs/namespace.c文件中：

static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
{
 const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
 unsigned long s_iflags;

 if (ns->user_ns == &init_user_ns)
  return false;

 /* mount结构体如果不允许访问直接返回失败 */
 s_iflags = mnt->mnt_sb->s_iflags;
 if (!(s_iflags & SB_I_USERNS_VISIBLE))
  return false;

 //如果超级块忽略物理设备或者忽略可执行文件则直接返回ok
 if ((s_iflags & required_iflags) != required_iflags) {
  WARN_ONCE(1, 'Expected s_iflags to contain 0x%lx\n',
     required_iflags);
  return true;
 }
 
 //判断mount结构体是否可以访问
 return !mnt_already_visible(ns, mnt, new_mnt_flags);
}

mount_too_revealing首先判断超级快的访问权限，如果超级块都不可以访问，则mount肯定不可以访问，直接返回失败；然后查看超级块有没有忽略物理设备或者忽略可执行文件的标志，有则直接返回成功；最后使用mnt_already_visible判断vfsmount 是否真的可以访问：

static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
    int *new_mnt_flags)
{
 int new_flags = *new_mnt_flags;
 struct mount *mnt;
 bool visible = false;

 down_read(&namespace_sem);//命名空间信号量减一
 //遍历命名空间下的所有挂载结构体（大遍历）
 list_for_each_entry(mnt, &ns->list, mnt_list) {
  struct mount *child;
  int mnt_flags;

  if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
   continue;

  //mount结构体的根目录不是超级快的根目录，说明该挂载是不完全可见的，直接返回
  if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
   continue;

  /* A local view of the mount flags */
  mnt_flags = mnt->mnt.mnt_flags;

  /* 如果超级快标志是只读，则挂载标志也要只读 */
  if (sb_rdonly(mnt->mnt.mnt_sb))
   mnt_flags |= MNT_LOCK_READONLY;

  /* Verify the mount flags are equal to or more permissive
   * than the proposed new mount.
   */
  //验证挂载标志的权限
  if ((mnt_flags & MNT_LOCK_READONLY) &&
      !(new_flags & MNT_READONLY))
   continue;
  if ((mnt_flags & MNT_LOCK_ATIME) &&
      ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
   continue;

  /* This mount is not fully visible if there are any
   * locked child mounts that cover anything except for
   * empty directories.
   */
  //遍历其子挂载结构体（小遍历）
  list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
   struct inode *inode = child->mnt_mountpoint->d_inode;
   /* 如果是锁住的则跳过 */
   if (!(child->mnt.mnt_flags & MNT_LOCKED))
    continue;
   /* 如果目录为空则结束遍历（小遍历） */
   if (!is_empty_dir_inode(inode))
    goto next;
  }
  /* 保留锁定的属性 */
  *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
            MNT_LOCK_ATIME);
  visible = true;
  goto found;//只要有一个可见就结束（大遍历）
 next: ;
 }
found:
 up_read(&namespace_sem);//命名空间信号量加一
 return visible;
}

mnt_already_visible主要遍历命名空间下的所有挂载结构体，寻找一个可以全部可见的mount结构体，其实只要找到一个就可以返回成功了。所以，他在遍历中首先查看mount结构体的根目录是不是超级快的根目录，不是则跳过；然后验证挂载标志的权限，权限不够的页跳过；再遍历其子挂载结构体，确定其子挂载结构体全部都不是空目录，就可以确定这个是可见的。这样子，新的mount结构体就可以挂载到这个命名空间的挂载树上了。mnt_already_visible的主要作用是判断新挂载的目录下是否有一个已经被挂载的目录，有的话需要进行处理，将其隐藏不可见。下面的函数，也就是第三个函数do_add_mount，他把新的mount结构体就可以挂载到这个命名空间的挂载树上的操作函数了：

static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
{
 struct mountpoint *mp;
 struct mount *parent;
 int err;

 mnt_flags &= ~MNT_INTERNAL_FLAGS;

 //这里不是简单的加锁,而是寻找挂载点，如果挂载目录是挂载点，则将最后一次挂载的文件系统根目录作为挂载点    
 mp = lock_mount(path);
 if (IS_ERR(mp))
  return PTR_ERR(mp);

 //使用container_of函数通过vfsmount找到父mount
 parent = real_mount(path->mnt);
 err = -EINVAL;
 //从这里开始有很多检查，如检查装载实例应该属于本进程的装载名字空间
 if (unlikely(!check_mnt(parent))) {
  /* that's acceptable only for automounts done in private ns */
  if (!(mnt_flags & MNT_SHRINKABLE))
   goto unlock;
  /* ... and for those we'd better have mountpoint still alive */
  if (!parent->mnt_ns)
   goto unlock;
 }

 err = -EBUSY;
 /* 不可以在相同挂载点上挂载相同的文件系统 */
 if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
     path->mnt->mnt_root == path->dentry)
  goto unlock;

 err = -EINVAL;
 //新文件系统的挂载实例的根inode不应该是一个符号链接
 if (d_is_symlink(newmnt->mnt.mnt_root))
  goto unlock;

 newmnt->mnt.mnt_flags = mnt_flags;
 //把newmnt加入到全局文件系统树中
 err = graft_tree(newmnt, parent, mp);

unlock:
 unlock_mount(mp);
 return err;
}

do_add_mount首先通过lock_mount上锁，并且在lock_mount寻找一个合适的挂载点，然后对于以下两种情况：1.在相同挂载点上挂载相同的文件系统；2.新文件系统的挂载实例的根inode是一个符号链接，这两种情况返回错误。最后通过graft_tree把newmnt加入到全局文件系统树中，并且在graft_tree处理了mount结构体、mount的父结构体、挂载点之间的关系，让他们两两指向。接下来我们看看lock_mount函数：

static struct mountpoint *lock_mount(struct path *path)
{
 struct vfsmount *mnt;
 struct dentry *dentry = path->dentry;//获得挂载目录的dentry 
retry:
 inode_lock(dentry->d_inode);//申请inode的读写信号量 
 if (unlikely(cant_mount(dentry))) {//判断挂载目录能否被挂载
  inode_unlock(dentry->d_inode);
  return ERR_PTR(-ENOENT);
 }
 namespace_lock();//命名空间读写信号量减一
 mnt = lookup_mnt(path);//查找挂载在path上的第一个子mount
 if (likely(!mnt)) {//mnt为空 说明没有文件系统挂载在这个path上
  struct mountpoint *mp = get_mountpoint(dentry);//从dentry目录获取挂载点
  if (IS_ERR(mp)) {
   namespace_unlock();//命名空间读写信号量加一
   inode_unlock(dentry->d_inode);//释放inode的读写信号量 
   return mp;//返回找到的挂载点实例 
  }
  return mp;//返回找到的挂载点实例 
 }
 namespace_unlock();//命名空间读写信号量加一
 inode_unlock(path->dentry->d_inode);//释放inode的读写信号量 
 path_put(path);
 path->mnt = mnt;//path->mnt指向找到的vfsmount
 dentry = path->dentry = dget(mnt->mnt_root);////path->dentry指向找到的vfsmount的根dentry
 goto retry;//继续查找下一个挂载
}

lock_mount函数首先找到挂载目录的dentry，然后判断该目录是否能被挂载，如果不可以则直接返回一个错误。之后通过lookup_mnt函数查找挂载在挂载点的路径path上的第一个子mount，如果这个mount为空则，通过path寻找下一个目录dentry，重复上面的操作，直到找到一个不为空的mount最后通过get_mountpoint得到一个挂载点，并且返回挂载点。get_mountpoint是怎么从dentry目录获取挂载点：

static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
 struct mountpoint *mp, *new = NULL;
 int ret;

 if (d_mountpoint(dentry)) {//设置dentry为挂载点
  /* dentry不在使用，使用数减一 */
  if (d_unlinked(dentry))
   return ERR_PTR(-ENOENT);
mountpoint:
  read_seqlock_excl(&mount_lock);//mount顺序锁上锁
  mp = lookup_mountpoint(dentry);//从mountpoint hash表 查找mountpoint
  read_sequnlock_excl(&mount_lock);//mount顺序锁解锁
  if (mp)
   goto done;//找到直接返回mountpoint实例 
 }

 if (!new)//mountpoint哈希表中没有，找到需要分配
  new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
 if (!new)
  return ERR_PTR(-ENOMEM);


 /* 设置挂载点标志 */
 ret = d_set_mounted(dentry);

 /* 其他人在设置，回到mountpoint再来一次 */
 if (ret == -EBUSY)
  goto mountpoint;

 /* 该dentry不能作为安装点使用则结束 */
 mp = ERR_PTR(ret);
 if (ret)
  goto done;

 /* 将新的挂载点添加到散列表中 */
 read_seqlock_excl(&mount_lock);
 new->m_dentry = dentry;
 new->m_count = 1;
 hlist_add_head(&new->m_hash, mp_hash(dentry));
 INIT_HLIST_HEAD(&new->m_list);
 read_sequnlock_excl(&mount_lock);

 mp = new;
 new = NULL;
done:
 kfree(new);
 return mp;
}

get_mountpoint首先设置dentry为挂载点，然后使用lookup_mountpoint函数从mountpoint hash表中查找挂载点，找到了直接返回找到的挂载点，找不到说明mountpoint hash表中没有，需要分配一个，然后设置挂载点目录项的标志，并且通过ERR_PTR这个宏确定刚刚分配到的挂载点目录项可以使用，最后将新的挂载点加入到mountpoint hash表中，并且返回刚刚分配的挂载点。挂载操作就到此结束了。

全部调用过程：

ksys_mount
 ↓
do_mount
 ↓
user_path
 ↓
do_new_mount
 ↓
 vfs_kern_mount
  alloc_vfsmnt
  mount_fs
 mount_too_revealing
  mnt_already_visible
 do_add_mount
  lock_mount
  real_mount
  graft_tree