分享

Linux源码剖析——匿名共享内存shmem原理

 waston 2024-12-29 发布于上海

如下问题如果都清楚了就不用看本文了:

1. shmem ram文件系统的初始化流程是怎样的

2. shmem思想上想复用基于文件的操作流程,实现上shmem也引入了一个文件,那么类似文件open会生成struct file,shmem的struct file怎么生成的

3. shmem的phsycial page是怎么创建的,page属性是如何的(迁移属性,_refcount,_mapcount等)。

4. shmem page怎样回收的

概述

进程间共享匿名内存有两种重要的方式,其一是mmap设置MAP_SHARED | PRIVATE创建的虚拟地址区域,这片区域fork执行之后,由父子进程共享。其二是主动调用shmget/shmat相关接口。此外android操作系统的ashmem匿名共享内存也是基于linux内核的shmem实现。本文我们将从源码角度剖析内核shmem的设计和实现原理。

shmem设计的基本思想

当vma中的页面对应磁盘文件时,系统在缺页的时候为了读取一页会调用vma_area_struct->a_ops中的fault函数(filemap_fault)。要把一个page写回磁盘设备使用inode->i_mapping->a_ops或者page->mapping->a_ops在address_space_operations找到相应的writepage函数。当进行普通文件操作,如mmap(),read()和write()时,系统会通过struct file *filp的file_operations指针f_op进行相应的函数调用。f_op在文件open时候从inode->i_fop设置:

fs/read_write.c: static int do_dentry_open

上面的流程很清晰,不过却无法处理匿名页的情况,因为匿名页没有对应一个文件,所以为复用上述清晰的逻辑Linux引入了基于RAM文件系统的文件,vma都由这个文件系统中的一个文件作为后援。

同时,shmem中的page也要考虑如何回收:即页面回收时会写入swap分区当中。
 

虚拟文件系统初始化

 linux系统启动的过程中会调用到shmem_init初始化虚拟文件系统,linux给shmem创建虚拟文件,必然就有文件对应的inode,shmem文件系统创建的inode附带shmem_inode_info结构,这个结构含有文件系统的私有信息,SHMEM_I()函数以inode为参数,返回shmem_inode_info,而shmem_init_inode_cache就是初始化shmem_inode_info的kmem_cache:

 shmem_inode_info定义在<linux/shmem_fs.h>:

 各个字段的含义:

lock : 数据结构并发访问保护的自旋锁。

flags: 相关标志,详见linux mm.h中的介绍。

alloced: shmem创建的pages的数量。

swapped: 当前inode有很多pages,其中写入swapcache交换缓存的page数量。

shrinklist: 与huge page相关,暂不分析。

swaplist: shmem_inode_info结构体通过该字段挂到shmem.c中shmem_swaplist双向链表中,这个挂接过程是在shmem_writepage中实现,后面源码会分析到。也就是说inode对应的page要写入swapcache的时候,就要将shmem_inode_info挂到shmem_swaplist。

注册文件系统

shmem函数指针结构体

shmem中定义了address_space_operations结构体shmem_aops和vm_operations_struct shmem_vm_ops,分别对应缺页处理和写页面到swapcache中,定义分别如下:

  1. static const struct vm_operations_struct shmem_vm_ops = {
  2. .fault = shmem_fault,
  3. .map_pages = filemap_map_pages,
  4. #ifdef CONFIG_NUMA
  5. .set_policy = shmem_set_policy,
  6. .get_policy = shmem_get_policy,
  7. #endif
  8. };
  9. static const struct address_space_operations shmem_aops = {
  10. .writepage = shmem_writepage,
  11. .set_page_dirty = __set_page_dirty_no_writeback,
  12. #ifdef CONFIG_TMPFS
  13. .write_begin = shmem_write_begin,
  14. .write_end = shmem_write_end,
  15. #endif
  16. #ifdef CONFIG_MIGRATION
  17. .migratepage = migrate_page,
  18. #endif
  19. .error_remove_page = generic_error_remove_page,
  20. };

匿名VMA使用shmem_vm_ops作为vm_operations_struct,所以中断缺页时会调用shmem_fault分配物理page。

文件和索引节点操作需要两个数据结构file_operations和inode_operations,分别定义如下:

  1. static const struct file_operations shmem_file_operations = {
  2. .mmap = shmem_mmap,
  3. .get_unmapped_area = shmem_get_unmapped_area,
  4. #ifdef CONFIG_TMPFS
  5. .llseek = shmem_file_llseek,
  6. .read_iter = shmem_file_read_iter,
  7. .write_iter = generic_file_write_iter,
  8. .fsync = noop_fsync,
  9. .splice_read = generic_file_splice_read,
  10. .splice_write = iter_file_splice_write,
  11. .fallocate = shmem_fallocate,
  12. #endif
  13. };
  14. static const struct inode_operations shmem_inode_operations = {
  15. .getattr = shmem_getattr,
  16. .setattr = shmem_setattr,
  17. #ifdef CONFIG_TMPFS_XATTR
  18. .listxattr = shmem_listxattr,
  19. .set_acl = simple_set_acl,
  20. #endif
  21. };

用户态空间向shmem 内存中写入数据就可以调用shmem_file_operations的write_iter接口。

shmem虚拟文件系统创建文件(类似普通文件的open过程)

shmem设计仿照了普通文件的流程,创建普通文件的时候,内核态会初始化struct file结构体和文件相应的inode,shmem这里使用虚拟文件系统也是类似流程,本小节描述shmem对应的文件的创建流程,具体实现函数为:shmem_file_setup,该函数主要创建shmem的strcut file和inode结构体 。

  1. static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
  2. unsigned long flags, unsigned int i_flags)
  3. {
  4. struct inode *inode;
  5. struct file *res;
  6. ...
  7. inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
  8. flags);
  9. if (unlikely(!inode)) {
  10. shmem_unacct_size(flags, size);
  11. return ERR_PTR(-ENOSPC);
  12. }
  13. inode->i_flags |= i_flags;
  14. inode->i_size = size;
  15. clear_nlink(inode); /* It is unlinked */
  16. res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
  17. if (!IS_ERR(res))
  18. //新建file的f_op = shmem_file_operations
  19. res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
  20. &shmem_file_operations);
  21. if (IS_ERR(res))
  22. iput(inode);
  23. return res;
  24. }

shmem_get_inode创建inode;alloc_file_psedo创建file对象。

shmem_get_inode函数:

  1. static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
  2. umode_t mode, dev_t dev, unsigned long flags)
  3. {
  4. struct inode *inode;
  5. struct shmem_inode_info *info;
  6. struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
  7. ino_t ino;
  8. if (shmem_reserve_inode(sb, &ino))
  9. return NULL;
  10. inode = new_inode(sb);
  11. if (inode) {
  12. ...
  13. switch (mode & S_IFMT) {
  14. ...
  15. case S_IFREG:
  16. inode->i_mapping->a_ops = &shmem_aops;
  17. inode->i_op = &shmem_inode_operations;
  18. inode->i_fop = &shmem_file_operations;
  19. mpol_shared_policy_init(&info->policy,
  20. shmem_get_sbmpol(sbinfo));
  21. break;
  22. ...
  23. }
  24. lockdep_annotate_inode_mutex_key(inode);
  25. } else
  26. shmem_free_inode(sb);
  27. return inode;
  28. }

shmem的物理page创建

发生缺页中断的时候,如果vma->vm_ops->fault存在,do_faul文件缺页处理函数中会调用该fault函数,具体可以参考Linux mmap系统调用视角看缺页中断_nginux的博客-CSDN博客​​​​​​

所以shmem的缺页中断会调用shmem_vm_ops 中的fault函数,即shmem_fault。核心函数是shmem_getpage_gfp:负责分配新页或者在swapcache或者swap分区中找到该页。

  1. static vm_fault_t shmem_fault(struct vm_fault *vmf)
  2. {
  3. struct vm_area_struct *vma = vmf->vma;
  4. struct inode *inode = file_inode(vma->vm_file);
  5. gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
  6. enum sgp_type sgp;
  7. int err;
  8. vm_fault_t ret = VM_FAULT_LOCKED;
  9. ...
  10. 处理与fallocate相关逻辑,暂不分析。
  11. ...
  12. sgp = SGP_CACHE;
  13. if ((vma->vm_flags & VM_NOHUGEPAGE) ||
  14. test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
  15. sgp = SGP_NOHUGE;
  16. else if (vma->vm_flags & VM_HUGEPAGE)
  17. sgp = SGP_HUGE;
  18. //缺页分配phsical page的核心函数
  19. err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
  20. gfp, vma, vmf, &ret);
  21. if (err)
  22. return vmf_error(err);
  23. return ret;
  24. }

shmem_getpage_gfp

  1. /*
  2. * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
  3. *
  4. * If we allocate a new one we do not mark it dirty. That's up to the
  5. * vm. If we swap it in we mark it dirty since we also free the swap
  6. * entry since a page cannot live in both the swap and page cache.
  7. *
  8. * vmf and fault_type are only supplied by shmem_fault:
  9. * otherwise they are NULL.
  10. */
  11. static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
  12. struct page **pagep, enum sgp_type sgp, gfp_t gfp,
  13. struct vm_area_struct *vma, struct vm_fault *vmf,
  14. vm_fault_t *fault_type)
  15. {
  16. struct address_space *mapping = inode->i_mapping;
  17. struct shmem_inode_info *info = SHMEM_I(inode);
  18. ...
  19. //先从mapping指向的缓存中查找,如果没有找到,尝试从swapcache和swap分区查找page
  20. page = find_lock_entry(mapping, index);
  21. if (xa_is_value(page)) {
  22. error = shmem_swapin_page(inode, index, &page,
  23. sgp, gfp, vma, fault_type);
  24. if (error == -EEXIST)
  25. goto repeat;
  26. *pagep = page;
  27. return error;
  28. }
  29. if (page && sgp == SGP_WRITE)
  30. mark_page_accessed(page);
  31. ...
  32. alloc_huge:
  33. page = shmem_alloc_and_acct_page(gfp, inode, index, true);
  34. if (IS_ERR(page)) {
  35. alloc_nohuge:
  36. //最终调用alloc_page创建物理page,内部会调用__SetPageSwapBacked(page);
  37. page = shmem_alloc_and_acct_page(gfp, inode,
  38. index, false);
  39. }
  40. ...
  41. if (sgp == SGP_WRITE)
  42. __SetPageReferenced(page);
  43. //将新建的page加入mapping对应的缓存空间,同时设置了page->mapping和page->index字段
  44. error = shmem_add_to_page_cache(page, mapping, hindex,
  45. NULL, gfp & GFP_RECLAIM_MASK,
  46. charge_mm);
  47. if (error)
  48. goto unacct;
  49. //page加入相应的lru链表,shmem是inactive anon lru链表。因为内核最终判定是加入哪个
  50. //lru是通过page_is_file_lru,该函数:!PageSwapBacked,如果满足即file lru,否则
  51. //anon lru。
  52. lru_cache_add(page);
  53. alloced = true;
  54. ...
  55. }

 要点:

shmem_alloc_and_acct_page最终调用alloc_page创建物理page,内部会调用__SetPageSwapBacked(page); 即shmem page是SwapBacked

shmem_add_to_page_cache:将新建的page加入mapping对应的缓存空间,设置了page->mapping和page->index字段,同时增加了NR_FILE_PAGES和NR_SHMEM计数:

 lru_cache_add:page加入相应的lru链表,shmem是inactive anon lru链表因为内核最终判定是加入哪个lru是通过page_is_file_lru,该函数:!PageSwapBacked,如果满足即file lru,否则 anon lru。

 shmem页面回写

shmem的物理page在内存紧张的时候会进行回收,由于不像file-back page可以写回磁盘,shmem的流程某些程度上类似anon page,会通过pageout换出到交换分区,其调用栈如下:

  1. #0 shmem_writepage (page=0xffffea0000008000, wbc=0xffff888004857440) at mm/shmem.c:1371
  2. #1 0xffffffff8135e671 in pageout (page=0xffffea0000008000, mapping=0xffff888000d78858) at mm/vmscan.c:830
  3. #2 0xffffffff8134f168 in shrink_page_list (page_list=0xffff888004857850, pgdat=0xffff888007fda000, sc=0xffff888004857d90, ttu_flags=(unknown: 0), stat=0xffff888004857890, ignore_references=false)
  4. at mm/vmscan.c:1355
  5. #3 0xffffffff81351477 in shrink_inactive_list (nr_to_scan=1, lruvec=0xffff888005c6a000, sc=0xffff888004857d90, lru=LRU_INACTIVE_ANON) at mm/vmscan.c:1962
  6. #4 0xffffffff81352312 in shrink_list (lru=LRU_INACTIVE_ANON, nr_to_scan=1, lruvec=0xffff888005c6a000, sc=0xffff888004857d90) at mm/vmscan.c:2172
  7. #5 0xffffffff81352c97 in shrink_lruvec (lruvec=0xffff888005c6a000, sc=0xffff888004857d90) at mm/vmscan.c:2467
  8. #6 0xffffffff813533f1 in shrink_node_memcgs (pgdat=0xffff888007fda000, sc=0xffff888004857d90) at mm/vmscan.c:2655
  9. #7 0xffffffff81353b0a in shrink_node (pgdat=0xffff888007fda000, sc=0xffff888004857d90) at mm/vmscan.c:2772
  10. #8 0xffffffff81355cd8 in kswapd_shrink_node (pgdat=0xffff888007fda000, sc=0xffff888004857d90) at mm/vmscan.c:3514
  11. #9 0xffffffff813561ac in balance_pgdat (pgdat=0xffff888007fda000, order=0, highest_zoneidx=0) at mm/vmscan.c:3672
  12. #10 0xffffffff81356ae4 in kswapd (p=0xffff888007fda000) at mm/vmscan.c:3930
  13. #11 0xffffffff811a2249 in kthread (_create=<optimized out>) at kernel/kthread.c:292

shmem_writepage在回收页面时将shmem写到swapcache当中:

  1. /*
  2. * Move the page from the page cache to the swap cache.
  3. */
  4. static int shmem_writepage(struct page *page, struct writeback_control *wbc)
  5. {
  6. struct shmem_inode_info *info;
  7. struct address_space *mapping;
  8. struct inode *inode;
  9. swp_entry_t swap;
  10. pgoff_t index;
  11. VM_BUG_ON_PAGE(PageCompound(page), page);
  12. BUG_ON(!PageLocked(page));
  13. mapping = page->mapping;
  14. index = page->index;
  15. inode = mapping->host;
  16. info = SHMEM_I(inode);
  17. if (info->flags & VM_LOCKED)
  18. goto redirty;
  19. if (!total_swap_pages)
  20. goto redirty;
  21. ...
  22. //从swap分区中获取一个空闲槽位。
  23. swap = get_swap_page(page);
  24. if (!swap.val)
  25. goto redirty;
  26. /*
  27. * Add inode to shmem_unuse()'s list of swapped-out inodes,
  28. * if it's not already there. Do it now before the page is
  29. * moved to swap cache, when its pagelock no longer protects
  30. * the inode from eviction. But don't unlock the mutex until
  31. * we've incremented swapped, because shmem_unuse_inode() will
  32. * prune a !swapped inode from the swaplist under this mutex.
  33. */
  34. mutex_lock(&shmem_swaplist_mutex);
  35. //将当前shmem_inode_info挂接到shmem_swaplist当中,shmem_swaplist
  36. if (list_empty(&info->swaplist))
  37. list_add(&info->swaplist, &shmem_swaplist);
  38. //将page添加到swapcache address_space当中
  39. if (add_to_swap_cache(page, swap,
  40. __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
  41. NULL) == 0) {
  42. spin_lock_irq(&info->lock);
  43. shmem_recalc_inode(inode);
  44. info->swapped++;
  45. spin_unlock_irq(&info->lock);
  46. swap_shmem_alloc(swap);
  47. //从page cache space中删除
  48. shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
  49. mutex_unlock(&shmem_swaplist_mutex);
  50. BUG_ON(page_mapped(page));
  51. //开始向交换分区写入,或者写入磁盘,或者zram压缩内存。
  52. swap_writepage(page, wbc);
  53. return 0;
  54. }
  55. ...
  56. return 0;
  57. }

注意:

  • swap_writepage会调用set_page_writeback设置page状态为writeback,也就说page正在回写,这影响/proc/meminfo Writeback统计,也就说不管是匿名页写回交换分区(或者压缩zram),还是write系统调用page cache向磁盘文件回写,都将统计到NR_WRITEBACK中,影响proc/meminfo的Writeback字段统计。

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多