分享

缺页异常处理程序

 Liucw2012 2012-09-24

版权声明:转载时请以超链接形式标明文章原始出处和作者信息及本声明
http://wanderer-zjhit./logs/157601763.html

核心:用户态缺页往往由于下面四种情况造成:
物理页面不存在,该页表项=0,内存没有从磁盘中读入到物理内存中,没有建立虚存到物理内存映射,新建或读取
物理页面不存在,该页表项位1=1,为非线性映射,页表项其余位指明了该物理页在文件中的页偏移,从磁盘缓冲区中读入
物理页面不存在,该页表项位1=0,非匿名映射,页表项其余为知名了该物理页在交换分区中的偏移,从磁盘缓冲区读入
物理页面存在,但是该虚存区间可写,而该页表项不可写,为创建新进程造成,利用写时拷贝技术,物理页面复制一份,然后建立页表项的映射,在这种写时复制技术中,无论父子进程试图写一个共享的页框都会造成异常,这是内核将该页框复制到一个新的页框中,并且将pte页表项重定向,标记为可写;原来的页框引用计数器--;且其仍然被标记为写保护,当另一个进程试图写该页面时,内核检查写进程是否成为该页框的唯一使用者,如果是,就把该页框标记为对该进程可写,即修改该进程pte页表项,从而可以写该页。

1 缺页异常处理程序
其中error_code表明虚存错误码,而虚存地址保存在cr2寄存器中
比特位                 值
0          0缺页               1保护异常
1          0读访               1写访问
2          0核心               1用户态

/*
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
 */
dotraplinkage void __kprobes
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
 struct vm_area_struct *vma;
 struct task_struct *tsk;
 unsigned long address;
 struct mm_struct *mm;
 int write;
 int fault;

 tsk = current;
 mm = tsk->mm;

 /* Get the faulting address: */
 address = read_cr2(); 错误地址保存在cr2寄存器中

 /*
 static inline unsigned long native_read_cr2(void)
{
 unsigned long val;
 asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
 return val;
}
  * Detect and handle instructions that would cause a page fault for
  * both a tracked kernel page and a userspace page.
  */
 if (kmemcheck_active(regs))
  kmemcheck_hide(regs);
 prefetchw(&mm->mmap_sem);

 if (unlikely(kmmio_fault(regs, address)))
  return;

 /*
  * We fault-in kernel-space virtual memory on-demand. The
  * 'reference' page table is init_mm.pgd.
  *
  * NOTE! We MUST NOT take any locks for this case. We may
  * be in an interrupt or a critical region, and should
  * only copy the information from the master page table,
  * nothing more.
  *
  * This verifies that the fault happens in kernel space
  * (error_code & 4) == 0, and that the fault was not a
  * protection error (error_code & 9) == 0.
  */
 if (unlikely(fault_in_kernel_space(address))) {   如果是发生缺页异常的地址位于内核态
  if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 异常发生在核心态、不是由于保护错误异常引起
   if (vmalloc_fault(address) >= 0) vmalloc异常,进程主页表与内核主页表同步
    return;

   if (kmemcheck_fault(regs, address, error_code))
    return;
  }

  /* Can handle a stale RO->RW TLB: */
  if (spurious_fault(error_code, address))
   return;

  /* kprobes don't want to hook the spurious faults: */
  if (notify_page_fault(regs))
   return;
  /*
   * Don't take the mm semaphore here. If we fixup a prefetch
   * fault we could otherwise deadlock:
   */
  bad_area_nosemaphore(regs, error_code, address);

  return;
 }

 /* kprobes don't want to hook the spurious faults: */
 if (unlikely(notify_page_fault(regs)))
  return;
 /*
  * It's safe to allow irq's after cr2 has been saved and the
  * vmalloc fault has been handled.
  *
  * User-mode registers count as a user access even for any
  * potential system fault or CPU buglet:
  */
 if (user_mode_vm(regs)) {
  local_irq_enable();
  error_code |= PF_USER;
 } else {
  if (regs->flags & X86_EFLAGS_IF)
   local_irq_enable();
 }

 if (unlikely(error_code & PF_RSVD))
  pgtable_bad(regs, error_code, address);

 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);

 /*
  * If we're in an interrupt, have no user context or are running
  * in an atomic region then we must not take the fault:
  */
 if (unlikely(in_atomic() || !mm)) { 如果在原子操作中或者没有mm结构,不能处理错误异常,即内核空间中没有用户态虚存空间
  bad_area_nosemaphore(regs, error_code, address);
  return;
 }

 /*
  * When running in the kernel we expect faults to occur only to
  * addresses in user space.  All other faults represent errors in
  * the kernel and should generate an OOPS.  Unfortunately, in the
  * case of an erroneous fault occurring in a code path which already
  * holds mmap_sem we will deadlock attempting to validate the fault
  * against the address space.  Luckily the kernel only validly
  * references user space from well defined areas of code, which are
  * listed in the exceptions table.
  *
  * As the vast majority of faults will be valid we will only perform
  * the source reference check when there is a possibility of a
  * deadlock. Attempt to lock the address space, if we cannot we then
  * validate the source. If this is invalid we can skip the address
  * space check, thus avoiding the deadlock:
  */
 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  if ((error_code & PF_USER) == 0 &&
      !search_exception_tables(regs->ip)) {
   bad_area_nosemaphore(regs, error_code, address);
   return;
  }
  down_read(&mm->mmap_sem);
 } else {
  /*
   * The above down_read_trylock() might have succeeded in
   * which case we'll have missed the might_sleep() from
   * down_read():
   */
  might_sleep();
 }

 vma = find_vma(mm, address); 如果该异常地址发生在用户态并且有虚存地址空间
 if (unlikely(!vma)) { 如果该地址没有映射到进程虚存空间中,发生地址未映射错误
  bad_area(regs, error_code, address);
  return;
 }
 if (likely(vma->vm_start <= address)) 该地址在虚存区间中
  goto good_area;
 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
  bad_area(regs, error_code, address);
  return;
 }
 if (error_code & PF_USER) {
  /*
   * Accessing the stack below %sp is always a bug.
   * The large cushion allows instructions like enter
   * and pusha to work. ("enter $65535, $31" pushes
   * 32 pointers and then decrements %sp by 65535.)
   */
  if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
   bad_area(regs, error_code, address);
   return;
  }
 }
 if (unlikely(expand_stack(vma, address))) {适当扩大栈空间
  bad_area(regs, error_code, address);
  return;
 }

 /*
  * Ok, we have a good vm_area for this memory access, so
  * we can handle it..
  */
good_area:
 write = error_code & PF_WRITE;

 if (unlikely(access_error(error_code, write, vma))) { 如果是写访问该出错地址,且该虚存区间不允许写出错,如果error_code表明是读缺页,且该虚存区间即不允许读、写也不允许执行,出错
  bad_area_access_error(regs, error_code, address);
  return;
 }

 /*
  * If for any reason at all we couldn't handle the fault,
  * make sure we exit gracefully rather than endlessly redo
  * the fault:
  */    权限检查
 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);

 if (unlikely(fault & VM_FAULT_ERROR)) { 在创建页面时候,也可能发生缺页异常
  mm_fault_error(regs, error_code, address, fault);
  return;
 }

 if (fault & VM_FAULT_MAJOR) { 表明数据需要从块设备中读取
  tsk->maj_flt++;
  perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
         regs, address);
 } else {   数据已经在内存中
  tsk->min_flt++;
  perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
         regs, address);
 }

 check_v8086_mode(regs, address, tsk);

 up_read(&mm->mmap_sem);
}
2 如果异常地址在内核态,且当前进程位于内核态不是由于保护异常造成,则利用init内核页表修复该进程的内核页表,调用arch/x86/mm/fault.c文件的vmalloc_fault(address)进行修复
    该函数首先从cr3寄存器中读取当前进程的页目录项结构,然后读取init进程的页目录结构(swapper_pg_dir),然后修改引起异常的该项目
3 如果异常发生在用户态,且存在虚存区间vm_area_struct与该异常地址对应,而且该虚存区间允许对该地址的读写访问(读写访问存在err_code中),则执行handle_mm_fault缺页处理程序
/*
 * By the time we get here, we already hold the mm semaphore
 */
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,unsigned long address, unsigned int flags)
{
 pgd_t *pgd;
 pud_t *pud;
 pmd_t *pmd;
 pte_t *pte;

 __set_current_state(TASK_RUNNING);

 count_vm_event(PGFAULT); 统计计数

 if (unlikely(is_vm_hugetlb_page(vma)))
  return hugetlb_fault(mm, vma, address, flags);

 pgd = pgd_offset(mm, address);
 pud = pud_alloc(mm, pgd, address);
 if (!pud)
  return VM_FAULT_OOM;
 pmd = pmd_alloc(mm, pud, address);
 if (!pmd)
  return VM_FAULT_OOM;
 pte = pte_alloc_map(mm, pmd, address);取得到虚存对应的pte页表项
 if (!pte)
  return VM_FAULT_OOM;

 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 建立该物理页面和页表项的映射关系
}
4 用户空间缺页异常的校正
/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
static inline int handle_pte_fault(struct mm_struct *mm,
  struct vm_area_struct *vma, unsigned long address,
  pte_t *pte, pmd_t *pmd, unsigned int flags)
{
 pte_t entry;
 spinlock_t *ptl;

 entry = *pte;
 if (!pte_present(entry)) 该页表项第0位=0,表明是物理页面不存在
 { 
 
 if (pte_none(entry)) { 1: 如果该页表项为0,内核从头加载该项
   if (vma->vm_ops) {
    if (likely(vma->vm_ops->fault))
     return do_linear_fault(mm, vma, address,
      pte, pmd, flags, entry);
   }
   return do_anonymous_page(mm, vma, address,
       pte, pmd, flags);调用默认的缺页处理程序
  }
  if (pte_file(entry)) 2:如果页表项第二位=1,且页表项中保存了相关信息,为非线性映射
   return do_nonlinear_fault(mm, vma, address,
     pte, pmd, flags, entry);
  页表项保存了相关信息,且第2位=0,为匿名映射,从交换区读入
  return do_swap_page(mm, vma, address, pte, pmd, flags, entry);
 }

 ptl = pte_lockptr(mm, pmd);
 spin_lock(ptl);
 if (unlikely(!pte_same(*pte, entry)))
  goto unlock;
4 如果该虚存区域有写权限,但是该页表项没有写权限,则利用写时复制技术复制一个内存页面,建立映射
 if (flags & FAULT_FLAG_WRITE) { 
  if (!pte_write(entry)) 
   return do_wp_page(mm, vma, address,pte, pmd, ptl, entry);更新页表项可写标志
   entry = pte_mkdirty(entry);如果该区域和该页都有写权限,将该页标记为脏
 }
 entry = pte_mkyoung(entry);
 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
  update_mmu_cache(vma, address, entry);
 }
else { 由于保护异常引起
  /*
   * This is needed only for protection faults but the arch code
   * is not yet telling us if this is a protection fault or not.
   * This still avoids useless tlb flushes for .text page faults
   * with threads.
   */
  if (flags & FAULT_FLAG_WRITE)
   flush_tlb_page(vma, address);
 }
unlock:
 pte_unmap_unlock(pte, ptl);
 return 0;
}
返回值分析:
VM_FAULT_MAJOR:缺页迫使当前进程睡眠,可能由于从磁盘拷贝数据花费时间长
VM_FAULT_MINOR:没有阻塞当前进程情况下处理了缺页
VM_FAULT_OOM:没有足够内存
VM_FAULT_SIGBOS:没有任何错误
4-0 有用户态的缺页异常可以看出,有四种情况造成缺页:物理页面不存在[页表项最低位=0]
物理页面不存在,该页表项=0,内存没有从磁盘中读入到物理内存中,没有建立虚存到物理内存映射,新建或读取
物理页面不存在,该页表项位1=1,为非线性映射,页表项其余位指明了该物理页在文件中的页偏移,从磁盘缓冲区中读入
物理页面不存在,该页表项位1=0,非匿名映射,页表项其余为知名了该物理页在交换分区中的偏移,从磁盘缓冲区读入
物理页面存在,但是该虚存区间可写,而该页表项不可写,为创建新进程造成,利用写时拷贝技术,物理页面复制一份,然后建立页表项的映射
4-1 考虑写时复制技术
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
  unsigned long address, pte_t *page_table, pmd_t *pmd,
  spinlock_t *ptl, pte_t orig_pte)
{
 struct page *old_page, *new_page;
 pte_t entry;
 int reuse = 0, ret = 0;
 int page_mkwrite = 0;
 struct page *dirty_page = NULL;

 old_page = vm_normal_page(vma, address, orig_pte);获得对应的物理页面page结构
 if (!old_page) {
  /*
   * VM_MIXEDMAP !pfn_valid() case
   *
   * We should not cow pages in a shared writeable mapping.
   * Just mark the pages writable as we can't do any dirty
   * accounting on raw pfn maps.
   */
  if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
         (VM_WRITE|VM_SHARED))
   goto reuse;
  goto gotten;
 }

 /*
  * Take out anonymous pages first, anonymous shared vmas are
  * not dirty accountable.
  */
 if (PageAnon(old_page) && !PageKsm(old_page)) {
  if (!trylock_page(old_page)) {
   page_cache_get(old_page);
   pte_unmap_unlock(page_table, ptl);
   lock_page(old_page);
   page_table = pte_offset_map_lock(mm, pmd, address,
        &ptl);
   if (!pte_same(*page_table, orig_pte)) {
    unlock_page(old_page);
    page_cache_release(old_page);
    goto unlock;
   }
   page_cache_release(old_page);
  }
  reuse = reuse_swap_page(old_page);
  unlock_page(old_page);
 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
     (VM_WRITE|VM_SHARED))) {
  /*
   * Only catch write-faults on shared writable pages,
   * read-only shared pages can get COWed by
   * get_user_pages(.write=1, .force=1).
   */
  if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
   struct vm_fault vmf;
   int tmp;

   vmf.virtual_address = (void __user *)(address &
        PAGE_MASK);
   vmf.pgoff = old_page->index;
   vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
   vmf.page = old_page;

   /*
    * Notify the address space that the page is about to
    * become writable so that it can prohibit this or wait
    * for the page to get into an appropriate state.
    *
    * We do this without the lock held, so that it can
    * sleep if it needs to.
    */
   page_cache_get(old_page);
   pte_unmap_unlock(page_table, ptl);

   tmp = vma->vm_ops->page_mkwrite(vma, &vmf);如果该页面只有一个引用者,没必要复制该页
   if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
    ret = tmp;
    goto unwritable_page;
   }
   if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
    lock_page(old_page);
    if (!old_page->mapping) {
     ret = 0; /* retry the fault */
     unlock_page(old_page);
     goto unwritable_page;
    }
   } else
    VM_BUG_ON(!PageLocked(old_page));

   /*
    * Since we dropped the lock we need to revalidate
    * the PTE as someone else may have changed it.  If
    * they did, we just return, as we can count on the
    * MMU to tell us if they didn't also make it writable.
    */
   page_table = pte_offset_map_lock(mm, pmd, address,
        &ptl);
   if (!pte_same(*page_table, orig_pte)) { 由于在上述操作中进程可能睡眠,此处做检查
    unlock_page(old_page);
    page_cache_release(old_page);
    goto unlock;
   }

   page_mkwrite = 1;
  }
  dirty_page = old_page;
  get_page(dirty_page);
  reuse = 1;
 }

 if (reuse) {
reuse:
  flush_cache_page(vma, address, pte_pfn(orig_pte));
  entry = pte_mkyoung(orig_pte);
  entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  if (ptep_set_access_flags(vma, address, page_table, entry,1))
   update_mmu_cache(vma, address, entry);
  ret |= VM_FAULT_WRITE;
  goto unlock;
 }

 /*
  * Ok, we need to copy. Oh, well..
  */
 page_cache_get(old_page);
gotten:
 pte_unmap_unlock(page_table, ptl);

 if (unlikely(anon_vma_prepare(vma)))
  goto oom;

 if (is_zero_pfn(pte_pfn(orig_pte))) {
  new_page = alloc_zeroed_user_highpage_movable(vma, address);
  if (!new_page)
   goto oom;
 } else {
  new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);分配一个新页
  if (!new_page)
   goto oom;
  cow_user_page(new_page, old_page, address, vma);页面内容复制
 }
 __SetPageUptodate(new_page);设置page->flags标志

 /*
  * Don't let another task, with possibly unlocked vma,
  * keep the mlocked page.
  */
 if ((vma->vm_flags & VM_LOCKED) && old_page) {
  lock_page(old_page); /* for LRU manipulation */
  clear_page_mlock(old_page);
  unlock_page(old_page);
 }

 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
  goto oom_free_new;

 /*
  * Re-check the pte - we dropped the lock
  */
 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 if (likely(pte_same(*page_table, orig_pte))) {
  if (old_page) {
   if (!PageAnon(old_page)) {
    dec_mm_counter(mm, file_rss);
    inc_mm_counter(mm, anon_rss);
   }
  } else
   inc_mm_counter(mm, anon_rss);
  flush_cache_page(vma, address, pte_pfn(orig_pte));
  entry = mk_pte(new_page, vma->vm_page_prot);   返回该页面对应的页表项内容
  entry = maybe_mkwrite(pte_mkdirty(entry), vma); 恢复该页表项的可写标志
  /*
   * Clear the pte entry and flush it first, before updating the
   * pte with the new entry. This will avoid a race condition
   * seen in the presence of one thread doing SMC and another
   * thread doing COW.
   */
  ptep_clear_flush(vma, address, page_table);
  page_add_new_anon_rmap(new_page, vma, address);将该page结构所在的vm_area_struct结构插入到该page对应匿名映射链表中
  /*
   * We call the notify macro here because, when using secondary
   * mmu page tables (such as kvm shadow page tables), we want the
   * new page to be mapped directly into the secondary page table.
   */
  set_pte_at_notify(mm, address, page_table, entry);填写子进程页表项pte_t结构
  update_mmu_cache(vma, address, entry);
  if (old_page) {
   page_remove_rmap(old_page);清除原来只读页的逆向映射,修改page->mmcout=0表示只有一个进程
  }

  /* Free the old page.. */
  new_page = old_page;
  ret |= VM_FAULT_WRITE;
 } else
  mem_cgroup_uncharge_page(new_page);

 if (new_page)
  page_cache_release(new_page);
 if (old_page)
  page_cache_release(old_page);
unlock:
 pte_unmap_unlock(page_table, ptl);
 if (dirty_page) {
  /*
   * Yes, Virginia, this is actually required to prevent a race
   * with clear_page_dirty_for_io() from clearing the page dirty
   * bit after it clear all dirty ptes, but before a racing
   * do_wp_page installs a dirty pte.
   *
   * do_no_page is protected similarly.
   */
  if (!page_mkwrite) {
   wait_on_page_locked(dirty_page);
   set_page_dirty_balance(dirty_page, page_mkwrite);
  }
  put_page(dirty_page);
  if (page_mkwrite) {
   struct address_space *mapping = dirty_page->mapping;

   set_page_dirty(dirty_page);
   unlock_page(dirty_page);
   page_cache_release(dirty_page);
   if (mapping) {
    /*
     * Some device drivers do not set page.mapping
     * but still dirty their pages
     */
    balance_dirty_pages_ratelimited(mapping);
   }
  }

  /* file_update_time outside page_lock */
  if (vma->vm_file)
   file_update_time(vma->vm_file);
 }
 return ret;
...
}
如何建立该物理页面的逆向映射数据结构:page_add_new_anon_rmap(new_page, vma, address)
/**
 * __page_set_anon_rmap - setup new anonymous rmap
 * @page: the page to add the mapping to
 * @vma: the vm area in which the mapping is added
 * @address: the user virtual address mapped
 */
static void __page_set_anon_rmap(struct page *page,struct vm_area_struct *vma, unsigned long address)
{
 struct anon_vma *anon_vma = vma->anon_vma;

 BUG_ON(!anon_vma);
 anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 page->mapping = (struct address_space *) anon_vma;

 page->index = linear_page_index(vma, address);

 /*
  * nr_mapped state can be updated without turning off
  * interrupts because it is not modified via interrupt.
  */
 __inc_zone_page_state(page, NR_ANON_PAGES);
}
分析:由于每个vm_area_struct包含了多个物理页面,在分配第一个物理页面时,为了反向映射所有的进程,页面对应了一个anon_vma双链表头结构,并由所有共享该页面的vm_area_struct->anon_vma字段描述,当再给vm_area_struct线性地址区间分配新的物理页面page时,直接将该页面->mmap字段指向第一个页面的anon_vma结构即可,反向映射该页面通过遍历page->mmap头部结构,也可以查找到所有的共享同一匿名映射的进程,这是由于这两个物理页面位于同一个进程虚存区间造成的,而反向映射是一种面向对象的匿名映射
   反向映射属于文件映射[page->mmaping字段最后两位=0,指向address_space结构,表示该物理页面映射到文件中],然后根据其page->index[页面在文件中的偏移]在address_space->prio_tree_root优先查找树中查找,如果属于匿名映射,page->mmaping最后一字段=1,指向anon_vma结构,其为所有共享同一匿名物理页面的所有进程虚存区间的头部,这样可以遍历到所有的虚存区间。

    本站是提供个人知识管理的网络存储空间,所有内容均由用户发布,不代表本站观点。请注意甄别内容中的联系方式、诱导购买等信息,谨防诈骗。如发现有害或侵权内容,请点击一键举报。
    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多