缺页异常处理程序

Liucw2012 2012-09-24

展开全文

版权声明：转载时请以超链接形式标明文章原始出处和作者信息及本声明
 http://wanderer-zjhit./logs/157601763.html

核心：用户态缺页往往由于下面四种情况造成：
物理页面不存在，该页表项=0，内存没有从磁盘中读入到物理内存中，没有建立虚存到物理内存映射，新建或读取
物理页面不存在，该页表项位1=1，为非线性映射，页表项其余位指明了该物理页在文件中的页偏移，从磁盘缓冲区中读入
物理页面不存在，该页表项位1=0，非匿名映射，页表项其余为知名了该物理页在交换分区中的偏移，从磁盘缓冲区读入
物理页面存在，但是该虚存区间可写，而该页表项不可写，为创建新进程造成，利用写时拷贝技术，物理页面复制一份，然后建立页表项的映射，在这种写时复制技术中，无论父子进程试图写一个共享的页框都会造成异常，这是内核将该页框复制到一个新的页框中，并且将pte页表项重定向，标记为可写；原来的页框引用计数器--；且其仍然被标记为写保护，当另一个进程试图写该页面时，内核检查写进程是否成为该页框的唯一使用者，如果是，就把该页框标记为对该进程可写，即修改该进程pte页表项，从而可以写该页。

1 缺页异常处理程序
其中error_code表明虚存错误码，而虚存地址保存在cr2寄存器中
比特位                 值
0          0缺页               1保护异常
1          0读访               1写访问
2          0核心               1用户态

/*
* This routine handles page faults. It determines the address,
* and the problem, and then passes it off to one of the appropriate
* routines.
*/
dotraplinkage void __kprobes
do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
struct vm_area_struct *vma;
struct task_struct *tsk;
unsigned long address;
struct mm_struct *mm;
int write;
int fault;

tsk = current;
mm = tsk->mm;

/* Get the faulting address: */
address = read_cr2(); 错误地址保存在cr2寄存器中

/*
static inline unsigned long native_read_cr2(void)
{
unsigned long val;
asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
return val;
}
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
if (kmemcheck_active(regs))
kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);

if (unlikely(kmmio_fault(regs, address)))
return;

/*
* We fault-in kernel-space virtual memory on-demand. The
* 'reference' page table is init_mm.pgd.
*
* NOTE! We MUST NOT take any locks for this case. We may
* be in an interrupt or a critical region, and should
* only copy the information from the master page table,
* nothing more.
*
* This verifies that the fault happens in kernel space
* (error_code & 4) == 0, and that the fault was not a
* protection error (error_code & 9) == 0.
*/
if (unlikely(fault_in_kernel_space(address))) { 如果是发生缺页异常的地址位于内核态
  if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 异常发生在核心态、不是由于保护错误异常引起
   if (vmalloc_fault(address) >= 0) vmalloc异常，进程主页表与内核主页表同步
    return;

   if (kmemcheck_fault(regs, address, error_code))
    return;
  }

  /* Can handle a stale RO->RW TLB: */
  if (spurious_fault(error_code, address))
   return;

  /* kprobes don't want to hook the spurious faults: */
  if (notify_page_fault(regs))
   return;
  /*
   * Don't take the mm semaphore here. If we fixup a prefetch
   * fault we could otherwise deadlock:
   */
  bad_area_nosemaphore(regs, error_code, address);

return;
}

/* kprobes don't want to hook the spurious faults: */
if (unlikely(notify_page_fault(regs)))
  return;
/*
* It's safe to allow irq's after cr2 has been saved and the
* vmalloc fault has been handled.
*
* User-mode registers count as a user access even for any
* potential system fault or CPU buglet:
*/
if (user_mode_vm(regs)) {
  local_irq_enable();
  error_code |= PF_USER;
} else {
  if (regs->flags & X86_EFLAGS_IF)
   local_irq_enable();
}

if (unlikely(error_code & PF_RSVD))
pgtable_bad(regs, error_code, address);

perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);

/*
* If we're in an interrupt, have no user context or are running
* in an atomic region then we must not take the fault:
*/
if (unlikely(in_atomic() || !mm)) { 如果在原子操作中或者没有mm结构，不能处理错误异常，即内核空间中没有用户态虚存空间
bad_area_nosemaphore(regs, error_code, address);
return;
}

/*
* When running in the kernel we expect faults to occur only to
* addresses in user space. All other faults represent errors in
* the kernel and should generate an OOPS. Unfortunately, in the
* case of an erroneous fault occurring in a code path which already
* holds mmap_sem we will deadlock attempting to validate the fault
* against the address space. Luckily the kernel only validly
* references user space from well defined areas of code, which are
* listed in the exceptions table.
*
* As the vast majority of faults will be valid we will only perform
* the source reference check when there is a possibility of a
* deadlock. Attempt to lock the address space, if we cannot we then
* validate the source. If this is invalid we can skip the address
* space check, thus avoiding the deadlock:
*/
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  if ((error_code & PF_USER) == 0 &&
      !search_exception_tables(regs->ip)) {
   bad_area_nosemaphore(regs, error_code, address);
   return;
  }
  down_read(&mm->mmap_sem);
} else {
  /*
   * The above down_read_trylock() might have succeeded in
   * which case we'll have missed the might_sleep() from
   * down_read():
   */
  might_sleep();
}

vma = find_vma(mm, address); 如果该异常地址发生在用户态并且有虚存地址空间
if (unlikely(!vma)) { 如果该地址没有映射到进程虚存空间中，发生地址未映射错误
  bad_area(regs, error_code, address);
  return;
}
if (likely(vma->vm_start <= address)) 该地址在虚存区间中
  goto good_area;
if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
  bad_area(regs, error_code, address);
  return;
}
if (error_code & PF_USER) {
  /*
   * Accessing the stack below %sp is always a bug.
   * The large cushion allows instructions like enter
   * and pusha to work. ("enter $65535, $31" pushes
   * 32 pointers and then decrements %sp by 65535.)
   */
  if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
   bad_area(regs, error_code, address);
   return;
  }
}
if (unlikely(expand_stack(vma, address))) {适当扩大栈空间
  bad_area(regs, error_code, address);
  return;
}

/*
* Ok, we have a good vm_area for this memory access, so
* we can handle it..
*/
good_area:
write = error_code & PF_WRITE;

if (unlikely(access_error(error_code, write, vma))) { 如果是写访问该出错地址，且该虚存区间不允许写出错，如果error_code表明是读缺页，且该虚存区间即不允许读、写也不允许执行，出错
bad_area_access_error(regs, error_code, address);
return;
}

/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
* the fault:
*/ 权限检查
fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);

if (unlikely(fault & VM_FAULT_ERROR)) { 在创建页面时候，也可能发生缺页异常
mm_fault_error(regs, error_code, address, fault);
return;
}

if (fault & VM_FAULT_MAJOR) { 表明数据需要从块设备中读取
  tsk->maj_flt++;
  perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
         regs, address);
} else {   数据已经在内存中
  tsk->min_flt++;
  perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
         regs, address);
}

check_v8086_mode(regs, address, tsk);

up_read(&mm->mmap_sem);
}
2 如果异常地址在内核态，且当前进程位于内核态不是由于保护异常造成，则利用init内核页表修复该进程的内核页表，调用arch/x86/mm/fault.c文件的vmalloc_fault（address）进行修复
该函数首先从cr3寄存器中读取当前进程的页目录项结构，然后读取init进程的页目录结构（swapper_pg_dir）,然后修改引起异常的该项目
3 如果异常发生在用户态，且存在虚存区间vm_area_struct与该异常地址对应，而且该虚存区间允许对该地址的读写访问（读写访问存在err_code中），则执行handle_mm_fault缺页处理程序
/*
* By the time we get here, we already hold the mm semaphore
*/
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,unsigned long address, unsigned int flags)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;

__set_current_state(TASK_RUNNING);

count_vm_event(PGFAULT); 统计计数

if (unlikely(is_vm_hugetlb_page(vma)))
return hugetlb_fault(mm, vma, address, flags);

pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
  return VM_FAULT_OOM;
pmd = pmd_alloc(mm, pud, address);
if (!pmd)
  return VM_FAULT_OOM;
pte = pte_alloc_map(mm, pmd, address);取得到虚存对应的pte页表项
if (!pte)
  return VM_FAULT_OOM;

return handle_pte_fault(mm, vma, address, pte, pmd, flags); 建立该物理页面和页表项的映射关系
}
4 用户空间缺页异常的校正
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
*
* There is also a hook called "update_mmu_cache()" that architectures
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
*/
static inline int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;

entry = *pte;
if (!pte_present(entry)) 该页表项第0位=0，表明是物理页面不存在
{
if (pte_none(entry)) { 1：如果该页表项为0，内核从头加载该项
   if (vma->vm_ops) {
    if (likely(vma->vm_ops->fault))
     return do_linear_fault(mm, vma, address,
      pte, pmd, flags, entry);
   }
   return do_anonymous_page(mm, vma, address,
       pte, pmd, flags);调用默认的缺页处理程序
  }
  if (pte_file(entry)) 2：如果页表项第二位=1，且页表项中保存了相关信息，为非线性映射
   return do_nonlinear_fault(mm, vma, address,
     pte, pmd, flags, entry);
页表项保存了相关信息，且第2位=0，为匿名映射，从交换区读入
  return do_swap_page(mm, vma, address, pte, pmd, flags, entry);
}

ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
  goto unlock;
4 如果该虚存区域有写权限，但是该页表项没有写权限，则利用写时复制技术复制一个内存页面，建立映射
if (flags & FAULT_FLAG_WRITE) {
  if (!pte_write(entry))
   return do_wp_page(mm, vma, address,pte, pmd, ptl, entry);更新页表项可写标志
   entry = pte_mkdirty(entry);如果该区域和该页都有写权限，将该页标记为脏
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
  update_mmu_cache(vma, address, entry);
}
else { 由于保护异常引起
  /*
   * This is needed only for protection faults but the arch code
   * is not yet telling us if this is a protection fault or not.
   * This still avoids useless tlb flushes for .text page faults
   * with threads.
   */
  if (flags & FAULT_FLAG_WRITE)
   flush_tlb_page(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return 0;
}
返回值分析：
VM_FAULT_MAJOR:缺页迫使当前进程睡眠，可能由于从磁盘拷贝数据花费时间长
VM_FAULT_MINOR:没有阻塞当前进程情况下处理了缺页
VM_FAULT_OOM:没有足够内存
VM_FAULT_SIGBOS:没有任何错误
4-0 有用户态的缺页异常可以看出，有四种情况造成缺页：物理页面不存在[页表项最低位=0]
物理页面不存在，该页表项=0，内存没有从磁盘中读入到物理内存中，没有建立虚存到物理内存映射，新建或读取
物理页面不存在，该页表项位1=1，为非线性映射，页表项其余位指明了该物理页在文件中的页偏移，从磁盘缓冲区中读入
物理页面不存在，该页表项位1=0，非匿名映射，页表项其余为知名了该物理页在交换分区中的偏移，从磁盘缓冲区读入
物理页面存在，但是该虚存区间可写，而该页表项不可写，为创建新进程造成，利用写时拷贝技术，物理页面复制一份，然后建立页表项的映射
4-1 考虑写时复制技术
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
  unsigned long address, pte_t *page_table, pmd_t *pmd,
  spinlock_t *ptl, pte_t orig_pte)
{
struct page *old_page, *new_page;
pte_t entry;
int reuse = 0, ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;

old_page = vm_normal_page(vma, address, orig_pte);获得对应的物理页面page结构
if (!old_page) {
  /*
   * VM_MIXEDMAP !pfn_valid() case
   *
   * We should not cow pages in a shared writeable mapping.
   * Just mark the pages writable as we can't do any dirty
   * accounting on raw pfn maps.
   */
  if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
         (VM_WRITE|VM_SHARED))
   goto reuse;
  goto gotten;
}

/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
if (PageAnon(old_page) && !PageKsm(old_page)) {
  if (!trylock_page(old_page)) {
   page_cache_get(old_page);
   pte_unmap_unlock(page_table, ptl);
   lock_page(old_page);
   page_table = pte_offset_map_lock(mm, pmd, address,
        &ptl);
   if (!pte_same(*page_table, orig_pte)) {
    unlock_page(old_page);
    page_cache_release(old_page);
    goto unlock;
   }
   page_cache_release(old_page);
  }
  reuse = reuse_swap_page(old_page);
  unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
     (VM_WRITE|VM_SHARED))) {
  /*
   * Only catch write-faults on shared writable pages,
   * read-only shared pages can get COWed by
   * get_user_pages(.write=1, .force=1).
   */
  if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
   struct vm_fault vmf;
   int tmp;

   vmf.virtual_address = (void __user *)(address &
        PAGE_MASK);
   vmf.pgoff = old_page->index;
   vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
   vmf.page = old_page;

   /*
    * Notify the address space that the page is about to
    * become writable so that it can prohibit this or wait
    * for the page to get into an appropriate state.
    *
    * We do this without the lock held, so that it can
    * sleep if it needs to.
    */
   page_cache_get(old_page);
   pte_unmap_unlock(page_table, ptl);

   tmp = vma->vm_ops->page_mkwrite(vma, &vmf);如果该页面只有一个引用者，没必要复制该页
   if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
    ret = tmp;
    goto unwritable_page;
   }
   if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
    lock_page(old_page);
    if (!old_page->mapping) {
     ret = 0; /* retry the fault */
     unlock_page(old_page);
     goto unwritable_page;
    }
   } else
    VM_BUG_ON(!PageLocked(old_page));

   /*
    * Since we dropped the lock we need to revalidate
    * the PTE as someone else may have changed it. If
    * they did, we just return, as we can count on the
    * MMU to tell us if they didn't also make it writable.
    */
   page_table = pte_offset_map_lock(mm, pmd, address,
        &ptl);
   if (!pte_same(*page_table, orig_pte)) { 由于在上述操作中进程可能睡眠，此处做检查
    unlock_page(old_page);
    page_cache_release(old_page);
    goto unlock;
   }

   page_mkwrite = 1;
  }
  dirty_page = old_page;
  get_page(dirty_page);
  reuse = 1;
}

if (reuse) {
reuse:
  flush_cache_page(vma, address, pte_pfn(orig_pte));
  entry = pte_mkyoung(orig_pte);
  entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  if (ptep_set_access_flags(vma, address, page_table, entry,1))
   update_mmu_cache(vma, address, entry);
  ret |= VM_FAULT_WRITE;
  goto unlock;
}

/*
* Ok, we need to copy. Oh, well..
*/
page_cache_get(old_page);
gotten:
pte_unmap_unlock(page_table, ptl);

if (unlikely(anon_vma_prepare(vma)))
goto oom;

if (is_zero_pfn(pte_pfn(orig_pte))) {
  new_page = alloc_zeroed_user_highpage_movable(vma, address);
  if (!new_page)
   goto oom;
} else {
  new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);分配一个新页
  if (!new_page)
   goto oom;
  cow_user_page(new_page, old_page, address, vma);页面内容复制
}
__SetPageUptodate(new_page);设置page->flags标志

/*
* Don't let another task, with possibly unlocked vma,
* keep the mlocked page.
*/
if ((vma->vm_flags & VM_LOCKED) && old_page) {
  lock_page(old_page); /* for LRU manipulation */
  clear_page_mlock(old_page);
  unlock_page(old_page);
}

if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;

/*
* Re-check the pte - we dropped the lock
*/
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
  if (old_page) {
   if (!PageAnon(old_page)) {
    dec_mm_counter(mm, file_rss);
    inc_mm_counter(mm, anon_rss);
   }
  } else
   inc_mm_counter(mm, anon_rss);
  flush_cache_page(vma, address, pte_pfn(orig_pte));
  entry = mk_pte(new_page, vma->vm_page_prot);   返回该页面对应的页表项内容
  entry = maybe_mkwrite(pte_mkdirty(entry), vma); 恢复该页表项的可写标志
  /*
   * Clear the pte entry and flush it first, before updating the
   * pte with the new entry. This will avoid a race condition
   * seen in the presence of one thread doing SMC and another
   * thread doing COW.
   */
  ptep_clear_flush(vma, address, page_table);
  page_add_new_anon_rmap(new_page, vma, address);将该page结构所在的vm_area_struct结构插入到该page对应匿名映射链表中
  /*
   * We call the notify macro here because, when using secondary
   * mmu page tables (such as kvm shadow page tables), we want the
   * new page to be mapped directly into the secondary page table.
   */
  set_pte_at_notify(mm, address, page_table, entry);填写子进程页表项pte_t结构
  update_mmu_cache(vma, address, entry);
  if (old_page) {
   page_remove_rmap(old_page);清除原来只读页的逆向映射，修改page->mmcout=0表示只有一个进程
  }

  /* Free the old page.. */
  new_page = old_page;
  ret |= VM_FAULT_WRITE;
} else
  mem_cgroup_uncharge_page(new_page);

if (new_page)
  page_cache_release(new_page);
if (old_page)
  page_cache_release(old_page);
unlock:
pte_unmap_unlock(page_table, ptl);
if (dirty_page) {
  /*
   * Yes, Virginia, this is actually required to prevent a race
   * with clear_page_dirty_for_io() from clearing the page dirty
   * bit after it clear all dirty ptes, but before a racing
   * do_wp_page installs a dirty pte.
   *
   * do_no_page is protected similarly.
   */
  if (!page_mkwrite) {
   wait_on_page_locked(dirty_page);
   set_page_dirty_balance(dirty_page, page_mkwrite);
  }
  put_page(dirty_page);
  if (page_mkwrite) {
   struct address_space *mapping = dirty_page->mapping;

   set_page_dirty(dirty_page);
   unlock_page(dirty_page);
   page_cache_release(dirty_page);
   if (mapping) {
    /*
     * Some device drivers do not set page.mapping
     * but still dirty their pages
     */
    balance_dirty_pages_ratelimited(mapping);
   }
  }

  /* file_update_time outside page_lock */
  if (vma->vm_file)
   file_update_time(vma->vm_file);
}
return ret;
...
}
如何建立该物理页面的逆向映射数据结构：page_add_new_anon_rmap(new_page, vma, address)
/**
* __page_set_anon_rmap - setup new anonymous rmap
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
*/
static void __page_set_anon_rmap(struct page *page,struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;

BUG_ON(!anon_vma);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;

page->index = linear_page_index(vma, address);

/*
* nr_mapped state can be updated without turning off
* interrupts because it is not modified via interrupt.
*/
__inc_zone_page_state(page, NR_ANON_PAGES);
}
分析：由于每个vm_area_struct包含了多个物理页面，在分配第一个物理页面时，为了反向映射所有的进程，页面对应了一个anon_vma双链表头结构，并由所有共享该页面的vm_area_struct->anon_vma字段描述，当再给vm_area_struct线性地址区间分配新的物理页面page时，直接将该页面->mmap字段指向第一个页面的anon_vma结构即可，反向映射该页面通过遍历page->mmap头部结构，也可以查找到所有的共享同一匿名映射的进程，这是由于这两个物理页面位于同一个进程虚存区间造成的，而反向映射是一种面向对象的匿名映射
反向映射属于文件映射[page->mmaping字段最后两位=0，指向address_space结构，表示该物理页面映射到文件中]，然后根据其page->index[页面在文件中的偏移]在address_space->prio_tree_root优先查找树中查找，如果属于匿名映射，page->mmaping最后一字段=1，指向anon_vma结构，其为所有共享同一匿名物理页面的所有进程虚存区间的头部，这样可以遍历到所有的虚存区间。