版权声明:转载时请以超链接形式标明文章原始出处和作者信息及本声明 http://wanderer-zjhit./logs/157601763.html
核心:用户态缺页往往由于下面四种情况造成: 物理页面不存在,该页表项=0,内存没有从磁盘中读入到物理内存中,没有建立虚存到物理内存映射,新建或读取 物理页面不存在,该页表项位1=1,为非线性映射,页表项其余位指明了该物理页在文件中的页偏移,从磁盘缓冲区中读入 物理页面不存在,该页表项位1=0,非匿名映射,页表项其余为知名了该物理页在交换分区中的偏移,从磁盘缓冲区读入 物理页面存在,但是该虚存区间可写,而该页表项不可写,为创建新进程造成,利用写时拷贝技术,物理页面复制一份,然后建立页表项的映射,在这种写时复制技术中,无论父子进程试图写一个共享的页框都会造成异常,这是内核将该页框复制到一个新的页框中,并且将pte页表项重定向,标记为可写;原来的页框引用计数器--;且其仍然被标记为写保护,当另一个进程试图写该页面时,内核检查写进程是否成为该页框的唯一使用者,如果是,就把该页框标记为对该进程可写,即修改该进程pte页表项,从而可以写该页。
1 缺页异常处理程序 其中error_code表明虚存错误码,而虚存地址保存在cr2寄存器中 比特位 值 0 0缺页 1保护异常 1 0读访 1写访问 2 0核心 1用户态
/* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. */ dotraplinkage void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; unsigned long address; struct mm_struct *mm; int write; int fault;
tsk = current; mm = tsk->mm;
/* Get the faulting address: */ address = read_cr2(); 错误地址保存在cr2寄存器中
/* static inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); return val; } * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. */ if (kmemcheck_active(regs)) kmemcheck_hide(regs); prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address))) return;
/* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. * * This verifies that the fault happens in kernel space * (error_code & 4) == 0, and that the fault was not a * protection error (error_code & 9) == 0. */ if (unlikely(fault_in_kernel_space(address))) { 如果是发生缺页异常的地址位于内核态 if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) { 异常发生在核心态、不是由于保护错误异常引起 if (vmalloc_fault(address) >= 0) vmalloc异常,进程主页表与内核主页表同步 return;
if (kmemcheck_fault(regs, address, error_code)) return; }
/* Can handle a stale RO->RW TLB: */ if (spurious_fault(error_code, address)) return;
/* kprobes don't want to hook the spurious faults: */ if (notify_page_fault(regs)) return; /* * Don't take the mm semaphore here. If we fixup a prefetch * fault we could otherwise deadlock: */ bad_area_nosemaphore(regs, error_code, address);
return; }
/* kprobes don't want to hook the spurious faults: */ if (unlikely(notify_page_fault(regs))) return; /* * It's safe to allow irq's after cr2 has been saved and the * vmalloc fault has been handled. * * User-mode registers count as a user access even for any * potential system fault or CPU buglet: */ if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); }
if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address);
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
/* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: */ if (unlikely(in_atomic() || !mm)) { 如果在原子操作中或者没有mm结构,不能处理错误异常,即内核空间中没有用户态虚存空间 bad_area_nosemaphore(regs, error_code, address); return; }
/* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in * the kernel and should generate an OOPS. Unfortunately, in the * case of an erroneous fault occurring in a code path which already * holds mmap_sem we will deadlock attempting to validate the fault * against the address space. Luckily the kernel only validly * references user space from well defined areas of code, which are * listed in the exceptions table. * * As the vast majority of faults will be valid we will only perform * the source reference check when there is a possibility of a * deadlock. Attempt to lock the address space, if we cannot we then * validate the source. If this is invalid we can skip the address * space check, thus avoiding the deadlock: */ if (unlikely(!down_read_trylock(&mm->mmap_sem))) { if ((error_code & PF_USER) == 0 && !search_exception_tables(regs->ip)) { bad_area_nosemaphore(regs, error_code, address); return; } down_read(&mm->mmap_sem); } else { /* * The above down_read_trylock() might have succeeded in * which case we'll have missed the might_sleep() from * down_read(): */ might_sleep(); }
vma = find_vma(mm, address); 如果该异常地址发生在用户态并且有虚存地址空间 if (unlikely(!vma)) { 如果该地址没有映射到进程虚存空间中,发生地址未映射错误 bad_area(regs, error_code, address); return; } if (likely(vma->vm_start <= address)) 该地址在虚存区间中 goto good_area; if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { bad_area(regs, error_code, address); return; } if (error_code & PF_USER) { /* * Accessing the stack below %sp is always a bug. * The large cushion allows instructions like enter * and pusha to work. ("enter $65535, $31" pushes * 32 pointers and then decrements %sp by 65535.) */ if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { bad_area(regs, error_code, address); return; } } if (unlikely(expand_stack(vma, address))) {适当扩大栈空间 bad_area(regs, error_code, address); return; }
/* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: write = error_code & PF_WRITE;
if (unlikely(access_error(error_code, write, vma))) { 如果是写访问该出错地址,且该虚存区间不允许写出错,如果error_code表明是读缺页,且该虚存区间即不允许读、写也不允许执行,出错 bad_area_access_error(regs, error_code, address); return; }
/* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault: */ 权限检查 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0);
if (unlikely(fault & VM_FAULT_ERROR)) { 在创建页面时候,也可能发生缺页异常 mm_fault_error(regs, error_code, address, fault); return; }
if (fault & VM_FAULT_MAJOR) { 表明数据需要从块设备中读取 tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { 数据已经在内存中 tsk->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); }
check_v8086_mode(regs, address, tsk);
up_read(&mm->mmap_sem); } 2 如果异常地址在内核态,且当前进程位于内核态不是由于保护异常造成,则利用init内核页表修复该进程的内核页表,调用arch/x86/mm/fault.c文件的vmalloc_fault(address)进行修复 该函数首先从cr3寄存器中读取当前进程的页目录项结构,然后读取init进程的页目录结构(swapper_pg_dir),然后修改引起异常的该项目 3 如果异常发生在用户态,且存在虚存区间vm_area_struct与该异常地址对应,而且该虚存区间允许对该地址的读写访问(读写访问存在err_code中),则执行handle_mm_fault缺页处理程序 /* * By the time we get here, we already hold the mm semaphore */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT); 统计计数
if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags);
pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; pte = pte_alloc_map(mm, pmd, address);取得到虚存对应的pte页表项 if (!pte) return VM_FAULT_OOM;
return handle_pte_fault(mm, vma, address, pte, pmd, flags); 建立该物理页面和页表项的映射关系 } 4 用户空间缺页异常的校正 /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most * RISC architectures). The early dirtying is also good on the i386. * * There is also a hook called "update_mmu_cache()" that architectures * with external mmu caches can use to update those (ie the Sparc or * PowerPC hashed page tables that act as extended TLBs). * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl;
entry = *pte; if (!pte_present(entry)) 该页表项第0位=0,表明是物理页面不存在 { if (pte_none(entry)) { 1: 如果该页表项为0,内核从头加载该项 if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, pte, pmd, flags, entry); } return do_anonymous_page(mm, vma, address, pte, pmd, flags);调用默认的缺页处理程序 } if (pte_file(entry)) 2:如果页表项第二位=1,且页表项中保存了相关信息,为非线性映射 return do_nonlinear_fault(mm, vma, address, pte, pmd, flags, entry); 页表项保存了相关信息,且第2位=0,为匿名映射,从交换区读入 return do_swap_page(mm, vma, address, pte, pmd, flags, entry); }
ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; 4 如果该虚存区域有写权限,但是该页表项没有写权限,则利用写时复制技术复制一个内存页面,建立映射 if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) return do_wp_page(mm, vma, address,pte, pmd, ptl, entry);更新页表项可写标志 entry = pte_mkdirty(entry);如果该区域和该页都有写权限,将该页标记为脏 } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, entry); } else { 由于保护异常引起 /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (flags & FAULT_FLAG_WRITE) flush_tlb_page(vma, address); } unlock: pte_unmap_unlock(pte, ptl); return 0; } 返回值分析: VM_FAULT_MAJOR:缺页迫使当前进程睡眠,可能由于从磁盘拷贝数据花费时间长 VM_FAULT_MINOR:没有阻塞当前进程情况下处理了缺页 VM_FAULT_OOM:没有足够内存 VM_FAULT_SIGBOS:没有任何错误 4-0 有用户态的缺页异常可以看出,有四种情况造成缺页:物理页面不存在[页表项最低位=0] 物理页面不存在,该页表项=0,内存没有从磁盘中读入到物理内存中,没有建立虚存到物理内存映射,新建或读取 物理页面不存在,该页表项位1=1,为非线性映射,页表项其余位指明了该物理页在文件中的页偏移,从磁盘缓冲区中读入 物理页面不存在,该页表项位1=0,非匿名映射,页表项其余为知名了该物理页在交换分区中的偏移,从磁盘缓冲区读入 物理页面存在,但是该虚存区间可写,而该页表项不可写,为创建新进程造成,利用写时拷贝技术,物理页面复制一份,然后建立页表项的映射 4-1 考虑写时复制技术 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) { struct page *old_page, *new_page; pte_t entry; int reuse = 0, ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL;
old_page = vm_normal_page(vma, address, orig_pte);获得对应的物理页面page结构 if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty * accounting on raw pfn maps. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) goto reuse; goto gotten; }
/* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ if (PageAnon(old_page) && !PageKsm(old_page)) { if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); lock_page(old_page); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); page_cache_release(old_page); goto unlock; } page_cache_release(old_page); } reuse = reuse_swap_page(old_page); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { /* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { struct vm_fault vmf; int tmp;
vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = old_page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.page = old_page;
/* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait * for the page to get into an appropriate state. * * We do this without the lock held, so that it can * sleep if it needs to. */ page_cache_get(old_page); pte_unmap_unlock(page_table, ptl);
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);如果该页面只有一个引用者,没必要复制该页 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; goto unwritable_page; } if (unlikely(!(tmp & VM_FAULT_LOCKED))) { lock_page(old_page); if (!old_page->mapping) { ret = 0; /* retry the fault */ unlock_page(old_page); goto unwritable_page; } } else VM_BUG_ON(!PageLocked(old_page));
/* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { 由于在上述操作中进程可能睡眠,此处做检查 unlock_page(old_page); page_cache_release(old_page); goto unlock; }
page_mkwrite = 1; } dirty_page = old_page; get_page(dirty_page); reuse = 1; }
if (reuse) { reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; goto unlock; }
/* * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); gotten: pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma))) goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);分配一个新页 if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma);页面内容复制 } __SetPageUptodate(new_page);设置page->flags标志
/* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ if ((vma->vm_flags & VM_LOCKED) && old_page) { lock_page(old_page); /* for LRU manipulation */ clear_page_mlock(old_page); unlock_page(old_page); }
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new;
/* * Re-check the pte - we dropped the lock */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); } } else inc_mm_counter(mm, anon_rss); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); 返回该页面对应的页表项内容 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 恢复该页表项的可写标志 /* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition * seen in the presence of one thread doing SMC and another * thread doing COW. */ ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address);将该page结构所在的vm_area_struct结构插入到该page对应匿名映射链表中 /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ set_pte_at_notify(mm, address, page_table, entry);填写子进程页表项pte_t结构 update_mmu_cache(vma, address, entry); if (old_page) { page_remove_rmap(old_page);清除原来只读页的逆向映射,修改page->mmcout=0表示只有一个进程 }
/* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; } else mem_cgroup_uncharge_page(new_page);
if (new_page) page_cache_release(new_page); if (old_page) page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * * do_no_page is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); set_page_dirty_balance(dirty_page, page_mkwrite); } put_page(dirty_page); if (page_mkwrite) { struct address_space *mapping = dirty_page->mapping;
set_page_dirty(dirty_page); unlock_page(dirty_page); page_cache_release(dirty_page); if (mapping) { /* * Some device drivers do not set page.mapping * but still dirty their pages */ balance_dirty_pages_ratelimited(mapping); } }
/* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); } return ret; ... } 如何建立该物理页面的逆向映射数据结构:page_add_new_anon_rmap(new_page, vma, address) /** * __page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ static void __page_set_anon_rmap(struct page *page,struct vm_area_struct *vma, unsigned long address) { struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
/* * nr_mapped state can be updated without turning off * interrupts because it is not modified via interrupt. */ __inc_zone_page_state(page, NR_ANON_PAGES); } 分析:由于每个vm_area_struct包含了多个物理页面,在分配第一个物理页面时,为了反向映射所有的进程,页面对应了一个anon_vma双链表头结构,并由所有共享该页面的vm_area_struct->anon_vma字段描述,当再给vm_area_struct线性地址区间分配新的物理页面page时,直接将该页面->mmap字段指向第一个页面的anon_vma结构即可,反向映射该页面通过遍历page->mmap头部结构,也可以查找到所有的共享同一匿名映射的进程,这是由于这两个物理页面位于同一个进程虚存区间造成的,而反向映射是一种面向对象的匿名映射 反向映射属于文件映射[page->mmaping字段最后两位=0,指向address_space结构,表示该物理页面映射到文件中],然后根据其page->index[页面在文件中的偏移]在address_space->prio_tree_root优先查找树中查找,如果属于匿名映射,page->mmaping最后一字段=1,指向anon_vma结构,其为所有共享同一匿名物理页面的所有进程虚存区间的头部,这样可以遍历到所有的虚存区间。
|