在研究thp透明大页之前,我们先来复习一下linux四级页表,分别是pgd,pud,pmd,pte。他们分别称为:页全局目录、页上级目录、页中间目录、页表。那下面我们先看一下怎么从一个addr转换成具体物理页
static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
pgd_t *pgd = base + pgd_index(address);
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (bad_address(pgd))
goto bad;
printk("PGD %lx ", pgd_val(*pgd));
if (!pgd_present(*pgd))
goto out;
pud = pud_offset(pgd, address);
if (bad_address(pud))
goto bad;
printk("PUD %lx ", pud_val(*pud));
if (!pud_present(*pud) || pud_large(*pud))
goto out;
pmd = pmd_offset(pud, address);
if (bad_address(pmd))
goto bad;
printk("PMD %lx ", pmd_val(*pmd));
if (!pmd_present(*pmd) || pmd_large(*pmd))
goto out;
pte = pte_offset_kernel(pmd, address);
if (bad_address(pte))
goto bad;
printk("PTE %lx", pte_val(*pte));
out:
printk("\n");
return;
bad:
printk("BAD\n");
}
拿到pte之后, pa = (pte_val(*pte) & PAGE_MASK | (address & ~PAGE_MASK);即得到物理地址
在一个linux 64位操作系统当中,pgd ,pud,pmd,pte都为9位,而offset为12位。一个4k页(2的12次方)对应的页表分级图如下
同理一个pmd对应的大小为2的21次方,故对应的页大小为2Mb,其分页结构如下
上面我们复习了一下linux内存的寻址方式,接下来我们看一下如果分配置出一个大页。镜头拉回到mm fault入口函数,在linux 中提供大页有两种方式 一种是hugetlbfs,另外一种是THP,本文中研究THP
int handle_mm_fault(struct vm_area_struct *mva, unsigned long address,
unsigned int flags)
{
int ret;
__set_current_state(TASK_RUNNING);
count_vm_event(PGFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
/*
* Enable the memcg OOM handling for faults triggered in user
* space. Kernel faults are handled more gracefully.
*/
if (flags & FAULT_FLAG_USER)
mem_cgroup_oom_enable();
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
return VM_FAULT_SIGSEGV;
if (unlikely(is_vm_hugetlb_page(vma)))
//通过hugetlbfs来提供大页
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
if (flags & FAULT_FLAG_USER) {
mem_cgroup_oom_disable();
/*
* The task may have entered a memcg OOM situation but
* if the allocation error was handled gracefully (no
* VM_FAULT_OOM), there is no need to kill anything.
* Just clean up the OOM state peacefully.
*/
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
}
/*
那么具体的调用关系如下:handle_mm_fault->__handle_mm_fault->create_hug_pmd->do_huge_pmd_anonymous_page。下面看一下 do_huge_pmd_anonymous_page的具体逻辑 以下为部分代码的节选
int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
{
struct page *page;
/*pmd_mask 为0xffe00000 */
unsigned long haddr = address & HPAGE_PMD_MASK;
if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
return VM_FAULT_FALLBACK;
//为vma准备anon_vma
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
//将该vma加入到khugepaged进程扫描list
if (unlikely(khugepaged_enter(vma)))
return VM_FAULT_OOM;
//如果是因为读引起的fault
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
struct page *zero_page;
bool set;
int ret;
pgtable = pte_alloc_one(mm, haddr);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = get_huge_zero_page();
if (unlikely(!zero_page)) {
pte_free(mm, pgtable);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
ptl = pmd_lock(mm, pmd);
ret = 0;
set = false;
if (pmd_none(*pmd)) {
if (userfaultfd_missing(vma)) {
spin_unlock(ptl);
ret = handle_userfault(vma, address, flags,
VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, mm, vma,
haddr, pmd,
zero_page);
spin_unlock(ptl);
set = true;
}
} else
spin_unlock(ptl);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
}
return ret;
}
/* 从buddy系统中分配大小为HPAGE_PMD_ORDER(大小为9)的连续pageblock */
page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
vma, haddr, numa_node_id(), 0);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page,
flags);
接下来看一下__do_huge_pmd_anonymous_page核心逻辑
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1.设置Pmd val为_PAGE_PSE 2.设置pmd PAGE_DIRTY 和PAGE_RW 3.将vma的anon和page pmd map起来 4.级mem cgroup提交一个page消费 5.将page加入到active或者unevictable lru上 6.存储 hugepage的页表 7.更新pmd值为entry
1.4k page为四级页表,而2M页为三级页表。故4k页 page_shit为12,2M页page_shit 21 2.THP的生成依赖于系统连续内存,具体是order为9的连续内存。所以,如果没有连续内存也就无法生成大页