icon-cookie
The website uses cookies to optimize your user experience. Using this website grants us the permission to collect certain information essential to the provision of our services to you, but you may change the cookie settings within your browser any time you wish. Learn more
I agree
blank_error__heading
blank_error__body
Text direction?

关于thp大页研究

linux 内存分级结构

在研究thp透明大页之前,我们先来复习一下linux四级页表,分别是pgd,pud,pmd,pte。他们分别称为:页全局目录、页上级目录、页中间目录、页表。那下面我们先看一下怎么从一个addr转换成具体物理页

static void dump_pagetable(unsigned long address)                               
{                                                                               
    pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);                        
    pgd_t *pgd = base + pgd_index(address);                                     
    pud_t *pud;                                                                 
    pmd_t *pmd;                                                                 
    pte_t *pte;                                                                 

    if (bad_address(pgd))                                                       
        goto bad;                                                               

    printk("PGD %lx ", pgd_val(*pgd));                                          

    if (!pgd_present(*pgd))                                                     
        goto out;                                                               

    pud = pud_offset(pgd, address);                                             
    if (bad_address(pud))                                                       
        goto bad;                                                               

    printk("PUD %lx ", pud_val(*pud));                                          
    if (!pud_present(*pud) || pud_large(*pud))                                  
        goto out;                                                               

    pmd = pmd_offset(pud, address);                                             
    if (bad_address(pmd))                                                       
        goto bad;                                                               

    printk("PMD %lx ", pmd_val(*pmd));                                          
    if (!pmd_present(*pmd) || pmd_large(*pmd))                                  
        goto out;                                                               

    pte = pte_offset_kernel(pmd, address);                                      
    if (bad_address(pte))                                                       
        goto bad;                                                               

    printk("PTE %lx", pte_val(*pte));                                           
out:                                                                            
    printk("\n");                                                               
    return;                                                                     
bad:                                                                            
    printk("BAD\n");                                                            
}                                                                                                                                                                           

拿到pte之后, pa = (pte_val(*pte) & PAGE_MASK | (address & ~PAGE_MASK);即得到物理地址

2M大页和4k小页在linux分级页表上的不同

在一个linux 64位操作系统当中,pgd ,pud,pmd,pte都为9位,而offset为12位。一个4k页(2的12次方)对应的页表分级图如下

同理一个pmd对应的大小为2的21次方,故对应的页大小为2Mb,其分页结构如下

thp的alloc

上面我们复习了一下linux内存的寻址方式,接下来我们看一下如果分配置出一个大页。镜头拉回到mm fault入口函数,在linux 中提供大页有两种方式 一种是hugetlbfs,另外一种是THP,本文中研究THP

int handle_mm_fault(struct vm_area_struct *mva, unsigned long address,              
        unsigned int flags)                                                         
{                                                                                   
    int ret;                                                                        

    __set_current_state(TASK_RUNNING);                                              

    count_vm_event(PGFAULT);                                                        
    mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);                                 

    /* do counter updates before entering really critical section. */               
    check_sync_rss_stat(current);                                                   

    /*                                                                              
     * Enable the memcg OOM handling for faults triggered in user                   
     * space.  Kernel faults are handled more gracefully.                           
     */                                                                             
    if (flags & FAULT_FLAG_USER)                                                                                                                                            
        mem_cgroup_oom_enable();                                                

    if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,               
                        flags & FAULT_FLAG_INSTRUCTION,                         
                        flags & FAULT_FLAG_REMOTE))                             
        return VM_FAULT_SIGSEGV;                                                

    if (unlikely(is_vm_hugetlb_page(vma)))   
        //通过hugetlbfs来提供大页                                   
        ret = hugetlb_fault(vma->vm_mm, vma, address, flags);                   
    else                                                                        
        ret = __handle_mm_fault(vma, address, flags);                           

    if (flags & FAULT_FLAG_USER) {                                              
        mem_cgroup_oom_disable();                                               
        /*                                                                      
         * The task may have entered a memcg OOM situation but                  
         * if the allocation error was handled gracefully (no                   
         * VM_FAULT_OOM), there is no need to kill anything.                    
         * Just clean up the OOM state peacefully.                              
         */                                                                     
        if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))                
            mem_cgroup_oom_synchronize(false);                                  
    }                                                                               

    /*                                             

那么具体的调用关系如下:handle_mm_fault->__handle_mm_fault->create_hug_pmd->do_huge_pmd_anonymous_page。下面看一下 do_huge_pmd_anonymous_page的具体逻辑 以下为部分代码的节选

int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                   unsigned long address, pmd_t *pmd,                           
                   unsigned int flags)                                          
{                                                                               
    struct page *page;   
    /*pmd_mask 为0xffe00000 */                                                      
    unsigned long haddr = address & HPAGE_PMD_MASK;                             

    if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)          
        return VM_FAULT_FALLBACK;  
        //为vma准备anon_vma                                             
    if (unlikely(anon_vma_prepare(vma)))                                        
        return VM_FAULT_OOM;  
        //将该vma加入到khugepaged进程扫描list                                                  
    if (unlikely(khugepaged_enter(vma)))                                        
        return VM_FAULT_OOM;  
        //如果是因为读引起的fault                                                  
    if (!(flags & FAULT_FLAG_WRITE) &&                                          
            transparent_hugepage_use_zero_page()) {                             
        spinlock_t *ptl;                                                        
        pgtable_t pgtable;                                                      
        struct page *zero_page;                                                 
        bool set;                                                               
        int ret;                                                                
        pgtable = pte_alloc_one(mm, haddr);                                     
        if (unlikely(!pgtable))                                                 
            return VM_FAULT_OOM;                                                
        zero_page = get_huge_zero_page();                                       
        if (unlikely(!zero_page)) {                                             
            pte_free(mm, pgtable);                                              
            count_vm_event(THP_FAULT_FALLBACK);                                 
            return VM_FAULT_FALLBACK;                                           
        }                                                                       
        ptl = pmd_lock(mm, pmd);                                                
        ret = 0;                                                                
        set = false;                                                            
        if (pmd_none(*pmd)) {                                                   
            if (userfaultfd_missing(vma)) {                                     
                spin_unlock(ptl);                                               
                ret = handle_userfault(vma, address, flags,                     
                               VM_UFFD_MISSING);                                
                VM_BUG_ON(ret & VM_FAULT_FALLBACK);                             
            } else {                                                            
                set_huge_zero_page(pgtable, mm, vma,                            
                           haddr, pmd,                                          
                           zero_page);                                          
                spin_unlock(ptl);                                               
                set = true;  
           }                                                                   
        } else                                                                  
            spin_unlock(ptl);                                                   
        if (!set) {                                                             
            pte_free(mm, pgtable);                                              
            put_huge_zero_page();                                               
        }                                                                       
        return ret;                                                             
    } 

    /* 从buddy系统中分配大小为HPAGE_PMD_ORDER(大小为9)的连续pageblock */   

    page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),                 
            vma, haddr, numa_node_id(), 0);                                     
    if (unlikely(!page)) {                                                      
        count_vm_event(THP_FAULT_FALLBACK);                                     
        return VM_FAULT_FALLBACK;                                               
    }                                                                           
    return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page,            
                        flags);                 

接下来看一下__do_huge_pmd_anonymous_page核心逻辑

        entry = mk_huge_pmd(page, vma->vm_page_prot);                           
        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);                     
        page_add_new_anon_rmap(page, vma, haddr, true);                         
        mem_cgroup_commit_charge(page, memcg, false, true);                     
        lru_cache_add_active_or_unevictable(page, vma);                         
        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);              
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);     

1.设置Pmd val为_PAGE_PSE 2.设置pmd PAGE_DIRTY 和PAGE_RW 3.将vma的anon和page pmd map起来 4.级mem cgroup提交一个page消费 5.将page加入到active或者unevictable lru上 6.存储 hugepage的页表 7.更新pmd值为entry

总结

1.4k page为四级页表,而2M页为三级页表。故4k页 page_shit为12,2M页page_shit 21 2.THP的生成依赖于系统连续内存,具体是order为9的连续内存。所以,如果没有连续内存也就无法生成大页

Measure
Measure
Related Notes
Get a free MyMarkup account to save this article and view it later on any device.
Create account

End User License Agreement

Summary | 1 Annotation
pa = (pte_val(*pte) & PAGE_MASK | (address & ~PAGE_MASK);即得到物理地址
2020/09/05 10:22