alloc_pages()函数分析

alloc_pages()函数用来给进程分配块，分配块所用的就是伙伴算法。

如果给进程分配块，调用的是get_page_from_freelist()。在分配块的过程中，我们要实施一些策略，也就是，假如可用内存的数量不足的话，要回收一些内存。我们在上一次的物理内存管理的pglist_data的数据结构也看到了，用于异步回收的进程。

在《深入理解Linux虚拟内存管理》的书中，有这样的一幅图：

这幅图中我们可以看到，在那些时候唤醒kswapd来进行异步回收内存，什么时候，同步回收内存。

其实alloc_pages()就是围绕着这幅图来设计算法的。

分配的具体步骤是：

1、用page_low作为阈值（就是分配结束后，要最少还有page_low数量的内存存在），来进行分配。如果分配成功，很好。如果分配不成功，那么就唤醒kswapd()来进行异步回收内存。

2、第一个步骤里面其实已经在某种程度上达到我们的目的了。也就是能够控制内存分配的速度了。因此，降低阈值。同时，还要看一下进程要求分配内存的分配策略，假若进程是非中断处理程序的实时进程，或者该进程不能被阻塞，那么这个时候，我要在最低阈值的标准的基础上，再次降低阈值，即page_min-=page-min/4。如果进程要求分配进程的策略还有加强，那么继续减小阈值：page_min-=page_min/2。阈值减小后，再次进行分配。

3、如果第二步分配成功了，不错；如果没有分配成功，那么就比较麻烦了。就需要启动同步回收内存的机制了。但在启动同步内存回收之前，我们要看一下进程的属性，假若进程本身是正在分配内存，或者进程正在撤销以释放内存，那么就需要特殊对待了，如果分配策略允许，就不再设置阈值，直接分配。如果没有分配成功，看一下，这个进程是不是很特殊，如果它不允许分配失败，就要不停的循环，不停的去尝试，直到分配成功。反之，分配失败，就不再往下尝试了。因为，在这里，进程本身就是回收内存，或者进程本身就在撤销以释放内存，这已经是底线了。

4、第3步，是一个特殊的情况。那么对于一般的进程在第二步较低阈值的情况下仍然没有分配成功，就开始同步回收内存了。方法就是把进程的标志位设置为PF_MEMALLOC,然后把进程回收的状态也改变了，对于不再活跃的SLAB也给回收了。然后就开始同步回收内存了。注意，这里同步回收的话，进程实际上是被阻塞的，如果进程本身的性质不允许阻塞的话，那么在第三步实际上就OVER了。这一步的回收内存，要比异步回收内存的力度更大。将所有不活动的脏的页给写回磁盘，让他们变成干净的页，然后归入可分配的空闲页框队列，这一步需要的时间比较长。

5、如果在第4步中，我们同步回收到内存了，那么，就从重复一下第二步的分配方法，给进程分配内存。如果不成功（汗，到这里还不成功）,那么同时就看两点，第一点，看分配策略是否允许重试，第二点，看进程需要分配的块为多大，如果小于等于8个页框，那么就休眠一会儿后，重新跳到第4步，再次同步回收内存，再次分配。否则不允许重试的话，就不要再做其他的努力了。OVER吧。

6、在第5步的假设是，同步回收内存成功了。那么如果同步回收内存没有成功了呢？这个进程十有八九是需要被KILL的。但内核采取了一种很委婉的方法，就是把阈值设为最大page_high。如果分配成功了，那运气真好；如果分配没有成功，那么名正言顺的，因为没有分配到内存，所以，就被KILL了。


909 struct page * fastcall

910 __alloc_pages(gfp_t gfp_mask, unsigned int order,

911         struct zonelist *zonelist)

912 {

913     const gfp_t wait = gfp_mask & __GFP_WAIT;//__GFP_WAIT:get_free_pages_wait

914     struct zone **z;

915     struct page *page;

916     struct reclaim_state reclaim_state;

917     struct task_struct *p = current;

918     int do_retry;

919     int alloc_flags;

920     int did_some_progress;

921

922     might_sleep_if(wait);//if wait then might_sleep

923

924 restart:

925     z = zonelist->zones;  /* the list of zones suitable for gfp_mask */

926

927     if (unlikely(*z == NULL)) {

928         /* Should this ever happen?? */

929         return NULL;

930     }

931

932     page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

933                 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);

934     if (page)

935         goto got_pg;

936

937     /*

938      *以上是第一阶段，分配内存，阈值为：WMARK_LOW

939      * */

940

941     do {

942         wakeup_kswapd(*z, order);

943     } while (*(++z));

944

945     /*LLOC

946      * OK, we're below the kswapd watermark and have kicked background

947      * reclaim. Now things get more complex, so set up alloc_flags according

948      * to how we want to proceed.

949      *

950      * The caller may dip into page reserves a bit more if the caller

951      * cannot run direct reclaim, or if the caller has realtime scheduling

952      * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will

953      * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).

954      */

955     alloc_flags = ALLOC_WMARK_MIN;

956     if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)

957         alloc_flags |= ALLOC_HARDER;

958     if (gfp_mask & __GFP_HIGH)

959         alloc_flags |= ALLOC_HIGH;

960     if (wait)

961         alloc_flags |= ALLOC_CPUSET;//??

962

963     /*

964      * Go through the zonelist again. Let __GFP_HIGH and allocations

965      * coming from realtime tasks go deeper into reserves.

966      *

967      * This is the last chance, in general, before the goto nopage.

968      * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.

969      * See also cpuset_zone_allowed() comment in kernel/cpuset.c.

970      */

971     page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);

972     if (page)

973         goto got_pg;

974

975     /*

976      *以上为第二轮的分配

977      * */

978

979     /* This allocation should allow future memory freeing. */

980

981     if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))//这两个位都是回收

982             && !in_interrupt()) {//进程是用来分配内存的

983         if (!(gfp_mask & __GFP_NOMEMALLOC)) {//GFP_NOMEMALLOC:don't use the reserved

984 nofail_alloc://may no use the reserved memory.

985             /* go through the zonelist yet again, ignoring mins */

986             page = get_page_from_freelist(gfp_mask, order,

987                 zonelist, ALLOC_NO_WATERMARKS);//Don't use the watermask

988             if (page)

989                 goto got_pg;

990             if (gfp_mask & __GFP_NOFAIL) {//if the _GFP_NOFAIL, will repeat until success

991                 blk_congestion_wait(WRITE, HZ/50);

992                 goto nofail_alloc;

993             }

994         }

995         goto nopage;//不用预留的内存的话，就到NOPAGE那个区间去

996     }

997

998     /*

999      *以上的这次分配是在该进程本身就是回收内存的进程来讲的。如果，本身不是的话，需要往下 */

1002

1003     /* Atomic allocations - we can't balance anything */

1004     if (!wait)//进程不阻塞，就nopage,如果阻塞，还以后还有机会

1005         goto nopage;

1006

1007 rebalance:

1008     cond_resched();//若允许阻塞，先让其他进程运行

1009

1010     /* We now go into synchronous reclaim */

1011     cpuset_memory_pressure_bump();//同步回收内存

1012     p->flags |= PF_MEMALLOC;

1013     reclaim_state.reclaimed_slab = 0;

1014     p->reclaim_state = &reclaim_state;

1015

1016                  /*

1017      * the follow step is to convert the inactive page to active page

1018      * */

1019     did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);

1020

1021     p->reclaim_state = NULL;

1022     p->flags &= ~PF_MEMALLOC;

1023

1024     cond_resched();

1025

1026     //1.if freepages then get

1027     //2.if no freepages and no retry then kill more

1028

1029     if (likely(did_some_progress)) {//this step is by the try_to_free_pages()

1030         page = get_page_from_freelist(gfp_mask, order,

1031                         zonelist, alloc_flags);

1032         if (page)

1033             goto got_pg;

1034     } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {//don't want to retry

1035         //if try_to_free_page do nothing and don't want to try again !then ,will be killed!

1036         /*

1037          * Go through the zonelist yet one more time, keep

1038          * very high watermark here, this is only to catch

1039          * a parallel oom killing, we must fail if we're still

1040          * under heavy pressure.

1041          */

1042         page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,

1043                 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);

1044         if (page)

1045             goto got_pg;

1046

1047         out_of_memory(zonelist, gfp_mask, order);//will be killed

1048         goto restart;

1049     }

1050

1051     /*

1052      * Don't let big-order allocations loop unless the caller explicitly

1053      * requests that.  Wait for some write requests to complete then retry.

1054      *

1055      * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order

1056      * <= 3, but that may not be true in other implementations.

1057      */

1058     do_retry = 0;

1059     if (!(gfp_mask & __GFP_NORETRY)) {

1060         if ((order <= 3) || (gfp_mask & __GFP_REPEAT))//根据申请order大小，Repeat

1061             do_retry = 1;

1062         if (gfp_mask & __GFP_NOFAIL)

1063             do_retry = 1;

1064     }

1065     if (do_retry) {//try again

1066         blk_congestion_wait(WRITE, HZ/50);//set the sleep time

1067         goto rebalance;//

1068     }

1069

1070 nopage:

1071     if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {

1072         printk(KERN_WARNING "%s: page allocation failure."

1073             " order:%d, mode:0x%x\n",

1074             p->comm, order, gfp_mask);

1075         dump_stack();

1076         show_mem();

1077     }

1078 got_pg:

1079     return page;

1080 }

搜索此博客

Linux Stuff

alloc_pages()函数分析

评论

发表评论

此博客中的热门博文

Linux/ARM Page Table Entry 属性设置分析

提交了30次才AC －－－【附】POJ 2488解题报告

n个进程共享m个资源得死锁问题证明