How Linux/ARM initialize fallback-zone-lists

本文按照《Linux内核源代码情景分析》的方式分析Linux/ARM如何初始化fallback-zone-lists,并按照缩进的方式组织函数调用. 没有说清楚的地方敬请指正.

zonelist准确说,应该是fallback zone list.其存在的意义:当内存管理单元在某个区分配可用内存页的时候,如果内存不足,则会在其他区进行分配。其他区可能不止一个,先后顺序就定义在某个node节点的node_zonelists成员变量上。并且,当某个节点的内存不足时,可以分配其他节点的内存。

在V2.6.23以及之前的内核中,每个node节点拥有(node's number* zones per_node)个fall back zonelists.其排序规则是:
1.对于UMA,假设其zone type有HIGHMEM、NORMAL、DMA,其节点的zone_lists的情况是:zone_lists[ZONE_DMA]={DMA,NULL},zone_lists[ZONE_NORMAL]={NORMAL,DMA,NULL},zone_lists[ZONE_HIGHMEM]={HIGHMEM,NORMAL,DMA}.
2.对于NUMA,节点有A、B、C、D四个,zone type有HIGHMEM、NORMAL、DMA,那么其节点的zone_lists为:
B.zone_lists[ZONE_HIGHMEM]={B.HIGHMEM,B.NORMAL,B.DMA,C.HIGHMEM,C.NORMAL,C.DMA,A.HIGHMEM,A.NORMAL,A.DMA}
这一排序规则在《Professional Linux Kernel Architecture》中有非常准确的描述。
但是,在后续的Kernel中,将每个节点拥有的多个zone_lists合并为两个(对于NUMA是2个,UMA是1个),其相应的patch以及对应的讨论在:https://lkml.org/lkml/2007/8/8/302

本文的代码分析是针对于V3.3的Kernel Code.

start_kernel->build_all_zonelists
3349 /*
3350  * Called with zonelists_mutex held always
3351  * unless system_state == SYSTEM_BOOTING.
3352  */
3353 void __ref build_all_zonelists(void *data)
3354 {
3355         set_zonelist_order();
3356
3357         if (system_state == SYSTEM_BOOTING) {
3358                 __build_all_zonelists(NULL);
3359                 mminit_verify_zonelist();
3360                 cpuset_init_current_mems_allowed();

3355行,设置zonelist的排序规则。当在分配页框的时候,如果要分配区域中的页框数量不足,则从候补区中查找页。候补区的排队顺序依照current_zonelist_order设置的值。不同的值代表不同的意义。如下:
    start_kernel->build_all_zonelists->set_zonelist_order:
    3226 static void set_zonelist_order(void)
    3227 {
    3228         current_zonelist_order = ZONELIST_ORDER_ZONE;
    3229 }
    ZONELIST_ORDER_ZONE代表将来zonelist的排序规则。不过,对于UMA来说,ZONELIST_ORDER_NODEZONELIST_ORDER_ZONE是一样的。
    2827 /*
    2828  *  zonelist_order:
    2829  *  0 = automatic detection of better ordering.
    2830  *  1 = order by ([node] distance, -zonetype)
    2831  *  2 = order by (-zonetype, [node] distance)
    2832  *
    2833  *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
    2834  *  the same zonelist. So only NUMA can configure this param.
    2835  */
    2836 #define ZONELIST_ORDER_DEFAULT  0
    2837 #define ZONELIST_ORDER_NODE     1
    2838 #define ZONELIST_ORDER_ZONE     2
3357行,在system_state默认的值是0.由于system_state是个枚举变量,因此,其值为SYSTEM_BOOTING.
    108 enum system_states system_state __read_mostly;
    109 EXPORT_SYMBOL(system_state);
3358行,建立zonelists.每个zone_type对应于pgdata->node_zonelists[]数组的一个元素。每个元素由有一个指针数组(下面代码中的_zonerefs[])构成。先来看下相关数据结构,再看处理过程:
     603 struct zonelist {
     604         struct zonelist_cache *zlcache_ptr;                  // NULL or &zlcache
     605         struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
     606 #ifdef CONFIG_NUMA
     607         struct zonelist_cache zlcache;                       // optional ...
     608 #endif
     609 };
     636 typedef struct pglist_data {
     637         struct zone node_zones[MAX_NR_ZONES];
     638         struct zonelist node_zonelists[MAX_ZONELISTS];
                ...
     668}
     对于UMA体系结构,
     573 #define MAX_ZONELISTS 1
     对于NUMA体系结构,
     498 /*
     499  * The NUMA zonelists are doubled because we need zonelists that restrict the
     500  * allocations to a single node for GFP_THISNODE.
     501  *
     502  * [0]  : Zonelist with fallback
     503  * [1]  : No fallback (GFP_THISNODE)
     504  */
     505 #define MAX_ZONELISTS 2

     493 /* Maximum number of zones on a zonelist */
     494 #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

    start_kernel->build_all_zonelists->__build_all_zonelists
    3301 static __init_refok int __build_all_zonelists(void *data)
    3302 {
    3303         int nid;
    3304         int cpu;
    3305
    3306 #ifdef CONFIG_NUMA
    3307         memset(node_load, 0, sizeof(node_load));
    3308 #endif
    3309         for_each_online_node(nid) {
    3310                 pg_data_t *pgdat = NODE_DATA(nid);
    3311
    3312                 build_zonelists(pgdat);
    3313                 build_zonelist_cache(pgdat);
    3314         }
    3315
    3316         /*
    3317          * Initialize the boot_pagesets that are going to be used
    3318          * for bootstrapping processors. The real pagesets for
    3319          * each zone will be allocated later when the per cpu
    3320          * allocator is available.
    3321          *
    3322          * boot_pagesets are used also for bootstrapping offline
    3323          * cpus if the system is already booted because the pagesets
    3324          * are needed to initialize allocators on a specific cpu too.
    3325          * F.e. the percpu allocator needs the page allocator which
    3326          * needs the percpu allocator in order to allocate its pagesets
    3327          * (a chicken-egg dilemma).
    3328          */
    3329         for_each_possible_cpu(cpu) {
    3330                 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
    3331
    3332 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
    3333                 /*
    3334                  * We now know the "local memory node" for each node--
    3335                  * i.e., the node of the first zone in the generic zonelist.
    3336                  * Set up numa_mem percpu variable for on-line cpus.  During
    3337                  * boot, only the boot cpu should be on-line;  we'll init the
    3338                  * secondary cpus' numa_mem as they come on-line.  During
    3339                  * node/memory hotplug, we'll fixup all on-line cpus.
    3340                  */
    3341                 if (cpu_online(cpu))
    3342                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
    3343 #endif
    3344         }
    3345
    3346         return 0;
    3347 }
    3312行,
        start_kernel->build_all_zonelists->__build_all_zonelists->build_zonelists:
        3231 static void build_zonelists(pg_data_t *pgdat)
        3232 {
        3233         int node, local_node;
        3234         enum zone_type j;
        3235         struct zonelist *zonelist;
        3236
        3237         local_node = pgdat->node_id;
        3238
        3239         zonelist = &pgdat->node_zonelists[0];
        3240         j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
        3241
        3242         /*
        3243          * Now we build the zonelist so that it contains the zones
        3244          * of all the other nodes.
        3245          * We don't want to pressure a particular node, so when
        3246          * building the zones for node N, we make sure that the
        3247          * zones coming right after the local ones are those from
        3248          * node N+1 (modulo N)
        3249          */
        3250         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
        3251                 if (!node_online(node))
        3252                         continue;
        3253                 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
        3254                                                         MAX_NR_ZONES - 1);
        3255         }
        3256         for (node = 0; node < local_node; node++) {
        3257                 if (!node_online(node))
        3258                         continue;
        3259                 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
        3260                                                         MAX_NR_ZONES - 1);
        3261         }
        3262
        3263         zonelist->_zonerefs[j].zone = NULL;
        3264         zonelist->_zonerefs[j].zone_idx = 0;
        3265 }
        该函数实现的结果是:
        如果是UMA,则:zone_lists[0]={ZONE_HIGHMEM,ZONE_NORMAL,ZONE_DMA,NULL}
        如果是NUMA,对于多个节点。A,B,C则:
        B.zone_lists[0]={B.ZONE_HIGHMEM,B.ZONE_NORMAL,B.ZONE_DMA,C.ZONE_HIGHMEM,C.ZONE_NORMAL,C.ZONE_DMA,A.ZONE_HIGHMEM,A.ZONE_NORMAL,A.ZONE_DMA,NULL}(NULL表示结束,32633264)[1]
     3313行,
        start_kernel->build_all_zonelists->__build_all_zonelists->build_zonelist_cache
        3267 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
        3268 static void build_zonelist_cache(pg_data_t *pgdat)
        3269 {
        3270         pgdat->node_zonelists[0].zlcache_ptr = NULL;
        3271 }
     33293330行,涉及到冷热页表,具体查看:物理内存管理之冷热页。
3359行,依赖于CONFIG_DEBUG_MEMORY_INIT
3360行,依赖于CONFIG_CPUSETS,此又依赖于Control Group Support
3361         } else {
3362                 /* we have to stop all cpus to guarantee there is no user
3363                    of zonelist */
3364 #ifdef CONFIG_MEMORY_HOTPLUG
3365                 if (data)
3366                         setup_zone_pageset((struct zone *)data);
3367 #endif
3368                 stop_machine(__build_all_zonelists, NULL, NULL);
3369                 /* cpuset refresh routine should be here */
3370         }
3371         vm_total_pages = nr_free_pagecache_pages();
3372         /*
3373          * Disable grouping by mobility if the number of pages in the
3374          * system is too low to allow the mechanism to work. It would be
3375          * more accurate, but expensive to check per-zone. This check is
3376          * made on memory-hotadd so a system can start with mobility
3377          * disabled and enable it later
3378          */
3379         if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
3380                 page_group_by_mobility_disabled = 1;
3381         else
3382                 page_group_by_mobility_disabled = 0;
3383
3384         printk("Built %i zonelists in %s order, mobility grouping %s.  "
3385                 "Total pages: %ld\n",
3386                         nr_online_nodes,
3387                         zonelist_order_name[current_zonelist_order],
3388                         page_group_by_mobility_disabled ? "off" : "on",
3389                         vm_total_pages);
3390 #ifdef CONFIG_NUMA
3391         printk("Policy zone: %s\n", zone_names[policy_zone]);
3392 #endif
3393 }
3371行计算满足各个区最低要求的情况下,剩余物理内存的页框数。
33793382行,根据得到的剩余内存页框书,判断是否采用MOVABLE类型的页分类。这个是Linux的一个反页碎片机制,将剩余内存分类连接到各个空闲链表上。分类情况为:
     38 #define MIGRATE_UNMOVABLE     0
     39 #define MIGRATE_RECLAIMABLE   1
     40 #define MIGRATE_MOVABLE       2
     41 #define MIGRATE_PCPTYPES      3 /* the number of types on the pcp lists */
     42 #define MIGRATE_RESERVE       3
     43 #define MIGRATE_ISOLATE       4 /* can't allocate from here */
     44 #define MIGRATE_TYPES         5

注意:
1. 这一点和<Professional Linux Kernel Architecture>,Wolfgang Mauerer,Wiley Publishing.Inc.P166168.所述的完全不同。我反复看代码确认在v3.3Kernel中确实是这样的。^_^
2. 蓝色部分是2012/8/21号新加内容

评论

此博客中的热门博文

提交了30次才AC ---【附】POJ 2488解题报告

n个进程共享m个资源得死锁问题证明