How Linux/ARM initialize fallback-zone-lists

本文按照《Linux内核源代码情景分析》的方式分析Linux/ARM如何初始化fallback-zone-lists,并按照缩进的方式组织函数调用. 没有说清楚的地方敬请指正.

zonelist准确说，应该是fallback zone list.其存在的意义：当内存管理单元在某个区分配可用内存页的时候，如果内存不足，则会在其他区进行分配。其他区可能不止一个，先后顺序就定义在某个node节点的node_zonelists成员变量上。并且，当某个节点的内存不足时，可以分配其他节点的内存。

在V2.6.23以及之前的内核中，每个node节点拥有(node's number* zones per_node)个fall back zonelists.其排序规则是：
1.对于UMA，假设其zone type有HIGHMEM、NORMAL、DMA，其节点的zone_lists的情况是：zone_lists[ZONE_DMA]={DMA,NULL},zone_lists[ZONE_NORMAL]={NORMAL,DMA,NULL},zone_lists[ZONE_HIGHMEM]={HIGHMEM,NORMAL,DMA}.
2.对于NUMA，节点有A、B、C、D四个，zone type有HIGHMEM、NORMAL、DMA,那么其节点的zone_lists为：
B.zone_lists[ZONE_HIGHMEM]={B.HIGHMEM,B.NORMAL,B.DMA,C.HIGHMEM,C.NORMAL,C.DMA,A.HIGHMEM,A.NORMAL,A.DMA}
这一排序规则在《Professional Linux Kernel Architecture》中有非常准确的描述。
但是，在后续的Kernel中，将每个节点拥有的多个zone_lists合并为两个（对于NUMA是2个，UMA是1个），其相应的patch以及对应的讨论在：https://lkml.org/lkml/2007/8/8/302

本文的代码分析是针对于V3.3的Kernel Code.

start_kernel->build_all_zonelists

3349 /*

3350 * Called with zonelists_mutex held always

3351 * unless system_state == SYSTEM_BOOTING.

3352 */

3353 void __ref build_all_zonelists(void *data)

3354 {

3355 set_zonelist_order();

3356

3357 if (system_state == SYSTEM_BOOTING) {

3358 __build_all_zonelists(NULL);

3359 mminit_verify_zonelist();

3360 cpuset_init_current_mems_allowed();

3355行，设置zonelist的排序规则。当在分配页框的时候，如果要分配区域中的页框数量不足，则从候补区中查找页。候补区的排队顺序依照current_zonelist_order设置的值。不同的值代表不同的意义。如下：

start_kernel->build_all_zonelists->set_zonelist_order:

3226 static void set_zonelist_order(void)

3227 {

3228 current_zonelist_order = ZONELIST_ORDER_ZONE;

3229 }

ZONELIST_ORDER_ZONE代表将来zonelist的排序规则。不过，对于UMA来说，ZONELIST_ORDER_NODE和ZONELIST_ORDER_ZONE是一样的。

2827 /*

2828 * zonelist_order:

2829 * 0 = automatic detection of better ordering.

2830 * 1 = order by ([node] distance, -zonetype)

2831 * 2 = order by (-zonetype, [node] distance)

2832 *

2833 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create

2834 * the same zonelist. So only NUMA can configure this param.

2835 */

2836 #define ZONELIST_ORDER_DEFAULT 0

2837 #define ZONELIST_ORDER_NODE 1

2838 #define ZONELIST_ORDER_ZONE 2

3357行，在system_state默认的值是0.由于system_state是个枚举变量，因此，其值为SYSTEM_BOOTING.

108 enum system_states system_state __read_mostly;

109 EXPORT_SYMBOL(system_state);

3358行，建立zonelists.每个zone_type对应于pgdata->node_zonelists[]数组的一个元素。每个元素由有一个指针数组（下面代码中的_zonerefs[]）构成。先来看下相关数据结构，再看处理过程：

603 struct zonelist {

604 struct zonelist_cache *zlcache_ptr; // NULL or &zlcache

605 struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];

606 #ifdef CONFIG_NUMA

607 struct zonelist_cache zlcache; // optional ...

608 #endif

609 };

636 typedef struct pglist_data {

637 struct zone node_zones[MAX_NR_ZONES];

638 struct zonelist node_zonelists[MAX_ZONELISTS];

...

668}

对于UMA体系结构，

573 #define MAX_ZONELISTS 1

对于NUMA体系结构，

498 /*

499 * The NUMA zonelists are doubled because we need zonelists that restrict the

500 * allocations to a single node for GFP_THISNODE.

501 *

502 * [0] : Zonelist with fallback

503 * [1] : No fallback (GFP_THISNODE)

504 */

505 #define MAX_ZONELISTS 2

493 /* Maximum number of zones on a zonelist */

494 #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

start_kernel->build_all_zonelists->__build_all_zonelists

3301 static __init_refok int __build_all_zonelists(void *data)

3302 {

3303 int nid;

3304 int cpu;

3305

3306 #ifdef CONFIG_NUMA

3307 memset(node_load, 0, sizeof(node_load));

3308 #endif

3309 for_each_online_node(nid) {

3310 pg_data_t *pgdat = NODE_DATA(nid);

3311

3312 build_zonelists(pgdat);

3313 build_zonelist_cache(pgdat);

3314 }

3315

3316 /*

3317 * Initialize the boot_pagesets that are going to be used

3318 * for bootstrapping processors. The real pagesets for

3319 * each zone will be allocated later when the per cpu

3320 * allocator is available.

3321 *

3322 * boot_pagesets are used also for bootstrapping offline

3323 * cpus if the system is already booted because the pagesets

3324 * are needed to initialize allocators on a specific cpu too.

3325 * F.e. the percpu allocator needs the page allocator which

3326 * needs the percpu allocator in order to allocate its pagesets

3327 * (a chicken-egg dilemma).

3328 */

3329 for_each_possible_cpu(cpu) {

3330 setup_pageset(&per_cpu(boot_pageset, cpu), 0);

3331

3332 #ifdef CONFIG_HAVE_MEMORYLESS_NODES

3333 /*

3334 * We now know the "local memory node" for each node--

3335 * i.e., the node of the first zone in the generic zonelist.

3336 * Set up numa_mem percpu variable for on-line cpus. During

3337 * boot, only the boot cpu should be on-line; we'll init the

3338 * secondary cpus' numa_mem as they come on-line. During

3339 * node/memory hotplug, we'll fixup all on-line cpus.

3340 */

3341 if (cpu_online(cpu))

3342 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));

3343 #endif

3344 }

3345

3346 return 0;

3347 }

看3312行，

start_kernel->build_all_zonelists->__build_all_zonelists->build_zonelists:

3231 static void build_zonelists(pg_data_t *pgdat)

3232 {

3233 int node, local_node;

3234 enum zone_type j;

3235 struct zonelist *zonelist;

3236

3237 local_node = pgdat->node_id;

3238

3239 zonelist = &pgdat->node_zonelists[0];

3240 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);

3241

3242 /*

3243 * Now we build the zonelist so that it contains the zones

3244 * of all the other nodes.

3245 * We don't want to pressure a particular node, so when

3246 * building the zones for node N, we make sure that the

3247 * zones coming right after the local ones are those from

3248 * node N+1 (modulo N)

3249 */

3250 for (node = local_node + 1; node < MAX_NUMNODES; node++) {

3251 if (!node_online(node))

3252 continue;

3253 j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3254 MAX_NR_ZONES - 1);

3255 }

3256 for (node = 0; node < local_node; node++) {

3257 if (!node_online(node))

3258 continue;

3259 j = build_zonelists_node(NODE_DATA(node), zonelist, j,

3260 MAX_NR_ZONES - 1);

3261 }

3262

3263 zonelist->_zonerefs[j].zone = NULL;

3264 zonelist->_zonerefs[j].zone_idx = 0;

3265 }

该函数实现的结果是：

如果是UMA，则：zone_lists[0]={ZONE_HIGHMEM,ZONE_NORMAL,ZONE_DMA,NULL}。

如果是NUMA，对于多个节点。A,B,C则：

B.zone_lists[0]={B.ZONE_HIGHMEM,B.ZONE_NORMAL,B.ZONE_DMA,C.ZONE_HIGHMEM,C.ZONE_NORMAL,C.ZONE_DMA,A.ZONE_HIGHMEM,A.ZONE_NORMAL,A.ZONE_DMA,NULL}(NULL表示结束，3263～3264行)[1]

看3313行，

start_kernel->build_all_zonelists->__build_all_zonelists->build_zonelist_cache

3267 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */

3268 static void build_zonelist_cache(pg_data_t *pgdat)

3269 {

3270 pgdat->node_zonelists[0].zlcache_ptr = NULL;

3271 }

3329～3330行，涉及到冷热页表,具体查看：物理内存管理之冷热页。

3359行，依赖于CONFIG_DEBUG_MEMORY_INIT。

3360行，依赖于CONFIG_CPUSETS，此又依赖于Control Group Support。

3361 } else {

3362 /* we have to stop all cpus to guarantee there is no user

3363 of zonelist */

3364 #ifdef CONFIG_MEMORY_HOTPLUG

3365 if (data)

3366 setup_zone_pageset((struct zone *)data);

3367 #endif

3368 stop_machine(__build_all_zonelists, NULL, NULL);

3369 /* cpuset refresh routine should be here */

3370 }

3371 vm_total_pages = nr_free_pagecache_pages();

3372 /*

3373 * Disable grouping by mobility if the number of pages in the

3374 * system is too low to allow the mechanism to work. It would be

3375 * more accurate, but expensive to check per-zone. This check is

3376 * made on memory-hotadd so a system can start with mobility

3377 * disabled and enable it later

3378 */

3379 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))

3380 page_group_by_mobility_disabled = 1;

3381 else

3382 page_group_by_mobility_disabled = 0;

3383

3384 printk("Built %i zonelists in %s order, mobility grouping %s. "

3385 "Total pages: %ld\n",

3386 nr_online_nodes,

3387 zonelist_order_name[current_zonelist_order],

3388 page_group_by_mobility_disabled ? "off" : "on",

3389 vm_total_pages);

3390 #ifdef CONFIG_NUMA

3391 printk("Policy zone: %s\n", zone_names[policy_zone]);

3392 #endif

3393 }

3371行计算满足各个区最低要求的情况下，剩余物理内存的页框数。

3379～3382行，根据得到的剩余内存页框书，判断是否采用MOVABLE类型的页分类。这个是Linux的一个反页碎片机制，将剩余内存分类连接到各个空闲链表上。分类情况为：

38 #define MIGRATE_UNMOVABLE 0

39 #define MIGRATE_RECLAIMABLE 1

40 #define MIGRATE_MOVABLE 2

41 #define MIGRATE_PCPTYPES 3 /* the number of types on the pcp lists */

42 #define MIGRATE_RESERVE 3

43 #define MIGRATE_ISOLATE 4 /* can't allocate from here */

44 #define MIGRATE_TYPES 5

注意：

1. 这一点和<Professional Linux Kernel Architecture>,Wolfgang Mauerer,Wiley Publishing.Inc.P166～168.所述的完全不同。我反复看代码确认在v3.3的Kernel中确实是这样的。^_^
2. 蓝色部分是2012/8/21号新加内容

搜索此博客

Linux Stuff

How Linux/ARM initialize fallback-zone-lists

评论

发表评论

此博客中的热门博文

Linux/ARM Page Table Entry 属性设置分析

由RFE指令引发的一串故事

提交了30次才AC －－－【附】POJ 2488解题报告