How Linux/ARM initialize fallback-zone-lists
本文按照《Linux内核源代码情景分析》的方式分析Linux/ARM如何初始化fallback-zone-lists,并按照缩进的方式组织函数调用. 没有说清楚的地方敬请指正.
zonelist准确说,应该是fallback zone list.其存在的意义:当内存管理单元在某个区分配可用内存页的时候,如果内存不足,则会在其他区进行分配。其他区可能不止一个,先后顺序就定义在某个node节点的node_zonelists成员变量上。并且,当某个节点的内存不足时,可以分配其他节点的内存。
在V2.6.23以及之前的内核中,每个node节点拥有(node's number* zones per_node)个fall back zonelists.其排序规则是:
1.对于UMA,假设其zone type有HIGHMEM、NORMAL、DMA,其节点的zone_lists的情况是:zone_lists[ZONE_DMA]={DMA,NULL},zone_lists[ZONE_NORMAL]={NORMAL,DMA,NULL},zone_lists[ZONE_HIGHMEM]={HIGHMEM,NORMAL,DMA}.
2.对于NUMA,节点有A、B、C、D四个,zone type有HIGHMEM、NORMAL、DMA,那么其节点的zone_lists为:
B.zone_lists[ZONE_HIGHMEM]={B.HIGHMEM,B.NORMAL,B.DMA,C.HIGHMEM,C.NORMAL,C.DMA,A.HIGHMEM,A.NORMAL,A.DMA}
这一排序规则在《Professional Linux Kernel Architecture》中有非常准确的描述。
但是,在后续的Kernel中,将每个节点拥有的多个zone_lists合并为两个(对于NUMA是2个,UMA是1个),其相应的patch以及对应的讨论在:https://lkml.org/lkml/2007/8/8/302
本文的代码分析是针对于V3.3的Kernel Code.
在V2.6.23以及之前的内核中,每个node节点拥有(node's number* zones per_node)个fall back zonelists.其排序规则是:
1.对于UMA,假设其zone type有HIGHMEM、NORMAL、DMA,其节点的zone_lists的情况是:zone_lists[ZONE_DMA]={DMA,NULL},zone_lists[ZONE_NORMAL]={NORMAL,DMA,NULL},zone_lists[ZONE_HIGHMEM]={HIGHMEM,NORMAL,DMA}.
2.对于NUMA,节点有A、B、C、D四个,zone type有HIGHMEM、NORMAL、DMA,那么其节点的zone_lists为:
B.zone_lists[ZONE_HIGHMEM]={B.HIGHMEM,B.NORMAL,B.DMA,C.HIGHMEM,C.NORMAL,C.DMA,A.HIGHMEM,A.NORMAL,A.DMA}
这一排序规则在《Professional Linux Kernel Architecture》中有非常准确的描述。
但是,在后续的Kernel中,将每个节点拥有的多个zone_lists合并为两个(对于NUMA是2个,UMA是1个),其相应的patch以及对应的讨论在:https://lkml.org/lkml/2007/8/8/302
本文的代码分析是针对于V3.3的Kernel Code.
start_kernel->build_all_zonelists
3349 /*
3350 * Called with zonelists_mutex held always
3351 * unless system_state == SYSTEM_BOOTING.
3352 */
3353 void __ref build_all_zonelists(void *data)
3354 {
3355 set_zonelist_order();
3356
3357 if (system_state == SYSTEM_BOOTING) {
3358 __build_all_zonelists(NULL);
3359 mminit_verify_zonelist();
3360 cpuset_init_current_mems_allowed();
3355行,设置zonelist的排序规则。当在分配页框的时候,如果要分配区域中的页框数量不足,则从候补区中查找页。候补区的排队顺序依照current_zonelist_order设置的值。不同的值代表不同的意义。如下:
start_kernel->build_all_zonelists->set_zonelist_order:
3226 static void set_zonelist_order(void)
3227 {
3228 current_zonelist_order = ZONELIST_ORDER_ZONE;
3229 }
ZONELIST_ORDER_ZONE代表将来zonelist的排序规则。不过,对于UMA来说,ZONELIST_ORDER_NODE和ZONELIST_ORDER_ZONE是一样的。
2827 /*
2828
* zonelist_order:
2829
* 0 = automatic detection of
better ordering.
2830
* 1 = order by ([node] distance,
-zonetype)
2831
* 2 = order by (-zonetype, [node]
distance)
2832
*
2833
* If not NUMA,
ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
2834
* the same zonelist. So only NUMA
can configure this param.
2835
*/
2836 #define ZONELIST_ORDER_DEFAULT 0
2837 #define ZONELIST_ORDER_NODE 1
2838 #define ZONELIST_ORDER_ZONE 2
3357行,在system_state默认的值是0.由于system_state是个枚举变量,因此,其值为SYSTEM_BOOTING.
108 enum system_states system_state
__read_mostly;
109 EXPORT_SYMBOL(system_state);
3358行,建立zonelists.每个zone_type对应于pgdata->node_zonelists[]数组的一个元素。每个元素由有一个指针数组(下面代码中的_zonerefs[])构成。先来看下相关数据结构,再看处理过程:
603 struct zonelist {
604 struct zonelist_cache *zlcache_ptr; // NULL or
&zlcache
605 struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST
+ 1];
606 #ifdef CONFIG_NUMA
607 struct zonelist_cache zlcache; // optional ...
608 #endif
609 };
636 typedef struct pglist_data {
637 struct zone node_zones[MAX_NR_ZONES];
638 struct zonelist node_zonelists[MAX_ZONELISTS];
...
668}
对于UMA体系结构,
573 #define MAX_ZONELISTS
1
对于NUMA体系结构,
498 /*
499
* The NUMA zonelists are doubled because we need zonelists that restrict
the
500
* allocations to a single node for GFP_THISNODE.
501
*
502
* [0] : Zonelist with fallback
503
* [1] : No fallback
(GFP_THISNODE)
504
*/
505 #define MAX_ZONELISTS
2
493 /* Maximum number
of zones on a zonelist */
494 #define MAX_ZONES_PER_ZONELIST
(MAX_NUMNODES * MAX_NR_ZONES)
start_kernel->build_all_zonelists->__build_all_zonelists
3301 static __init_refok int __build_all_zonelists(void *data)
3302 {
3303 int nid;
3304 int cpu;
3305
3306 #ifdef CONFIG_NUMA
3307 memset(node_load, 0, sizeof(node_load));
3308 #endif
3309 for_each_online_node(nid) {
3310 pg_data_t *pgdat = NODE_DATA(nid);
3311
3312 build_zonelists(pgdat);
3313 build_zonelist_cache(pgdat);
3314 }
3315
3316 /*
3317 * Initialize the boot_pagesets that
are going to be used
3318 * for bootstrapping processors. The
real pagesets for
3319 * each zone will be allocated later
when the per cpu
3320 * allocator is available.
3321
*
3322 * boot_pagesets are used also for
bootstrapping offline
3323 * cpus if the system is already
booted because the pagesets
3324 * are needed to initialize allocators
on a specific cpu too.
3325
* F.e. the percpu allocator needs
the page allocator which
3326 * needs the percpu allocator in order
to allocate its pagesets
3327 * (a chicken-egg dilemma).
3328 */
3329 for_each_possible_cpu(cpu) {
3330 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
3331
3332 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
3333 /*
3334 * We now know the "local
memory node" for each node--
3335 * i.e., the node of the first zone in the
generic zonelist.
3336 * Set up numa_mem percpu
variable for on-line cpus. During
3337 * boot, only the boot cpu
should be on-line; we'll init the
3338 * secondary cpus' numa_mem as they come
on-line. During
3339 * node/memory hotplug, we'll
fixup all on-line cpus.
3340 */
3341 if (cpu_online(cpu))
3342 set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
3343 #endif
3344 }
3345
3346 return 0;
3347 }
看3312行,
start_kernel->build_all_zonelists->__build_all_zonelists->build_zonelists:
3231 static void build_zonelists(pg_data_t *pgdat)
3232 {
3233 int node, local_node;
3234 enum zone_type j;
3235 struct zonelist *zonelist;
3236
3237 local_node = pgdat->node_id;
3238
3239 zonelist = &pgdat->node_zonelists[0];
3240 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
3241
3242 /*
3243 * Now we build the zonelist so that
it contains the zones
3244 * of all the other nodes.
3245 * We don't want to pressure a
particular node, so when
3246 * building the zones for node N, we
make sure that the
3247 * zones coming right after the local
ones are those from
3248 * node N+1 (modulo N)
3249 */
3250 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3251 if (!node_online(node))
3252 continue;
3253 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3254
MAX_NR_ZONES - 1);
3255 }
3256 for (node = 0; node < local_node; node++) {
3257 if (!node_online(node))
3258 continue;
3259 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
3260 MAX_NR_ZONES - 1);
3261 }
3262
3263 zonelist->_zonerefs[j].zone = NULL;
3264 zonelist->_zonerefs[j].zone_idx = 0;
3265 }
该函数实现的结果是:
如果是UMA,则:zone_lists[0]={ZONE_HIGHMEM,ZONE_NORMAL,ZONE_DMA,NULL}。
如果是NUMA,对于多个节点。A,B,C则:
B.zone_lists[0]={B.ZONE_HIGHMEM,B.ZONE_NORMAL,B.ZONE_DMA,C.ZONE_HIGHMEM,C.ZONE_NORMAL,C.ZONE_DMA,A.ZONE_HIGHMEM,A.ZONE_NORMAL,A.ZONE_DMA,NULL}(NULL表示结束,3263~3264行)[1]
看3313行,
start_kernel->build_all_zonelists->__build_all_zonelists->build_zonelist_cache
3267 /* non-NUMA
variant of zonelist performance cache - just NULL zlcache_ptr */
3268 static void build_zonelist_cache(pg_data_t *pgdat)
3269 {
3270 pgdat->node_zonelists[0].zlcache_ptr = NULL;
3271 }
3329~3330行,涉及到冷热页表,具体查看:物理内存管理之冷热页。
3359行,依赖于CONFIG_DEBUG_MEMORY_INIT。
3360行,依赖于CONFIG_CPUSETS,此又依赖于Control Group Support。
3361 } else {
3362 /* we have to stop
all cpus to guarantee there is no user
3363 of zonelist */
3364 #ifdef CONFIG_MEMORY_HOTPLUG
3365 if (data)
3366 setup_zone_pageset((struct zone *)data);
3367 #endif
3368 stop_machine(__build_all_zonelists, NULL, NULL);
3369 /* cpuset refresh
routine should be here */
3370 }
3371 vm_total_pages = nr_free_pagecache_pages();
3372 /*
3373 * Disable grouping by mobility if the
number of pages in the
3374 * system is too low to allow the
mechanism to work. It would be
3375 * more accurate, but expensive to
check per-zone. This check is
3376 * made on memory-hotadd so a system
can start with mobility
3377 * disabled and enable it later
3378 */
3379 if (vm_total_pages < (pageblock_nr_pages
* MIGRATE_TYPES))
3380 page_group_by_mobility_disabled
= 1;
3381 else
3382 page_group_by_mobility_disabled = 0;
3383
3384 printk("Built %i
zonelists in %s order, mobility grouping %s.
"
3385 "Total pages:
%ld\n",
3386 nr_online_nodes,
3387 zonelist_order_name[current_zonelist_order],
3388 page_group_by_mobility_disabled
? "off" : "on",
3389 vm_total_pages);
3390 #ifdef CONFIG_NUMA
3391 printk("Policy zone:
%s\n", zone_names[policy_zone]);
3392 #endif
3393 }
3371行计算满足各个区最低要求的情况下,剩余物理内存的页框数。
3379~3382行,根据得到的剩余内存页框书,判断是否采用MOVABLE类型的页分类。这个是Linux的一个反页碎片机制,将剩余内存分类连接到各个空闲链表上。分类情况为:
38 #define MIGRATE_UNMOVABLE 0
39 #define MIGRATE_RECLAIMABLE 1
40 #define MIGRATE_MOVABLE 2
41 #define MIGRATE_PCPTYPES 3 /* the number of
types on the pcp lists */
42 #define MIGRATE_RESERVE 3
43 #define MIGRATE_ISOLATE 4 /* can't allocate
from here */
44 #define MIGRATE_TYPES 5
注意:
1. 这一点和<Professional Linux
Kernel Architecture>,Wolfgang Mauerer,Wiley Publishing.Inc.P166~168.所述的完全不同。我反复看代码确认在v3.3的Kernel中确实是这样的。^_^
2. 蓝色部分是2012/8/21号新加内容
2. 蓝色部分是2012/8/21号新加内容
评论
发表评论