这里只说物理内存管理 linux内核的,看了很多讲解的内存的东西,但是自己总结的时候总感觉无从下手,这里就从实际物理内存分配接口开始吧。
Kmalloc 它分配连续的物理内存空间 ,它不负责把分配的内存空间清零,它能分配多大的呢?并且它只能分配ZONE_NORMAL的不能分配dma和high里的,也就是只分配低端内存.一般情况下内存被分为三个zone:NORMAL、DMA、HIGH.
这个函数是建立在slab分配器的基础上的,通过cache 而cache有通过slab 分配obj 。 在开始分析kmalloc函数之前,我们需要说明一下linux内核物理内存的分配函数API: __get_free_pages它会调用alloc_pages,它的特点是不能从HIGHMEM分配内存,分配2的幂个连续物理页面。它有简化模式(只分配一page) __get_free_page,而get_zeroed_page接口分配的页面内容对应填充为0. 从dma分配可以调用__get_dma_pages(它本质也是调用__get_free_pages) 那么终极接口alloc_pages它可以从任何zone里申请内存,当然前提设置对应的flags. 参考内核:linux3.18.13 参考书籍:《linux内核设计与实现》《linux设备驱动程序》《深入理解linux设备驱动内核机制》 下面我们就说说kmalloc:(关于分配时候的flags这里不讨论,具体可以参考资料) 我们先看头文件
#include
而关于它的具体实现我们看slab.h中
- #ifdef CONFIG_SLUB
- #include linux/slub_def.h>
- #elif defined(CONFIG_SLOB)
- #include linux/slob_def.h>
- #else
- #include linux/slab_def.h>
- #endif
一般系统默认#include
- static __always_inline void *kmalloc(size_t size, gfp_t flags)
- {
- struct kmem_cache *cachep;
- void *ret;
-
- if (__builtin_constant_p(size)) {
- int i = 0;
-
- if (!size)
- return ZERO_SIZE_PTR;
-
- #define CACHE(x) \
- if (size = x) \
- goto found; \
- else \
- i++;
- #include linux/kmalloc_sizes.h> //这里查询申请的size在哪个范围 从32乘2递增。I每次加1.
- #undef CACHE
- return NULL;
- found:
- #ifdef CONFIG_ZONE_DMA
- if (flags & GFP_DMA)
- cachep = malloc_sizes[i].cs_dmacachep; //很明显如果定义了dma,并且设置了dma标志则优先从dma cache里申请。malloc_sizes的初始化在slab.c里。可以具体分析一下。
- else
- #endif
- cachep = malloc_sizes[i].cs_cachep; //从指定的cache链表分配内存,不浪费空间。
-
- ret = kmem_cache_alloc_trace(cachep, flags, size);
-
- return ret;
- }
- return __kmalloc(size, flags);
- }
这里可以补充下代码关于
kmalloc_sizes . h - #if (PAGE_SIZE == 4096)
- CACHE(32)
- #endif
- CACHE(64)
- #if L1_CACHE_BYTES 64
- CACHE(96)
- #endif
- CACHE(128)
- #if L1_CACHE_BYTES 128
- CACHE(192)
- #endif
- CACHE(256)
- CACHE(512)
- CACHE(1024)
- CACHE(2048)
- CACHE(4096)
- CACHE(8192)
- CACHE(16384)
- CACHE(32768)
- CACHE(65536)
- CACHE(131072)
- #if KMALLOC_MAX_SIZE >= 262144
- CACHE(262144)
- #endif
- #if KMALLOC_MAX_SIZE >= 524288
- CACHE(524288)
- #endif
- #if KMALLOC_MAX_SIZE >= 1048576
- CACHE(1048576)
- #endif
- #if KMALLOC_MAX_SIZE >= 2097152
- CACHE(2097152)
- #endif
- #if KMALLOC_MAX_SIZE >= 4194304
- CACHE(4194304)
- #endif
- #if KMALLOC_MAX_SIZE >= 8388608
- CACHE(8388608)
- #endif
- #if KMALLOC_MAX_SIZE >= 16777216
- CACHE(16777216)
- #endif
- #if KMALLOC_MAX_SIZE >= 33554432
- CACHE(33554432)
- #endif
我们看到函数开头需要说明一下:
__builtin_constant_p 是编译器gcc内置函数,用于判断一个值是否为编译时常量,如果是常数,函数返回1 ,否则返回0。此内置函数的典型用法是在宏中用于手动编译时优化显然如果size为常数 则用__kmalloc(size, flags);申请内存.
它查询需要分配的内存在哪个系统cache然后调用
- #ifdef CONFIG_TRACING
- extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
- #else
- static __always_inline void *
- kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
- {
- return kmem_cache_alloc(cachep, flags);
- }
- #endif
我们看具体代码:
- /**
- * kmem_cache_alloc - Allocate an object
- * @cachep: The cache to allocate from.
- * @flags: See kmalloc().
- *
- * Allocate an object from this cache. The flags are only relevant
- * if the cache has no available objects.
- */
- void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
- {
- void *ret = slab_alloc(cachep, flags, _RET_IP_);
-
- trace_kmem_cache_alloc(_RET_IP_, ret, // 跟踪调试会用到
- cachep->object_size, cachep->size, flags);
-
- return ret;
- }
它实际的分配是slab_alloc:
- static __always_inline void *
- slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
- {
- unsigned long save_flags;
- void *objp;
-
- flags &= gfp_allowed_mask; // 说明在gfp.h中 ,如下
- /*
- * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
- * GFP flags are used before interrupts are enabled. Once interrupts are
- * enabled, it is set to __GFP_BITS_MASK while the system is running. During
- * hibernation, it is used by PM to avoid I/O during memory allocation while
- * devices are suspended.
- */
- extern gfp_t gfp_allowed_mask;
-
- lockdep_trace_alloc(flags); // 调试用
-
- if (slab_should_failslab(cachep, flags))
- return NULL;
-
- cachep = memcg_kmem_get_cache(cachep, flags);
-
- cache_alloc_debugcheck_before(cachep, flags);
- local_irq_save(save_flags);
- objp = __do_cache_alloc(cachep, flags);
- local_irq_restore(save_flags);
- objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
- kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
- flags);
- prefetchw(objp);
-
- if (likely(objp))
- kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
-
- if (unlikely((flags & __GFP_ZERO) && objp))
- memset(objp, 0, cachep->object_size);
-
- return objp;
- }
它调用objp = __do_cache_alloc(cachep, flags); 除了检查一些标志等继续调用
____cache_alloc(cachep, flags);
它是一个统一的接口 (有检测numa和uma ,linux默认是uma 除非指定了numa)
- static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
- {
- void *objp;
- struct array_cache *ac;
- bool force_refill = false;
-
- check_irq_off();
-
- ac = cpu_cache_get(cachep);
- if (likely(ac->avail)) {
- ac->touched = 1;
- objp = ac_get_obj(cachep, ac, flags, false);
-
- /*
- * Allow for the possibility all avail objects are not allowed
- * by the current flags
- */
- if (objp) {
- STATS_INC_ALLOCHIT(cachep);
- goto out;
- }
- force_refill = true;
- }
-
- STATS_INC_ALLOCMISS(cachep);
- objp = cache_alloc_refill(cachep, flags, force_refill);
- /*
- * the 'ac' may be updated by cache_alloc_refill(),
- * and kmemleak_erase() requires its correct value.
- */
- ac = cpu_cache_get(cachep);
-
- out:
- /*
- * To avoid a false negative, if an object that is in one of the
- * per-CPU caches is leaked, we need to make sure kmemleak doesn't
- * treat the array pointers as a reference to the object.
- */
- if (objp)
- kmemleak_erase(&ac->entry[ac->avail]);
- return objp;
- }
这里我们假定是第一次使用分配内存,那么根据在kmem_cache_init中的malloc_sizes[]的初始化,在kmalloc的时候返回的kmalloc_cache指针指向的cache中用到这样个函数:
- static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
- {
- if (slab_state >= FULL)
- return enable_cpucache(cachep, gfp);
-
- if (slab_state == DOWN) {
- /*
- * Note: Creation of first cache (kmem_cache).
- * The setup_list3s is taken care
- * of by the caller of __kmem_cache_create
- */
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
- slab_state = PARTIAL;
- } else if (slab_state == PARTIAL) {
- /*
- * Note: the second kmem_cache_create must create the cache
- * that's used by kmalloc(24), otherwise the creation of
- * further caches will BUG().
- */
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
-
- /*
- * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
- * the second cache, then we need to set up all its list3s,
- * otherwise the creation of further caches will BUG().
- */
- set_up_list3s(cachep, SIZE_AC);
- if (INDEX_AC == INDEX_L3)
- slab_state = PARTIAL_L3;
- else
- slab_state = PARTIAL_ARRAYCACHE;
- } else {
- /* Remaining boot caches */
- cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), gfp);
-
- if (slab_state == PARTIAL_ARRAYCACHE) {
- set_up_list3s(cachep, SIZE_L3);
- slab_state = PARTIAL_L3;
- } else {
- int node;
- for_each_online_node(node) {
- cachep->nodelists[node] =
- kmalloc_node(sizeof(struct kmem_list3),
- gfp, node);
- BUG_ON(!cachep->nodelists[node]);
- kmem_list3_init(cachep->nodelists[node]);
- }
- }
- }
- cachep->nodelists[numa_mem_id()]->next_reap =
- jiffies + REAPTIMEOUT_LIST3 +
- ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
- cpu_cache_get(cachep)->avail = 0;
- cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; // 1
- cpu_cache_get(cachep)->batchcount = 1;
- cpu_cache_get(cachep)->touched = 0;
- cachep->batchcount = 1;
- cachep->limit = BOOT_CPUCACHE_ENTRIES;
- return 0;
- }
我们知道不论array被赋了什么值,最后都要初始化avail等值.
所以如果array不可用,那么就会调用;当然如果array可用那么直接返回申请的obj的内存指针.
- static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
- bool force_refill)
- {
- int batchcount;
- struct kmem_list3 *l3;
- struct array_cache *ac;
- int node;
-
- check_irq_off();
- node = numa_mem_id();
- if (unlikely(force_refill))
- goto force_grow;
- retry:
- ac = cpu_cache_get(cachep);
- batchcount = ac->batchcount;
- if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
- /*
- * If there was little recent activity on this cache, then
- * perform only a partial refill. Otherwise we could generate
- * refill bouncing.
- */
- batchcount = BATCHREFILL_LIMIT;
- }
- l3 = cachep->nodelists[node];
-
- BUG_ON(ac->avail > 0 || !l3);
- spin_lock(&l3->list_lock);
-
- /* See if we can refill from the shared array */
- if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
- l3->shared->touched = 1;
- goto alloc_done;
- }
-
- while (batchcount > 0) {
- struct list_head *entry;
- struct slab *slabp;
- /* Get slab alloc is to come from. */
- entry = l3->slabs_partial.next;
- if (entry == &l3->slabs_partial) {
- l3->free_touched = 1;
- entry = l3->slabs_free.next;
- if (entry == &l3->slabs_free)
- goto must_grow;
- }
-
- slabp = list_entry(entry, struct slab, list);
- check_slabp(cachep, slabp);
- check_spinlock_acquired(cachep);
-
- /*
- * The slab was either on partial or free list so
- * there must be at least one object available for
- * allocation.
- */
- BUG_ON(slabp->inuse >= cachep->num);
-
- while (slabp->inuse cachep->num && batchcount--) {
- STATS_INC_ALLOCED(cachep);
- STATS_INC_ACTIVE(cachep);
- STATS_SET_HIGH(cachep);
-
- ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
- node));
- }
- check_slabp(cachep, slabp);
-
- /* move slabp to correct slabp list: */
- list_del(&slabp->list);
- if (slabp->free == BUFCTL_END)
- list_add(&slabp->list, &l3->slabs_full);
- else
- list_add(&slabp->list, &l3->slabs_partial);
- }
-
- must_grow:
- l3->free_objects -= ac->avail;
- alloc_done:
- spin_unlock(&l3->list_lock);
-
- if (unlikely(!ac->avail)) {
- int x;
- force_grow:
- x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); // grow成功返回 1
-
- /* cache_grow can reenable interrupts, then ac could change. */
- ac = cpu_cache_get(cachep);
- node = numa_mem_id();
-
- /* no objects in sight? abort */
- if (!x && (ac->avail == 0 || force_refill))
- return NULL;
-
- if (!ac->avail) /* objects refilled by interrupt? */
- goto retry;
- }
- ac->touched = 1;
-
- return ac_get_obj(cachep, ac, flags, force_refill);
- }
由于第一次使用nodelist上slab链表都为空,所以must_grow
它调用cache_grow,这个函数首先计算了slab着色处理。然后调用kmem_getpages申请page,大小根据cache->gfporder,它返回申请pages的虚拟地址.
- /*
- * Grow (by 1) the number of slabs within a cache. This is called by
- * kmem_cache_alloc() when there are no active objs left in a cache.
- */
- static int cache_grow(struct kmem_cache *cachep,
- gfp_t flags, int nodeid, void *objp)
- {
- struct slab *slabp;
- size_t offset;
- gfp_t local_flags;
- struct kmem_list3 *l3;
-
- /*
- * Be lazy and only check for valid flags here, keeping it out of the
- * critical path in kmem_cache_alloc().
- */
- BUG_ON(flags & GFP_SLAB_BUG_MASK);
- local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-
- /* Take the l3 list lock to change the colour_next on this node */
- check_irq_off();
- l3 = cachep->nodelists[nodeid];
- spin_lock(&l3->list_lock);
-
- /* Get colour for the slab, and cal the next value. */
- offset = l3->colour_next; // default 0
- l3->colour_next++;
- if (l3->colour_next >= cachep->colour)
- l3->colour_next = 0;
- spin_unlock(&l3->list_lock);
-
- offset *= cachep->colour_off; // first time ,offset is 0 ;
-
- if (local_flags & __GFP_WAIT)
- local_irq_enable();
-
- /*
- * The test for missing atomic flag is performed here, rather than
- * the more obvious place, simply to reduce the critical path length
- * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
- * will eventually be caught here (where it matters).
- */
- kmem_flagcheck(cachep, flags);
-
- /*
- * Get mem for the objs. Attempt to allocate a physical page from
- * 'nodeid'.
- */
- if (!objp)
- objp = kmem_getpages(cachep, local_flags, nodeid);
- if (!objp)
- goto failed;
-
- /* Get slab management. */
- slabp = alloc_slabmgmt(cachep, objp, offset,
- local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!slabp)
- goto opps1;
-
- slab_map_pages(cachep, slabp, objp);
-
- cache_init_objs(cachep, slabp);
-
- if (local_flags & __GFP_WAIT)
- local_irq_disable();
- check_irq_off();
- spin_lock(&l3->list_lock);
-
- /* Make slab active. */
- list_add_tail(&slabp->list, &(l3->slabs_free)); // 把新申请的slab添加到nodelist的slabs_free链表。
- STATS_INC_GROWN(cachep);
- l3->free_objects += cachep->num; //初始化可用的对象即每个slab可以包含的obj数目
- spin_unlock(&l3->list_lock);
- return 1;
- opps1:
- kmem_freepages(cachep, objp);
- failed:
- if (local_flags & __GFP_WAIT)
- local_irq_disable();
- return 0;
- }
而关于slab着色跟硬件缓冲有关,为了尽量避免缓存冲突不命中问题,提高效率(cache_line问题)。可以参考《深入理解计算机系统》。
具体操作见:
- /*
- * Get the memory for a slab management obj.
- * For a slab cache when the slab descriptor is off-slab, slab descriptors
- * always come from malloc_sizes caches. The slab descriptor cannot
- * come from the same cache which is getting created because,
- * when we are searching for an appropriate cache for these
- * descriptors in kmem_cache_create, we search through the malloc_sizes array.
- * If we are creating a malloc_sizes cache here it would not be visible to
- * kmem_find_general_cachep till the initialization is complete.
- * Hence we cannot have slabp_cache same as the original cache.
- */
- static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
- int colour_off, gfp_t local_flags,
- int nodeid)
- {
- struct slab *slabp;
-
- if (OFF_SLAB(cachep)) {
- // 关于OFF_SLAB问题 可以看代码:
-
- CFLGS_OFF_SLAB 在__kmem_cache_create
- /*
- * Determine if the slab management is 'on' or 'off' slab.
- * (bootstrapping cannot cope with offslab caches so don't do
- * it too early on. Always use on-slab management when
- * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
- */
- if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
- !(flags & SLAB_NOLEAKTRACE))
- /*
- * Size is large, assume best to place the slab management obj
- * off-slab (should allow better packing of objs).
- */
- flags |= CFLGS_OFF_SLAB;
-
- /* Slab management obj is off-slab. */
- slabp = kmem_cache_alloc_node(cachep->slabp_cache,
- local_flags, nodeid);
- /*
- * If the first object in the slab is leaked (it's allocated
- * but no one has a reference to it), we want to make sure
- * kmemleak does not treat the ->s_mem pointer as a reference
- * to the object. Otherwise we will not report the leak.
- */
- kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
- local_flags);
- if (!slabp)
- return NULL;
- } else {
- slabp = objp + colour_off; // 在__kmem_cache_create中cachep->colour_off = cache_line_size();
- // 在cache.h中#define cache_line_size() L1_CACHE_BYTES; 一般为32B 大小.
- // cachep->colour = left_over / cachep->colour_off;
- colour_off += cachep->slab_size;
- }
- slabp->inuse = 0; // num of objs active in slab
- slabp->colouroff = colour_off; //第一个obj相对page地址的偏移
- slabp->s_mem = objp + colour_off; //第一个obj的地址
- slabp->nodeid = nodeid;
- slabp->free = 0;
- return slabp;
- }
我们看看另外一个很重要的操作:
- static void cache_init_objs(struct kmem_cache *cachep,
- struct slab *slabp)
- {
- int i;
-
- for (i = 0; i cachep->num; i++) {
- void *objp = index_to_obj(cachep, slabp, i);
- #if DEBUG
- /* need to poison the objs? */
- if (cachep->flags & SLAB_POISON)
- poison_obj(cachep, objp, POISON_FREE);
- if (cachep->flags & SLAB_STORE_USER)
- *dbg_userword(cachep, objp) = NULL;
-
- if (cachep->flags & SLAB_RED_ZONE) {
- *dbg_redzone1(cachep, objp) = RED_INACTIVE;
- *dbg_redzone2(cachep, objp) = RED_INACTIVE;
- }
- /*
- * Constructors are not allowed to allocate memory from the same
- * cache which they are a constructor for. Otherwise, deadlock.
- * They must also be threaded.
- */
- if (cachep->ctor && !(cachep->flags & SLAB_POISON))
- cachep->ctor(objp + obj_offset(cachep));
-
- if (cachep->flags & SLAB_RED_ZONE) {
- if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " end of an object");
- if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
- slab_error(cachep, "constructor overwrote the"
- " start of an object");
- }
- if ((cachep->size % PAGE_SIZE) == 0 &&
- OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
- kernel_map_pages(virt_to_page(objp),
- cachep->size / PAGE_SIZE, 0);
- #else
- if (cachep->ctor)
- cachep->ctor(objp); // 根据构造函数初始化对象
- #endif
- slab_bufctl(slabp)[i] = i + 1; // init bufctl数组 1、2、3、4 ..... 最后一个设置成为BUFCTL_END
- }
- slab_bufctl(slabp)[i - 1] = BUFCTL_END;
- }