Python内存管理与垃圾回收机制

内存管理

Python解释器由c语言开发完成,py中所有的操作最终都由底层的c语言来实现并完成

  1. 两个重要结构体:
// include/object.h
#define _PyObject_HEAD_EXTRA            \ //前后指针,用于构造双向队列
    struct _object *_ob_next;           \
    struct _object *_ob_prev;
// 宏定义     
#define PyObject_HEAD       PyObject ob_base;
#define PyObject_VAR_HEAD      PyVarObject ob_base;

typedef struct _object {
    _PyObject_HEAD_EXTRA // 用于构造双向链表
    Py_ssize_t ob_refcnt;  // 引用计数器
    struct _typeobject *ob_type;    // 数据类型
} PyObject;
 
 
typedef struct {
    PyObject ob_base;   // PyObject对象,包含3个元素
    Py_ssize_t ob_size; /* Number of items in variable part,即:元素个数 */
} PyVarObject;

Python中所有类型创建对象时,底层都是与PyObject和PyVarObject结构体实现,

一般情况下由单个元素组成对象内部会使用PyObject结构体(float)、

由多个元素组成的对象内部会使用PyVarObject结构体(str/int/list/dict/tuple/set/自定义类),

因为由多个元素组成的话是需要为其维护一个 ob_size(内部元素个数)

//include/floatobject.h
typedef struct {
    PyObject_HEAD
    double ob_fval;
} PyFloatObject;
//include/longobject.h
// longintrepr.h
struct _longobject {
    PyObject_VAR_HEAD
    digit ob_digit[1];
};
// longobject.h
/* Long (arbitrary precision) integer object interface */
typedef struct _longobject PyLongObject; /* Revealed in longintrepr.h */
/*
1. python3中没有long类型,只有int类型,但py3内部的int是基于long实现。
2. python3中对int/long长度没有限制,因其内部不是用long存储而是使用类似于“字符串”存储。
*/
//include/bytesobject.h
typedef struct {
    PyObject_VAR_HEAD
    Py_hash_t ob_shash;
    char ob_sval[1];
    /* Invariants:
     *     ob_sval contains space for 'ob_size+1' elements.
     *     ob_sval[ob_size] == 0.
     *     ob_shash is the hash of the string or -1 if not computed yet.
     */
} PyBytesObject;
//include/listobject.h
typedef struct {
    PyObject_VAR_HEAD
      
    /* Vector of pointers to list elements.  list[0] is ob_item[0], etc. */
    PyObject **ob_item;
  
    /* ob_item contains space for 'allocated' elements.  The number
     * currently in use is ob_size.
     * Invariants:
     *     0 <= ob_size <= allocated
     *     len(list) == ob_size
     *     ob_item == NULL implies ob_size == allocated == 0
     * list.sort() temporarily sets allocated to -1 to detect mutations.
     *
     * Items must normally not be NULL, except during construction when
     * the list is not yet visible outside the function that builds it.
     */
    Py_ssize_t allocated;
} PyListObject;
//include/tupleobject.h
typedef struct {
    PyObject_VAR_HEAD
    PyObject *ob_item[1];

    /* ob_item contains space for 'ob_size' elements.
     * Items must normally not be NULL, except during construction when
     * the tuple is not yet visible outside the function that builds it.
     */
} PyTupleObject;
//include/dictobject.h
typedef struct {
    PyObject_HEAD
    Py_ssize_t ma_used;
    PyDictKeysObject *ma_keys;
    PyObject **ma_values;
} PyDictObject;
//include/setobject.h
typedef struct {
    PyObject_HEAD

    Py_ssize_t fill;            /* Number active and dummy entries*/
    Py_ssize_t used;            /* Number active entries */

    /* The table contains mask + 1 slots, and that's a power of 2.
     * We store the mask instead of the size because the mask is more
     * frequently needed.
     */
    Py_ssize_t mask;

    /* The table points to a fixed-size smalltable for small tables
     * or to additional malloc'ed memory for bigger tables.
     * The table pointer is never NULL which saves us from repeated
     * runtime null-tests.
     */
    setentry *table;
    Py_hash_t hash;             /* Only used by frozenset objects */
    Py_ssize_t finger;          /* Search finger for pop() */

    setentry smalltable[PySet_MINSIZE];
    PyObject *weakreflist;      /* List of weak references */
} PySetObject;
//自定义类 include/object.h
typedef struct _typeobject {
    PyObject_VAR_HEAD
    const char *tp_name; /* For printing, in format "<module>.<name>" */
    Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */

    /* Methods to implement standard operations */
    ...
    
} PyTypeObject;

Python3只保留int类型,但此时的int就是Python2中的long类型

  1. 内存管理

创建float对象时代码执行逻辑:

/* Special free list
   free_list is a singly-linked list of available PyFloatObjects, linked
   via abuse of their ob_type members.
*/
static PyFloatObject *free_list = NULL;
static int numfree = 0;
  
PyObject *
PyFloat_FromDouble(double fval)
{
    PyFloatObject *op = free_list;
    /* 检查free_list是否为空。如果不为空,说明有可用的空闲对象,
    从列表中取出一个对象(通过修改free_list指针和减少numfree计数来实现),并跳过内存分配步骤 */
    if (op != NULL) {
        free_list = (PyFloatObject *) Py_TYPE(op);
        numfree--;
    } else {
          
        // 第一步:根据float类型大小,为float对象开辟内存。
        op = (PyFloatObject*) PyObject_MALLOC(sizeof(PyFloatObject));
        if (!op)
            return PyErr_NoMemory();
    }
  
    // 第二步:在为float对象开辟的内存中进行初始化。
    /* Inline PyObject_New */
    (void)PyObject_INIT(op, &PyFloat_Type);
      
    // 第三步:将值赋值到float对象开辟的内存中。
    op->ob_fval = fval;
  
    // 第四步:返回已经创建的float对象的内存地址(引用/指针)
    return (PyObject *) op;
} 

**第一步:**根据float类型所需的内存大小,为其开辟内存。

//Objects/obmalloc.c
static PyMemAllocatorEx _PyObject = {
#ifdef PYMALLOC_DEBUG
    &_PyMem_Debug.obj, PYDBG_FUNCS
#else
    NULL, PYOBJ_FUNCS
#endif
    };



void *
PyObject_Malloc(size_t size)
{
    /* see PyMem_RawMalloc() */
    if (size > (size_t)PY_SSIZE_T_MAX)
        return NULL;

    // 开辟内存
    return _PyObject.malloc(_PyObject.ctx, size);
}
//PyMemAllocatorEx的方法说明
Customize Memory Allocators
===========================

.. versionadded:: 3.4

.. c:type:: PyMemAllocatorEx

   Structure used to describe a memory block allocator. The structure has
   four fields:

   +----------------------------------------------------------+---------------------------------------+
   | Field                                                    | Meaning                               |
   +==========================================================+=======================================+
   | ``void *ctx``                                            | user context passed as first argument |
   +----------------------------------------------------------+---------------------------------------+
   | ``void* malloc(void *ctx, size_t size)``                 | allocate a memory block               |
   +----------------------------------------------------------+---------------------------------------+
   | ``void* calloc(void *ctx, size_t nelem, size_t elsize)`` | allocate a memory block initialized   |
   |                                                          | with zeros                            |
   +----------------------------------------------------------+---------------------------------------+
   | ``void* realloc(void *ctx, void *ptr, size_t new_size)`` | allocate or resize a memory block     |
   +----------------------------------------------------------+---------------------------------------+
   | ``void free(void *ctx, void *ptr)``                      | free a memory block                   |
   +----------------------------------------------------------+---------------------------------------+

   .. versionchanged:: 3.5
      The :c:type:`PyMemAllocator` structure was renamed to
      :c:type:`PyMemAllocatorEx` and a new ``calloc`` field was added.

**第二步:**对新开辟的内存中进行类型和引用的初始化

//include/objimpl.h
/* Macros trading binary compatibility for speed. See also pymem.h.
   Note that these macros expect non-NULL object pointers.*/
#define PyObject_INIT(op, typeobj) \
    ( Py_TYPE(op) = (typeobj), _Py_NewReference((PyObject *)(op)), (op) )
//Objects/object.c
/* Head of circular doubly-linked list of all objects.  These are linked
 * together via the _ob_prev and _ob_next members of a PyObject, which
 * exist only in a Py_TRACE_REFS build.
 */
static PyObject refchain = {&refchain, &refchain};

/* Insert op at the front of the list of all objects.  If force is true,
 * op is added even if _ob_prev and _ob_next are non-NULL already.  If
 * force is false amd _ob_prev or _ob_next are non-NULL, do nothing.
 * force should be true if and only if op points to freshly allocated,
 * uninitialized memory, or you've unlinked op from the list and are
 * relinking it into the front.
 * Note that objects are normally added to the list via _Py_NewReference,
 * which is called by PyObject_Init.  Not all objects are initialized that
 * way, though; exceptions include statically allocated type objects, and
 * statically allocated singletons (like Py_True and Py_None).
 */
void
_Py_AddToAllObjects(PyObject *op, int force)
{

    if (force || op->_ob_prev == NULL) {
        op->_ob_next = refchain._ob_next;
        op->_ob_prev = &refchain;
        refchain._ob_next->_ob_prev = op;
        refchain._ob_next = op;
    }
}

void
_Py_NewReference(PyObject *op)
{
    _Py_INC_REFTOTAL;

    // 对新开辟的内存中的的引用计数器初始化为1。
    op->ob_refcnt = 1;

    // 将新开辟的内存的指针添加到一个双向链表refchain中。
    _Py_AddToAllObjects(op, 1);

    _Py_INC_TPALLOCS(op);
}

float类型每次创建对象时都会把对象放到 refchain 的双向链表中

float对象引用:

在给对象创建新引用时,会对其引用计数器+1

//include/object.h
#define Py_INCREF(op) (                         \
    _Py_INC_REFTOTAL  _Py_REF_DEBUG_COMMA       \
    ((PyObject *)(op))->ob_refcnt++)

销毁float对象:

val = 3.14
del val
The macros Py_INCREF(op) and Py_DECREF(op) are used to increment or decrement
reference counts.  Py_DECREF calls the object's deallocator function when
the refcount falls to 0; for
objects that don't contain references to other objects or heap memory
this can be the standard function free().  Both macros can be used
wherever a void expression is allowed.  The argument must not be a
NULL pointer.  If it may be NULL, use Py_XINCREF/Py_XDECREF instead.
The macro _Py_NewReference(op) initialize reference counts to 1, and
in special builds (Py_REF_DEBUG, Py_TRACE_REFS) performs additional
bookkeeping appropriate to the special build.

//include/object.h 第一步
#define Py_DECREF(op)                                   \
    do {                                                \
        PyObject *_py_decref_tmp = (PyObject *)(op);    \
        if (_Py_DEC_REFTOTAL  _Py_REF_DEBUG_COMMA       \
        --(_py_decref_tmp)->ob_refcnt != 0)             \
            _Py_CHECK_REFCNT(_py_decref_tmp)            \
        else                                            \
        _Py_Dealloc(_py_decref_tmp);                    \
    } while (0)
//Objects/object.c 第二步
void _Py_Dealloc(PyObject *op)
{
    // 第一步:调用float类型的tp_dealloc,进行内存的销毁
    destructor dealloc = Py_TYPE(op)->tp_dealloc;

    // 第二步:在refchain双向链表中移除
    _Py_ForgetReference(op);

    (*dealloc)(op);
}

调用float类型的tp_dealloc进行内存的销毁。

按理此过程说应该直接将对象内存销毁,但float内部有缓存机制,所以他的执行流程是这样的:

  • float内部缓存的内存个数已经大于等于100,那么在执行del val的语句时,内存中就会直接删除此对象。
  • 未达到100时,那么执行 del val语句,不会真的在内存中销毁对象,而是将对象放到一个free_list的单链表中,以便以后的对象使用

总结:

float对象在创建对象时会把为其开辟内存并初始化引用计数器为1,然后将其加入到名为 refchain 的双向链表中;float对象在增加引用时,会执行 Py_INCREF在内部会让引用计数器+1;最后执行销毁float对象时,会先判断float内部free_list中缓存的个数,如果已达到300个,则直接在内存中销毁,否则不会真正销毁而是加入free_list单链表中,以后后续对象使用,销毁动作的最后再在refchain中移除即可

垃圾回收机制

引用计数器为主,标记清除和分代回收为辅

  1. 引用计数器:

每个对象内部都维护了一个值,该值记录这此对象被引用的次数,如果次数为0,则Python垃圾回收机制会自动清除此对象

//实例
import sys
 
# 在内存中创建一个字符串对象"武沛齐",对象引用计数器的值为:1
nick_name = '武沛齐'
 
# 应该输入2,实际输出2,因为getrefcount方法时把 nick_name 当做参数传递了,引发引用计数器+1,所以打印时值为:2
# 注意:getrefcount 函数执行完毕后,会自动-1,所以本质上引用计数器还是1.
print(sys.getrefcount(nick_name))
 
# 变量 real_name 也指向的字符串对象"武沛齐",即:引用计数器再 +1,所以值为:2
real_name = nick_name
 
# 应该输出2,实际输出3. 因为getrefcount方法时把 real_name 当做参数传递了,引发引用计数器+1,所以打印时值为:3
# 注意:getrefcount 函数执行完毕后,会自动-1,所以本质上引用计数器还是2.
print(sys.getrefcount(nick_name))
 
# 删除reald_name变量,并让其指向对象中的引用计数器-1
del real_name
 
# 应该输出1,实际输出2,因为getrefcount方法时把 real_name 当做参数传递了,引发引用计数器+1,所以打印时值为:2.
print(sys.getrefcount(nick_name))
 
// 实际运行时 Python 解释器内部对字符串的额外引用,会导致调用 sys.getrefcount 时,实际输出的值都比预期高
 
 
 
 
# ############ getrefcount 注释信息 ############
'''
def getrefcount(p_object): # real signature unknown; restored from __doc__
    """
    getrefcount(object) -> integer
     
    Return the reference count of object.  The count returned is generally
    one higher than you might expect, because it includes the (temporary)
    reference as an argument to getrefcount().
    """
    return 0
'''
  1. 循环引用问题:
import gc
import objgraph
class Foo(object):
    def __init__(self):
        self.data = None
# 在内存创建两个对象,即:引用计数器值都是1
obj1 = Foo()
obj2 = Foo()
# 两个对象循环引用,导致内存中对象的应用+1,即:引用计数器值都是2
obj1.data = obj2
obj2.data = obj1
# 删除变量,并将引用计数器-1。
del obj1
del obj2
# 关闭垃圾回收机制,因为python的垃圾回收机制是:引用计数器、标记清除、分代回收 配合已解决循环引用的问题,关闭他便于之后查询内存中未被释放对象。
gc.disable()
 
# 至此,由于循环引用导致内存中创建的obj1和obj2两个对象引用计数器不为0,无法被垃圾回收机制回收。
# 所以,内存中Foo类的对象就还显示有2个。
print(objgraph.count('Foo'))

# gc.collect() 可以主动触发垃圾回收; 
  1. 标记清除&分代回收

针对 lists, tuples, instances, classes, dictionaries, and functions 类型,每创建一个对象都会将对象放到一个双向链表中,每个对象中都有 _ob_next 和 _ob_prev 指针,用于挂靠到链表中

/* Nothing is actually declared to be a PyObject, but every pointer to
 * a Python object can be cast to a PyObject*.  This is inheritance built
 * by hand.  Similarly every pointer to a variable-size Python object can,
 * in addition, be cast to PyVarObject*.
 */
typedef struct _object {
    _PyObject_HEAD_EXTRA # 双向链表
    Py_ssize_t ob_refcnt;
    struct _typeobject *ob_type;
} PyObject;
 
typedef struct {
    PyObject ob_base;
    Py_ssize_t ob_size; /* Number of items in variable part */
} PyVarObject;
 
 
/* Define pointers to support a doubly-linked list of all live heap objects. */
#define _PyObject_HEAD_EXTRA            \
    struct _object *_ob_next;           \
    struct _object *_ob_prev;

随着对象的创建,该双向链表上的对象会越来越多。

当对象个数超过 700个 时,Python解释器就会进行垃圾回收。

当代码中主动执行gc.collect()命令时,Python解释器就会进行垃圾回收。

import gc
gc.collect()

标记-清除(Mark-and-Sweep)

标记-清除是一种直接的垃圾回收算法,其工作原理分为两个主要阶段:

标记(Mark)阶段:

从根集合(root set)开始,这包括全局变量、局部变量、活跃线程的栈等。
遍历这些根对象,并标记所有从根可达的对象(即,如果从一个根对象出发,通过一系列引用可以到达的对象)。
递归地标记所有从已标记对象可达的其他对象。

清除(Sweep)阶段:

遍历堆中的所有对象。
对于那些未被标记的对象(即,从根集合不可达的对象),认为是垃圾。
回收这些垃圾对象的内存空间。

分代回收(Generational Collection)

分代回收是一种优化技术,它基于这样一个观察:大多数对象很快变得不可达(即,成为垃圾),而只有少数对象存活时间较长。因此,将对象分为不同的“代”(generation),并根据这些代的特性来调整垃圾回收的策略,可以显著提高垃圾回收的效率。

在 Python 中,对象被分为三代(通常是 0、1、2 代,但具体实现可能有所不同):

0 代:新创建的对象被分配到 0 代。
1 代:经过一次垃圾回收后仍然存活的对象被提升到 1 代。
2 代:再经过一次垃圾回收后仍然存活的对象被提升到 2 代。

分代回收的策略通常包括:

更频繁地检查年轻代:因为年轻代中的对象更容易成为垃圾,所以更频繁地对它们进行垃圾回收可以提高效率。
更少地检查老年代:老年代中的对象存活时间较长,因此不需要那么频繁地检查。
使用不同的垃圾回收算法:对于不同的代,可能会使用不同的垃圾回收算法或策略。例如,对于年轻代,可能会使用更高效的算法,如复制算法(Copying GC),而对于老年代,则可能使用标记-清除算法或分段标记-清除算法(Segregated Fit)。
Python 中的实现
Python 的垃圾回收机制是自动的,用户通常不需要直接干预。但是,Python 提供了几个与垃圾回收相关的函数和设置,允许用户在一定程度上控制或查询垃圾回收的行为。例如,gc.collect() 函数可以手动触发垃圾回收,gc.disable() 和 gc.enable() 函数可以禁用和启用自动垃圾回收。

Python解释器在垃圾回收时,会遍历链表中的每个对象,如果存在循环引用,就将存在循环引用的对象的引用计数器 -1,同时Python解释器也会将计数器等于0(可回收)和不等于0(不可回收)的一分为二,把计数器等于0的所有对象进行回收,把计数器不为0的对象放到另外一个双向链表表

# 默认情况下三个阈值为 (700,10,10) ,也可以主动去修改默认阈值。
import gc
 
gc.set_threshold(threshold0[, threshold1[, threshold2]])