Linux kernel 笔记 (7) ——内存地址类型

Linux中有以下几种内存地址类型:

(1)User virtual addresses
用户空间程序(user-space program)所看到和使用的地址。这些都是虚拟地址(virtual address),需要经过转化后,才能变成实际的内存物理地址。每个进程都拥有自己的虚拟地址空间。

(2)Physical addresses
实际的内存物理地址,也即是CPU用来访问物理内存的地址。

(3)Bus addresses
总线地址。外围设备所看到的物理内存地址。通常情况下,这个地址就是实际的内存物理地址。但是如果系统支持IOMMU的话,总线地址和实际的内存物理地址之间需要做一个映射。

(4)Kernel logic addresses
内核逻辑地址。内核逻辑地址会部分或全部地映射到物理内存地址,所以看起来就像实际的物理内存地址一样,并且这种映射是线性的,一对一的。在许多体系结构上,内核逻辑地址和物理内存地址只差一个常量偏移。内核虚拟地址的值通常存储在指针类型或unsigned long类型的变量里。kmalloc函数返回的就是内核逻辑地址。

(5)Kernel virtual addresses
内核虚拟地址。内核虚拟地址同内核逻辑地址相似的地方是都是把内核地址空间和实际物理内存的地址空间做映射,不同之处在于内核虚拟地址不要求这种映射是线性的,一对一的。内核逻辑地址都是内核虚拟地址,但是反过来则不成立。内核虚拟地址的值通常存储在指针类型变量里。vmallockmap返回的都是内核虚拟地址。

__pa(定义在<asm/page.h>)宏用来把内核逻辑地址映射为实际的内存物理地址;__va则用来把实际的内存物理地址映射为内核逻辑地址(仅限于low memory)。

Linux kernel IOMMU代码分析笔记(3)

看一下dmar_table_init这个函数:

int __init dmar_table_init(void)
{
    static int dmar_table_initialized;
    int ret;

    if (dmar_table_initialized == 0) {
        ret = parse_dmar_table();
        if (ret < 0) {
            if (ret != -ENODEV)
                pr_info("parse DMAR table failure.\n");
        } else  if (list_empty(&dmar_drhd_units)) {
            pr_info("No DMAR devices found\n");
            ret = -ENODEV;
        }

        if (ret < 0)
            dmar_table_initialized = ret;
        else
            dmar_table_initialized = 1;
    }

    return dmar_table_initialized < 0 ? dmar_table_initialized : 0;
}

dmar_table_init其实就是调用了parse_dmar_table这个函数来解析 DMAR ACPI tableparse_dmar_table代码如下:

static int __init
parse_dmar_table(void)
{
    struct acpi_table_dmar *dmar;
    struct acpi_dmar_header *entry_header;
    int ret = 0;
    int drhd_count = 0;

    /*
     * Do it again, earlier dmar_tbl mapping could be mapped with
     * fixed map.
     */
    dmar_table_detect();

    /*
     * ACPI tables may not be DMA protected by tboot, so use DMAR copy
     * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
     */
    dmar_tbl = tboot_get_dmar_table(dmar_tbl);

    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar)
        return -ENODEV;

    if (dmar->width < PAGE_SHIFT - 1) {
        pr_warn("Invalid DMAR haw\n");
        return -EINVAL;
    }

    pr_info("Host address width %d\n", dmar->width + 1);

    entry_header = (struct acpi_dmar_header *)(dmar + 1);
    while (((unsigned long)entry_header) <
            (((unsigned long)dmar) + dmar_tbl->length)) {
        /* Avoid looping forever on bad ACPI tables */
        if (entry_header->length == 0) {
            pr_warn("Invalid 0-length structure\n");
            ret = -EINVAL;
            break;
        }

        dmar_table_print_dmar_entry(entry_header);

        switch (entry_header->type) {
        case ACPI_DMAR_TYPE_HARDWARE_UNIT:
            drhd_count++;
            ret = dmar_parse_one_drhd(entry_header);
            break;
        case ACPI_DMAR_TYPE_RESERVED_MEMORY:
            ret = dmar_parse_one_rmrr(entry_header);
            break;
        case ACPI_DMAR_TYPE_ATSR:
            ret = dmar_parse_one_atsr(entry_header);
            break;
        case ACPI_DMAR_HARDWARE_AFFINITY:
#ifdef CONFIG_ACPI_NUMA
            ret = dmar_parse_one_rhsa(entry_header);
#endif
            break;
        case ACPI_DMAR_TYPE_ANDD:
            ret = dmar_parse_one_andd(entry_header);
            break;
        default:
            pr_warn("Unknown DMAR structure type %d\n",
                entry_header->type);
            ret = 0; /* for forward compatibility */
            break;
        }
        if (ret)
            break;

        entry_header = ((void *)entry_header + entry_header->length);
    }
    if (drhd_count == 0)
        pr_warn(FW_BUG "No DRHD structure found in DMAR table\n");
    return ret;
}

parse_dmar_table函数中重要的工作有以下两点:
(1)检查host机器地址总线宽度:

if (dmar->width < PAGE_SHIFT - 1) {
        pr_warn("Invalid DMAR haw\n");
        return -EINVAL;
}

(2)遍历并解析DMAR table中的每一项,并加到相应的列表中:

while (((unsigned long)entry_header) <
            (((unsigned long)dmar) + dmar_tbl->length)) {
            ......
}

DMA Remapping Hardware单元为例:

static int __init
dmar_parse_one_drhd(struct acpi_dmar_header *header)
{
    struct acpi_dmar_hardware_unit *drhd;
    struct dmar_drhd_unit *dmaru;
    int ret = 0;

    drhd = (struct acpi_dmar_hardware_unit *)header;
    dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
    if (!dmaru)
        return -ENOMEM;

    dmaru->hdr = header;
    dmaru->reg_base_addr = drhd->address;
    dmaru->segment = drhd->segment;
    dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
    dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
                          ((void *)drhd) + drhd->header.length,
                          &dmaru->devices_cnt);
    if (dmaru->devices_cnt && dmaru->devices == NULL) {
        kfree(dmaru);
        return -ENOMEM;
    }

    ret = alloc_iommu(dmaru);
    if (ret) {
        dmar_free_dev_scope(&dmaru->devices,
                    &dmaru->devices_cnt);
        kfree(dmaru);
        return ret;
    }
    dmar_register_drhd_unit(dmaru);
    return 0;
}

最后dmar_register_drhd_unit这个函数会把解析好的DMA Remapping Hardware单元加到dmar_drhd_units这个列表中。

Linux kernel 笔记 (6) ——__init和__initdata

__init__initdata定义在include/linux/init.h

/* These macros are used to mark some functions or 
 * initialized data (doesn't apply to uninitialized data)
 * as `initialization' functions. The kernel can take this
 * as hint that the function is used only during the initialization
 * phase and free up used memory resources after
 *
 * Usage:
 * For functions:
 * 
 * You should add __init immediately before the function name, like:
 *
 * static void __init initme(int x, int y)
 * {
 *    extern int z; z = x * y;
 * }
 *
 * If the function has a prototype somewhere, you can also add
 * __init between closing brace of the prototype and semicolon:
 *
 * extern int initialize_foobar_device(int, int, int) __init;
 *
 * For initialized data:
 * You should insert __initdata between the variable name and equal
 * sign followed by value, e.g.:
 *
 * static int init_variable __initdata = 0;
 * static const char linux_logo[] __initconst = { 0x32, 0x36, ... };
 *
 * Don't forget to initialize data not at file scope, i.e. within a function,
 * as gcc otherwise puts the data into the bss section and not into the init
 * section.
 * 
 * Also note, that this data cannot be "const".
 */

/* These are for everybody (although not all archs will actually
   discard it in modules) */
#define __init      __section(.init.text) __cold notrace
#define __initdata  __section(.init.data)
#define __initconst __constsection(.init.rodata)

__init修饰函数时表明函数只在kernel初始化阶段使用(函数放在.init.text区),在初始化完成后,这些函数所占用的内存就可以回收利用。举例如下:

static int __init parse_dmar_table(void)
{
    ....
}

__initdata作用类似于__init,只不过它修饰变量,并且存放在.init.data区。

Linux kernel IOMMU代码分析笔记(2)

系统BIOS会检查平台的remapping硬件功能,并且会在主机的系统地址空间内定位memory-mapped remapping硬件寄存器。BIOS通过DMA Remapping Reporting (DMAR) ACPI table向系统软件报告remapping硬件单元信息。

除了RSDPFACS,所有的ACPI table定义都包含一个共同的header定义:

struct acpi_table_header {
    char signature[ACPI_NAME_SIZE]; /* ASCII table signature */
    u32 length;     /* Length of table in bytes, including this header */
    u8 revision;        /* ACPI Specification minor version number */
    u8 checksum;        /* To make sum of entire table == 0 */
    char oem_id[ACPI_OEM_ID_SIZE];  /* ASCII OEM identification */
    char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];  /* ASCII OEM table identification */
    u32 oem_revision;   /* OEM revision number */
    char asl_compiler_id[ACPI_NAME_SIZE];   /* ASCII ASL compiler vendor ID */
    u32 asl_compiler_revision;  /* ASL compiler version */
};

DMA Remapping table定义如下(可以看到包含有acpi_table_header):

struct acpi_table_dmar {
    struct acpi_table_header header;    /* Common ACPI table header */
    u8 width;       /* Host Address Width */
    u8 flags;
    u8 reserved[10];
};

对所有的DMA Remapping结构体,都会包含一个type和一个length

/* DMAR subtable header */

struct acpi_dmar_header {
    u16 type;
    u16 length;
};

DMA Remapping Hardware单元(类型为0)为例:

/* 0: Hardware Unit Definition */

struct acpi_dmar_hardware_unit {
    struct acpi_dmar_header header;
    u8 flags;
    u8 reserved;
    u16 segment;
    u64 address;        /* Register Base Address */
};

其它的还有acpi_dmar_reserved_memoryacpi_dmar_atsr等等定义。

Linux kernel 笔记 (5) ——spin_lock_irqsave

Linux kernel中最基本的lock原语就是spin lock(自旋锁)。实例代码如下:

static DEFINE_SPINLOCK(xxx_lock);

unsigned long flags;

spin_lock_irqsave(&xxx_lock, flags);
... critical section here ..
spin_unlock_irqrestore(&xxx_lock, flags);

以上代码总是安全的(包括可以用在中断处理函数),并可以在UPSMP上都可以正常工作。

Linux kernel 笔记 (4) ——memory barrier

由于编译器和处理器可以打乱程序的执行顺序,所以有些时候,我们需要“memory barrier”来保证内存访问指令的执行顺序(loadstore指令):

(1)rmb():提供一个读(load)操作的“memory barrier”,保证读操作不会跨越rmb()进行reorder。即rmb()之前的读(load)操作不会reorder rmb()之后,而rmb()之后的读(load)操作不会reorder rmb()之前。

(2)wmb():提供一个写(store)操作的“memory barrier”,同rmb类似,不过wmb()是保证写操作不会跨越wmb()进行reorder

(3)mb():保证读写操作都不会跨越mb()进行reorder

(4)read_barrier_depends()rmb()的一个变种,但仅保证有依赖关系的读操作不会跨越rmb()进行reorder

其它还有smp_rmb()smp_wmb()搜索。

UP VS SMP

UP(Uni-Processor):系统只有一个处理器单元,即单核CPU系统。

SMP(Symmetric Multi-Processors):系统有多个处理器单元。各个处理器之间共享总线,内存等等。在操作系统看来,各个处理器之间没有区别。

要注意,这里提到的“处理器单元”是指“logic CPU”,而不是“physical CPU”。举个例子,如果一个“physical CPU”包含2core,并且一个core包含2hardware thread。则一个“处理器单元”就是一个hardware thread

Linux kernel IOMMU代码分析笔记(1)

Linux kernel代码版本是3.10

intel-iommu.h头文件定义了root-entry table address寄存器:

#define DMAR_RTADDR_REG 0x20    /* Root entry table */

DMAR_RTADDR_REG只在iommu_set_root_entry这个函数中使用(这个函数作用是更新root-entry table的地址):

static void iommu_set_root_entry(struct intel_iommu *iommu)
{
    void *addr;
    u32 sts;
    unsigned long flag;

    addr = iommu->root_entry;

    raw_spin_lock_irqsave(&iommu->register_lock, flag);
    dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));

    writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);

    /* Make sure hardware complete it */
    IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
              readl, (sts & DMA_GSTS_RTPS), sts);

    raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
}

DMAR_RTADDR_REG存储的是root-entry table的物理地址(virt_to_phys()函数把virtual address转成physical address)。整个root-entry table4KiB大小,并且要求起始地址是“页面对齐的(page-alignedpage长度是4KiB)”,所以DMAR_RTADDR_REG12 bits0。更新DMAR_RTADDR_REG代码如下:

dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));

更新完DMAR_RTADDR_REG寄存器,还要把global command寄存器的SRTPSet Root Table Pointer)位置1

writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);

最后读取Global Status寄存器的RTPSRoot Table Pointer Status)位,确认更新成功:

IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
                  readl, (sts & DMA_GSTS_RTPS), sts);

Linux kernel 笔记 (3) ——page和zone

MMU(Memory Management Unit)用来管理内存的最小单位是一个物理页面(physical page,在32-bit系统上通常每个页面为4k64-bit系统上为8k)。Linux kernel使用struct page结构体(定义在<linux/mm_types.h>)来代表每个物理页面:

struct page {
    /* First double word block */
    unsigned long flags;        /* Atomic flags, some possibly
                     * updated asynchronously */
    struct address_space *mapping;  /* If low bit clear, points to
                     * inode address_space, or NULL.
                     * If page mapped as anonymous
                     * memory, low bit is set, and
                     * it points to anon_vma object:
                     * see PAGE_MAPPING_ANON below.
                     */
    ......
}  

受制于一些硬件限制,有些物理页面不能用来进行某种操作。所以kernel把页面分成不同的zone
ZONE_DMA:用来进行DMA操作的页面;
ZONE_DMA32:也是用来进行DMA操作的页面,不过仅针对32-bit设备;
ZONE_NORMAL:包含常规的,用来映射的页面;
ZONE_HIGHMEM:包含不能永久地映射到kernel的地址空间(address space)的页面。
还有其它的zone定义(例如:ZONE_MOVABLE)。zone_type定义在 <linux/mmzone.h>文件里:

enum zone_type {
#ifdef CONFIG_ZONE_DMA
    /*
     * ZONE_DMA is used when there are devices that are not able
     * to do DMA to all of addressable memory (ZONE_NORMAL). Then we
     * carve out the portion of memory that is needed for these devices.
     * The range is arch specific.
     *
     * Some examples
     *
     * Architecture     Limit
     * ---------------------------
     * parisc, ia64, sparc  <4G
     * s390         <2G
     * arm          Various
     * alpha        Unlimited or 0-16MB.
     *
     * i386, x86_64 and multiple other arches
     *          <16M.
     */
    ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
    /*
     * x86_64 needs two ZONE_DMAs because it supports devices that are
     * only able to do DMA to the lower 16M but also 32 bit devices that
     * can only do DMA areas below 4G.
     */
    ZONE_DMA32,
#endif
    /*
     * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
     * performed on pages in ZONE_NORMAL if the DMA devices support
     * transfers to all addressable memory.
     */
    ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
    /*
     * A memory area that is only addressable by the kernel through
     * mapping portions into its own address space. This is for example
     * used by i386 to allow the kernel to address the memory beyond
     * 900MB. The kernel will set up special mappings (page
     * table entries on i386) for each page that the kernel needs to
     * access.
     */
    ZONE_HIGHMEM,
#endif
    ZONE_MOVABLE,
    __MAX_NR_ZONES
};  

如果CPU体系结构允许DMA操作访问任何地址空间,那就没有ZONE_DMA。同理,Intel X86_64处理器可以访问所有的内存空间,也就不存在ZONE_HIGHMEM。 关于“High memory”,可以参考这篇文章

Linux kernel 笔记 (2) ——kernel代码的一些特性

Linux kernel代码相对于user-space程序的一些特性:
(1)kernel代码不能访问标准C程序库(libc)和头文件(例如打印使用printk,而不是printf);
(2)kernel代码是用GNU C实现的,不是ANSI C(比如:内联函数,内联汇编,分支预测等等);
(3)kernel程序没有内存保护;
(4)避免在kernel代码中引入浮点运算;
(5)kernel程序为每个进程预分配的堆栈空间很小;
(6)因为kernel程序是可抢占的(preemptive),支持symmetrical multiprocessing (SMP)和异步中断(asynchronous interrupt),所以开发kernel程序时要时刻考虑并发和同步问题;
(7)可移植性对于kernel代码非常重要。