Linux kernel 笔记 (18)——current变量

kernel代码中有一个current变量,它是一个指针,用来指向执行当前这段kernel代码的进程。举个例子,当一个进程执行open系统调用时,在kernel中,就可以用current来访问这个进程。current定义在<asm/current.h>中,以X86平台为例:

#ifndef __ASSEMBLY__
struct task_struct;

DECLARE_PER_CPU(struct task_struct *, current_task);

static __always_inline struct task_struct *get_current(void)
{
    return this_cpu_read_stable(current_task);
}

#define current get_current()

#endif /* __ASSEMBLY__ */

可以看到currrent变量实际上是一个指向struct task_struct的指针,而struct task_struct则保存了关于进程的信息。

 

Linux kernel IOMMU代码分析笔记(12)——page-table entry的相关代码定义

DMA请求的地址转换如下图所示:

1

page-table entry格式如下:

2

因为每个page-table entry82^3)个byte,所以上面的转化图中只要9位就可以了(12 - 32^12 = 4KiB)。

GAW的定义:

Guest Address Width: Physical addressability limit within a partition (virtual machine)

可以理解为从虚拟机角度看到的物理地址宽度。举个例子,如果一个虚拟机只能访问2G内存,那么GAW就是31

AGAW的定义:Adjusted Guest Address Width。为了保证9bit长度的步长转化,GAWAGAW之间的转换伪代码如下:

R = (GAW - 12) MOD 9;
if (R == 0) {
    AGAW = GAW;
} else {
    AGAW = GAW + 9 - R;
}
if (AGAW > 64)
    AGAW = 64;

对应的函数是guestwidth_to_adjustwidth

static inline int guestwidth_to_adjustwidth(int gaw)
{
    int agaw;
    int r = (gaw - 12) % 9;

    if (r == 0)
        agaw = gaw;
    else
        agaw = gaw + 9 - r;
    if (agaw > 64)
        agaw = 64;
    return agaw;
}

AGAW的最小长度是30bit,参考以下规范定义(context-entry格式里的内容):

• 000b: 30-bit AGAW (2-level page table)
• 001b: 39-bit AGAW (3-level page table)
• 010b: 48-bit AGAW (4-level page table)
• 011b: 57-bit AGAW (5-level page table)
• 100b: 64-bit AGAW (6-level page table)

所以可以看到kernelagaw的一些转换代码会用到302这些数字:

static inline int agaw_to_level(int agaw)
{
    return agaw + 2;
}

static inline int agaw_to_width(int agaw)
{
    return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
}

static inline int width_to_agaw(int width)
{
    return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
}

 

Linux kernel 笔记 (17)——提交Linux Kernel IOMMU patch的注意事项

以下是提交Linux Kernel IOMMU patch的注意事项:

(1)Patch主题前缀:
IOMMU相关:<arch>/<iommu>
Intel VT-d相关:iommu/vt-d
举例如下:

iommu/vt-d: Enhance intel-iommu driver to support DMAR unit hotplug

(2)IOMMU patch每行不超过60个字符长度。

Linux kernel 笔记 (16)——clflush_cache_range函数

/**
 * clflush_cache_range - flush a cache range with clflush
 * @vaddr:  virtual start address
 * @size:   number of bytes to flush
 *
 * clflushopt is an unordered instruction which needs fencing with mfence or
 * sfence to avoid ordering issues.
 */
void clflush_cache_range(void *vaddr, unsigned int size)
{
    void *vend = vaddr + size - 1;

    mb();

    for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
        clflushopt(vaddr);
    /*
     * Flush any possible final partial cacheline:
     */
    clflushopt(vend);

    mb();
}

clflush_cache_range()函数用来把从虚拟地址vaddr起始的,长度为size的的cache line置为无效,各级包含这个cache linecache系统都会失效。

Linux kernel IOMMU代码分析笔记(11)——root_entry的相关代码定义

root_entry3.10版本的相关定义:

/*
 * 0: Present
 * 1-11: Reserved
 * 12-63: Context Ptr (12 - (haw-1))
 * 64-127: Reserved
 */
struct root_entry {
    u64 val;
    u64 rsvd1;
};
#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
static inline bool root_present(struct root_entry *root)
{
    return (root->val & 1);
}
static inline void set_root_present(struct root_entry *root)
{
    root->val |= 1;
}
static inline void set_root_value(struct root_entry *root, unsigned long value)
{
    root->val |= value & VTD_PAGE_MASK;
}

root_entrymainstream版本的相关定义:

/*
 * 0: Present
 * 1-11: Reserved
 * 12-63: Context Ptr (12 - (haw-1))
 * 64-127: Reserved
 */
struct root_entry {
    u64 lo;
    u64 hi;
};
#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))

/*
 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 * if marked present.
 */
static phys_addr_t root_entry_lctp(struct root_entry *re)
{
    if (!(re->lo & 1))
        return 0;

    return re->lo & VTD_PAGE_MASK;
}

/*
 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 * if marked present.
 */
static phys_addr_t root_entry_uctp(struct root_entry *re)
{
    if (!(re->hi & 1))
        return 0;

    return re->hi & VTD_PAGE_MASK;
}

VTD_PAGE_MASK的相关定义:

/*
 * VT-d hardware uses 4KiB page size regardless of host page size.
 */
#define VTD_PAGE_SHIFT      (12)
#define VTD_PAGE_SIZE       (1UL << VTD_PAGE_SHIFT)
#define VTD_PAGE_MASK       (((u64)-1) << VTD_PAGE_SHIFT)
#define VTD_PAGE_ALIGN(addr)    (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)

所以root_entry_lctp得到的是Context Table的物理地址。

Root entry的格式如下:

1

Extended root entry的格式如下:

2

Root entryextended root entry都占16byte16 * 8 = 128),而HAW代表这个平台的Host Address Width,一共有256root entryextended root entry4096/16 = 256)。

参考资料:
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(10)——[PATCH] iommu/vt-d: Load old data structures only in kdump kernel

kernel mainstreamintel-iommu.c代码中:

static int __init init_dmars(void)
{
    ......
    if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
        iommu_disable_translation(iommu);
        clear_translation_pre_enabled(iommu);
        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
            iommu->name);
    }
    ......
}

translation_pre_enabled函数如下:

static bool translation_pre_enabled(struct intel_iommu *iommu)
{
    return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
}

VTD_FLAG_TRANS_PRE_ENABLED赋值是在init_translation_status函数中:

static void init_translation_status(struct intel_iommu *iommu)
{
    u32 gsts;

    gsts = readl(iommu->reg + DMAR_GSTS_REG);
    if (gsts & DMA_GSTS_TES)
        iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
}

DMAR_GSTS_REG(Global status register)DMA_GSTS_TES(Translation Enable Status)表明是否开启了DMA Remapping功能。

所以if (translation_pre_enabled(iommu) && !is_kdump_kernel())这段代码含义是如果这个iommu硬件单元已经开启了DMA Remapping功能,但是当前运行的kernel不是rebootkernel,则当前iommu硬件状态是不能被认为是正确的,所以要把DMAR_GSTS_REG(Global status register)寄存器,和iommu->flags都要重置(clear_translation_pre_enabled)。

同理,在intel_irq_remapping.c中,也有类似代码:

static int intel_setup_irq_remapping(struct intel_iommu *iommu)
{
    ......
    if (ir_pre_enabled(iommu)) {
        if (iommu_load_old_irte(iommu))
            pr_err("Failed to copy IR table for %s from previous kernel\n",
                   iommu->name);
        else
            pr_info("Copied IR table for %s from previous kernel\n",
                iommu->name);
    }
    ......
}

static int iommu_load_old_irte(struct intel_iommu *iommu)
{
    ......
    if (!is_kdump_kernel()) {
        ......
    }
    ......
 }

参考资料:
[PATCH 04/17] iommu/vt-d: Load old data structures only in kdump kernel
Intel ® Virtualization Technology for Directed I/O

Linux kernel 笔记 (14)——is_kdump_kernel函数

#ifdef CONFIG_CRASH_DUMP
/*
 * is_kdump_kernel() checks whether this kernel is booting after a panic of
 * previous kernel or not. This is determined by checking if previous kernel
 * has passed the elf core header address on command line.
 *
 * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will
 * return 1 if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic of
 * previous kernel.
 */

static inline int is_kdump_kernel(void)
{
    return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0;
}
#else /* !CONFIG_CRASH_DUMP */
static inline int is_kdump_kernel(void) { return 0; }
#endif /* CONFIG_CRASH_DUMP */

is_kdump_kernel用来检查当前运行的kernel是不是由于之前运行的kernel panic了,而重启的kernel。如果没有配置CONFIG_CRASH_DUMP,则总是返回0

Linux kernel IOMMU代码分析笔记(9)——EIM,IR和QI

支持IOMMU的硬件单元的Extended Capability Register有三个关联的位:

EIMExtended Interrupt Mode):在X86_64平台,0表示支持xAPIC1表示支持x2APICItanium平台这一位没意义。并且这一位只有在IR位设置为1才有效。

IRInterrupt Remapping support):1表示支持Interrupt remapping0表示不支持。硬件单元支持Interrupt remapping,也必须支持QI

QIQueued Invalidation support):1支持Queued Invalidation0表示不支持。

参考资料:
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(8)——intel_enable_irq_remapping(2)

上文

(4)

for_each_iommu(iommu, drhd) {
    /*
     * If the queued invalidation is already initialized,
     * shouldn't disable it.
     */
    if (iommu->qi)
        continue;

    /*
     * Clear previous faults.
     */
    dmar_fault(-1, iommu);

    /*
     * Disable intr remapping and queued invalidation, if already
     * enabled prior to OS handover.
     */
    iommu_disable_irq_remapping(iommu);

    dmar_disable_qi(iommu);
}

上面代码含义是如果硬件单元的queued invalidation还没有初始化,则清掉之前的fault,并且disable IRQ remappingqueued invalidation

(5)

/*
 * check for the Interrupt-remapping support
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (eim && !ecap_eim_support(iommu->ecap)) {
        printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
               " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
        goto error;
    }
}

上述代码含义是如果硬件支持IRQ remapping并且系统支持x2APIC模式,如果硬件不支持x2APIC模式,就失败。

(6)

/*
 * Enable queued invalidation for all the DRHD's.
 */
for_each_iommu(iommu, drhd) {
    int ret = dmar_enable_qi(iommu);

    if (ret) {
        printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
               " invalidation, ecap %Lx, ret %d\n",
               drhd->reg_base_addr, iommu->ecap, ret);
        goto error;
    }
}

/*
 * Setup Interrupt-remapping for all the DRHD's now.
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (intel_setup_irq_remapping(iommu, eim))
        goto error;

    setup = 1;
}

if (!setup)
    goto error;

上述代码分别为每个硬件单元enable queued invalidationIRQ remapping

(7)

irq_remapping_enabled = 1;

/*
 * VT-d has a different layout for IO-APIC entries when
 * interrupt remapping is enabled. So it needs a special routine
 * to print IO-APIC entries for debugging purposes too.
 */
x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;

pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");

return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;

error:
    /*
     * handle error condition gracefully here!
     */

if (x2apic_present)
    pr_warn("Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");

return -1;

如果成功的话,返回IRQ_REMAP_XAPIC_MODE(0)IRQ_REMAP_X2APIC_MODE(1),否则返回-1

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/O