Linux kernel IOMMU代码分析笔记(12)——page-table entry的相关代码定义

DMA请求的地址转换如下图所示:

1

page-table entry格式如下:

2

因为每个page-table entry82^3)个byte,所以上面的转化图中只要9位就可以了(12 - 32^12 = 4KiB)。

GAW的定义:

Guest Address Width: Physical addressability limit within a partition (virtual machine)

可以理解为从虚拟机角度看到的物理地址宽度。举个例子,如果一个虚拟机只能访问2G内存,那么GAW就是31

AGAW的定义:Adjusted Guest Address Width。为了保证9bit长度的步长转化,GAWAGAW之间的转换伪代码如下:

R = (GAW - 12) MOD 9;
if (R == 0) {
    AGAW = GAW;
} else {
    AGAW = GAW + 9 - R;
}
if (AGAW > 64)
    AGAW = 64;

对应的函数是guestwidth_to_adjustwidth

static inline int guestwidth_to_adjustwidth(int gaw)
{
    int agaw;
    int r = (gaw - 12) % 9;

    if (r == 0)
        agaw = gaw;
    else
        agaw = gaw + 9 - r;
    if (agaw > 64)
        agaw = 64;
    return agaw;
}

AGAW的最小长度是30bit,参考以下规范定义(context-entry格式里的内容):

• 000b: 30-bit AGAW (2-level page table)
• 001b: 39-bit AGAW (3-level page table)
• 010b: 48-bit AGAW (4-level page table)
• 011b: 57-bit AGAW (5-level page table)
• 100b: 64-bit AGAW (6-level page table)

所以可以看到kernelagaw的一些转换代码会用到302这些数字:

static inline int agaw_to_level(int agaw)
{
    return agaw + 2;
}

static inline int agaw_to_width(int agaw)
{
    return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
}

static inline int width_to_agaw(int width)
{
    return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
}

 

Linux kernel IOMMU代码分析笔记(11)——root_entry的相关代码定义

root_entry3.10版本的相关定义:

/*
 * 0: Present
 * 1-11: Reserved
 * 12-63: Context Ptr (12 - (haw-1))
 * 64-127: Reserved
 */
struct root_entry {
    u64 val;
    u64 rsvd1;
};
#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
static inline bool root_present(struct root_entry *root)
{
    return (root->val & 1);
}
static inline void set_root_present(struct root_entry *root)
{
    root->val |= 1;
}
static inline void set_root_value(struct root_entry *root, unsigned long value)
{
    root->val |= value & VTD_PAGE_MASK;
}

root_entrymainstream版本的相关定义:

/*
 * 0: Present
 * 1-11: Reserved
 * 12-63: Context Ptr (12 - (haw-1))
 * 64-127: Reserved
 */
struct root_entry {
    u64 lo;
    u64 hi;
};
#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))

/*
 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 * if marked present.
 */
static phys_addr_t root_entry_lctp(struct root_entry *re)
{
    if (!(re->lo & 1))
        return 0;

    return re->lo & VTD_PAGE_MASK;
}

/*
 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 * if marked present.
 */
static phys_addr_t root_entry_uctp(struct root_entry *re)
{
    if (!(re->hi & 1))
        return 0;

    return re->hi & VTD_PAGE_MASK;
}

VTD_PAGE_MASK的相关定义:

/*
 * VT-d hardware uses 4KiB page size regardless of host page size.
 */
#define VTD_PAGE_SHIFT      (12)
#define VTD_PAGE_SIZE       (1UL << VTD_PAGE_SHIFT)
#define VTD_PAGE_MASK       (((u64)-1) << VTD_PAGE_SHIFT)
#define VTD_PAGE_ALIGN(addr)    (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)

所以root_entry_lctp得到的是Context Table的物理地址。

Root entry的格式如下:

1

Extended root entry的格式如下:

2

Root entryextended root entry都占16byte16 * 8 = 128),而HAW代表这个平台的Host Address Width,一共有256root entryextended root entry4096/16 = 256)。

参考资料:
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(10)——[PATCH] iommu/vt-d: Load old data structures only in kdump kernel

kernel mainstreamintel-iommu.c代码中:

static int __init init_dmars(void)
{
    ......
    if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
        iommu_disable_translation(iommu);
        clear_translation_pre_enabled(iommu);
        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
            iommu->name);
    }
    ......
}

translation_pre_enabled函数如下:

static bool translation_pre_enabled(struct intel_iommu *iommu)
{
    return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
}

VTD_FLAG_TRANS_PRE_ENABLED赋值是在init_translation_status函数中:

static void init_translation_status(struct intel_iommu *iommu)
{
    u32 gsts;

    gsts = readl(iommu->reg + DMAR_GSTS_REG);
    if (gsts & DMA_GSTS_TES)
        iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
}

DMAR_GSTS_REG(Global status register)DMA_GSTS_TES(Translation Enable Status)表明是否开启了DMA Remapping功能。

所以if (translation_pre_enabled(iommu) && !is_kdump_kernel())这段代码含义是如果这个iommu硬件单元已经开启了DMA Remapping功能,但是当前运行的kernel不是rebootkernel,则当前iommu硬件状态是不能被认为是正确的,所以要把DMAR_GSTS_REG(Global status register)寄存器,和iommu->flags都要重置(clear_translation_pre_enabled)。

同理,在intel_irq_remapping.c中,也有类似代码:

static int intel_setup_irq_remapping(struct intel_iommu *iommu)
{
    ......
    if (ir_pre_enabled(iommu)) {
        if (iommu_load_old_irte(iommu))
            pr_err("Failed to copy IR table for %s from previous kernel\n",
                   iommu->name);
        else
            pr_info("Copied IR table for %s from previous kernel\n",
                iommu->name);
    }
    ......
}

static int iommu_load_old_irte(struct intel_iommu *iommu)
{
    ......
    if (!is_kdump_kernel()) {
        ......
    }
    ......
 }

参考资料:
[PATCH 04/17] iommu/vt-d: Load old data structures only in kdump kernel
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(9)——EIM,IR和QI

支持IOMMU的硬件单元的Extended Capability Register有三个关联的位:

EIMExtended Interrupt Mode):在X86_64平台,0表示支持xAPIC1表示支持x2APICItanium平台这一位没意义。并且这一位只有在IR位设置为1才有效。

IRInterrupt Remapping support):1表示支持Interrupt remapping0表示不支持。硬件单元支持Interrupt remapping,也必须支持QI

QIQueued Invalidation support):1支持Queued Invalidation0表示不支持。

参考资料:
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(8)——intel_enable_irq_remapping(2)

上文

(4)

for_each_iommu(iommu, drhd) {
    /*
     * If the queued invalidation is already initialized,
     * shouldn't disable it.
     */
    if (iommu->qi)
        continue;

    /*
     * Clear previous faults.
     */
    dmar_fault(-1, iommu);

    /*
     * Disable intr remapping and queued invalidation, if already
     * enabled prior to OS handover.
     */
    iommu_disable_irq_remapping(iommu);

    dmar_disable_qi(iommu);
}

上面代码含义是如果硬件单元的queued invalidation还没有初始化,则清掉之前的fault,并且disable IRQ remappingqueued invalidation

(5)

/*
 * check for the Interrupt-remapping support
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (eim && !ecap_eim_support(iommu->ecap)) {
        printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
               " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
        goto error;
    }
}

上述代码含义是如果硬件支持IRQ remapping并且系统支持x2APIC模式,如果硬件不支持x2APIC模式,就失败。

(6)

/*
 * Enable queued invalidation for all the DRHD's.
 */
for_each_iommu(iommu, drhd) {
    int ret = dmar_enable_qi(iommu);

    if (ret) {
        printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
               " invalidation, ecap %Lx, ret %d\n",
               drhd->reg_base_addr, iommu->ecap, ret);
        goto error;
    }
}

/*
 * Setup Interrupt-remapping for all the DRHD's now.
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (intel_setup_irq_remapping(iommu, eim))
        goto error;

    setup = 1;
}

if (!setup)
    goto error;

上述代码分别为每个硬件单元enable queued invalidationIRQ remapping

(7)

irq_remapping_enabled = 1;

/*
 * VT-d has a different layout for IO-APIC entries when
 * interrupt remapping is enabled. So it needs a special routine
 * to print IO-APIC entries for debugging purposes too.
 */
x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;

pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");

return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;

error:
    /*
     * handle error condition gracefully here!
     */

if (x2apic_present)
    pr_warn("Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");

return -1;

如果成功的话,返回IRQ_REMAP_XAPIC_MODE(0)IRQ_REMAP_X2APIC_MODE(1),否则返回-1

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(7)——intel_enable_irq_remapping(1)

看一下intel_enable_irq_remapping的代码:

static int __init intel_enable_irq_remapping(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;
    bool x2apic_present;
    int setup = 0;
    int eim = 0;

    x2apic_present = x2apic_supported();

    if (parse_ioapics_under_ir() != 1) {
        printk(KERN_INFO "Not enable interrupt remapping\n");
        goto error;
    }

    if (x2apic_present) {
        pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");

        eim = !dmar_x2apic_optout();
        if (!eim)
            printk(KERN_WARNING
                "Your BIOS is broken and requested that x2apic be disabled.\n"
                "This will slightly decrease performance.\n"
                "Use 'intremap=no_x2apic_optout' to override BIOS request.\n");
    }

    for_each_iommu(iommu, drhd) {
        /*
         * If the queued invalidation is already initialized,
         * shouldn't disable it.
         */
        if (iommu->qi)
            continue;

        /*
         * Clear previous faults.
         */
        dmar_fault(-1, iommu);

        /*
         * Disable intr remapping and queued invalidation, if already
         * enabled prior to OS handover.
         */
        iommu_disable_irq_remapping(iommu);

        dmar_disable_qi(iommu);
    }

    /*
     * check for the Interrupt-remapping support
     */
    for_each_iommu(iommu, drhd) {
        if (!ecap_ir_support(iommu->ecap))
            continue;

        if (eim && !ecap_eim_support(iommu->ecap)) {
            printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
                   " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
            goto error;
        }
    }

    /*
     * Enable queued invalidation for all the DRHD's.
     */
    for_each_iommu(iommu, drhd) {
        int ret = dmar_enable_qi(iommu);

        if (ret) {
            printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
                   " invalidation, ecap %Lx, ret %d\n",
                   drhd->reg_base_addr, iommu->ecap, ret);
            goto error;
        }
    }

    /*
     * Setup Interrupt-remapping for all the DRHD's now.
     */
    for_each_iommu(iommu, drhd) {
        if (!ecap_ir_support(iommu->ecap))
            continue;

        if (intel_setup_irq_remapping(iommu, eim))
            goto error;

        setup = 1;
    }

    if (!setup)
        goto error;

    irq_remapping_enabled = 1;

    /*
     * VT-d has a different layout for IO-APIC entries when
     * interrupt remapping is enabled. So it needs a special routine
     * to print IO-APIC entries for debugging purposes too.
     */
    x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;

    pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");

    return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;

error:
    /*
     * handle error condition gracefully here!
     */

    if (x2apic_present)
        pr_warn("Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");

    return -1;
}

(1)

x2apic_present = x2apic_supported();

查看系统是否支持x2APIC模式。

(2)

if (parse_ioapics_under_ir() != 1) {
    printk(KERN_INFO "Not enable interrupt remapping\n");
    goto error;
}

parse_ioapics_under_ir函数如下:

/*
 * Finds the assocaition between IOAPIC's and its Interrupt-remapping
 * hardware unit.
 */
static int __init parse_ioapics_under_ir(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;
    int ir_supported = 0;
    int ioapic_idx;

    for_each_iommu(iommu, drhd)
        if (ecap_ir_support(iommu->ecap)) {
            if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
                return -1;

            ir_supported = 1;
        }

    if (!ir_supported)
        return 0;

    for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
        int ioapic_id = mpc_ioapic_id(ioapic_idx);
        if (!map_ioapic_to_ir(ioapic_id)) {
            pr_err(FW_BUG "ioapic %d has no mapping iommu, "
                   "interrupt remapping will be disabled\n",
                   ioapic_id);
            return -1;
        }
    }

    return 1;
}

ir_parse_ioapic_hpet_scope函数的作用是解析device scope类型是IOAPICHPET (High Precision Event Timer)

static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
                      struct intel_iommu *iommu)
{
    struct acpi_dmar_hardware_unit *drhd;
    struct acpi_dmar_device_scope *scope;
    void *start, *end;

    drhd = (struct acpi_dmar_hardware_unit *)header;

    start = (void *)(drhd + 1);
    end = ((void *)drhd) + header->length;

    while (start < end) {
        scope = start;
        if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
            if (ir_ioapic_num == MAX_IO_APICS) {
                printk(KERN_WARNING "Exceeded Max IO APICS\n");
                return -1;
            }

            printk(KERN_INFO "IOAPIC id %d under DRHD base "
                   " 0x%Lx IOMMU %d\n", scope->enumeration_id,
                   drhd->address, iommu->seq_id);

            ir_parse_one_ioapic_scope(scope, iommu);
        } else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
            if (ir_hpet_num == MAX_HPET_TBS) {
                printk(KERN_WARNING "Exceeded Max HPET blocks\n");
                return -1;
            }

            printk(KERN_INFO "HPET id %d under DRHD base"
                   " 0x%Lx\n", scope->enumeration_id,
                   drhd->address);

            ir_parse_one_hpet_scope(scope, iommu);
        }
        start += scope->length;
    }

    return 0;
}

ir_parse_one_ioapic_scope为例(ir_parse_one_hpet_scope类似):

static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
                      struct intel_iommu *iommu)
{
    struct acpi_dmar_pci_path *path;
    u8 bus;
    int count;

    bus = scope->bus;
    path = (struct acpi_dmar_pci_path *)(scope + 1);
    count = (scope->length - sizeof(struct acpi_dmar_device_scope))
        / sizeof(struct acpi_dmar_pci_path);

    while (--count > 0) {
        /*
         * Access PCI directly due to the PCI
         * subsystem isn't initialized yet.
         */
        bus = read_pci_config_byte(bus, path->device, path->function,
                       PCI_SECONDARY_BUS);
        path++;
    }

    ir_ioapic[ir_ioapic_num].bus   = bus;
    ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->device, path->function);
    ir_ioapic[ir_ioapic_num].iommu = iommu;
    ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
    ir_ioapic_num++;
}

可以看到,实际上是通过递归访问path得到IOAPIC信息的过程:bus好,对应的iommu设备单元,等等。

for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
    int ioapic_id = mpc_ioapic_id(ioapic_idx);
    if (!map_ioapic_to_ir(ioapic_id)) {
        pr_err(FW_BUG "ioapic %d has no mapping iommu, "
               "interrupt remapping will be disabled\n",
               ioapic_id);
        return -1;
    }
}

这段代码则是检查IOAPIC是否都有对应的IOMMU

(3)

if (x2apic_present) {
        pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");

        eim = !dmar_x2apic_optout();
        if (!eim)
            printk(KERN_WARNING
                "Your BIOS is broken and requested that x2apic be disabled.\n"
                "This will slightly decrease performance.\n"
                "Use 'intremap=no_x2apic_optout' to override BIOS request.\n");
    }  

dmar_x2apic_optout函数实现如下:

static int __init dmar_x2apic_optout(void)
{
    struct acpi_table_dmar *dmar;
    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar || no_x2apic_optout)
        return 0;
    return dmar->flags & DMAR_X2APIC_OPT_OUT;
}

这个函数的返回值表示系统是否使用X2APIC功能(1表示不使用,0表示使用)。

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/Oc

Linux kernel IOMMU代码分析笔记(6)——intel_irq_remapping_supported

看一下intel_irq_remapping_supported的代码:

static int __init intel_irq_remapping_supported(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;

    if (disable_irq_remap)
        return 0;
    if (irq_remap_broken) {
        printk(KERN_WARNING
            "This system BIOS has enabled interrupt remapping\n"
            "on a chipset that contains an erratum making that\n"
            "feature unstable.  To maintain system stability\n"
            "interrupt remapping is being disabled.  Please\n"
            "contact your BIOS vendor for an update\n");
        add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
        disable_irq_remap = 1;
        return 0;
    }

    if (!dmar_ir_support())
        return 0;

    for_each_iommu(iommu, drhd)
        if (!ecap_ir_support(iommu->ecap))
            return 0;

    return 1;
}

disable_irq_remapirq_remap_broken定义在irq_remapping.c中,用来判断是否支持IRQ remapping这个功能。

dmar_ir_support定义在dmar.c中:

/*
 * Check interrupt remapping support in DMAR table description.
 */
int __init dmar_ir_support(void)
{
    struct acpi_table_dmar *dmar;
    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar)
        return 0;
    return dmar->flags & 0x1;
}

flags的第0位标示平台是否支持IRQ remapping这个功能。

    for_each_iommu(iommu, drhd)
        if (!ecap_ir_support(iommu->ecap))
            return 0;

这段代码检查DMA Remapping Hardware Unit的所代表的IOMMU单元是否支持IRQ remapping这个功能。

如果检查都通过了,返回1,表示支持IRQ remapping这个功能。

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(5)——Interrupt Remapping初始化相关部分

irq_remap_ops定义在drivers\iommu\irq_remapping.h中:

struct irq_remap_ops {
    /* Check whether Interrupt Remapping is supported */
    int (*supported)(void);

    /* Initializes hardware and makes it ready for remapping interrupts */
    int  (*prepare)(void);

    /* Enables the remapping hardware */
    int  (*enable)(void);

    /* Disables the remapping hardware */
    void (*disable)(void);

    /* Reenables the remapping hardware */
    int  (*reenable)(int);

    /* Enable fault handling */
    int  (*enable_faulting)(void);

    /* IO-APIC setup routine */
    int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *,
                  unsigned int, int,
                  struct io_apic_irq_attr *);

    /* Set the CPU affinity of a remapped interrupt */
    int (*set_affinity)(struct irq_data *data, const struct cpumask *mask,
                bool force);

    /* Free an IRQ */
    int (*free_irq)(int);

    /* Create MSI msg to use for interrupt remapping */
    void (*compose_msi_msg)(struct pci_dev *,
                unsigned int, unsigned int,
                struct msi_msg *, u8);

    /* Allocate remapping resources for MSI */
    int (*msi_alloc_irq)(struct pci_dev *, int, int);

    /* Setup the remapped MSI irq */
    int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int);

    /* Setup interrupt remapping for an HPET MSI */
    int (*setup_hpet_msi)(unsigned int, unsigned int);
};

extern struct irq_remap_ops intel_irq_remap_ops;
extern struct irq_remap_ops amd_iommu_irq_ops;

可以看到,在结构体中定义了一系列的函数指针,每个函数的作用都写得很清楚。

针对intel处理器的结构体定义在drivers\iommu\intel_irq_remapping.c

struct irq_remap_ops intel_irq_remap_ops = {
    .supported      = intel_irq_remapping_supported,
    .prepare        = dmar_table_init,
    .enable         = intel_enable_irq_remapping,
    .disable        = disable_irq_remapping,
    .reenable       = reenable_irq_remapping,
    .enable_faulting    = enable_drhd_fault_handling,
    .setup_ioapic_entry = intel_setup_ioapic_entry,
    .set_affinity       = intel_ioapic_set_affinity,
    .free_irq       = free_irte,
    .compose_msi_msg    = intel_compose_msi_msg,
    .msi_alloc_irq      = intel_msi_alloc_irq,
    .msi_setup_irq      = intel_msi_setup_irq,
    .setup_hpet_msi     = intel_setup_hpet_msi,
};

AMD相关的暂不讨论。

enable_IR_x2apic函数(arch\x86\kernel\apic)中,包含着irq_remap_ops中初始化相关函数的调用:

void __init enable_IR_x2apic(void)
{
    ......
    /* Make sure irq_remap_ops are initialized */
    setup_irq_remapping_ops();

    hardware_init_ret = irq_remapping_prepare();
    if (hardware_init_ret && !x2apic_supported())
        return;

    ......   
    if (hardware_init_ret)
        ret = -1;
    else
        ret = enable_IR();

    ......
}

(1)

void __init setup_irq_remapping_ops(void)
{
    remap_ops = &intel_irq_remap_ops;

#ifdef CONFIG_AMD_IOMMU
    if (amd_iommu_irq_ops.prepare() == 0)
        remap_ops = &amd_iommu_irq_ops;
#endif
}

这个函数会让全局变量remap_ops这个指针指向intel_irq_remap_opsamd_iommu_irq_ops

(2)

int __init irq_remapping_prepare(void)
{
    if (!remap_ops || !remap_ops->prepare)
        return -ENODEV;

    return remap_ops->prepare();
}

irq_remapping_prepare会调用prepare函数(Intel平台即dmar_table_init)。

(3)

int __init enable_IR(void)
{
#ifdef CONFIG_IRQ_REMAP
    if (!irq_remapping_supported()) {
        pr_debug("intr-remapping not supported\n");
        return -1;
    }

    if (!x2apic_preenabled && skip_ioapic_setup) {
        pr_info("Skipped enabling intr-remap because of skipping "
            "io-apic setup\n");
        return -1;
    }

    return irq_remapping_enable();
#endif
    return -1;
}

enable_IR中会调用irq_remapping_supportedirq_remapping_enable

int irq_remapping_supported(void)
{
    if (disable_irq_remap)
        return 0;

    if (!remap_ops || !remap_ops->supported)
        return 0;

    return remap_ops->supported();
}

int __init irq_remapping_enable(void)
{
    int ret;

    if (!remap_ops || !remap_ops->enable)
        return -ENODEV;

    ret = remap_ops->enable();

    if (irq_remapping_enabled)
        irq_remapping_modify_x86_ops();

    return ret;
}

会先后调用supportedIntel平台即为intel_irq_remapping_supported)和enable函数(Intel平台即为intel_enable_irq_remapping)。

irq_remapping_modify_x86_ops会把其它函数赋值给相应的全局函数指针:

static void __init irq_remapping_modify_x86_ops(void)
{
    x86_io_apic_ops.disable     = irq_remapping_disable_io_apic;
    x86_io_apic_ops.set_affinity    = set_remapped_irq_affinity;
    x86_io_apic_ops.setup_entry = setup_ioapic_remapped_entry;
    x86_io_apic_ops.eoi_ioapic_pin  = eoi_ioapic_pin_remapped;
    x86_msi.setup_msi_irqs      = irq_remapping_setup_msi_irqs;
    x86_msi.setup_hpet_msi      = setup_hpet_msi_remapped;
    x86_msi.compose_msi_msg     = compose_remapped_msi_msg;
}

Linux kernel IOMMU代码分析笔记(4)——fault处理函数

dmar_fault是发生DMA remapping faultinterrupt remapping fault时的处理函数。代码如下:

irqreturn_t dmar_fault(int irq, void *dev_id)
{
    struct intel_iommu *iommu = dev_id;
    int reg, fault_index;
    u32 fault_status;
    unsigned long flag;

    raw_spin_lock_irqsave(&iommu->register_lock, flag);
    fault_status = readl(iommu->reg + DMAR_FSTS_REG);
    if (fault_status)
        pr_err("DRHD: handling fault status reg %x\n", fault_status);

    /* TBD: ignore advanced fault log currently */
    if (!(fault_status & DMA_FSTS_PPF))
        goto unlock_exit;

    fault_index = dma_fsts_fault_record_index(fault_status);
    reg = cap_fault_reg_offset(iommu->cap);
    while (1) {
        u8 fault_reason;
        u16 source_id;
        u64 guest_addr;
        int type;
        u32 data;

        /* highest 32 bits */
        data = readl(iommu->reg + reg +
                fault_index * PRIMARY_FAULT_REG_LEN + 12);
        if (!(data & DMA_FRCD_F))
            break;

        fault_reason = dma_frcd_fault_reason(data);
        type = dma_frcd_type(data);

        data = readl(iommu->reg + reg +
                fault_index * PRIMARY_FAULT_REG_LEN + 8);
        source_id = dma_frcd_source_id(data);

        guest_addr = dmar_readq(iommu->reg + reg +
                fault_index * PRIMARY_FAULT_REG_LEN);
        guest_addr = dma_frcd_page_addr(guest_addr);
        /* clear the fault */
        writel(DMA_FRCD_F, iommu->reg + reg +
            fault_index * PRIMARY_FAULT_REG_LEN + 12);

        raw_spin_unlock_irqrestore(&iommu->register_lock, flag);

        dmar_fault_do_one(iommu, type, fault_reason,
                source_id, guest_addr);

        fault_index++;
        if (fault_index >= cap_num_fault_regs(iommu->cap))
            fault_index = 0;
        raw_spin_lock_irqsave(&iommu->register_lock, flag);
    }

    writel(DMA_FSTS_PFO | DMA_FSTS_PPF, iommu->reg + DMAR_FSTS_REG);

unlock_exit:
    raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
    return IRQ_HANDLED;
}  

分析一下这段代码:

(1)

fault_status = readl(iommu->reg + DMAR_FSTS_REG);
if (fault_status)
    pr_err("DRHD: handling fault status reg %x\n", fault_status);

/* TBD: ignore advanced fault log currently */
if (!(fault_status & DMA_FSTS_PPF))
    goto unlock_exit;

fault_index = dma_fsts_fault_record_index(fault_status);

DMAR_FSTS_REGFault Status Register,其中PPFPrimary Pending Fault)是否置位表明fault recording registers是否还有fault信息。而fault_index则是第一个含有fault信息的fault recording registers的索引。

(2)

reg = cap_fault_reg_offset(iommu->cap);

计算fault recording registers的地址偏移量。

(3)接下来的while循环会读取包含fault信息的fault recording registers

(4)

    data = readl(iommu->reg + reg +
            fault_index * PRIMARY_FAULT_REG_LEN + 12);
    if (!(data & DMA_FRCD_F))
        break;

    fault_reason = dma_frcd_fault_reason(data);
    type = dma_frcd_type(data);

    data = readl(iommu->reg + reg +
            fault_index * PRIMARY_FAULT_REG_LEN + 8);
    source_id = dma_frcd_source_id(data);

    guest_addr = dmar_readq(iommu->reg + reg +
            fault_index * PRIMARY_FAULT_REG_LEN);
    guest_addr = dma_frcd_page_addr(guest_addr);
    /* clear the fault */
    writel(DMA_FRCD_F, iommu->reg + reg +
        fault_index * PRIMARY_FAULT_REG_LEN + 12);

fault recording registers读取fault信息。需要注意的是,由于X86平台是小端模式,所以寄存器的高位内容会位于内存的高地址空间。另外,每读取完一个fault recording register信息,要把DMA_FRCD_F写回寄存器,用来表明软件已经读完了。

(5)dmar_fault_do_one是格式化打印的fault信息,其代码如下:

static int dmar_fault_do_one(struct intel_iommu *iommu, int type,
        u8 fault_reason, u16 source_id, unsigned long long addr)
{
    const char *reason;
    int fault_type;

    reason = dmar_get_fault_reason(fault_reason, &fault_type);

    if (fault_type == INTR_REMAP)
        pr_err("INTR-REMAP: Request device [[%02x:%02x.%d] "
               "fault index %llx\n"
            "INTR-REMAP:[fault reason %02d] %s\n",
            (source_id >> 8), PCI_SLOT(source_id & 0xFF),
            PCI_FUNC(source_id & 0xFF), addr >> 48,
            fault_reason, reason);
    else
        pr_err("DMAR:[%s] Request device [%02x:%02x.%d] "
               "fault addr %llx \n"
               "DMAR:[fault reason %02d] %s\n",
               (type ? "DMA Read" : "DMA Write"),
               (source_id >> 8), PCI_SLOT(source_id & 0xFF),
               PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
    return 0;
}

(6)

writel(DMA_FSTS_PFO | DMA_FSTS_PPF, iommu->reg + DMAR_FSTS_REG);

最后表明软件已处理完所有的fault信息。

Linux kernel IOMMU代码分析笔记(3)

看一下dmar_table_init这个函数:

int __init dmar_table_init(void)
{
    static int dmar_table_initialized;
    int ret;

    if (dmar_table_initialized == 0) {
        ret = parse_dmar_table();
        if (ret < 0) {
            if (ret != -ENODEV)
                pr_info("parse DMAR table failure.\n");
        } else  if (list_empty(&dmar_drhd_units)) {
            pr_info("No DMAR devices found\n");
            ret = -ENODEV;
        }

        if (ret < 0)
            dmar_table_initialized = ret;
        else
            dmar_table_initialized = 1;
    }

    return dmar_table_initialized < 0 ? dmar_table_initialized : 0;
}

dmar_table_init其实就是调用了parse_dmar_table这个函数来解析 DMAR ACPI tableparse_dmar_table代码如下:

static int __init
parse_dmar_table(void)
{
    struct acpi_table_dmar *dmar;
    struct acpi_dmar_header *entry_header;
    int ret = 0;
    int drhd_count = 0;

    /*
     * Do it again, earlier dmar_tbl mapping could be mapped with
     * fixed map.
     */
    dmar_table_detect();

    /*
     * ACPI tables may not be DMA protected by tboot, so use DMAR copy
     * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
     */
    dmar_tbl = tboot_get_dmar_table(dmar_tbl);

    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar)
        return -ENODEV;

    if (dmar->width < PAGE_SHIFT - 1) {
        pr_warn("Invalid DMAR haw\n");
        return -EINVAL;
    }

    pr_info("Host address width %d\n", dmar->width + 1);

    entry_header = (struct acpi_dmar_header *)(dmar + 1);
    while (((unsigned long)entry_header) <
            (((unsigned long)dmar) + dmar_tbl->length)) {
        /* Avoid looping forever on bad ACPI tables */
        if (entry_header->length == 0) {
            pr_warn("Invalid 0-length structure\n");
            ret = -EINVAL;
            break;
        }

        dmar_table_print_dmar_entry(entry_header);

        switch (entry_header->type) {
        case ACPI_DMAR_TYPE_HARDWARE_UNIT:
            drhd_count++;
            ret = dmar_parse_one_drhd(entry_header);
            break;
        case ACPI_DMAR_TYPE_RESERVED_MEMORY:
            ret = dmar_parse_one_rmrr(entry_header);
            break;
        case ACPI_DMAR_TYPE_ATSR:
            ret = dmar_parse_one_atsr(entry_header);
            break;
        case ACPI_DMAR_HARDWARE_AFFINITY:
#ifdef CONFIG_ACPI_NUMA
            ret = dmar_parse_one_rhsa(entry_header);
#endif
            break;
        case ACPI_DMAR_TYPE_ANDD:
            ret = dmar_parse_one_andd(entry_header);
            break;
        default:
            pr_warn("Unknown DMAR structure type %d\n",
                entry_header->type);
            ret = 0; /* for forward compatibility */
            break;
        }
        if (ret)
            break;

        entry_header = ((void *)entry_header + entry_header->length);
    }
    if (drhd_count == 0)
        pr_warn(FW_BUG "No DRHD structure found in DMAR table\n");
    return ret;
}

parse_dmar_table函数中重要的工作有以下两点:
(1)检查host机器地址总线宽度:

if (dmar->width < PAGE_SHIFT - 1) {
        pr_warn("Invalid DMAR haw\n");
        return -EINVAL;
}

(2)遍历并解析DMAR table中的每一项,并加到相应的列表中:

while (((unsigned long)entry_header) <
            (((unsigned long)dmar) + dmar_tbl->length)) {
            ......
}

DMA Remapping Hardware单元为例:

static int __init
dmar_parse_one_drhd(struct acpi_dmar_header *header)
{
    struct acpi_dmar_hardware_unit *drhd;
    struct dmar_drhd_unit *dmaru;
    int ret = 0;

    drhd = (struct acpi_dmar_hardware_unit *)header;
    dmaru = kzalloc(sizeof(*dmaru), GFP_KERNEL);
    if (!dmaru)
        return -ENOMEM;

    dmaru->hdr = header;
    dmaru->reg_base_addr = drhd->address;
    dmaru->segment = drhd->segment;
    dmaru->include_all = drhd->flags & 0x1; /* BIT0: INCLUDE_ALL */
    dmaru->devices = dmar_alloc_dev_scope((void *)(drhd + 1),
                          ((void *)drhd) + drhd->header.length,
                          &dmaru->devices_cnt);
    if (dmaru->devices_cnt && dmaru->devices == NULL) {
        kfree(dmaru);
        return -ENOMEM;
    }

    ret = alloc_iommu(dmaru);
    if (ret) {
        dmar_free_dev_scope(&dmaru->devices,
                    &dmaru->devices_cnt);
        kfree(dmaru);
        return ret;
    }
    dmar_register_drhd_unit(dmaru);
    return 0;
}

最后dmar_register_drhd_unit这个函数会把解析好的DMA Remapping Hardware单元加到dmar_drhd_units这个列表中。