Linux kernel IOMMU代码分析笔记(12)——page-table entry的相关代码定义

DMA请求的地址转换如下图所示:

1

page-table entry格式如下:

2

因为每个page-table entry82^3)个byte,所以上面的转化图中只要9位就可以了(12 - 32^12 = 4KiB)。

GAW的定义:

Guest Address Width: Physical addressability limit within a partition (virtual machine)

可以理解为从虚拟机角度看到的物理地址宽度。举个例子,如果一个虚拟机只能访问2G内存,那么GAW就是31

AGAW的定义:Adjusted Guest Address Width。为了保证9bit长度的步长转化,GAWAGAW之间的转换伪代码如下:

R = (GAW - 12) MOD 9;
if (R == 0) {
    AGAW = GAW;
} else {
    AGAW = GAW + 9 - R;
}
if (AGAW > 64)
    AGAW = 64;

对应的函数是guestwidth_to_adjustwidth

static inline int guestwidth_to_adjustwidth(int gaw)
{
    int agaw;
    int r = (gaw - 12) % 9;

    if (r == 0)
        agaw = gaw;
    else
        agaw = gaw + 9 - r;
    if (agaw > 64)
        agaw = 64;
    return agaw;
}

AGAW的最小长度是30bit,参考以下规范定义(context-entry格式里的内容):

• 000b: 30-bit AGAW (2-level page table)
• 001b: 39-bit AGAW (3-level page table)
• 010b: 48-bit AGAW (4-level page table)
• 011b: 57-bit AGAW (5-level page table)
• 100b: 64-bit AGAW (6-level page table)

所以可以看到kernelagaw的一些转换代码会用到302这些数字:

static inline int agaw_to_level(int agaw)
{
    return agaw + 2;
}

static inline int agaw_to_width(int agaw)
{
    return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
}

static inline int width_to_agaw(int width)
{
    return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
}

 

Linux kernel 笔记 (16)——clflush_cache_range函数

/**
 * clflush_cache_range - flush a cache range with clflush
 * @vaddr:  virtual start address
 * @size:   number of bytes to flush
 *
 * clflushopt is an unordered instruction which needs fencing with mfence or
 * sfence to avoid ordering issues.
 */
void clflush_cache_range(void *vaddr, unsigned int size)
{
    void *vend = vaddr + size - 1;

    mb();

    for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
        clflushopt(vaddr);
    /*
     * Flush any possible final partial cacheline:
     */
    clflushopt(vend);

    mb();
}

clflush_cache_range()函数用来把从虚拟地址vaddr起始的,长度为size的的cache line置为无效,各级包含这个cache linecache系统都会失效。

Linux kernel IOMMU代码分析笔记(11)——root_entry的相关代码定义

root_entry3.10版本的相关定义:

/*
 * 0: Present
 * 1-11: Reserved
 * 12-63: Context Ptr (12 - (haw-1))
 * 64-127: Reserved
 */
struct root_entry {
    u64 val;
    u64 rsvd1;
};
#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
static inline bool root_present(struct root_entry *root)
{
    return (root->val & 1);
}
static inline void set_root_present(struct root_entry *root)
{
    root->val |= 1;
}
static inline void set_root_value(struct root_entry *root, unsigned long value)
{
    root->val |= value & VTD_PAGE_MASK;
}

root_entrymainstream版本的相关定义:

/*
 * 0: Present
 * 1-11: Reserved
 * 12-63: Context Ptr (12 - (haw-1))
 * 64-127: Reserved
 */
struct root_entry {
    u64 lo;
    u64 hi;
};
#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))

/*
 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 * if marked present.
 */
static phys_addr_t root_entry_lctp(struct root_entry *re)
{
    if (!(re->lo & 1))
        return 0;

    return re->lo & VTD_PAGE_MASK;
}

/*
 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 * if marked present.
 */
static phys_addr_t root_entry_uctp(struct root_entry *re)
{
    if (!(re->hi & 1))
        return 0;

    return re->hi & VTD_PAGE_MASK;
}

VTD_PAGE_MASK的相关定义:

/*
 * VT-d hardware uses 4KiB page size regardless of host page size.
 */
#define VTD_PAGE_SHIFT      (12)
#define VTD_PAGE_SIZE       (1UL << VTD_PAGE_SHIFT)
#define VTD_PAGE_MASK       (((u64)-1) << VTD_PAGE_SHIFT)
#define VTD_PAGE_ALIGN(addr)    (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)

所以root_entry_lctp得到的是Context Table的物理地址。

Root entry的格式如下:

1

Extended root entry的格式如下:

2

Root entryextended root entry都占16byte16 * 8 = 128),而HAW代表这个平台的Host Address Width,一共有256root entryextended root entry4096/16 = 256)。

参考资料:
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(10)——[PATCH] iommu/vt-d: Load old data structures only in kdump kernel

kernel mainstreamintel-iommu.c代码中:

static int __init init_dmars(void)
{
    ......
    if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
        iommu_disable_translation(iommu);
        clear_translation_pre_enabled(iommu);
        pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
            iommu->name);
    }
    ......
}

translation_pre_enabled函数如下:

static bool translation_pre_enabled(struct intel_iommu *iommu)
{
    return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
}

VTD_FLAG_TRANS_PRE_ENABLED赋值是在init_translation_status函数中:

static void init_translation_status(struct intel_iommu *iommu)
{
    u32 gsts;

    gsts = readl(iommu->reg + DMAR_GSTS_REG);
    if (gsts & DMA_GSTS_TES)
        iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
}

DMAR_GSTS_REG(Global status register)DMA_GSTS_TES(Translation Enable Status)表明是否开启了DMA Remapping功能。

所以if (translation_pre_enabled(iommu) && !is_kdump_kernel())这段代码含义是如果这个iommu硬件单元已经开启了DMA Remapping功能,但是当前运行的kernel不是rebootkernel,则当前iommu硬件状态是不能被认为是正确的,所以要把DMAR_GSTS_REG(Global status register)寄存器,和iommu->flags都要重置(clear_translation_pre_enabled)。

同理,在intel_irq_remapping.c中,也有类似代码:

static int intel_setup_irq_remapping(struct intel_iommu *iommu)
{
    ......
    if (ir_pre_enabled(iommu)) {
        if (iommu_load_old_irte(iommu))
            pr_err("Failed to copy IR table for %s from previous kernel\n",
                   iommu->name);
        else
            pr_info("Copied IR table for %s from previous kernel\n",
                iommu->name);
    }
    ......
}

static int iommu_load_old_irte(struct intel_iommu *iommu)
{
    ......
    if (!is_kdump_kernel()) {
        ......
    }
    ......
 }

参考资料:
[PATCH 04/17] iommu/vt-d: Load old data structures only in kdump kernel
Intel ® Virtualization Technology for Directed I/O

Shark代码分析笔记(3)——shark_init.lua

看一下shark_init.lua这个文件(省去版权信息):

local uv = require("uv")
local ffi = require("ffi")

package.path = package.path .. ";./deps/?.lua"
package.cpath = package.cpath .. ";./deps/?.so"

-- microsecond precision
ffi.cdef[[
typedef long time_t;

typedef struct timeval {
    time_t tv_sec;
    time_t tv_usec;
} timeval;

int gettimeofday(struct timeval *t, void *tzp);
]]

local gettimeofday_struct = ffi.new("timeval")

shark.gettimeofday = function()
  ffi.C.gettimeofday(gettimeofday_struct, nil)
  return tonumber(gettimeofday_struct.tv_sec) * 1000000 +
         tonumber(gettimeofday_struct.tv_usec)
end

set_interval = function(callback, interval)
  local timer = uv.new_timer()
  local function ontimeout()
    callback(timer)
  end
  uv.timer_start(timer, interval, interval, ontimeout)
  return timer
end

set_timeout = function(callback, timeout)
  local timer = uv.new_timer()
  local function ontimeout()
    uv.timer_stop(timer)
    uv.close(timer)
    callback(timer)
  end
  uv.timer_start(timer, timeout, 0, ontimeout)
  return timer
end

local shark_end_notify_list = {}

shark.add_end_notify = function(callback)
  table.insert(shark_end_notify_list, callback)
end

shark.on_end = function(callback)
  local function call_end()
    --notify registered on_end function
    for _, cb in pairs(shark_end_notify_list) do
      cb()
    end

    callback()
    os.exit(0)
  end
  local sigint = uv.new_signal()
  uv.signal_start(sigint, "sigint", function()
    call_end()
  end)

  local sigterm = uv.new_signal()
  uv.signal_start(sigterm, "sigterm", function()
    call_end()
  end)
end


---------------------------------------------------------------

local function fill_line(n, max)
  for i = 1, max do
    if i < n then
      io.write("*")
    else
      io.write(" ")
    end
  end
end

-- standard histogram print function
-- all type keys and number value
local __print_hist = function(t, cmp_func, mode)
  local stdSum = 0
  local array = {}

  for k, v in pairs(t) do
    stdSum = stdSum + v
    if tostring(k) ~= "" then
      array[#array + 1] = {k = k, v = v}
    end
  end

  table.sort(array, function(v1, v2)
    if cmp_func ~= nil then
      return cmp_func(v1.v, v2.v)
    else
      if v1.v > v2.v then return true end
    end
  end)

  if mode == "default" then
    io.write("                          value  ---------- Distribution ----------  count\n")
  end

  for k, v in pairs(array) do
    if mode == "default" then
      io.write(string.format("%33s |", tostring(v.k)))
      fill_line(v.v * 34 / stdSum, 34)
      io.write(string.format("| %d\n", v.v))
    else
      io.write(string.format("%s\n%d\n", tostring(v.k), v.v))
    end
  end
end

function print_hist(t, cmp_func)
  __print_hist(t, cmp_func, "default")
end


function print_hist_raw(t, cmp_func)
  __print_hist(t, cmp_func, "raw")
end

shark.print_hist = print_hist
shark.print_hist_raw = print_hist_raw

(1)

local uv = require("uv")
local ffi = require("ffi")

package.path = package.path .. ";./deps/?.lua"
package.cpath = package.cpath .. ";./deps/?.so"

require("uv")加载luv模块(在main函数中已经将luaopen_luv注册进了package.preload table),require("ffi")加载luajit中的ffi模块。
然后修改package.pathpackage.cpath,这样可以找到shark依赖的lua文件和库。

(2)

-- microsecond precision
ffi.cdef[[
typedef long time_t;

typedef struct timeval {
    time_t tv_sec;
    time_t tv_usec;
} timeval;

int gettimeofday(struct timeval *t, void *tzp);
]]

local gettimeofday_struct = ffi.new("timeval")

shark.gettimeofday = function()
  ffi.C.gettimeofday(gettimeofday_struct, nil)
  return tonumber(gettimeofday_struct.tv_sec) * 1000000 +
         tonumber(gettimeofday_struct.tv_usec)
end

ffi.new()返回一个cdata类型的值。shark.gettimeofday返回的是当前时间的微秒(us)值。

(3)

set_interval = function(callback, interval)
  local timer = uv.new_timer()
  local function ontimeout()
    callback(timer)
  end
  uv.timer_start(timer, interval, interval, ontimeout)
  return timer
end

set_timeout = function(callback, timeout)
  local timer = uv.new_timer()
  local function ontimeout()
    uv.timer_stop(timer)
    uv.close(timer)
    callback(timer)
  end
  uv.timer_start(timer, timeout, 0, ontimeout)
  return timer
end

set_intervalset_timeout利用了luv模块中定时器相关函数。set_interval函数会让callback函数间隔interval时间执行。而set_timeout函数会让callback函数在timeout后执行,且执行一次。要注意这两个函数的时间单位是毫秒(ms)。

(4)

local shark_end_notify_list = {}

shark.add_end_notify = function(callback)
  table.insert(shark_end_notify_list, callback)
end

创建一个shark_end_notify_listtable,并且定义一个shark.add_end_notify的函数。这个函数的作用是向shark_end_notify_list添加回调函数(callback),这个table中的函数会在shark.on_end这个函数中调用,也就是在脚本退出时执行收尾工作。

(5)

shark.on_end = function(callback)
  local function call_end()
    --notify registered on_end function
    for _, cb in pairs(shark_end_notify_list) do
      cb()
    end

    callback()
    os.exit(0)
  end
  local sigint = uv.new_signal()
  uv.signal_start(sigint, "sigint", function()
    call_end()
  end)

  local sigterm = uv.new_signal()
  uv.signal_start(sigterm, "sigterm", function()
    call_end()
  end)
end

shark.on_end函数的参数是一个回调函数(callback)。在shark.on_end里定义了一个local函数:call_endcall_end首先会遍历shark_end_notify_list,并把其中的函数都执行一遍,最后调用shark.on_end函数传入的回调函数(callback)。 call_end函数就是shark.on_end函数为sigintsigterm信号注册的信号处理函数。

(6)

local function fill_line(n, max)
  for i = 1, max do
    if i < n then
      io.write("*")
    else
      io.write(" ")
    end
  end
end

fill_line函数用来输出*,用在接下来的__print_hist函数中。

(7)

-- standard histogram print function
-- all type keys and number value
local __print_hist = function(t, cmp_func, mode)
  local stdSum = 0
  local array = {}

  for k, v in pairs(t) do
    stdSum = stdSum + v
    if tostring(k) ~= "" then
      array[#array + 1] = {k = k, v = v}
    end
  end

  table.sort(array, function(v1, v2)
    if cmp_func ~= nil then
      return cmp_func(v1.v, v2.v)
    else
      if v1.v > v2.v then return true end
    end
  end)

  if mode == "default" then
    io.write("                          value  ---------- Distribution ----------  count\n")
  end

  for k, v in pairs(array) do
    if mode == "default" then
      io.write(string.format("%33s |", tostring(v.k)))
      fill_line(v.v * 34 / stdSum, 34)
      io.write(string.format("| %d\n", v.v))
    else
      io.write(string.format("%s\n%d\n", tostring(v.k), v.v))
    end
  end
end

__print_hist函数用来打印柱状图。

  local stdSum = 0
  local array = {}

  for k, v in pairs(t) do
    stdSum = stdSum + v
    if tostring(k) ~= "" then
      array[#array + 1] = {k = k, v = v}
    end
  end

输入参数t是一个table。以上代码是遍历这个table,并且生成一个新的“array table”。这个array的值又是一个table:包含输入tablekeyvalue

table.sort(array, function(v1, v2)
    if cmp_func ~= nil then
      return cmp_func(v1.v, v2.v)
    else
      if v1.v > v2.v then return true end
    end
  end)

以上代码是对array进行排序。如果没有输入cmp_func函数参数,就用默认的比较方式。

if mode == "default" then
    io.write("                          value  ---------- Distribution ----------  count\n")
  end

  for k, v in pairs(array) do
    if mode == "default" then
      io.write(string.format("%33s |", tostring(v.k)))
      fill_line(v.v * 34 / stdSum, 34)
      io.write(string.format("| %d\n", v.v))
    else
      io.write(string.format("%s\n%d\n", tostring(v.k), v.v))
    end
  end

以上代码就是打印最后的柱状图了。输出结果类似:

                           value  ---------- Distribution ----------  count
  syscalls:sys_enter_gettimeofday |********                          | 25940
   syscalls:sys_exit_gettimeofday |********                          | 25940
    syscalls:sys_enter_epoll_wait |****                              | 12977
     syscalls:sys_exit_epoll_wait |****                              | 12977
          syscalls:sys_exit_alarm |*                                 | 3917

 

(8)

function print_hist(t, cmp_func)
  __print_hist(t, cmp_func, "default")
end


function print_hist_raw(t, cmp_func)
  __print_hist(t, cmp_func, "raw")
end

shark.print_hist = print_hist
shark.print_hist_raw = print_hist_raw

最后就是把__print_hist封装出shark.print_histshark.print_hist_raw两个函数供其它程序调用。

 

 

 

 

Linux kernel IOMMU代码分析笔记(9)——EIM,IR和QI

支持IOMMU的硬件单元的Extended Capability Register有三个关联的位:

EIMExtended Interrupt Mode):在X86_64平台,0表示支持xAPIC1表示支持x2APICItanium平台这一位没意义。并且这一位只有在IR位设置为1才有效。

IRInterrupt Remapping support):1表示支持Interrupt remapping0表示不支持。硬件单元支持Interrupt remapping,也必须支持QI

QIQueued Invalidation support):1支持Queued Invalidation0表示不支持。

参考资料:
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(8)——intel_enable_irq_remapping(2)

上文

(4)

for_each_iommu(iommu, drhd) {
    /*
     * If the queued invalidation is already initialized,
     * shouldn't disable it.
     */
    if (iommu->qi)
        continue;

    /*
     * Clear previous faults.
     */
    dmar_fault(-1, iommu);

    /*
     * Disable intr remapping and queued invalidation, if already
     * enabled prior to OS handover.
     */
    iommu_disable_irq_remapping(iommu);

    dmar_disable_qi(iommu);
}

上面代码含义是如果硬件单元的queued invalidation还没有初始化,则清掉之前的fault,并且disable IRQ remappingqueued invalidation

(5)

/*
 * check for the Interrupt-remapping support
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (eim && !ecap_eim_support(iommu->ecap)) {
        printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
               " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
        goto error;
    }
}

上述代码含义是如果硬件支持IRQ remapping并且系统支持x2APIC模式,如果硬件不支持x2APIC模式,就失败。

(6)

/*
 * Enable queued invalidation for all the DRHD's.
 */
for_each_iommu(iommu, drhd) {
    int ret = dmar_enable_qi(iommu);

    if (ret) {
        printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
               " invalidation, ecap %Lx, ret %d\n",
               drhd->reg_base_addr, iommu->ecap, ret);
        goto error;
    }
}

/*
 * Setup Interrupt-remapping for all the DRHD's now.
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (intel_setup_irq_remapping(iommu, eim))
        goto error;

    setup = 1;
}

if (!setup)
    goto error;

上述代码分别为每个硬件单元enable queued invalidationIRQ remapping

(7)

irq_remapping_enabled = 1;

/*
 * VT-d has a different layout for IO-APIC entries when
 * interrupt remapping is enabled. So it needs a special routine
 * to print IO-APIC entries for debugging purposes too.
 */
x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;

pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");

return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;

error:
    /*
     * handle error condition gracefully here!
     */

if (x2apic_present)
    pr_warn("Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");

return -1;

如果成功的话,返回IRQ_REMAP_XAPIC_MODE(0)IRQ_REMAP_X2APIC_MODE(1),否则返回-1

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(7)——intel_enable_irq_remapping(1)

看一下intel_enable_irq_remapping的代码:

static int __init intel_enable_irq_remapping(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;
    bool x2apic_present;
    int setup = 0;
    int eim = 0;

    x2apic_present = x2apic_supported();

    if (parse_ioapics_under_ir() != 1) {
        printk(KERN_INFO "Not enable interrupt remapping\n");
        goto error;
    }

    if (x2apic_present) {
        pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");

        eim = !dmar_x2apic_optout();
        if (!eim)
            printk(KERN_WARNING
                "Your BIOS is broken and requested that x2apic be disabled.\n"
                "This will slightly decrease performance.\n"
                "Use 'intremap=no_x2apic_optout' to override BIOS request.\n");
    }

    for_each_iommu(iommu, drhd) {
        /*
         * If the queued invalidation is already initialized,
         * shouldn't disable it.
         */
        if (iommu->qi)
            continue;

        /*
         * Clear previous faults.
         */
        dmar_fault(-1, iommu);

        /*
         * Disable intr remapping and queued invalidation, if already
         * enabled prior to OS handover.
         */
        iommu_disable_irq_remapping(iommu);

        dmar_disable_qi(iommu);
    }

    /*
     * check for the Interrupt-remapping support
     */
    for_each_iommu(iommu, drhd) {
        if (!ecap_ir_support(iommu->ecap))
            continue;

        if (eim && !ecap_eim_support(iommu->ecap)) {
            printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
                   " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
            goto error;
        }
    }

    /*
     * Enable queued invalidation for all the DRHD's.
     */
    for_each_iommu(iommu, drhd) {
        int ret = dmar_enable_qi(iommu);

        if (ret) {
            printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
                   " invalidation, ecap %Lx, ret %d\n",
                   drhd->reg_base_addr, iommu->ecap, ret);
            goto error;
        }
    }

    /*
     * Setup Interrupt-remapping for all the DRHD's now.
     */
    for_each_iommu(iommu, drhd) {
        if (!ecap_ir_support(iommu->ecap))
            continue;

        if (intel_setup_irq_remapping(iommu, eim))
            goto error;

        setup = 1;
    }

    if (!setup)
        goto error;

    irq_remapping_enabled = 1;

    /*
     * VT-d has a different layout for IO-APIC entries when
     * interrupt remapping is enabled. So it needs a special routine
     * to print IO-APIC entries for debugging purposes too.
     */
    x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;

    pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");

    return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;

error:
    /*
     * handle error condition gracefully here!
     */

    if (x2apic_present)
        pr_warn("Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");

    return -1;
}

(1)

x2apic_present = x2apic_supported();

查看系统是否支持x2APIC模式。

(2)

if (parse_ioapics_under_ir() != 1) {
    printk(KERN_INFO "Not enable interrupt remapping\n");
    goto error;
}

parse_ioapics_under_ir函数如下:

/*
 * Finds the assocaition between IOAPIC's and its Interrupt-remapping
 * hardware unit.
 */
static int __init parse_ioapics_under_ir(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;
    int ir_supported = 0;
    int ioapic_idx;

    for_each_iommu(iommu, drhd)
        if (ecap_ir_support(iommu->ecap)) {
            if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
                return -1;

            ir_supported = 1;
        }

    if (!ir_supported)
        return 0;

    for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
        int ioapic_id = mpc_ioapic_id(ioapic_idx);
        if (!map_ioapic_to_ir(ioapic_id)) {
            pr_err(FW_BUG "ioapic %d has no mapping iommu, "
                   "interrupt remapping will be disabled\n",
                   ioapic_id);
            return -1;
        }
    }

    return 1;
}

ir_parse_ioapic_hpet_scope函数的作用是解析device scope类型是IOAPICHPET (High Precision Event Timer)

static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
                      struct intel_iommu *iommu)
{
    struct acpi_dmar_hardware_unit *drhd;
    struct acpi_dmar_device_scope *scope;
    void *start, *end;

    drhd = (struct acpi_dmar_hardware_unit *)header;

    start = (void *)(drhd + 1);
    end = ((void *)drhd) + header->length;

    while (start < end) {
        scope = start;
        if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
            if (ir_ioapic_num == MAX_IO_APICS) {
                printk(KERN_WARNING "Exceeded Max IO APICS\n");
                return -1;
            }

            printk(KERN_INFO "IOAPIC id %d under DRHD base "
                   " 0x%Lx IOMMU %d\n", scope->enumeration_id,
                   drhd->address, iommu->seq_id);

            ir_parse_one_ioapic_scope(scope, iommu);
        } else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
            if (ir_hpet_num == MAX_HPET_TBS) {
                printk(KERN_WARNING "Exceeded Max HPET blocks\n");
                return -1;
            }

            printk(KERN_INFO "HPET id %d under DRHD base"
                   " 0x%Lx\n", scope->enumeration_id,
                   drhd->address);

            ir_parse_one_hpet_scope(scope, iommu);
        }
        start += scope->length;
    }

    return 0;
}

ir_parse_one_ioapic_scope为例(ir_parse_one_hpet_scope类似):

static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
                      struct intel_iommu *iommu)
{
    struct acpi_dmar_pci_path *path;
    u8 bus;
    int count;

    bus = scope->bus;
    path = (struct acpi_dmar_pci_path *)(scope + 1);
    count = (scope->length - sizeof(struct acpi_dmar_device_scope))
        / sizeof(struct acpi_dmar_pci_path);

    while (--count > 0) {
        /*
         * Access PCI directly due to the PCI
         * subsystem isn't initialized yet.
         */
        bus = read_pci_config_byte(bus, path->device, path->function,
                       PCI_SECONDARY_BUS);
        path++;
    }

    ir_ioapic[ir_ioapic_num].bus   = bus;
    ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->device, path->function);
    ir_ioapic[ir_ioapic_num].iommu = iommu;
    ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
    ir_ioapic_num++;
}

可以看到,实际上是通过递归访问path得到IOAPIC信息的过程:bus好,对应的iommu设备单元,等等。

for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
    int ioapic_id = mpc_ioapic_id(ioapic_idx);
    if (!map_ioapic_to_ir(ioapic_id)) {
        pr_err(FW_BUG "ioapic %d has no mapping iommu, "
               "interrupt remapping will be disabled\n",
               ioapic_id);
        return -1;
    }
}

这段代码则是检查IOAPIC是否都有对应的IOMMU

(3)

if (x2apic_present) {
        pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");

        eim = !dmar_x2apic_optout();
        if (!eim)
            printk(KERN_WARNING
                "Your BIOS is broken and requested that x2apic be disabled.\n"
                "This will slightly decrease performance.\n"
                "Use 'intremap=no_x2apic_optout' to override BIOS request.\n");
    }  

dmar_x2apic_optout函数实现如下:

static int __init dmar_x2apic_optout(void)
{
    struct acpi_table_dmar *dmar;
    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar || no_x2apic_optout)
        return 0;
    return dmar->flags & DMAR_X2APIC_OPT_OUT;
}

这个函数的返回值表示系统是否使用X2APIC功能(1表示不使用,0表示使用)。

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/Oc

Linux kernel IOMMU代码分析笔记(6)——intel_irq_remapping_supported

看一下intel_irq_remapping_supported的代码:

static int __init intel_irq_remapping_supported(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;

    if (disable_irq_remap)
        return 0;
    if (irq_remap_broken) {
        printk(KERN_WARNING
            "This system BIOS has enabled interrupt remapping\n"
            "on a chipset that contains an erratum making that\n"
            "feature unstable.  To maintain system stability\n"
            "interrupt remapping is being disabled.  Please\n"
            "contact your BIOS vendor for an update\n");
        add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
        disable_irq_remap = 1;
        return 0;
    }

    if (!dmar_ir_support())
        return 0;

    for_each_iommu(iommu, drhd)
        if (!ecap_ir_support(iommu->ecap))
            return 0;

    return 1;
}

disable_irq_remapirq_remap_broken定义在irq_remapping.c中,用来判断是否支持IRQ remapping这个功能。

dmar_ir_support定义在dmar.c中:

/*
 * Check interrupt remapping support in DMAR table description.
 */
int __init dmar_ir_support(void)
{
    struct acpi_table_dmar *dmar;
    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar)
        return 0;
    return dmar->flags & 0x1;
}

flags的第0位标示平台是否支持IRQ remapping这个功能。

    for_each_iommu(iommu, drhd)
        if (!ecap_ir_support(iommu->ecap))
            return 0;

这段代码检查DMA Remapping Hardware Unit的所代表的IOMMU单元是否支持IRQ remapping这个功能。

如果检查都通过了,返回1,表示支持IRQ remapping这个功能。

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/O