Linux kernel 笔记 (14)——is_kdump_kernel函数

#ifdef CONFIG_CRASH_DUMP
/*
 * is_kdump_kernel() checks whether this kernel is booting after a panic of
 * previous kernel or not. This is determined by checking if previous kernel
 * has passed the elf core header address on command line.
 *
 * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will
 * return 1 if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic of
 * previous kernel.
 */

static inline int is_kdump_kernel(void)
{
    return (elfcorehdr_addr != ELFCORE_ADDR_MAX) ? 1 : 0;
}
#else /* !CONFIG_CRASH_DUMP */
static inline int is_kdump_kernel(void) { return 0; }
#endif /* CONFIG_CRASH_DUMP */

is_kdump_kernel用来检查当前运行的kernel是不是由于之前运行的kernel panic了,而重启的kernel。如果没有配置CONFIG_CRASH_DUMP,则总是返回0

Linux kernel IOMMU代码分析笔记(9)——EIM,IR和QI

支持IOMMU的硬件单元的Extended Capability Register有三个关联的位:

EIMExtended Interrupt Mode):在X86_64平台,0表示支持xAPIC1表示支持x2APICItanium平台这一位没意义。并且这一位只有在IR位设置为1才有效。

IRInterrupt Remapping support):1表示支持Interrupt remapping0表示不支持。硬件单元支持Interrupt remapping,也必须支持QI

QIQueued Invalidation support):1支持Queued Invalidation0表示不支持。

参考资料:
Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(8)——intel_enable_irq_remapping(2)

上文

(4)

for_each_iommu(iommu, drhd) {
    /*
     * If the queued invalidation is already initialized,
     * shouldn't disable it.
     */
    if (iommu->qi)
        continue;

    /*
     * Clear previous faults.
     */
    dmar_fault(-1, iommu);

    /*
     * Disable intr remapping and queued invalidation, if already
     * enabled prior to OS handover.
     */
    iommu_disable_irq_remapping(iommu);

    dmar_disable_qi(iommu);
}

上面代码含义是如果硬件单元的queued invalidation还没有初始化,则清掉之前的fault,并且disable IRQ remappingqueued invalidation

(5)

/*
 * check for the Interrupt-remapping support
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (eim && !ecap_eim_support(iommu->ecap)) {
        printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
               " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
        goto error;
    }
}

上述代码含义是如果硬件支持IRQ remapping并且系统支持x2APIC模式,如果硬件不支持x2APIC模式,就失败。

(6)

/*
 * Enable queued invalidation for all the DRHD's.
 */
for_each_iommu(iommu, drhd) {
    int ret = dmar_enable_qi(iommu);

    if (ret) {
        printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
               " invalidation, ecap %Lx, ret %d\n",
               drhd->reg_base_addr, iommu->ecap, ret);
        goto error;
    }
}

/*
 * Setup Interrupt-remapping for all the DRHD's now.
 */
for_each_iommu(iommu, drhd) {
    if (!ecap_ir_support(iommu->ecap))
        continue;

    if (intel_setup_irq_remapping(iommu, eim))
        goto error;

    setup = 1;
}

if (!setup)
    goto error;

上述代码分别为每个硬件单元enable queued invalidationIRQ remapping

(7)

irq_remapping_enabled = 1;

/*
 * VT-d has a different layout for IO-APIC entries when
 * interrupt remapping is enabled. So it needs a special routine
 * to print IO-APIC entries for debugging purposes too.
 */
x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;

pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");

return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;

error:
    /*
     * handle error condition gracefully here!
     */

if (x2apic_present)
    pr_warn("Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");

return -1;

如果成功的话,返回IRQ_REMAP_XAPIC_MODE(0)IRQ_REMAP_X2APIC_MODE(1),否则返回-1

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/O

Linux kernel IOMMU代码分析笔记(7)——intel_enable_irq_remapping(1)

看一下intel_enable_irq_remapping的代码:

static int __init intel_enable_irq_remapping(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;
    bool x2apic_present;
    int setup = 0;
    int eim = 0;

    x2apic_present = x2apic_supported();

    if (parse_ioapics_under_ir() != 1) {
        printk(KERN_INFO "Not enable interrupt remapping\n");
        goto error;
    }

    if (x2apic_present) {
        pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");

        eim = !dmar_x2apic_optout();
        if (!eim)
            printk(KERN_WARNING
                "Your BIOS is broken and requested that x2apic be disabled.\n"
                "This will slightly decrease performance.\n"
                "Use 'intremap=no_x2apic_optout' to override BIOS request.\n");
    }

    for_each_iommu(iommu, drhd) {
        /*
         * If the queued invalidation is already initialized,
         * shouldn't disable it.
         */
        if (iommu->qi)
            continue;

        /*
         * Clear previous faults.
         */
        dmar_fault(-1, iommu);

        /*
         * Disable intr remapping and queued invalidation, if already
         * enabled prior to OS handover.
         */
        iommu_disable_irq_remapping(iommu);

        dmar_disable_qi(iommu);
    }

    /*
     * check for the Interrupt-remapping support
     */
    for_each_iommu(iommu, drhd) {
        if (!ecap_ir_support(iommu->ecap))
            continue;

        if (eim && !ecap_eim_support(iommu->ecap)) {
            printk(KERN_INFO "DRHD %Lx: EIM not supported by DRHD, "
                   " ecap %Lx\n", drhd->reg_base_addr, iommu->ecap);
            goto error;
        }
    }

    /*
     * Enable queued invalidation for all the DRHD's.
     */
    for_each_iommu(iommu, drhd) {
        int ret = dmar_enable_qi(iommu);

        if (ret) {
            printk(KERN_ERR "DRHD %Lx: failed to enable queued, "
                   " invalidation, ecap %Lx, ret %d\n",
                   drhd->reg_base_addr, iommu->ecap, ret);
            goto error;
        }
    }

    /*
     * Setup Interrupt-remapping for all the DRHD's now.
     */
    for_each_iommu(iommu, drhd) {
        if (!ecap_ir_support(iommu->ecap))
            continue;

        if (intel_setup_irq_remapping(iommu, eim))
            goto error;

        setup = 1;
    }

    if (!setup)
        goto error;

    irq_remapping_enabled = 1;

    /*
     * VT-d has a different layout for IO-APIC entries when
     * interrupt remapping is enabled. So it needs a special routine
     * to print IO-APIC entries for debugging purposes too.
     */
    x86_io_apic_ops.print_entries = intel_ir_io_apic_print_entries;

    pr_info("Enabled IRQ remapping in %s mode\n", eim ? "x2apic" : "xapic");

    return eim ? IRQ_REMAP_X2APIC_MODE : IRQ_REMAP_XAPIC_MODE;

error:
    /*
     * handle error condition gracefully here!
     */

    if (x2apic_present)
        pr_warn("Failed to enable irq remapping.  You are vulnerable to irq-injection attacks.\n");

    return -1;
}

(1)

x2apic_present = x2apic_supported();

查看系统是否支持x2APIC模式。

(2)

if (parse_ioapics_under_ir() != 1) {
    printk(KERN_INFO "Not enable interrupt remapping\n");
    goto error;
}

parse_ioapics_under_ir函数如下:

/*
 * Finds the assocaition between IOAPIC's and its Interrupt-remapping
 * hardware unit.
 */
static int __init parse_ioapics_under_ir(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;
    int ir_supported = 0;
    int ioapic_idx;

    for_each_iommu(iommu, drhd)
        if (ecap_ir_support(iommu->ecap)) {
            if (ir_parse_ioapic_hpet_scope(drhd->hdr, iommu))
                return -1;

            ir_supported = 1;
        }

    if (!ir_supported)
        return 0;

    for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
        int ioapic_id = mpc_ioapic_id(ioapic_idx);
        if (!map_ioapic_to_ir(ioapic_id)) {
            pr_err(FW_BUG "ioapic %d has no mapping iommu, "
                   "interrupt remapping will be disabled\n",
                   ioapic_id);
            return -1;
        }
    }

    return 1;
}

ir_parse_ioapic_hpet_scope函数的作用是解析device scope类型是IOAPICHPET (High Precision Event Timer)

static int ir_parse_ioapic_hpet_scope(struct acpi_dmar_header *header,
                      struct intel_iommu *iommu)
{
    struct acpi_dmar_hardware_unit *drhd;
    struct acpi_dmar_device_scope *scope;
    void *start, *end;

    drhd = (struct acpi_dmar_hardware_unit *)header;

    start = (void *)(drhd + 1);
    end = ((void *)drhd) + header->length;

    while (start < end) {
        scope = start;
        if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_IOAPIC) {
            if (ir_ioapic_num == MAX_IO_APICS) {
                printk(KERN_WARNING "Exceeded Max IO APICS\n");
                return -1;
            }

            printk(KERN_INFO "IOAPIC id %d under DRHD base "
                   " 0x%Lx IOMMU %d\n", scope->enumeration_id,
                   drhd->address, iommu->seq_id);

            ir_parse_one_ioapic_scope(scope, iommu);
        } else if (scope->entry_type == ACPI_DMAR_SCOPE_TYPE_HPET) {
            if (ir_hpet_num == MAX_HPET_TBS) {
                printk(KERN_WARNING "Exceeded Max HPET blocks\n");
                return -1;
            }

            printk(KERN_INFO "HPET id %d under DRHD base"
                   " 0x%Lx\n", scope->enumeration_id,
                   drhd->address);

            ir_parse_one_hpet_scope(scope, iommu);
        }
        start += scope->length;
    }

    return 0;
}

ir_parse_one_ioapic_scope为例(ir_parse_one_hpet_scope类似):

static void ir_parse_one_ioapic_scope(struct acpi_dmar_device_scope *scope,
                      struct intel_iommu *iommu)
{
    struct acpi_dmar_pci_path *path;
    u8 bus;
    int count;

    bus = scope->bus;
    path = (struct acpi_dmar_pci_path *)(scope + 1);
    count = (scope->length - sizeof(struct acpi_dmar_device_scope))
        / sizeof(struct acpi_dmar_pci_path);

    while (--count > 0) {
        /*
         * Access PCI directly due to the PCI
         * subsystem isn't initialized yet.
         */
        bus = read_pci_config_byte(bus, path->device, path->function,
                       PCI_SECONDARY_BUS);
        path++;
    }

    ir_ioapic[ir_ioapic_num].bus   = bus;
    ir_ioapic[ir_ioapic_num].devfn = PCI_DEVFN(path->device, path->function);
    ir_ioapic[ir_ioapic_num].iommu = iommu;
    ir_ioapic[ir_ioapic_num].id    = scope->enumeration_id;
    ir_ioapic_num++;
}

可以看到,实际上是通过递归访问path得到IOAPIC信息的过程:bus好,对应的iommu设备单元,等等。

for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) {
    int ioapic_id = mpc_ioapic_id(ioapic_idx);
    if (!map_ioapic_to_ir(ioapic_id)) {
        pr_err(FW_BUG "ioapic %d has no mapping iommu, "
               "interrupt remapping will be disabled\n",
               ioapic_id);
        return -1;
    }
}

这段代码则是检查IOAPIC是否都有对应的IOMMU

(3)

if (x2apic_present) {
        pr_info("Queued invalidation will be enabled to support x2apic and Intr-remapping.\n");

        eim = !dmar_x2apic_optout();
        if (!eim)
            printk(KERN_WARNING
                "Your BIOS is broken and requested that x2apic be disabled.\n"
                "This will slightly decrease performance.\n"
                "Use 'intremap=no_x2apic_optout' to override BIOS request.\n");
    }  

dmar_x2apic_optout函数实现如下:

static int __init dmar_x2apic_optout(void)
{
    struct acpi_table_dmar *dmar;
    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar || no_x2apic_optout)
        return 0;
    return dmar->flags & DMAR_X2APIC_OPT_OUT;
}

这个函数的返回值表示系统是否使用X2APIC功能(1表示不使用,0表示使用)。

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/Oc

Linux kernel IOMMU代码分析笔记(6)——intel_irq_remapping_supported

看一下intel_irq_remapping_supported的代码:

static int __init intel_irq_remapping_supported(void)
{
    struct dmar_drhd_unit *drhd;
    struct intel_iommu *iommu;

    if (disable_irq_remap)
        return 0;
    if (irq_remap_broken) {
        printk(KERN_WARNING
            "This system BIOS has enabled interrupt remapping\n"
            "on a chipset that contains an erratum making that\n"
            "feature unstable.  To maintain system stability\n"
            "interrupt remapping is being disabled.  Please\n"
            "contact your BIOS vendor for an update\n");
        add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
        disable_irq_remap = 1;
        return 0;
    }

    if (!dmar_ir_support())
        return 0;

    for_each_iommu(iommu, drhd)
        if (!ecap_ir_support(iommu->ecap))
            return 0;

    return 1;
}

disable_irq_remapirq_remap_broken定义在irq_remapping.c中,用来判断是否支持IRQ remapping这个功能。

dmar_ir_support定义在dmar.c中:

/*
 * Check interrupt remapping support in DMAR table description.
 */
int __init dmar_ir_support(void)
{
    struct acpi_table_dmar *dmar;
    dmar = (struct acpi_table_dmar *)dmar_tbl;
    if (!dmar)
        return 0;
    return dmar->flags & 0x1;
}

flags的第0位标示平台是否支持IRQ remapping这个功能。

    for_each_iommu(iommu, drhd)
        if (!ecap_ir_support(iommu->ecap))
            return 0;

这段代码检查DMA Remapping Hardware Unit的所代表的IOMMU单元是否支持IRQ remapping这个功能。

如果检查都通过了,返回1,表示支持IRQ remapping这个功能。

参考资料:
"BIOS Considerations" in *Intel ® Virtualization Technology for Directed I/O

Linux kernel 笔记 (13)——“magic SysRq key”简介

Magic SysRq key”是一种组合键(例如在X86平台,是ALT-SysRq-<command key>),除非kernel被完全锁定(连中断都无法处理),否则就会响应这个组合键。这是一个很好的调试kernel方法。

要使用“magic SysRq key”功能,编译kernelCONFIG_MAGIC_SYSRQ要选择yes:“make menuconfig”->“Kernel hacking”->“Magic SysRq key”。当“magic SysRq key”功能编译进kernel后,可以通过/proc/sys/kernel/sysrq文件控制“magic SysRq key”键的功能。这个文件里的默认值是CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE这个配置项的值。例如:

0:关闭所有“`magic SysRq key`”功能
1:开启所有“`magic SysRq key`”功能
......

要注意/proc/sys/kernel/sysrq只能影响通过键盘使用“magic SysRq key”键的功能。而通过访问/proc/sysrq-trigger使用“magic SysRq key”键的功能则总是允许的。

常用的命令包括:

重启系统 (在系统hung住时特别管用):
echo b > /proc/sysrq-trigger

让系统crash:
echo c > /proc/sysrq-trigger

......

具体请参考kernel文档。

参考资料:
(1)magic sysrq: a linux system debugging technique
(2)sysrq.txt

Linux kernel 笔记 (12)——如何删除内核?

本文介绍如何删除自己编译安装的内核:

(1)在/boot文件夹下删除相关文件:

/boot/vmlinuz*KERNEL-VERSION*
/boot/initrd*KERNEL-VERSION*
/boot/System-map*KERNEL-VERSION*
/boot/config-*KERNEL-VERSION*(如果存在)

(2)删除和这个kernel相关的module文件夹。默认是在/lib/modules这个目录下:

/lib/modules/*KERNEL-VERSION*/

(3)修改grub启动文件:删除相应的menuentry,并记得修改default值。

参考资料:
How to: Linux delete or remove kernel

Linux kernel IOMMU代码分析笔记(5)——Interrupt Remapping初始化相关部分

irq_remap_ops定义在drivers\iommu\irq_remapping.h中:

struct irq_remap_ops {
    /* Check whether Interrupt Remapping is supported */
    int (*supported)(void);

    /* Initializes hardware and makes it ready for remapping interrupts */
    int  (*prepare)(void);

    /* Enables the remapping hardware */
    int  (*enable)(void);

    /* Disables the remapping hardware */
    void (*disable)(void);

    /* Reenables the remapping hardware */
    int  (*reenable)(int);

    /* Enable fault handling */
    int  (*enable_faulting)(void);

    /* IO-APIC setup routine */
    int (*setup_ioapic_entry)(int irq, struct IO_APIC_route_entry *,
                  unsigned int, int,
                  struct io_apic_irq_attr *);

    /* Set the CPU affinity of a remapped interrupt */
    int (*set_affinity)(struct irq_data *data, const struct cpumask *mask,
                bool force);

    /* Free an IRQ */
    int (*free_irq)(int);

    /* Create MSI msg to use for interrupt remapping */
    void (*compose_msi_msg)(struct pci_dev *,
                unsigned int, unsigned int,
                struct msi_msg *, u8);

    /* Allocate remapping resources for MSI */
    int (*msi_alloc_irq)(struct pci_dev *, int, int);

    /* Setup the remapped MSI irq */
    int (*msi_setup_irq)(struct pci_dev *, unsigned int, int, int);

    /* Setup interrupt remapping for an HPET MSI */
    int (*setup_hpet_msi)(unsigned int, unsigned int);
};

extern struct irq_remap_ops intel_irq_remap_ops;
extern struct irq_remap_ops amd_iommu_irq_ops;

可以看到,在结构体中定义了一系列的函数指针,每个函数的作用都写得很清楚。

针对intel处理器的结构体定义在drivers\iommu\intel_irq_remapping.c

struct irq_remap_ops intel_irq_remap_ops = {
    .supported      = intel_irq_remapping_supported,
    .prepare        = dmar_table_init,
    .enable         = intel_enable_irq_remapping,
    .disable        = disable_irq_remapping,
    .reenable       = reenable_irq_remapping,
    .enable_faulting    = enable_drhd_fault_handling,
    .setup_ioapic_entry = intel_setup_ioapic_entry,
    .set_affinity       = intel_ioapic_set_affinity,
    .free_irq       = free_irte,
    .compose_msi_msg    = intel_compose_msi_msg,
    .msi_alloc_irq      = intel_msi_alloc_irq,
    .msi_setup_irq      = intel_msi_setup_irq,
    .setup_hpet_msi     = intel_setup_hpet_msi,
};

AMD相关的暂不讨论。

enable_IR_x2apic函数(arch\x86\kernel\apic)中,包含着irq_remap_ops中初始化相关函数的调用:

void __init enable_IR_x2apic(void)
{
    ......
    /* Make sure irq_remap_ops are initialized */
    setup_irq_remapping_ops();

    hardware_init_ret = irq_remapping_prepare();
    if (hardware_init_ret && !x2apic_supported())
        return;

    ......   
    if (hardware_init_ret)
        ret = -1;
    else
        ret = enable_IR();

    ......
}

(1)

void __init setup_irq_remapping_ops(void)
{
    remap_ops = &intel_irq_remap_ops;

#ifdef CONFIG_AMD_IOMMU
    if (amd_iommu_irq_ops.prepare() == 0)
        remap_ops = &amd_iommu_irq_ops;
#endif
}

这个函数会让全局变量remap_ops这个指针指向intel_irq_remap_opsamd_iommu_irq_ops

(2)

int __init irq_remapping_prepare(void)
{
    if (!remap_ops || !remap_ops->prepare)
        return -ENODEV;

    return remap_ops->prepare();
}

irq_remapping_prepare会调用prepare函数(Intel平台即dmar_table_init)。

(3)

int __init enable_IR(void)
{
#ifdef CONFIG_IRQ_REMAP
    if (!irq_remapping_supported()) {
        pr_debug("intr-remapping not supported\n");
        return -1;
    }

    if (!x2apic_preenabled && skip_ioapic_setup) {
        pr_info("Skipped enabling intr-remap because of skipping "
            "io-apic setup\n");
        return -1;
    }

    return irq_remapping_enable();
#endif
    return -1;
}

enable_IR中会调用irq_remapping_supportedirq_remapping_enable

int irq_remapping_supported(void)
{
    if (disable_irq_remap)
        return 0;

    if (!remap_ops || !remap_ops->supported)
        return 0;

    return remap_ops->supported();
}

int __init irq_remapping_enable(void)
{
    int ret;

    if (!remap_ops || !remap_ops->enable)
        return -ENODEV;

    ret = remap_ops->enable();

    if (irq_remapping_enabled)
        irq_remapping_modify_x86_ops();

    return ret;
}

会先后调用supportedIntel平台即为intel_irq_remapping_supported)和enable函数(Intel平台即为intel_enable_irq_remapping)。

irq_remapping_modify_x86_ops会把其它函数赋值给相应的全局函数指针:

static void __init irq_remapping_modify_x86_ops(void)
{
    x86_io_apic_ops.disable     = irq_remapping_disable_io_apic;
    x86_io_apic_ops.set_affinity    = set_remapped_irq_affinity;
    x86_io_apic_ops.setup_entry = setup_ioapic_remapped_entry;
    x86_io_apic_ops.eoi_ioapic_pin  = eoi_ioapic_pin_remapped;
    x86_msi.setup_msi_irqs      = irq_remapping_setup_msi_irqs;
    x86_msi.setup_hpet_msi      = setup_hpet_msi_remapped;
    x86_msi.compose_msi_msg     = compose_remapped_msi_msg;
}

Linux kernel 笔记 (11)——pr_fmt

kernel代码中,很多.c文件开头都会有pr_fmt的定义,例如drivers\iommu\dmar.c

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* has to precede printk.h */

这个pr_fmt是针对这个特定的模块定义的。看一下include\linux\printk.h

#ifndef pr_fmt
#define pr_fmt(fmt) fmt
#endif

可以看到,如果这个模块没有定义自己的pr_fmt,那么就会使用默认的fmt。自己定义pr_fmt的好处是可以更清晰地打印这个模块的相关log,便于以后调试。