Shark代码分析笔记(4)——bpf.lua(未完待续)

bpf.lua位于bpf文件夹下,其代码如下(省去版权信息):

local ffi = require("ffi")
local C = ffi.C

local bpf = {}

bpf.cargs_extra = ""

---------------------------------------------------------------

bpf.cargs_add = function(s)
  bpf.cargs_extra = s
end

---------------------------------------------------------------

ffi.cdef[[
int load_bpf_file(const char *path);
]]

local function load_bpf_file(path)
  local ret = C.load_bpf_file(path)
  if tonumber(ret) ~= 0 then
    os.exit(-1)
  end
end

local function run_llc_bpf(bc_file, bpfobj_file)
  os.execute("llc-bpf -march=bpf -filetype=obj -o " ..
              bpfobj_file .. " " .. bc_file)
end

---------------------------------------------------------------
local function split(s, delimiter)
  local result = {};
  for match in (s..delimiter):gmatch("(.-)"..delimiter) do
    table.insert(result, match);
  end
  return result;
end

local basic_type_tbl = {
  "char", "u8", "short", "u16", "int", "u32", "long", "long long", "u64"
}

local function get_real_type(typestr)
  for _, v in pairs(basic_type_tbl) do
    if v == typestr then
      return v
    end
  end

  return "cdata"
end

local function process_type(typestr0)
  local typestr, size, size

  typestr, size = string.match(typestr0, "(.*) %[(.*)%]")
  if typestr ~= nil and size ~= nil then
    sizestr = "sizeof("..typestr .. ") * "..size
    typestr = typestr
    if typestr == "char" then
      typestr = "string"
    end
  else
    typestr, size = string.match(typestr0, "(.*)%[(.*)%]")
    if typestr ~= nil and size ~= nil then
      sizestr = "sizeof("..typestr .. ") * "..size
      typestr = typestr
      if typestr == "char" then
        typestr = "string"
      end
    else
      sizestr = "sizeof("..typestr0..")"
      typestr = get_real_type(typestr0)
    end
  end

  return typestr, sizestr
end

local function gen_map_decl(map_type, key_type, val_type, entries, name)
  local key_typestr, key_sizestr = process_type(key_type)
  local val_typestr, val_sizestr = process_type(val_type)

  if not entries then
    entries = 1024;
  end

  local str = string.format("struct bpf_map_def SEC(\"maps\") %s = {\n" ..
                            "\t.name = \"%s\",                 \n" ..
                            "\t.key_type = \"%s\",             \n" ..
                            "\t.val_type = \"%s\",             \n" ..
                            "\t.type = BPF_MAP_TYPE_%s,        \n" ..
                            "\t.key_size = %s,                 \n" ..
                            "\t.value_size = %s,               \n" ..
                            "\t.max_entries = %d,              \n};\n",
        name, name, key_typestr, val_typestr, map_type, key_sizestr,
        val_sizestr, entries)
  io.write(str)
end

local function translate_cdef(s)
  local n = split(s, '\n')
  for _, line in pairs(n) do
    local key_type, val_type, entries, name
    local match = false

    key_type, val_type, name = string.match(line,
             "bpf_map_hash<([^,]*), ([^,]*)> (.*);")
    if key_type and val_type and name then
      gen_map_decl("HASH", key_type, val_type, nil, name)
      match = true
    end

    key_type, val_type, entries, name = string.match(line,
             "bpf_map_hash<([^,]*), ([^,]*), (%d+)> (.*);")
    if key_type and val_type and entries and name then
      gen_map_decl("HASH", key_type, val_type, entries, name)
      match = true
    end

    key_type, val_type, name = string.match(line,
             "bpf_map_array<([^,]*), ([^,]*)> (.*);")
    if key_type and val_type and name then
      gen_map_decl("ARRAY", key_type, val_type, nil, name)
      match = true
    end

    key_type, val_type, entries, name = string.match(line,
             "bpf_map_array<([^,]*), ([^,]*), (%d+)> (.*);")
    if key_type and val_type and entries and name then
      gen_map_decl("ARRAY", key_type, val_type, entries, name)
      match = true
    end

    if not match then
      io.write(line, "\n")
    end
  end

  io.write("\nchar _license[] SEC(\"license\") = \"GPL\";\n")
  io.write("\n#include <linux/version.h>\n")
  io.write("\nu32 _version SEC(\"version\") = LINUX_VERSION_CODE;\n")
end

local function compile(srcfile, objfile)
   local f = io.popen("uname -r", "r")
   local release = f:read()
   local linuxinc = string.format("/lib/modules/%s/build/", release)
   local bpfinc = "bpf/libbpf"

   local clang_cmd = string.format("clang -nostdinc -isystem /usr/lib/gcc/x86_64-linux-gnu/4.8/include -I%s -I%s/arch/x86/include -I%s/arch/x86/include/generated/uapi -I%s/arch/x86/include/generated  -I%s/include -I%s/arch/x86/include/uapi -I%s/arch/x86/include/generated/uapi -I%s/include/uapi -I%s/include/generated/uapi -include %s/include/linux/kconfig.h %s -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign -O2 -emit-llvm -x c -c %s -o %s", bpfinc, linuxinc, linuxinc, linuxinc, linuxinc, linuxinc, linuxinc, linuxinc, linuxinc, linuxinc, bpf.cargs_extra, srcfile, objfile)

  os.execute(clang_cmd)
end

-- builtin cdef function
bpf.cdef = function(s)
  local file

  local srctmp = os.tmpname()
  file = io.open(srctmp, "w")
  io.output(file)
  translate_cdef(s)

  io.close(file)
  io.output(io.stdout)

  -- dump source
  local f = io.open(srctmp, "rb")
  local content = f:read("*all")
  print(content)
  f:close()

  local bctmp = os.tmpname()
  compile(srctmp, bctmp)

  local bpftmp = os.tmpname()
  run_llc_bpf(bctmp, bpftmp)

  -- pass bpf table for bpf object loading, need clear it after load done.
  _G["bpf"] = bpf
  load_bpf_file(bpftmp)

  os.remove(srctmp)
  os.remove(bctmp)
  os.remove(bpftmp)
end

---------------------------------------------------------------

bpf.print_map = function(map)
  local map = map
  for k in pairs(map) do
    print(k, map[k])
  end
end

---------------------------------------------------------------

bpf.copy_map = function(map)
  local new = {}
  local map = map
  for k in pairs(map) do
    new[k] = map[k]
  end
  return new
end

---------------------------------------------------------------

local function fill_line(n, max)
  for i = 1, max do
    if i < n then
      io.write("*")
    else
      io.write(" ")
    end
  end
end

-- print histogram for bpf.var.map
-- Only support number key now.
bpf.print_hist_map = function(t)
  local histo = {}
  local stdSum, max = 0, 0

  for k in pairs(t) do
    if t[k] ~= 0 then
      local k1 = math.pow(2, math.floor(math.log(k, 2)))
      if histo[k1] == nil then histo[k1] = 0 end
      histo[k1] = histo[k1] + t[k]

      stdSum = stdSum + t[k]
      if k1 > max then max = k1 end
    end
  end

  print("        value  ------------- Distribution -------------  count")
  local k = 0
  while k <= max do
    local v = histo[k]
    if v == nil then v = 0 end

    io.write(string.format("%13d |", k))
    fill_line(v * 40 / stdSum, 40)
    io.write(string.format("| %d", v))

    print()

    if k == 0 then k = 1 else k = k * 2 end
  end
end

return bpf

(1)

local ffi = require("ffi")
local C = ffi.C

local bpf = {}

bpf.cargs_extra = ""

以上代码导入ffi模块,并且定义bpf这个tablebpf table有一个cargs_extrakey,所对应的值是一个空字符串。

 

BIOS和UEFI

BIOS(Basic Input/Output System)是固化在主板芯片上的一段代码。电脑启动时,由BIOS负责启动各个硬件单元,并把“控制权”交给操作系统。BIOS使用MBRMaster Boot Record,存储电脑的分区表)来决定使用哪个操作系统。BIOS为用户提供了一个操作硬件的接口。

UEFI(Unified Extensible Firmware Interface)BIOS的继任者。它同BIOS的比较如下:

a)16-bit vs 32/64-bit
BIOS只能工作在16-bit处理器模式上,且只能访问1M内存。而UEFI则可工作在32/64-bit处理器模式上,且可访问更大的内存空间。

b)Booting
MBR限制每个磁盘只能有4个分区,且可以boot的磁盘大小限制在2.2TB。而UEFI使用GUID Partition Table,可以访问更大的磁盘。

c)Extensions
UEFI支持老的extensions(比如,ACPI)。

参考资料:
HTG Explains: Learn How UEFI Will Replace Your PC’s BIOS

 

“Emulation”和“Hardware virtualization”的比较

Emulation”是软件模拟硬件,即在你当前的host机器(举个例子:X86)上运行另一个SPARC平台的的虚拟机。软件需要把SPARC平台指令转化为X86指令,所以速度很慢。(qemu-system-x86_64

Hardware virtualization”是硬件支持虚拟化。即软件直接利用CPU和芯片组等硬件。由于没有指令转化,所以速度很快。当然,host机器和虚拟机是同样的指令集。(qemu-system-x86_64 -enable-kvm

参考资料:
qemu-kvm or qemu-system-x86_64?

 

git branch & merge笔记

In fact, in Git, the act of creating a new branch is simply writing a file in the .git/refs/heads directory that has the SHA-1 of the last commit for that branch.

Creating a branch is nothing more than just writing 40 characters to a file.

Switching to that branch simply means having Git make your working directory look like the tree that SHA-1 points to and updating the HEAD file so each commit from that point on moves that branch pointer forward (in other words, it changes the 40 characters in .git/refs/heads/[current_branch_name] be the SHA-1 of your last commit).

可以看到,在git中,创建一个branch仅仅是在一个文件中加入40个字符的SHA-1值。

Remotes are basically pointers to branches in other peoples copies of the same repository, often on other computers. If you got your repository by cloning it, rather than initializing it, you should have a remote branch of where you copied it from automatically added as origin by default. Which means the tree that was checked out during your initial clone would be referenced as origin/master , which means “the master branch of the origin remote.”

Remotes是指向其它人关于这个repository copy里某个branch的指针。如果你的repository是通过clone其它copy得到的,而不是initialize的,在你的repository里,会自动产生一个origin/masterremote branch指向你copyrepository tree

参考资料:
Git internals

PCI寻址笔记

Linux现在支持多个PCI domain,每个PCI domain管理256bus2^8),每个bus上可以挂载322^5)个设备,每个设备最多支持82^3)个function。同一PCI bus上的设备共享memory locationI/O port的地址空间,而configuration register则不是。每个PCI设备的configuration register256个字节。

执行“lspci -D”命令显示当前系统的PCI设备信息。格式为:domain:bus:dev:func

[root@localhost ~]# lspci -D
0000:00:00.0 Host bridge: Intel Corporation 440FX - 82441FX PMC [Natoma] (rev 02)
0000:00:01.0 ISA bridge: Intel Corporation 82371SB PIIX3 ISA [Natoma/Triton II]
0000:00:01.1 IDE interface: Intel Corporation 82371AB/EB/MB PIIX4 IDE (rev 01)
0000:00:02.0 VGA compatible controller: InnoTek Systemberatung GmbH VirtualBox Graphics Adapter
0000:00:03.0 Ethernet controller: Intel Corporation 82540EM Gigabit Ethernet Controller (rev 02)
0000:00:04.0 System peripheral: InnoTek Systemberatung GmbH VirtualBox Guest Service
0000:00:05.0 Multimedia audio controller: Intel Corporation 82801AA AC'97 Audio Controller (rev 01)
0000:00:06.0 USB controller: Apple Inc. KeyLargo/Intrepid USB
0000:00:07.0 Bridge: Intel Corporation 82371AB/EB/MB PIIX4 ACPI (rev 08)
0000:00:0b.0 USB controller: Intel Corporation 82801FB/FBM/FR/FW/FRW (ICH6 Family) USB2 EHCI Controller
0000:00:0d.0 SATA controller: Intel Corporation 82801HM/HEM (ICH8M/ICH8M-E) SATA Controller [AHCI mode] (rev 02)

闲侃两句工资的事

今天看到AVOS Cloud公司的薪酬体系制度(http://open.avoscloud.com/salary.html),很有感触。借题发挥,闲扯两句。

工资在当今社会已经变成了一个非常敏感的话题,基本上在各个公司都是保密的,不让议论的。在这个问题上,我接触到的一些国企反倒是公开的,不过上层领导有多少不算在工资里的收入,就不得而知了。其实我想了一下,之所以保密这个东西,无非是怕公开出来,会引起员工心理不平衡,进而影响工作效率,公司正常运转等等。比方说,员工A会想,员工B总是偷偷上网,工作效率也不高,凭什么收入比我高?员工B又会想,员工C就会拍马屁,凭什么收入比我高?等等。而关于这些具体事情的细节,究竟孰是孰非,恐怕也没人有精力愿意去调查明白,所以干脆就把这块设立为禁区,谁也别碰。但是这么做真就能把这个问题解决吗?大家心里就不会想这些事了吗?就能相安无事了吗?我觉得未必。我不是HR,也不是学心理学的。但我知道人都不傻子,谁吃亏谁心里明白,保密也好,不让别人议论也罢,也许并一定是最好的方法。如何做最好呢?我也不知道,只是希望公司管理者们能够认真思考一下这个问题。

如何组织一场技术沙龙?

从去年到现在,我参加了几次技术沙龙。其中有组织的很好的,也有不尽如人意的。在这篇文章里,我以举办一次Go语言话题分享为例,谈谈我个人觉得该如何办好一场技术沙龙。

(1)组织

首先说一下沙龙的组织者与参与者之间沟通形式的问题。现在是信息高度发达的互联网时代,对于组织者来说,不仅有像meetup这样专门发布聚会信息的网站,还有各种IM群来让大家讨论,献计献策。而在试用过这些五花八门的工具后,我个人倒是觉得,古老的邮件列表可能是最好的联系方式。原因有以下两点:

a)你现在所使用的IM工具也许在某一天会消失,但是电子邮件不会,它依然是现在世界范围内最广泛使用的沟通工具;
b)邮件列表可以很好地把信息和资料保存起来,方便将来查找。

选好沟通工具后,组织者就可以向大家征集与会议题并展开讨论了。同时组织者还要联系场地和赞助,比如说可以提供一些抽奖奖品或是纪念品等等。这些纪念品不用价值很昂贵,个人觉得一个马克杯,一件T恤,或是一个普通笔记本就可以了。此外,对提供场地和赞助的公司或个人,也可以在沙龙上进行一下适度的宣传或广告。毕竟人家提供了帮助,这样也可以提高赞助者们未来提供更多帮助的积极性。

接下来说一下时间的问题。举办一次沙龙还是比较耗费时间和精力的,所以2~3个月一次就可以了。举办时间最好安排在周六的下午,这样可以方便喜欢周末睡懒觉的朋友。选择周六另外一个原因是:第二天是周日,仍然休息,所以即使这一天弄得太晚,也不用担心明天早起上班。

(2)选题

一般的技术沙龙会安排3~4个小时,而这还包含参与者提问和中间休息的时间,所以我觉得安排4个选题就够了。

第一个选题是“暖场”作用,不一定非要技术相关。可以讲讲Go语言社区这段时间的动态,一些轶闻趣事等等,目的是把大家的兴趣和积极性调动起来。

第二、三个选题应该是和Go语言技术紧密相关的了,可以介绍Go语言的使用技巧和调试方法,当前有哪些比较好的Go语言开源项目,以及各个公司使用Go语言的经验分享等等。

最后一个选题可以是其它技术的话题分享:因为Go语言不是孤立的,而是和它运行的环境紧密结合的,所以可以讲讲Unix系统的知识,运维经验的分享等等。

(3)扫尾

开完沙龙后,大家要帮忙把会场打扫干净,这样将来赞助方才更愿意提供场地设施。组织者也要把一些资料及时地发出来,方便大家参考。

以上就是我个人对如何组织一场技术沙龙的一点拙见,希望能给大家一点帮助。

“Understanding Caching”笔记

本文是“Understanding Caching”的笔记:

(1)什么是cache line

A cache line is the smallest unit of memory that can be transferred to or from a cache. The essential elements that quantify a cache are called the read and write line widths. These signify the minimum amount of data the cache must read or write from the memory or cache below it. Frequently, these quantities are the same, so caches often are quantified simply by the line width. Even if they differ, the longest width usually is called the line width.

(2)inclusiveexclusive

A multilevel cache can be either inclusive or exclusive. Exclusive means a particular cache line may be present in exactly one of the cache levels and no more than one. Inclusive means the line may be present simultaneously in more than one level of cache. Nothing prevents the line widths from being different in differing cache levels.

(3)Write throughwrite back

Write through means the cache may store a copy of the data, but the write must be completed at the next level down before it can be signaled as complete to the layer above. Write back means a write may be considered complete as soon as the data is stored in the cache. For a write back cache, as long as the written data is not transmitted, the cache line is considered dirty, because it ultimately must be written out to the level below.

(4)Coherency

A cache line is termed coherent when the data in the line is identical to the data stored in the main memory being cached. If this is not true, the cache line is termed incoherent.

没有coherency带来的两个问题:
a)所有种类cache都有可能发生(stale data):主存数据更新了,但是cache数据不是最新的。因此会导致读错误。如下图所示:

7105f1.inline

这是一个暂时的错误,因为正确的数据在主存中,只需cache重新读一下即可。

b)这种错误只发生在write back cache中:主存和cache里分别更新了数据,现在要使二者数据一致。由于每次cache要更新一个cache line,所以必然要导致更新的数据出现不一致。如下图所示:

7105f2.inline

Libuv笔记(1)—— 模型

Libuv使用的是异步的(asynchronous),非阻塞(non-blocking),事件驱动的(event-driven)编程模式。它的核心是提供了一个事件循环(event loop),并对感兴趣的事件注册了回调函数(callback)。伪代码如下:

当有事件需要处理:
    取出下一个事件
    如果这个事件注册了回调函数:
        调用回调函数

Linux kernel IOMMU代码分析笔记(12)——page-table entry的相关代码定义

DMA请求的地址转换如下图所示:

1

page-table entry格式如下:

2

因为每个page-table entry82^3)个byte,所以上面的转化图中只要9位就可以了(12 - 32^12 = 4KiB)。

GAW的定义:

Guest Address Width: Physical addressability limit within a partition (virtual machine)

可以理解为从虚拟机角度看到的物理地址宽度。举个例子,如果一个虚拟机只能访问2G内存,那么GAW就是31

AGAW的定义:Adjusted Guest Address Width。为了保证9bit长度的步长转化,GAWAGAW之间的转换伪代码如下:

R = (GAW - 12) MOD 9;
if (R == 0) {
    AGAW = GAW;
} else {
    AGAW = GAW + 9 - R;
}
if (AGAW > 64)
    AGAW = 64;

对应的函数是guestwidth_to_adjustwidth

static inline int guestwidth_to_adjustwidth(int gaw)
{
    int agaw;
    int r = (gaw - 12) % 9;

    if (r == 0)
        agaw = gaw;
    else
        agaw = gaw + 9 - r;
    if (agaw > 64)
        agaw = 64;
    return agaw;
}

AGAW的最小长度是30bit,参考以下规范定义(context-entry格式里的内容):

• 000b: 30-bit AGAW (2-level page table)
• 001b: 39-bit AGAW (3-level page table)
• 010b: 48-bit AGAW (4-level page table)
• 011b: 57-bit AGAW (5-level page table)
• 100b: 64-bit AGAW (6-level page table)

所以可以看到kernelagaw的一些转换代码会用到302这些数字:

static inline int agaw_to_level(int agaw)
{
    return agaw + 2;
}

static inline int agaw_to_width(int agaw)
{
    return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
}

static inline int width_to_agaw(int width)
{
    return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
}