FreeBSD kernel 笔记(4)——UIO

UIO相关的结构体和函数定义:

 #include <sys/types.h>
 #include <sys/uio.h>

 struct uio {
     struct  iovec *uio_iov;         /* scatter/gather list */
     int     uio_iovcnt;         /* length of scatter/gather list */
     off_t   uio_offset;         /* offset in target object */
     ssize_t uio_resid;          /* remaining bytes to copy */
     enum    uio_seg uio_segflg;     /* address space */
     enum    uio_rw uio_rw;      /* operation */
     struct  thread *uio_td;         /* owner */
 };

 int
 uiomove(void *buf, int howmuch, struct uio *uiop);

 int
 uiomove_nofault(void *buf, int howmuch, struct uio *uiop);

关于uio结构体需要注意的是:如果uio_iovcnt不为1,可以把uio_iov所指向的struct iovec看成一个连接起来的大bufferuio_offset指向这个bufferoffest,而uio_resid表明还有多少字节需要copy。在执行read操作时,uio_offset表明已经填充的buffer大小,而uio_resid表明buffer剩余的空间。可以参考这个程序

uiomoveuiomove_nofault本质上调用的都是uiomove_faultflag函数:

static int
uiomove_faultflag(void *cp, int n, struct uio *uio, int nofault)
{
    struct thread *td;
    struct iovec *iov;
    size_t cnt;
    int error, newflags, save;

    td = curthread;
    error = 0;

    KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE,
    ("uiomove: mode"));
    KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == td,
    ("uiomove proc"));
    if (!nofault)
        WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
        "Calling uiomove()");

    /* XXX does it make a sense to set TDP_DEADLKTREAT for UIO_SYSSPACE ? */
    newflags = TDP_DEADLKTREAT;
    if (uio->uio_segflg == UIO_USERSPACE && nofault) {
        /*
         * Fail if a non-spurious page fault occurs.
         */
        newflags |= TDP_NOFAULTING | TDP_RESETSPUR;
    }
    save = curthread_pflags_set(newflags);

    while (n > 0 && uio->uio_resid) {
        iov = uio->uio_iov;
        cnt = iov->iov_len;
        if (cnt == 0) {
            uio->uio_iov++;
            uio->uio_iovcnt--;
            continue;
        }
        if (cnt > n)
            cnt = n;

        switch (uio->uio_segflg) {

        case UIO_USERSPACE:
            maybe_yield();
            if (uio->uio_rw == UIO_READ)
                error = copyout(cp, iov->iov_base, cnt);
            else
                error = copyin(iov->iov_base, cp, cnt);
            if (error)
                goto out;
            break;

        case UIO_SYSSPACE:
            if (uio->uio_rw == UIO_READ)
                bcopy(cp, iov->iov_base, cnt);
            else
                bcopy(iov->iov_base, cp, cnt);
            break;
        case UIO_NOCOPY:
            break;
        }
        iov->iov_base = (char *)iov->iov_base + cnt;
        iov->iov_len -= cnt;
        uio->uio_resid -= cnt;
        uio->uio_offset += cnt;
        cp = (char *)cp + cnt;
        n -= cnt;
    }
out:
    curthread_pflags_restore(save);
    return (error);
}

可以看到这个函数会对传入的uio结构体的内容进行修改。

关于uiomove_nofault()函数,参考如下定义:

The function uiomovenofault() requires that the buffer and I/O vectors be accessible without incurring a page fault. The source and destination addresses must be physically mapped for read and write access, respec- tively, and neither the source nor destination addresses may be pageable. Thus, the function uiomovenofault() can be called from contexts where acquiring virtual memory system locks or sleeping are prohibited.

参考资料:
UIO

FreeBSD中的sysctl函数

FreeBSDsysctl家族的函数定义:

#include <sys/types.h>
 #include <sys/sysctl.h>

 int
 sysctl(const int *name, u_int namelen, void *oldp, size_t *oldlenp,
 const void *newp, size_t newlen);

 int
 sysctlbyname(const char *name, void *oldp, size_t *oldlenp,
 const void *newp, size_t newlen);

 int
 sysctlnametomib(const char *name, int *mibp, size_t *sizep);

sysctl函数参数中,namenamelen用来表明内核参数IDoldpoldlenp用来存储当前内核参数的值;而newpnewlen则用来设置新的内核参数值。如果不需要的话,可以把相应的值置成NULL
看一下sysctlbyname的实现:

int
sysctlbyname(const char *name, void *oldp, size_t *oldlenp,
    const void *newp, size_t newlen)
{
    int real_oid[CTL_MAXNAME+2];
    size_t oidlen;

    oidlen = sizeof(real_oid) / sizeof(int);
    if (sysctlnametomib(name, real_oid, &oidlen) < 0)
        return (-1);
    return (sysctl(real_oid, oidlen, oldp, oldlenp, newp, newlen));
}

可以看到,sysctlbyname首先通过sysctlnametomib获得真正的ID,接着调用sysctl完成想要的工作。

参考资料:
SYSCTL(3)
Grokking SYSCTL and the Art of Smashing Kernel Variables

Linux kernel 笔记 (62)——list_head

双向链表是Linux kernel中常用的数据结构,定义如下:

struct list_head {
    struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
    struct list_head name = LIST_HEAD_INIT(name)

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    list->next = list;
    list->prev = list;
}
...

下图选自plka

Capture

从上图可以看出,定义链表需要一个头结点,通过头结点继而可以完成插入,删除元素等操作。来看一个例子(list.c):

struct list_head {
        struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)


int main(void) {
        LIST_HEAD(dev_list);
        return 0;
}

检查gcc预处理的输出:

# gcc -E -P list.c
struct list_head {
 struct list_head *next, *prev;
};
int main(void) {
 struct list_head dev_list = { &(dev_list), &(dev_list) };
 return 0;
}

可以看到,头结点dev_listprevnext都指向了自己。下面代码达到同样的效果:

struct list_head {
    struct list_head *next, *prev;
};

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    list->next = list;
    list->prev = list;
}

int main(void) {
    struct list_head dev_list;
    INIT_LIST_HEAD(&dev_list);
    return 0;
}

 

getopt和getopt_long

这篇笔记选自Using getopt。 一个典型的的Unix程序格式如下:

getopt [-dmp] [-s name] -f name file [file ...]

a)dmp是可选option,在一个[]中表示它们可以一起使用;

b)[-s name]表示s是一个带参数的可选option
c)-f name表示f是一个带参数的必选option
d)file [file ...]表示程序还需要一个或多个命令行参数。
getopt函数原型如下:

#include <unistd.h>

int getopt(int argc, char * const argv[], const char *optstring);

extern char *optarg;
extern int optind, opterr, optopt;

需要注意以下几点:

a)每次调用getopt后,如果option带参数,optarg指向后面跟着的参数;optind则表示下一次处理optionindex。因此当getopt解析完所有option后,如果同argc相同,则表示没有命令行参数。
b)getopt前两个参数直接从main函数参数得到,第三个参数指定如何处理option"df:mps:"。冒号表示前面的option后面需要带参数。如果getopt解析option时遇到不在optstring中的option返回?,把option全部解析完返回-1
下面看一下getopt_longgetopt_long_only(参考getopt(3) – Linux man page):

#include <getopt.h>

int getopt_long(int argc, char * const argv[],
           const char *optstring,
           const struct option *longopts, int *longindex);
int getopt_long_only(int argc, char * const argv[],
        const char *optstring,
        const struct option *longopts, int *longindex);

getopt_long除了可以处理short option外,还可以处理long option(以--开头)。关于struct option定义如下:

struct option {
    const char *name;
    int         has_arg;
    int        *flag;
    int         val;
};
The meanings of the different fields are:
name
is the name of the long option.

has_arg
is: no_argument (or 0) if the option does not take an argument; required_argument (or 1) if the option requires an argument; or optional_argument (or 2) if the option takes an optional argument.  

flag
specifies how results are returned for a long option. If flag is NULL, then getopt_long() returns val. (For example, the calling program may set val to the equivalent short option character.) Otherwise, getopt_long() returns 0, and flag points to a variable which is set to val if the option is found, but left unchanged if the option is not found.

val
is the value to return, or to load into the variable pointed to by flag.

如果flagNULLgetopt_long会返回val的值,因此通常会把flag置成NULL,把val置成与long option对应的short option。否则getopt_long会返回0,并把val的值赋给flag

参考下列代码(选自GNU binutils中的size命令)可以更好地了解getopt_long

#define OPTION_FORMAT (200)
#define OPTION_RADIX (OPTION_FORMAT + 1)
#define OPTION_TARGET (OPTION_RADIX + 1)

static struct option long_options[] =
{
  {"common", no_argument, &show_common, 1},
  {"format", required_argument, 0, OPTION_FORMAT},
  {"radix", required_argument, 0, OPTION_RADIX},
  {"target", required_argument, 0, OPTION_TARGET},
  {"totals", no_argument, &show_totals, 1},
  {"version", no_argument, &show_version, 1},
  {"help", no_argument, &show_help, 1},
  {0, no_argument, 0, 0}
};


 while ((c = getopt_long (argc, argv, "ABHhVvdfotx", long_options,
               (int *) 0)) != EOF)
    switch (c)
      {
      case OPTION_FORMAT:
    switch (*optarg)
      {
      case 'B':
      case 'b':
        berkeley_format = 1;
        break;
      case 'S':
      case 's':
        berkeley_format = 0;
        break;
      default:
        non_fatal (_("invalid argument to --format: %s"), optarg);
        usage (stderr, 1);
      }
    break;

    ......

    case 0:
    break;
    ......
    }

{"format", required_argument, 0, OPTION_FORMAT}flagNULL,所以getopt_long返回值是OPTION_FORMAT;根据optarg确定应该使用哪种format。而{"totals", no_argument, &show_totals, 1}flagNULLgetopt_long返回值是0show_totals的值为1

getopt_longgetopt_long_only的区别:

getoptlongonly() is like getopt_long(), but ‘-‘ as well as “–” can indicate a long option. If an option that starts with ‘-‘ (not “–“) doesn’t match a long option, but does match a short option, it is parsed as a short option instead.

 

Linux kernel 笔记 (47)——操作信号量的函数

操作信号量的函数如下:

#include <linux/semaphore.h>
void down(struct semaphore *sem);
int down_interruptible(struct semaphore *sem);
int down_killable(struct semaphore *sem);
int down_trylock(struct semaphore *sem); 
int down_timeout(struct semaphore *sem, long jiffies);
void up(struct semaphore *sem);

down已经不再推荐使用。

down_interruptible可以被信号打断,因此需要检查返回值:只有返回0,才表明成功获取了信号量。使用down_interruptible例子如下:

if (down_interruptible(&sem)) return -ERESTARTSYS;

down_killable只能被fatal信号打断,这种信号通常用来终止进程,因此down_killable用了保证用户进程可以被杀死,否则一旦有死锁进程,则只能重启系统。

down_trylock是非阻塞版本的down,也要检查返回值。举例如下:

if (file->f_flags & O_NONBLOCK) {
    if (down_trylock(&iosem)) return -EAGAIN;
} else {
    if (down_interruptible(&iosem)) return -ERESTARTSYS;
}

down_timeout用来等待一段时间,中间也不能被信号打断。

up用来释放信号量,不需要提供interrupt版本。

参考资料:
Mutex, semaphore and the proc file system

 

Linux kernel 笔记 (43)——do_sys_open

以下是do_sys_openkernel 3.12版本的代码:

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
    struct open_flags op;
    int fd = build_open_flags(flags, mode, &op);
    struct filename *tmp;

    if (fd)
        return fd;

    tmp = getname(filename);
    if (IS_ERR(tmp))
        return PTR_ERR(tmp);

    fd = get_unused_fd_flags(flags);
    if (fd >= 0) {
        struct file *f = do_filp_open(dfd, tmp, &op);
        if (IS_ERR(f)) {
            put_unused_fd(fd);
            fd = PTR_ERR(f);
        } else {
            fsnotify_open(f);
            fd_install(fd, f);
        }
    }
    putname(tmp);
    return fd;
}

核心部分如下:

a)get_unused_fd_flags得到一个文件描述符;
b)do_filp_open得到一个struct file结构;
c)fd_install把文件描述符和struct file结构关联起来。

struct file包含f_op成员:

struct file {
    ......
    const struct file_operations    *f_op;
    ......
    void            *private_data;
    ......
}

struct file_operations又包含open成员:

struct file_operations {
    ......
    int (*open) (struct inode *, struct file *);
    ......
}

open成员的两个参数:实际文件的inode节点和struct file结构。

open系统调用执行驱动中open方法之前(struct file_operations中的open成员),会将private_data置成NULL,用户可以根据自己的需要设置private_data的值(参考do_dentry_open函数)。

 

openat VS open

2.6.16版本开始,GNU/Linux引入openat系统调用:

#define _XOPEN_SOURCE 700 /* Or define _POSIX_C_SOURCE >= 200809 */
#include <fcntl.h>
int openat(int  dirfd , const char * pathname , int  flags , ... /* mode_t  mode */);
Returns file descriptor on success, or –1 on error

open相比,多了一个dirfd参数。关于它的用法,参考以下解释:

If pathname specifies a relative pathname, then it is interpreted relative to the directory referred to by the open file descriptor dirfd, rather than relative to the process’s current working directory.

If pathname specifies a relative pathname, and dirfd contains the special value AT_FDCWD , then pathname is interpreted relative to the process’s current working directory (i.e., the same behavior as open(2)).

If pathname specifies an absolute pathname, then dirfd is ignored.

总结起来,如果pathname是绝对路径,则dirfd参数没用。如果pathname是相对路径,并且dirfd的值不是AT_FDCWD,则pathname的参照物是相对于dirfd指向的目录,而不是进程的当前工作目录;反之,如果dirfd的值是AT_FDCWDpathname则是相对于进程当前工作目录的相对路径,此时等同于open。参考kernel代码则一目了然:

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;

    return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
        umode_t, mode)
{
    if (force_o_largefile())
        flags |= O_LARGEFILE;

    return do_sys_open(dfd, filename, flags, mode);
}

引入openat(及其它at结尾的函数)有以下两个原因:

First, openat() allows an application to avoid race conditions that could occur when using open(2) to open files in directories other than the current working directory. These race conditions result from the fact that some component of the directory prefix given to open(2) could be changed in parallel with the call to open(2). Such races can be avoided by opening a file descriptor for the target directory, and then specifying that file descriptor as the dirfd argument of openat().

Second, openat() allows the implementation of a per-thread “current working directory”, via file descriptor(s) maintained by the application. (This functionality can also be obtained by tricks based on the use of /proc/self/fd/dirfd, but less efficiently.)

参考资料:
openat(2) – Linux man page
The Linux programming interface

 

Linux kernel 笔记 (42)——container_of

container_of定义在<linux/kernel.h>中:

/**
 * container_of - cast a member of a structure out to the containing structure
 * @ptr:    the pointer to the member.
 * @type:   the type of the container struct this is embedded in.
 * @member: the name of the member within the struct.
 *
 */
#define container_of(ptr, type, member) ({          \
    const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
    (type *)( (char *)__mptr - offsetof(type,member) );})

它的功能是通过一个结构体成员的地址,得到结构体的地址。举例如下:

struct st_A
{
        int member_b;
        int member_c;
};

struct st_A a;

container_of(&(a.member_c), struct st_A, member_c)会得到变量a的地址,也就是&a的值。

 

*NIX & Hacking —— 第9期

做一本我感兴趣的杂志,就这么简单!

Assembler

Assembler relaxation

GDB

GDB dashboard

Go

Best practices for a new Go developer
On Go, Portability, and System Interfaces

Kernel

A Toure of Bootloading
GRUB 2 bootloader – Full tutorial
How I ended up writing new real-time kernel
Kernel bypass
Linux Kernel Crash Book

Network

TCP in 30 instructions

RMS

Interviews: RMS Answers Your Questions

Rust

Why Rust?

Tracing

Dynamic Tracing with DTrace & SystemTap

Linux kernel 笔记 (21)——per-CPU变量

per-CPU变量顾名思义,即当你声明一个per-CPU变量时,当前系统上的每个CPU都会有一份当前变量的copy。使用per-CPU变量好处是访问它几乎不需要加锁,因为每个CPU都有一份copy。此外,CPU可以把这个变量放在自己的cache里,访问起来会特别快。定义per-CPU变量方法如下:

DEFINE_PER_CPU(type, name);  

如果per-CPU变量是数组,则定义方式如下:

DEFINE_PER_CPU(type[length], array); 

per-CPU变量可以导出,供其它模块使用:

EXPORT_PER_CPU_SYMBOL(per_cpu_var);
EXPORT_PER_CPU_SYMBOL_GPL(per_cpu_var);

要在其它模块使用per-CPU变量,则需要声明:

DECLARE_PER_CPU(type, name);

访问per-CPU变量可以使用get_cpu_var(var)set_cpu_var(var)这两个macro

/* <linux/percpu.h>*/

/*
 * Must be an lvalue. Since @var must be a simple identifier,
 * we force a syntax error here if it isn't.
 */
#define get_cpu_var(var) (*({               \
    preempt_disable();              \
    &__get_cpu_var(var); }))

/*
 * The weird & is necessary because sparse considers (void)(var) to be
 * a direct dereference of percpu variable (var).
 */
#define put_cpu_var(var) do {               \
    (void)&(var);                   \
    preempt_enable();               \
} while (0)

因为kernel线程是允许preemption的,所以在get_cpu_var中需要调用preempt_disable,并且要和put_cpu_var配对使用。

访问另一个CPUper-CPU变量:

per_cpu(variable, int cpu_id);

参考资料:
Driver porting: per-CPU variables;
Per-CPU Variables