Linux | 我的站点

Linux操作系统的pstack工具

Solaris操作系统提供了pstack工具，用来打印运行程序的线程堆栈信息。RedHat公司发行的Linux操作系统（RHEL，CentOS等等）也提供了pstack工具，只要安装gdb：

# yum install gdb

就会把pstack也一并安装成功。

首先看一下pstack：

# which pstack
/usr/bin/pstack
# ls -lt /usr/bin/pstack
lrwxrwxrwx. 1 root root 6 Nov 19 06:32 /usr/bin/pstack -> gstack

可以看出pstack实际上只是一个指向了gstack的符号链接。再看一下gstack：

# cat /usr/bin/gstack
#!/bin/sh

if test $# -ne 1; then
    echo "Usage: `basename $0 .sh` <process-id>" 1>&2
    exit 1
fi

if test ! -r /proc/$1; then
    echo "Process $1 not found." 1>&2
    exit 1
fi

# GDB doesn't allow "thread apply all bt" when the process isn't
# threaded; need to peek at the process to determine if that or the
# simpler "bt" should be used.

backtrace="bt"
if test -d /proc/$1/task ; then
    # Newer kernel; has a task/ directory.
    if test `/bin/ls /proc/$1/task | /usr/bin/wc -l` -gt 1 2>/dev/null ; then
    backtrace="thread apply all bt"
    fi
elif test -f /proc/$1/maps ; then
    # Older kernel; go by it loading libpthread.
    if /bin/grep -e libpthread /proc/$1/maps > /dev/null 2>&1 ; then
    backtrace="thread apply all bt"
    fi
fi

GDB=${GDB:-/usr/bin/gdb}

# Run GDB, strip out unwanted noise.
# --readnever is no longer used since .gdb_index is now in use.
$GDB --quiet -nx $GDBARGS /proc/$1/exe $1 <<EOF 2>&1 |
set width 0
set height 0
set pagination no
$backtrace
EOF
/bin/sed -n \
    -e 's/^\((gdb) \)*//' \
    -e '/^#/p' \
    -e '/^Thread/p'

可以看到gstack仅仅是一个shell脚本。简单浏览一下这个脚本：

（1）

if test $# -ne 1; then
    echo "Usage: `basename $0 .sh` <process-id>" 1>&2
    exit 1
fi

脚本要求一个参数：进程ID。

（2）

if test ! -r /proc/$1; then
    echo "Process $1 not found." 1>&2
    exit 1
fi

通过检测/proc目录下进程子目录是否可读，来查看相应进程是否存在。

（3）

# GDB doesn't allow "thread apply all bt" when the process isn't
# threaded; need to peek at the process to determine if that or the
# simpler "bt" should be used.

backtrace="bt"
if test -d /proc/$1/task ; then
    # Newer kernel; has a task/ directory.
    if test `/bin/ls /proc/$1/task | /usr/bin/wc -l` -gt 1 2>/dev/null ; then
    backtrace="thread apply all bt"
    fi
elif test -f /proc/$1/maps ; then
    # Older kernel; go by it loading libpthread.
    if /bin/grep -e libpthread /proc/$1/maps > /dev/null 2>&1 ; then
    backtrace="thread apply all bt"
    fi
fi

如果进程只有一个线程，那么使用gdb的“bt”命令打印线程堆栈信息，否则使用“thread apply all bt”命令。

（4）

GDB=${GDB:-/usr/bin/gdb}

# Run GDB, strip out unwanted noise.
# --readnever is no longer used since .gdb_index is now in use.
$GDB --quiet -nx $GDBARGS /proc/$1/exe $1 <<EOF 2>&1 |
set width 0
set height 0
set pagination no
$backtrace
EOF
/bin/sed -n \
    -e 's/^\((gdb) \)*//' \
    -e '/^#/p' \
    -e '/^Thread/p'

最后调用gdb，使用“bt”或“thread apply all bt”命令，并把输出重定向到sed工具，由sed工具打印出线程堆栈信息。

最后看一个使用pstack的例子：

# pstack 707
Thread 3 (Thread 0x7f69600d8700 (LWP 713)):
#0  0x00007f6968af269d in poll () at ../sysdeps/unix/syscall-template.S:81
#1  0x00007f6969027a84 in g_main_context_iterate.isra.24 () from /lib64/libglib-2.0.so.0
#2  0x00007f6969027bac in g_main_context_iteration () from /lib64/libglib-2.0.so.0
#3  0x00007f6969027be9 in glib_worker_main () from /lib64/libglib-2.0.so.0
#4  0x00007f696904d4f5 in g_thread_proxy () from /lib64/libglib-2.0.so.0
#5  0x00007f696af9fdc5 in start_thread (arg=0x7f69600d8700) at pthread_create.c:308
#6  0x00007f6968afcced in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
Thread 2 (Thread 0x7f695eec3700 (LWP 716)):
#0  0x00007f6968af269d in poll () at ../sysdeps/unix/syscall-template.S:81
#1  0x00007f6969027a84 in g_main_context_iterate.isra.24 () from /lib64/libglib-2.0.so.0
#2  0x00007f6969027dca in g_main_loop_run () from /lib64/libglib-2.0.so.0
#3  0x00007f6969641336 in gdbus_shared_thread_func () from /lib64/libgio-2.0.so.0
#4  0x00007f696904d4f5 in g_thread_proxy () from /lib64/libglib-2.0.so.0
#5  0x00007f696af9fdc5 in start_thread (arg=0x7f695eec3700) at pthread_create.c:308
#6  0x00007f6968afcced in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:113
Thread 1 (Thread 0x7f696c5738c0 (LWP 707)):
#0  0x00007f6968af269d in poll () at ../sysdeps/unix/syscall-template.S:81
#1  0x00007f6969027a84 in g_main_context_iterate.isra.24 () from /lib64/libglib-2.0.so.0
#2  0x00007f6969027dca in g_main_loop_run () from /lib64/libglib-2.0.so.0
#3  0x0000560a080a80a3 in main ()

如果使用的Linux发行版没有pstack这个工具，可以考虑直接把gstack脚本拷贝过去。

strace命令介绍

strace是Linux上的一个很好用的工具，它可以用来输出程序在运行过程中发生的系统调用以及收到的信号的相关信息，因此在调试和诊断问题时有很大的帮助，特别是在程序没有源码，或是在前期做一些粗略的分析时。strace命令格式如下：

strace [options] command [args]

举个例子：

＃ strace sleep 300
execve("/usr/bin/sleep", ["sleep", "300"], [/* 24 vars */]) = 0
brk(0)                                  = 0x22fa000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f70d1ef8000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
......
--- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=20243, si_uid=0} ---
......

从上面例子可以看出，对于系统调用，比如open，access，strace都会输出详细的参数和返回值，如果发生了错误，也会输出细致的错误信息。而对于接收到的信号，除了输出信息外，还要注意信号信息的前后都加了“---”，以示与系统调用的区别。

以下是一些常用的选项：
（1）-o：把strace执行结果输出到指定文件里：

# strace -o out ls

（2）-t：打印时间：

# strace -t ls
10:30:07 execve("/usr/bin/ls", ["ls"], [/* 24 vars */]) = 0
10:30:07 brk(0)
......

(3）-e：只关注某一系统调用：

# strace -e open ls
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
open("/lib64/libselinux.so.1", O_RDONLY|O_CLOEXEC) = 3
......

（4）-y：显示和文件描述符关联的文件路径：

# strace -y ls
......
fstat(3</etc/ld.so.cache>, {st_mode=S_IFREG|0644, st_size=32951, ...}) = 0
mmap(NULL, 32951, PROT_READ, MAP_PRIVATE, 3</etc/ld.so.cache>, 0) = 0x7fba3db13000
close(3</etc/ld.so.cache>)              = 0
......

（5）-f：追踪运行进程所生成的子进程。

参考资料：
strace(1) – Linux man page；
A swiss army knife of debugging tools。

Linux kernel 笔记（63）——改变启动的kernel

原文在这里。

得到当前系统运行的kernel（系统为CentOS）：

# egrep ^menuentry /etc/grub2.cfg | cut -f 2 -d \'
CentOS Linux (4.8.3) 7 (Core)
CentOS Linux (3.10.0-327.el7.x86_64) 7 (Core)
CentOS Linux (0-rescue-d07a2009dd34415fa45624985dccbdf6) 7 (Core)

使用grub2-set-default改变启动的kernel：

# grub2-set-default 0

如果仅仅想生效一次，可以使用grub2-reboot命令：

＃ grub2-reboot 0

inode，“hard link”和“symbol link”

在*nix文件系统上，每个文件的存储实际可以看成包含两部分：inode和实际存储文件内容的数据块。其中inode存储文件的metadata，包含创建时间，访问权限，等等，当然还有指向文件具体数据块的指针。正是通过这个指针，将indoe和数据块关联起来。

要注意，inode中并不保存文件的名字。关于文件名字和inode的映射存储在目录文件中。因此，当访问一个文件时，其实是通过这个文件所在的目录文件访问到这个文件的inode信息，继而进行文件操作的。

接下来，看一下hard link，symbol link和inode之间的关系。首先创建一个文件和指向这个文件的hard link和symbol link：

# echo 'Hello, World!' > myfile.txt
# ln myfile.txt my-hard-link
# ln -s myfile.txt my-soft-link

查看这3个文件的inode信息：

# ls -ailt my*
325332 lrwxr-xr-x  1 root  wheel  10 Oct 24 05:26 my-soft-link -> myfile.txt
325331 -rw-r--r--  2 root  wheel  14 Oct 24 05:25 my-hard-link
325331 -rw-r--r--  2 root  wheel  14 Oct 24 05:25 myfile.txt

可以看到myfile.txt和my-hard-link其实对应的是同一个inode节点：325331，而my-soft-link对应的是另一个inode节点：325332。接下来删除myfile.txt，然后分别读取my-hard-link和my-soft-link文件内容：

# rm myfile.txt
# ls -ailt my*
325332 lrwxr-xr-x  1 root  wheel  10 Oct 24 05:26 my-soft-link -> myfile.txt
325331 -rw-r--r--  1 root  wheel  14 Oct 24 05:25 my-hard-link
# cat my-hard-link
Hello, World!
# cat my-soft-link
cat: my-soft-link: No such file or directory

可以看到，因为my-hard-link和myfile.txt对应相同的inode节点：325331，因此删除myfile.txt后，仍然可以通过my-hard-link读取325331这个inode节点所对应的文件内容。而my-soft-link仅仅是指向myfile.txt这个文件名字，因此一旦myfile.txt被删除，也就无法读取文件内容了。

参考资料：
Inodes – an Introduction；
What is the difference between a symbolic link and a hard link?。

Linux下使用vmstat命令获得系统CPU的使用状态

本文是使用vmstat命令监控CPU使用的续文。

在Linux下使用vmstat命令可以得到系统CPU的使用状态：

# vmstat
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 2  0      0 1860352    948 131040    0    0  2433   137  252  897  2  7 90  1  0

其中描述CPU状态的是最后5列：

------cpu-----
us sy id wa st
2  7 90  1  0

要注意，上面数字的含义是百分比。即CPU运行user space程序的时间占2%，。。。

各列含义如下：

us（user time）：CPU运行user space代码的时间；
sy（system time）：CPU运行kernel代码的时间，比如执行系统调用；
id（idle time）：CPU处于idle状态的时间；
wa（IO-wait time）：CPU处于idle状态，因为所有正在运行的进程都在等待I/O操作完成，因此当前无可以调度的进程；
st（stolen time）：CPU花费在执行系统上运行的虚拟机的时间。

参考资料：
The precise meaning of I/O wait time in Linux；
Linux Performance Analysis in 60,000 Milliseconds。

CentOS配置静态IP

在VirtualBox里安装CentOS，配置静态IP：

（1）CentOS 6，修改/etc/sysconfig/network-scripts/ifcfg-eth0文件：

......
ONBOOT=yes
BOOTPROTO=static
IPADDR=192.168.1.9
NETMASK=255.255.255.0
GATEWAY=192.168.1.1

（2）CentOS 7，修改/etc/sysconfig/network-scripts/ifcfg-enp0s3文件：

......
BOOTPROTO="static"
ONBOOT="yes"
IPADDR="192.168.1.5"
NETMASK="255.255.255.0"
GATEWAY="192.168.1.1"
DNS1="192.168.1.1"
DNS2="8.8.8.8"

进程的priority和nice

本文选自Difference between nice value and priority in the top output，以Linux系统为例讲解进程的priority和nice。
（1）

The difference is that PR is a real priority of a process at the moment inside of the kernel and NI is just a hint for the kernel what the priority the process should have.

Priority反映当时进程真正的优先级，而nice则是告诉kernel进程应该获得什么样的优先级。

（2）Nice的值从-20到19，-20表示优先级最高。通常情况下，priority = nice + 20，也就是priority的值为0~39。但是上述理论仅仅适用于调度策略是SHED_OTHER的进程，此外，kernel也有可能只改变priority的值，而nice的值保持不变，因此上述等式同样不适用。

“Page out”和“swap out”

下文摘自Linux performance and tuning guidelines：

The pages are used mainly for two purposes: page cache and process address space. The page cache is pages mapped to a file on disk. The pages that belong to a process address space (called anonymous memory because it is not mapped to any files, and it has no name) are used for heap and stack. When kswapd reclaims pages, it would rather shrink the page cache than page out (or swap out) the pages owned by processes. A large proportion of page cache that is reclaimed and process address space that is reclaimed might depend on the usage scenario and will affect performance. You can take some control of this behavior by using /proc/sys/vm/swappiness.

Page out and swap out: The phrases “page out” and “swap out” are sometimes confusing. The phrase “page out” means take some pages (a part of entire address space) into swap space while “swap out” means taking entire address space into swap space. They are sometimes used interchangeably.

Unix中的zombie进程和orphan进程

Unix中子进程退出后，如果父进程没有使用wait()函数获得子进程的退出状态，则子进程的相关信息仍然会在系统的进程表里占用一席之地，这时的子进程称之为zombie进程。如果父进程先于子进程退出，这时的子进程称之为orphan进程，而init进程则会变成orphan进程的父进程。init进程会定期处理父进程是init的zombie进程。

参考资料：
Zombie process；
Zombie process vs Orphan process。

Linux kernel 笔记（62）——list_head

双向链表是Linux kernel中常用的数据结构，定义如下：

struct list_head {
    struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
    struct list_head name = LIST_HEAD_INIT(name)

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    list->next = list;
    list->prev = list;
}
...

下图选自plka：

从上图可以看出，定义链表需要一个头结点，通过头结点继而可以完成插入，删除元素等操作。来看一个例子（list.c）：

struct list_head {
        struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)


int main(void) {
        LIST_HEAD(dev_list);
        return 0;
}

检查gcc预处理的输出：

# gcc -E -P list.c
struct list_head {
 struct list_head *next, *prev;
};
int main(void) {
 struct list_head dev_list = { &(dev_list), &(dev_list) };
 return 0;
}

可以看到，头结点dev_list的prev和next都指向了自己。下面代码达到同样的效果：

struct list_head {
    struct list_head *next, *prev;
};

static inline void INIT_LIST_HEAD(struct list_head *list)
{
    list->next = list;
    list->prev = list;
}

int main(void) {
    struct list_head dev_list;
    INIT_LIST_HEAD(&dev_list);
    return 0;
}

2024年4月
一	二	三	四	五	六	日
« 12月
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30