The anatomy of uptime&w commands on OpenBSD

On OpenBSD, uptime and w are actually the same program:

$ ls -lt /usr/bin/uptime /usr/bin/w
-r-xr-xr-x  2 root  bin  18136 May 30 12:53 /usr/bin/uptime
-r-xr-xr-x  2 root  bin  18136 May 30 12:53 /usr/bin/w

and the source code is usr.bin/w/w.c.

Compare the outputs of uptime and w:

$ uptime
10:59AM  up 7 days,  1:51, 1 user, load averages: 0.00, 0.00, 0.00
$ w
10:59AM  up 7 days,  1:51, 1 user, load averages: 0.00, 0.00, 0.00
USER    TTY FROM              [email protected]  IDLE WHAT
root     p0 10.217.242.57     9:10AM     0 w

You can see the uptime just displays the first line of w, and w also shows the login users’ information.

w uses clock_gettime to get system up time:

if (clock_gettime(CLOCK_BOOTTIME, &boottime) != -1) {
    ......
} 

and getloadavg to retrieve system load average int the past 1, 5, and 15 minutes:

int
getloadavg(double loadavg[], int nelem)
{
    ......
    mib[0] = CTL_VM;
    mib[1] = VM_LOADAVG;
    size = sizeof(loadinfo);
    if (sysctl(mib, 2, &loadinfo, &size, NULL, 0) < 0)
        return (-1);
    ......
}

The current user login information is kept in /var/run/utmp, and it is composed of utmp struct:

struct utmp {
    char    ut_line[UT_LINESIZE];
    char    ut_name[UT_NAMESIZE];
    char    ut_host[UT_HOSTSIZE];
    time_t  ut_time;
};

utmp.ut_line is the login terminal (remove “tty” prefix); utmp.ut_name is the login user name; utmp.ut_host is the login address and the utmp.ut_timeis the login time. These are the first 4 columns of every line:

USER    TTY FROM              [email protected]  IDLE WHAT
root     p0 10.217.242.57     9:10AM     0 w

The IDLE column displays how long has passed since you last operates on terminal:

if ((ep->idle = now - stp->st_atime) < 0)
        ep->idle = 0;

and WHAT shows the current process.

Use “cat” to concatenate files

cat is a neat command to concatenate files on Unix (please see this post). Let’s see some examples:

# cat 1.txt
1
# cat 2.txt
2
# cat 1.txt 2.txt > new.txt
# cat new.txt
1
2
# cat 1.txt 2.txt >> new.txt
# cat new.txt
1
2
1
2

Please notice if the output file is also the input file, the input file content will be truncated first:

# cat 1.txt
1
# cat 2.txt
2
# cat 1.txt 2.txt > 1.txt
# cat 1.txt
2
# cat 2.txt
2

So the correct appending file method is using >>:

# cat 1.txt
1
# cat 2.txt
2
# cat 2.txt >> 1.txt
# cat 1.txt
1
2
# cat 2.txt
2

Check the implementation of cat in OpenBSD, and the core parts are iterating files and reading content from them:

(1) Iterate every file (raw_args):

void
raw_args(char **argv)
{
    int fd;

    fd = fileno(stdin);
    filename = "stdin";
    do {
        if (*argv) {
            if (!strcmp(*argv, "-"))
                fd = fileno(stdin);
            else if ((fd = open(*argv, O_RDONLY, 0)) < 0) {
                warn("%s", *argv);
                rval = 1;
                ++argv;
                continue;
            }
            filename = *argv++;
        }
        raw_cat(fd);
        if (fd != fileno(stdin))
            (void)close(fd);
    } while (*argv);
}

You need to pay attention that cat use - to identify stdin.

(2) Read content from every file (raw_cat):

void
raw_cat(int rfd)
{
    int wfd;
    ssize_t nr, nw, off;
    static size_t bsize;
    static char *buf = NULL;
    struct stat sbuf;

    wfd = fileno(stdout);
    if (buf == NULL) {
        if (fstat(wfd, &sbuf))
            err(1, "stdout");
        bsize = MAXIMUM(sbuf.st_blksize, BUFSIZ);
        if ((buf = malloc(bsize)) == NULL)
            err(1, "malloc");
    }
    while ((nr = read(rfd, buf, bsize)) != -1 && nr != 0)
        for (off = 0; nr; nr -= nw, off += nw)
            if ((nw = write(wfd, buf + off, (size_t)nr)) == 0 ||
                 nw == -1)
                err(1, "stdout");
    if (nr < 0) {
        warn("%s", filename);
        rval = 1;
    }
}

a) When reading the first file, the cat command uses fstat to get the st_blksize attribute of stdout which is “optimal blocksize for I/O”, then allocates the memory:

    ......
    if (buf == NULL) {
        if (fstat(wfd, &sbuf))
            err(1, "stdout");
        bsize = MAXIMUM(sbuf.st_blksize, BUFSIZ);
        if ((buf = malloc(bsize)) == NULL)
            err(1, "malloc");
    }
    ......

b) Read the content of file and write it into stdout:

    ......
    while ((nr = read(rfd, buf, bsize)) != -1 && nr != 0)
        for (off = 0; nr; nr -= nw, off += nw)
            if ((nw = write(wfd, buf + off, (size_t)nr)) == 0 ||
                 nw == -1)
                err(1, "stdout");
    ......

When read returns 0, it means reaching the end of file. If write doesn’t return the number you want to write, it is also considered as an error.

Benchmark C++ ifstream and mmap

After reading Which is fastest: read, fread, ifstream or mmap?, I try to benchmark C++ ifstream and mmap myself.

The test file is number.txt, and the size is 4GiB:

# ls -alt number.txt
-rw-r--r-- 1 root root 4294967296 Apr  2 13:51 number.txt

The test_ifstream.cpp is like this:

#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

const std::string FILE_NAME = "number.txt";
const std::string RESULT_FILE_NAME = "result.txt";

char chunk[1048576];

int main(void)
{
    std::ifstream ifs(FILE_NAME, std::ios_base::binary);
    if (!ifs) {
        std::cerr << "Error opeing " << FILE_NAME << std::endl;
        exit(1);
    }
    std::ofstream ofs(RESULT_FILE_NAME, std::ios_base::binary);
    if (!ofs) {
        std::cerr << "Error opeing " << RESULT_FILE_NAME << std::endl;
        exit(1);
    }

    std::vector<std::chrono::milliseconds> duration_vec(5);
    for (std::vector<std::chrono::milliseconds>::size_type i = 0; i < duration_vec.size(); i++) {
        unsigned long long res = 0;
        ifs.seekg(0);
        auto begin = std::chrono::system_clock::now();

        for (size_t j = 0; j < 4096; j++) {
            ifs.read(chunk, sizeof(chunk));
            for (size_t k = 0; k < sizeof(chunk); k++) {
                res += chunk[k];
            }
        }
        ofs << res;

        duration_vec[i] = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - begin);
        std::cout<< duration_vec[i].count() << std::endl;
    }

    std::chrono::milliseconds total_time{0};
    for (auto const& v : duration_vec) {
        total_time += v;
    }
    std::cout << "Average exec time: " << total_time.count() / duration_vec.size() << std::endl;
    return 0;
}

The program reads 1MiB(1024 * 1024 = 1048576) every time (the total count is 4096). Use -O2 optimization:

# clang++ -O2 test_ifstream.cpp -o test_ifstream
# ./test_ifstream
57208
57085
57061
57105
57069
Average exec time: 57105

The average execution time is 57105 ms. From the htop output:

1

We can see test_ifstream occupies very little memory.

The following is test_mmap file:

#include <chrono>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>


const std::string FILE_NAME = "number.txt";
const std::string RESULT_FILE_NAME = "result.txt";

int main(void)
{
    int fd = ::open(FILE_NAME.c_str(), O_RDONLY);
    if (fd < 0) {
        std::cerr << "Error opeing " << FILE_NAME << std::endl;
        exit(1);
    }
    std::ofstream ofs(RESULT_FILE_NAME, std::ios_base::binary);
    if (!ofs) {
        std::cerr << "Error opeing " << RESULT_FILE_NAME << std::endl;
        exit(1);
    }

    auto file_size = lseek(fd, 0, SEEK_END);

    std::vector<std::chrono::milliseconds> duration_vec(5);
    for (std::vector<std::chrono::milliseconds>::size_type i = 0; i < duration_vec.size(); i++) {
        lseek(fd, 0, SEEK_SET);
        unsigned long long res = 0;
        auto begin = std::chrono::system_clock::now();

        char *chunk = reinterpret_cast<char*>(mmap(NULL, file_size, PROT_READ, MAP_FILE | MAP_SHARED, fd, 0));
        char *addr = chunk;

        for (size_t j = 0; j < file_size; j++) {
            res += *chunk++;
        }
        ofs << res;

        munmap(addr, file_size);

        duration_vec[i] = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - begin);
        std::cout<< duration_vec[i].count() << std::endl;
    }

    std::chrono::milliseconds total_time{0};
    for (auto const& v : duration_vec) {
        total_time += v;
    }
    std::cout << "Average exec time: " << total_time.count() / duration_vec.size() << std::endl;

    ::close(fd);
    return 0;
}

Still use -O2 optimization:

# clang++ -O2 test_mmap.cpp -o test_mmap
# ./test_mmap
57241
57095
57038
57008
57175
Average read time: 57111

We can see the execution time of test_mmap is similar as test_ifstream, whereas test_mmap uses more memory:

2

Be careful of thread stack size

Today, my colleague came across a thread stack overflow core dump:

Capture

From above diagram, we can see only this function’s stack will occupy ~7 MiB space. Check the stack size configuration on system:

$ ulimit -S -s
8192

just 8 MiB. Double the stack size:

$ ulimit -S -s 16384

The program won’t crash.

Reference:
General: How do I change my default limits for stack size, core file size, etc.?.

 

The anatomy of tee program on OpenBSD

The tee command is used to read content from standard input and displays it not only in standard output but also saves to other files simultaneously. The source code of tee in OpenBSD is very simple, and I want to give it an analysis:

(1) tee leverages Singlely-linked List defined in sys/queue.h to manage outputted files (including standard output):

struct list {
    SLIST_ENTRY(list) next;
    int fd;
    char *name;
};
SLIST_HEAD(, list) head;

......

static void
add(int fd, char *name)
{
    struct list *p;
    ......
    SLIST_INSERT_HEAD(&head, p, next);
}

int
main(int argc, char *argv[])
{
    struct list *p;
    ......
    SLIST_INIT(&head);
    ......
    SLIST_FOREACH(p, &head, next) {
        ......
    }
}

To understand it easily, I extract the macros from sys/queue.h and created a file which utilizes the marcos:

#define SLIST_HEAD(name, type)                      \
struct name {                               \
    struct type *slh_first; /* first element */         \
}

#define SLIST_ENTRY(type)                       \
struct {                                \
    struct type *sle_next;  /* next element */          \
}

#define SLIST_FIRST(head)   ((head)->slh_first)
#define SLIST_END(head)     NULL
#define SLIST_EMPTY(head)   (SLIST_FIRST(head) == SLIST_END(head))
#define SLIST_NEXT(elm, field)  ((elm)->field.sle_next)

#define SLIST_FOREACH(var, head, field)                 \
    for((var) = SLIST_FIRST(head);                  \
        (var) != SLIST_END(head);                   \
        (var) = SLIST_NEXT(var, field))

#define SLIST_INIT(head) {                      \
    SLIST_FIRST(head) = SLIST_END(head);                \
}

#define SLIST_INSERT_HEAD(head, elm, field) do {            \
    (elm)->field.sle_next = (head)->slh_first;          \
    (head)->slh_first = (elm);                  \
} while (0)

struct list {
    SLIST_ENTRY(list) next;
    int fd;
    char *name;
};
SLIST_HEAD(, list) head;

int
main(int argc, char *argv[])
{
    struct list *p;
    SLIST_INIT(&head);

    SLIST_INSERT_HEAD(&head, p, next);
    SLIST_FOREACH(p, &head, next) {

    }
}

Then employed gcc‘s pre-processing function:

# gcc -E slist.c
# 1 "slist.c"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "slist.c"
# 30 "slist.c"
struct list {
 struct { struct list *sle_next; } next;
 int fd;
 char *name;
};
struct { struct list *slh_first; } head;

int
main(int argc, char *argv[])
{
 struct list *p;
 { ((&head)->slh_first) = NULL; };

 do { (p)->next.sle_next = (&head)->slh_first; (&head)->slh_first = (p); } while (0);
 for((p) = ((&head)->slh_first); (p) != NULL; (p) = ((p)->next.sle_next)) {

 }
}

It becomes clear now! The head node in list contains only 1 member: slh_first, which points to the first valid node. For the elements in the list, it is embedded with next struct which uses sle_next to refer to next buddy.

(2) By default, tee will overwrite the output files. If you want to append it, use -a option, and the code is as following:

while (*argv) {
    if ((fd = open(*argv, O_WRONLY | O_CREAT |
        (append ? O_APPEND : O_TRUNC), DEFFILEMODE)) == -1) {
        ......
    } 
    ......
}

(3) The next part is the skeleton of saving content to files:

while ((rval = read(STDIN_FILENO, buf, sizeof(buf))) > 0) {
    SLIST_FOREACH(p, &head, next) {
        n = rval;
        bp = buf;
        do {
            if ((wval = write(p->fd, bp, n)) == -1) {
                ......
            }
            bp += wval;
        } while (n -= wval);
    }
}

We need to iterates every opened file descriptor and write contents into it.

(4) Normally, theinterrupt signal will cause tee exit:

# tee
fdkfkdfjk
fdkfkdfjk
^C
#

To disable this feature, use -i option:

# tee -i
fdhfhd
fdhfhd
^C^C

The corresponding code is like this:

......
case 'i':
    (void)signal(SIGINT, SIG_IGN);
    break;