Vladislav Shpilevoy PRO
Database C developer at Tarantool. Backend C++ developer at VirtualMinds.
Lecture 9:
Advanced IO. Non-blocking IO operations. File blocking. Multiplexed IO: select, poll, kqueue, epoll.
System programming
Version: 3
Anonymous
int
pipe2(int pipefd[2], int flags);
void *
mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
Named, XSI standard
int
semget(key_t key, int nsems, int semflg);
int
msgget(key_t key, int msgflg);
int
shmget(key_t key, size_t size, int shmflg);
Named, POSIX standard
sem_t *
sem_open(const char *name, int oflag, mode_t mode, unsigned int value);
int
mkfifo(const char *pathname, mode_t mode);
int
socket(int domain, int type, int protocol);
int
bind(int sockfd, const struct sockaddr *addr,
socklen_t addrlen);
int
listen(int sockfd, int backlog);
int
connect(int sockfd, const struct sockaddr *addr,
socklen_t addrlen);
int
accept(int sockfd, struct sockaddr *addr,
socklen_t *addrlen);
Usage
int fd = socket();
connect(fd, remote_addr);
/* Ready to read/write fd. */
Connection to a named socket
Creation of a named socket without connection
int fd = socket();
bind(fd, addr);
/** Ready to read/write fd. */
Creation of a named socket with connection
int fd = socket();
bind(fd, addr);
listen(fd);
while(1) {
int remote_fd = accept(fd);
/*
* Ready to read/write
* remote_fd.
*/
}
Connect() creates a paired socket on the server side, and this pair can interact just like socketpair()
int fd2 = socket();
bind(fd2, addr2);
/** Ready to read/write fd2. */
read/write
send/recv
sendto/recvfrom
Only packet sockets work without connect(), and destination address should be specified for each packet manually
Server
Client
Client
Client
Client
New client
How to work with multiple clients?
int new_client_fd = accept(server_fd);
if (fork() == 0) {
/* Read/write this client. */
...
return 0;
}
/*
* Server continues accepting new
* clients.
*/
Allocate a process for each?
Allocate a thread for each?
int new_client_fd = accept(server_fd);
pthread_t tid;
/*
* Put new client into a separate
* thread.
*/
pthread_create(&tid, NULL,
work_with_client_f,
new_client_fd);
/*
* Server continues accepting new
* clients.
*/
Why can't read from every client right away?
Accept(), read(), write() block thread until a new client/data appear. Others won't be served during that.
int *client_fds = NULL;
int client_count = 0;
while (1) {
int new_cli_fd = accept(server_fd);
add_new_client(&client_fds, &client_count, new_cli_fd);
for (int i = 0; i < client_count; ++i)
interact_with_client(client_fds[i]);
}
1 point
O_NONBLOCK
int fd = open(file_name, flags | O_NONBLOCK);
int old_flags = fcntl(fd, F_GETFL);
fcntl(fd, F_SETFL, old_flags | O_NONBLOCK);
ssize_t rc = read/write/accept/send/recv(fd, ...);
if (rc == -1 && (errno == EAGAIN || errno == EWOULDBLOCK))
/* No data to read, or space to write. */
How to set the flag?
How to use?
or
Good code should check for both errors
void
make_fd_nonblocking(int fd)
{
int old_flags = fcntl(fd, F_GETFL);
fcntl(fd, F_SETFL, old_flags | O_NONBLOCK);
}
int
main(int argc, const char **argv)
{
make_fd_nonblocking(STDIN_FILENO);
int value = 0;
int rc = scanf("%d", &value);
printf("scanf rc = %d\n", rc);
printf("scanf error = %s\n", strerror(errno));
rc = read(STDIN_FILENO, &value, sizeof(value));
printf("read rc = %d\n", rc);
printf("read error = %s\n", strerror(errno));
while (1) {
rc = scanf("%d", &value);
if (rc > 0 || (errno != EAGAIN &&
errno != EWOULDBLOCK))
break;
}
printf("value = %d\n", value);
return 0;
}
The function makes any descriptor non-blocking
Even the standard input can be made non-blocking
Then scanf() and read() return an error, when no data
Read in a loop until success
$> gcc 1_nonblock.c
$> ./a.out
scanf rc = -1
scanf error = Resource
temporarily unavailable
read rc = -1
read error = Resource
temporarily unavailable
...
100
value = 100
#define LOCK_SH 1 /* Shared lock. */
#define LOCK_EX 2 /* Exclusive lock. */
#define LOCK_NB 4 /* Don't block when locking. */
#define LOCK_UN 8 /* Unlock. */
int
flock(int fd, int operation);
/* Shared lock. */
flock(fd, LOCK_SH);
/* Exclusive lock. */
flock(fd, LOCK_EX);
/* Do not block on flock(). */
flock(fd, LOCK_SH | LOCK_NB);
flock(fd, LOCK_EX | LOCK_NB);
/* Unlock. */
flock(fd, LOCK_UN);
Lock a whole file
int main()
{
int fd = open("tmp.txt", O_CREAT | O_RDWR, S_IRWXU);
char cmd[100];
while (scanf("%s", cmd) > 0) {
int rc;
if (strcmp(cmd, "excl") == 0) {
rc = flock(fd, LOCK_EX);
} else if (strcmp(cmd, "exclnb") == 0) {
rc = flock(fd, LOCK_EX | LOCK_NB);
} else if (strcmp(cmd, "shared") == 0) {
rc = flock(fd, LOCK_SH);
} else if (strcmp(cmd, "sharednb") == 0) {
rc = flock(fd, LOCK_SH | LOCK_NB);
} else if (strcmp(cmd, "unlock") == 0) {
rc = flock(fd, LOCK_UN);
} else if (strcmp(cmd, "write") == 0) {
rc = write(fd, "data", 4);
} else {
printf("unknown command\n");
continue;
}
if (rc == -1)
printf("error = %s\n", strerror(errno));
else
printf("ok\n");
}
close(fd);
return 0;
}
Open or create a file
Install or drop a lock
$> gcc 2_flock.c
$> ./a.out
$> ./a.out
excl
ok
excl
...
unlock
ok
ok
unlock
ok
shared
ok
shared
ok
exclnb
error = Resource temporarily unavailable
unlock
ok
exclnb
ok
sharednb
error = Resource temporarily unavailable
^D
$>
sharednb
ok
$> ./a.out
write
ok
int
flock(int fd, int operation);
Shortcomings:
/* cmd = ... */
#define F_GETLK /* Check if lock exists. */
#define F_SETLK /* Do nonblocking lock. */
#define F_SETLKW /* Do blocking lock. */
int
fcntl(int fd, int cmd, struct flock *lock_def);
struct flock {
off_t l_start;
off_t l_len;
pid_t l_pid;
short l_type;
short l_whence;
};
l_start, l_len
- begin and length of a range
l_whence
- from where to count the range? From the beginning, end, current position of fd descriptor?
l_type
- take read lock (shared), write lock (exclusive), unlock
l_pid
- PID of a process keeping the lock, if any
File
0
100
lock.l_start = 60;
lock.l_whence = SEEK_SET;
lock.l_start = 20;
lock.l_whence = SEEK_CUR;
lock.l_start = -40;
lock.l_whence = SEEK_END;
struct flock lock;
lock.l_len = 30;
lock.l_type = F_WRLCK;
fcntl(fd, F_SETLK, &lock);
start
0
100
fd
start
0
start
60
20
-40
100
40
File
100
struct flock lock;
lock.l_len = 30;
lock.l_type = F_WRLCK;
lock.l_start = 60;
lock.l_whence = SEEK_SET;
fcntl(fd, F_SETLK, &lock);
struct flock lock;
lock.l_len = 20;
lock.l_type = F_WRLCK;
lock.l_start = 10;
lock.l_whence = SEEK_SET;
fcntl(fd, F_SETLK, &lock);
struct flock lock;
lock.l_len = 20;
lock.l_type = F_WRLCK;
lock.l_start = 40;
lock.l_whence = SEEK_SET;
fcntl(fd, F_SETLK, &lock);
struct flock lock;
lock.l_len = 20;
lock.l_type = F_UNLCK;
lock.l_start = 50;
lock.l_whence = SEEK_SET;
fcntl(fd, F_SETLK, &lock);
60
90
10
30
40
50
70
int char_to_whence(char whence)
{
if (whence == 's')
return SEEK_SET;
return whence == 'e' ? SEEK_END : SEEK_CUR;
}
int char_to_type(char type)
{
if (type == 'r')
return F_RDLCK;
return type == 'w' ? F_WRLCK : F_UNLCK;
}
int do_lock(int fd, bool block, char *cmd)
{
char whence, type;
int start, len;
sscanf(cmd, "%c %c %d %d", &type, &whence, &start, &len);
printf("type = %c, whence = %c, start = %d, len = %d\n", type, whence,
start, len);
struct flock fl;
fl.l_type = char_to_type(type);
fl.l_whence = char_to_whence(whence);
fl.l_start = start;
fl.l_len = len;
return fcntl(fd, block ? F_SETLKW : F_SETLK, &fl);
}
The code reads lines:
<cmd> <type> <whence> <start> <len>
whence - s (set), e (end), c (cur)
type - r (read lock), w (write lock), u (unlock)
cmd - lock, lockb, unlock, getlock
Implementation of locking/unlocking commands
int
get_lock(int fd, char *cmd)
{
char whence, type;
int start, len;
sscanf(cmd, "%c %c %d %d", &type, &whence, &start, &len);
printf("type = %c, whence = %c, start = %d, len = %d\n", type, whence,
start, len);
struct flock fl;
fl.l_type = char_to_type(type);
fl.l_whence = char_to_whence(whence);
fl.l_start = start;
fl.l_len = len;
if (fcntl(fd, F_GETLK, &fl) == -1)
return -1;
if (fl.l_type == F_UNLCK)
printf("no lock on this region\n");
else
printf("process %d holds the lock\n", (int)fl.l_pid);
return 0;
}
Check if the range has a lock
Choose which range to check
Use fcntl with F_GETLK - the command will store into fl.l_pid pid of the process keeping the lock
If the range is free - fl.l_type will store F_UNLCK
Otherwise in fl.l_pid there is a valid pid
int main()
{
printf("my pid = %d\n", (int) getpid());
int fd = open("tmp.txt", O_CREAT | O_RDWR, S_IRWXU);
char *buf = NULL;
size_t size = 0;
while (getline(&buf, &size, stdin) > 0) {
char *line = buf;
line[strlen(line) - 1] = 0;
int rc;
char *cmd = strsep(&line, " \n");
if (strcmp(cmd, "write") == 0) {
rc = write_symbols(fd, line);
} else if (strcmp(cmd, "lock") == 0) {
rc = do_lock(fd, false, line);
} else if (strcmp(cmd, "lockb") == 0) {
rc = do_lock(fd, true, line);
} else if (strcmp(cmd, "getlock") == 0) {
rc = get_lock(fd, line);
}
if (rc == -1)
printf("error = %s\n", strerror(errno));
else
printf("ok\n");
}
close(fd);
return 0;
}
Command line implementation
Write command to fill a file with something
Lock/unlock
$> gcc 2_flock.c
$> ./a.out
my pid = 95231
$> ./a.out
my pid = 95229
write 100 a
ok
0
100
lock w(rite) s(et) 10 20
ok
lock r(ead) s 40 20
ok
- write lock
- read lock
lock r s 50 30
ok
lock w s 20 30
error = Resource temporarily unavailable
lockb w s 20 30
...
lock u(nlock) s 20 10
ok
lock u(nlock) s 40 10
ok
ok
lock w s 100 0
ok
write 10 a
ok
110
lock w s 101 9
error = Resource temporarily unavailable
^C
$>
lock w s 100 10
ok
10
20
30
40
50
60
80
#define F_ULOCK /* Unlock locked sections. */
#define F_LOCK /* Lock a section for exclusive use. */
#define F_TLOCK /* Test and lock a section for exclusive use. */
#define F_TEST /* Test a section for locks by other processes. */
int
lockf(int fildes, int function, off_t size);
struct flock fl;
fl.l_start = 0;
fl.l_len = size;
fl.l_whence = SEEK_CUR;
if (function == F_TEST) {
fl.l_type = F_WRLCK;
if (fcntl(fd, F_GETLK, &fl) != 0)
return -1;
if (fl.l_type == F_UNLCK)
return 0;
errno = EAGAIN;
return -1;
}
if (function == F_ULOCK) {
fl.l_type = F_UNLCK;
return fcntl(fd, F_SETLK, &fl);
}
fl.l_type = F_WRLCK;
return fcntl(fd, function == F_LOCK ? F_SETLK : F_SETLKW, &fl);
Possible implementation:
int
lockf(int fildes, int function, off_t size);
int
fcntl(int fd, int cmd, struct flock *lock_def);
int
flock(int fd, int operation);
Affect each other
Are not inherited at fork()
Belong to process, not to thread. Can take the same lock in one process multiple times
All of them are advisory
Can those locks be used as inter-process mutex? If yes - how? If no - why?
They can be. 1) use one file a mutex, 2) use any range in a file as a mutex (first byte, for example)
int
lockf(int fildes, int function, off_t size);
int
fcntl(int fd, int cmd, struct flock *lock_def);
int
flock(int fd, int operation);
1 point
Server
Client
Client
Client
Client
How to understand from which socket it is ok to read?
Events, data
void make_fd_nonblocking(int fd)
{
int old_flags = fcntl(fd, F_GETFL);
fcntl(fd, F_SETFL, old_flags | O_NONBLOCK);
}
/* ... */
/* Make 'accept(server_fd)' non-blocking. */
make_fd_nonblocking(server_fd)
int *client_fds = NULL;
int client_count = 0;
while (1) {
int new_cli_fd = accept(server_fd);
if (new_cli_fd != -1) {
/* Make read/write(new_cli_fd) non-blocking. */
make_fd_nonblocking(new_cli_fd);
add_new_client(&client_fds, &client_count, new_cli_fd);
}
for (int i = 0; i < client_count; ++i)
interact_with_client(client_fds[i]);
}
Check all in a queue? - polling
Additional system calls - waste of CPU time
Latency lineary depends on the number of descriptors
Make all descriptors non blocking
Loop infinitely and check for EAGAIN, EWOULDBLOCK
Sockets
Files
Terminals
Futexes
void FD_CLR(fd, fd_set *fdset);
void FD_COPY(fd_set *fdset_orig, fd_set *fdset_copy);
int FD_ISSET(fd, fd_set *fdset);
void FD_SET(fd, fd_set *fdset);
void FD_ZERO(fd_set *fdset);
int select(int nfds, fd_set *readfds, fd_set *writefds,
fd_set *errorfds, struct timeval *timeout);
3 sets of file descriptors: for reading, writing, exceptions
Functions to work with one fd_set - file descriptor set
Value of the maximal descriptor in all 3 sets
Select() blocks until an event happens on any descriptor
Returns when an event happens, removes all descriptors from fd_sets except for having events
fd_set set;
FD_ZERO(&set);
FD_SET(file_desc1, &set);
FD_SET(file_desc2, &set);
select(MAX(file_desc1, file_desc2) + 1, &set,
NULL, NULL, NULL);
if (FD_ISSET(file_desc1, &set)) {
/* file_desc1 has data to read. */
read(file_desc1, buffer, buf_size);
/* ... */
}
if (FD_ISSET(file_desc2, &set)) {
/* ... */
}
int
main(int argc, const char **argv)
{
int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
struct sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_port = htons(12345);
inet_aton("127.0.0.1", &addr.sin_addr);
connect(sock, (struct sockaddr *) &addr, sizeof(addr));
int number;
while (scanf("%d", &number) > 0) {
if (send(sock, &number, sizeof(number), 0) == -1) {
printf("error = %s\n", strerror(errno));
continue;
}
printf("Sent %d\n", number);
number = 0;
int rc = recv(sock, &number, sizeof(number), 0);
if (rc == 0) {
printf("Closed connection\n");
break;
}
if (rc == -1)
printf("error = %s\n", strerror(errno));
else
printf("Received %d\n", number);
}
close(sock);
return 0;
}
Normal client. Sends numbers, receives them incremented
int fill_fdset(fd_set *set, int *clients, int client_count, int server)
{
int max_fd = server;
FD_ZERO(set);
FD_SET(server, set);
for (int i = 0; i < client_count; ++i) {
FD_SET(clients[i], set);
if (clients[i] > max_fd)
max_fd = clients[i];
}
return max_fd;
}
int main(int argc, const char **argv)
{
int server = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
struct sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_port = htons(12345);
inet_aton("127.0.0.1", &addr.sin_addr);
bind(server, (struct sockaddr *) &addr, sizeof(addr));
listen(server, 128);
int client_count = 0, *clients = NULL;
fd_set readset;
while(1) {
int max_fd = fill_fdset(&readset, clients, client_count, server);
struct timeval timeval;
timeval.tv_sec = 2;
timeval.tv_usec = 0;
Creation of a server, listen on a port
Fill readset before each select()
int nfds = select(max_fd + 1, &readset, NULL, NULL, &timeval);
if (nfds == 0) {
printf("Timeout\n");
continue;
}
if (FD_ISSET(server, &readset)) {
int client_sock = accept(server, NULL, NULL);
printf("New client\n");
client_count++;
clients = realloc(clients, client_count * sizeof(int));
clients[client_count - 1] = client_sock;
nfds--;
}
for (int i = 0; i < client_count && nfds > 0; ++i) {
if (! FD_ISSET(clients[i], &readset))
continue;
nfds--;
printf("Interact with fd %d\n", clients[i]);
int rc = interact(clients[i]);
if (rc == 0) {
printf("Client disconnected\n");
remove_client(&clients, &client_count, i);
}
}
}
close(server);
for (int i = 0; i < client_count; ++i)
close(clients[i]);
free(clients);
return 0;
}
Select() returns number of descriptors having events. 0 = timeout
First check the server socket - it is special. A new client is signaled as a 'read' event
Check each client
Result of select is a hint allowing not to scan the whole file sets
$> gcc 4_server_select.c -o server
$> ./server
$> gcc 4_client.c -o client
$> ./client
$> ./client
Timeout
Timeout
New client
New client
100
Sent 100
Received 101
Interact with fd 4
Received 100
Sent 101
200
Sent 200
Received 201
Interact with fd 5
Received 200
Sent 201
^C
$>
Interact with fd 5
Client disconnected
^C
$>
Interact with fd 4
Client disconnected
int
poll(struct pollfd fds[], nfds_t nfds, int timeout);
struct pollfd {
int fd; /* File descriptor. */
short events; /* Events to look for. */
short revents; /* Events returned. */
};
#define POLLERR /* Wait for exceptions. */
#define POLLIN /* Wait for ability to read. */
#define POLLOUT /* Wait for ability to write. */
/* POLLHUP, POLLNVAL, POLLPRI, POLLRDBAND, POLLRDNORM,
POLLWRBAND, POLLWRNORM */
struct pollfd fds[2];
fds[0].fd = file_desc1;
fds[0].events = POLLIN;
fds[1].fd = file_desc2;
fds[1].events = POLLIN | POLLOUT;
poll(fds, 2, 2000);
if (fds[0].revents | POLLIN)
/* Can safely read from file_desc1. */
if (fds[1].revents | POLLIN)
/* Can safely read from file_desc2. */
if (fds[1].revents | POLLOUT)
/* Can safely write to file_desc2. */
In each pollfd a file descriptor is specified and needed events
After return, poll() stores here the happened events
struct pollfd *fds = malloc(sizeof(fds[0]));
int fd_count = 1;
fds[0].fd = server;
fds[0].events = POLLIN;
while(1) {
int nfds = poll(fds, fd_count, 2000);
if (nfds == 0) {
printf("Timeout\n");
continue;
}
if ((fds[0].revents & POLLIN) != 0) {
int client_sock = accept(server, NULL, NULL);
printf("New client\n");
fd_count++;
fds = realloc(fds, fd_count * sizeof(fds[0]));
fds[fd_count - 1].fd = client_sock;
fds[fd_count - 1].events = POLLIN;
nfds--;
}
for (int i = 1; i < fd_count && nfds > 0; ++i) {
if ((fds[i].revents & POLLIN) == 0)
continue;
nfds--;
printf("Interact with fd %d\n", fds[i].fd);
if (interact(fds[i].fd) == 0) {
printf("Client disconnected\n");
remove_client(&fds, &fd_count, i);
}
}
}
Instead of an int array now it is a pollfd array
Poll, like select, returns number of descriptors having events
Addition via putting a new descriptor into the pollfd array
Check which clients have events
int
kqueue(void);
int
kevent(int kq,
const struct kevent *changelist, int nchanges,
struct kevent *eventlist, int nevents,
const struct timespec *timeout);
struct kevent {
uintptr_t ident; /* Identifier for this event. */
int16_t filter; /* Filter for event. */
uint16_t flags; /* General flags. */
uint32_t fflags; /* Filter-specific flags. */
intptr_t data; /* Filter-specific data. */
void *udata; /* Opaque user data identifier. */
};
EV_SET(&kev, ident, filter, flags, fflags, data, udata);
Kernel Events Queue
Create a queue, access it via a "file descriptor"
Manage the queue and extract events
Changes such as "track new event", "stop tracking other event"
Happened events are stored here
kevent can track many system events. In case of IO it is a file descriptor
What to track? Read, write, exceptions, close ... ?
Action - delete, add, change an event
Attach any data to the event. To get it back when the event happens
int kq = kqueue();
Create the queue
struct kevent new_ev;
EV_SET(&new_ev, fd, EVFILT_READ/WRITE/..., EV_ADD, 0, 0, 0);
kevent(kq, &new_ev, 1, 0, 0, NULL);
Track a new event on the descriptor fd. Each event (read, write) needs an own kevent
Delete an event from tracking
struct kevent old_ev;
EV_SET(&old_ev, fd, EVFILT_READ/WRITE/..., EV_DELETE, 0, 0, 0);
kevent(kq, &old_ev, 1, 0, 0, NULL);
Fetch happened events
struct kevent happened_ev;
kevent(kq, NULL, 0, &happened_ev, 1, NULL);
if (happened_ev.filter | EVFILT_READ)
/* Can safely read from happened_ev.ident. */
if (happened_ev.filter | EVFILT_WRITE)
/* Can safely write to happened_ev.ident. */
int kq = kqueue();
if (kq == -1) {
printf("error = %s\n", strerror(errno));
close(server);
return -1;
}
struct kevent new_ev;
EV_SET(&new_ev, server, EVFILT_READ, EV_ADD, 0, 0, 0);
if (kevent(kq, &new_ev, 1, 0, 0, NULL) == -1) {
printf("error = %s\n", strerror(errno));
close(server);
return -1;
}
struct peer **peers = NULL;
struct timespec timeout;
timeout.tv_sec = 2;
timeout.tv_nsec = 0;
while(1) {
int nfds = kevent(kq, NULL, 0, &new_ev, 1, &timeout);
if (nfds == 0) {
printf("Timeout\n");
continue;
}
if (nfds == -1) {
printf("error = %s\n", strerror(errno));
break;
}
No explicit array anymore. The tracked events are stored inside the kernel
Track "read" event on the server socket
Receive the events one by one. But for big client count better fetch the events in batches
Accepted clients are stored in a list, each with its own context.
struct peer {
int fd;
struct peer *next;
struct peer *prev;
};
if (new_ev.udata == NULL) {
int peer_sock = accept(server, NULL, NULL);
if (peer_sock == -1) {
printf("error = %s\n", strerror(errno));
break;
}
printf("New client\n");
struct peer *p = malloc(sizeof(*p));
EV_SET(&new_ev, peer_sock, EVFILT_READ, EV_ADD,
0, 0, p);
if (kevent(kq, &new_ev, 1, 0, 0, NULL) == -1) {
printf("error = %s\n", strerror(errno));
close(peer_sock);
free(p);
break;
}
p->fd = peer_sock;
p->next = peers;
p->prev = NULL;
if (peers != NULL)
peers->prev = p;
peers = p;
continue;
}
For the server I didn't fill `udata` field. This is how I can tell if the event happened with the server or one of the peers.
Register newly accepted client with `udata` = peer context.
Append to the list of peers. To delete them all when the program exits.
struct peer *p = new_ev.udata;
printf("Interact with fd %d\n", p->fd);
int rc = interact(p);
if (rc == -1) {
printf("error = %s\n", strerror(errno));
if (errno != EWOULDBLOCK && errno != EAGAIN)
break;
continue;
}
if ((new_ev.flags & EV_EOF) == 0)
continue;
printf("Client disconnected\n");
if (p->prev != NULL)
p->prev->next = p->next;
if (p->next != NULL)
p->next->prev = p->prev;
if (p == peers)
peers = p->next;
close(p->fd);
free(p);
}
When udata field is filled, it means this is a peer's event. Which peer? Its pointer is saved in udata by me, when I registered it.
EV_EOF is a special flag which means the socket is closed remotely. When set, I can delete the peer and close its socket.
And delete it from the list of peers.
while (peers != NULL) {
struct peer *next = peers->next;
close(peers->fd);
free(peers);
peers = next;
}
close(kq);
close(server);
return 0;
Destroy everything in the end when the server exits.
int
epoll_create(int size);
int
epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
int
epoll_wait(int epfd, struct epoll_event *events,
int maxevents, int timeout);
typedef union epoll_data {
void *ptr;
int fd;
uint32_t u32;
uint64_t u64;
} epoll_data_t;
struct epoll_event {
uint32_t events; /* Epoll events. */
epoll_data_t data; /* User data variable. */
};
Create event queue
Manage queue: add descriptor, delete, change
Wait for events and fetch them
Each descriptor has a set of tracked events
Arbitrary userdata, like in kevent
int ep = epoll_create(12345);
Create the queue. Its size is ignored now, but is kept in the API since times when it mattered
struct epoll_event new_ev;
new_ev.events = EPOLLIN | EPOLLOUT;
new_ev.data.fd/u32/u64/ptr = my_any_data;
epoll_ctl(ep, EPOLL_CTL_ADD, file_desc, &new_ev);
Add a new descriptor for the needed events tracking
Delete a descriptor from the queue
epoll_ctl(ep, EPOLL_CTL_DEL, file_desc, NULL);
int ep = epoll_create(1);
if (ep == -1) {
printf("error = %s\n", strerror(errno));
close(server);
return -1;
}
struct epoll_event new_ev;
new_ev.data.ptr = NULL;
new_ev.events = EPOLLIN;
if (epoll_ctl(ep, EPOLL_CTL_ADD, server, &new_ev) == -1) {
printf("error = %s\n", strerror(errno));
close(server);
return -1;
}
struct peer *peers = NULL;
while(1) {
int nfds = epoll_wait(ep, &new_ev, 1, 2000);
if (nfds == 0) {
printf("Timeout\n");
continue;
}
if (nfds == -1) {
printf("error = %s\n", strerror(errno));
break;
}
Create the queue
Register the server for 'read' events.
Accepted clients are stored in a list, each with its own context.
struct peer {
int fd;
struct peer *next;
struct peer *prev;
};
Receive the events one by one. But for big client count better fetch the events in batches
if (new_ev.data.ptr == NULL) {
int peer_sock = accept(server, NULL, NULL);
if (peer_sock == -1) {
printf("error = %s\n", strerror(errno));
break;
}
printf("New client\n");
struct peer *p = malloc(sizeof(*p));
new_ev.data.ptr = p;
new_ev.events = EPOLLIN;
if (epoll_ctl(ep, EPOLL_CTL_ADD, peer_sock,
&new_ev) == -1) {
printf("error = %s\n", strerror(errno));
free(p);
break;
}
p->fd = peer_sock;
p->next = peers;
p->prev = NULL;
if (peers != NULL)
peers->prev = p;
peers = p;
continue;
}
For the server I set `data` field to NULL. This is how I can tell if the event happened with the server or one of the peers.
Register newly accepted client with `data` = peer context.
Append to the list of peers. To delete them all when the program exits.
struct peer *p = new_ev.data.ptr;
printf("Interact with fd %d\n", (int)p->fd);
int rc = interact(p);
if (rc == -1) {
printf("error = %s\n", strerror(errno));
if (errno != EWOULDBLOCK && errno != EAGAIN)
break;
continue;
}
if (rc != 0)
continue;
printf("Client disconnected\n");
epoll_ctl(ep, EPOLL_CTL_DEL, p->fd, NULL);
if (p->prev != NULL)
p->prev->next = p->next;
if (p->next != NULL)
p->next->prev = p->prev;
if (p == peers)
peers = p->next;
close(p->fd);
free(p);
}
When data field is filled, it means this is a peer's event. Which peer? Its pointer is saved in data by me, when I registered it.
When read/recv returns 0, it means the socket is closed remotely.
And delete it from the list of peers.
while (peers != NULL) {
struct peer *next = peers->next;
close(peers->fd);
free(peers);
peers = next;
}
close(ep);
close(server);
return 0;
Destroy everything in the end when the server exits.
int aio_read(struct aiocb *aiocbp);
int aio_write(struct aiocb *aiocbp);
int aio_error(const struct aiocb *aiocbp);
ssize_t aio_return(struct aiocb *aiocbp);
int aio_suspend(const struct aiocb *const list[],
int nent, const struct timespec *timeout);
int aio_cancel(int fildes, struct aiocb *aiocbp);
int lio_listio(int mode, struct aiocb *const aiocb_list[],
int nitems, struct sigevent *sevp);
struct aiocb {
int aio_fildes; /* File descriptor. */
off_t aio_offset; /* File offset. */
volatile void *aio_buf; /* Location of buffer. */
size_t aio_nbytes; /* Length of transfer. */
int aio_reqprio; /* Request priority. */
struct sigevent aio_sigevent; /* Notification method. */
int aio_lio_opcode; /* lio_listio specific. */
};
char buffer[1024];
struct aiocb cb;
memset(&cb, 0, sizeof(cb));
cb.aio_fieldes = fd;
cb.aio_offset = lseek(fd, 0, SEEK_CUR);
cb.aio_buf = buffer;
cb.aio_nbytes = sizeof(buffer);
aio_read(&cb);
/* Do some non-related work ... */
while (aio_error(&cb) == EINPROGRESS) {};
int result = aio_return(&cb);
char buffer[1024];
struct aiocb cb;
memset(&cb, 0, sizeof(cb));
cb.aio_fieldes = fd;
cb.aio_offset = lseek(fd, 0, SEEK_CUR);
cb.aio_buf = buffer;
cb.aio_nbytes = sizeof(buffer);
aio_read(&cb);
/* Do some non-related work ... */
aio_suspend(&cb, 1, NULL);
int result = aio_return(&cb);
=
char buffer[1024];
read(fd, buffer, sizeof(buffer));
const int chunk_size = 1024;
char buffer[chunk_size * 3];
struct aiocb cb[3];
memset(cb, 0, sizeof(cb));
for (int i = 0; i < 3; ++i) {
cb[i].aio_fieldes = fd;
cb[i].aio_nbytes = chunk_size;
cb[i].aio_lio_opcode = LIO_READ;
}
cb[1].aio_offset = lseek(fd, 0, SEEK_CUR);
cb[2].aio_offset =
cb[1].aio_offset + chunk_size;
cb[3].aio_offset =
cb[2].aio_offset + chunk_size;
cb[1].aio_buf = buffer;
cb[2].aio_buf = buffer + chunk_size;
cb[3].aio_buf = buffer + chunk_size * 2;
lio_listio(LIO_NOWAIT, cb, 3, NULL);
/* Do some non-related work ... */
aio_suspend(cb, 3, NULL);
int result1 = aio_return(&cb[1]);
int result2 = aio_return(&cb[2]);
int result3 = aio_return(&cb[3]);
const int chunk_size = 1024;
char buffer[chunk_size * 3];
read(fd, buffer, chunk_size);
read(fd, buffer + chunk_size,
chunk_size);
read(fd, buffer + 2 * chunk_size,
chunk_size);
=
Why should not use?
What to use instead?
ssize_t
writev(int fildes, const struct iovec *iov, int iovcnt);
ssize_t
readv(int d, const struct iovec *iov, int iovcnt);
struct iovec {
char *iov_base;
size_t iov_len;
};
char buffer[2][512];
struct iovec vec[2];
vec[0].iov_base = buffer[0];
vec[0].iov_len = sizeof(buffer[0]);
vec[1].iov_base = buffer[1];
vec[1].iov_len = sizeof(buffer[1]);
writev(fd, vec, 2);
=
char buffer[2][512];
write(fd, buffer[0], sizeof(buffer[0]));
write(fd, buffer[1], sizeof(buffer[1]));
Saving on number of system calls; no need to copy everything into a monolithic buffer
Descriptors are blocking by default. O_NONBLOCK flag makes the operations return EWOUDBLOCK if can't be completed now. This is essential when handle many descriptors in one thread.
Files and dirs can be locked by processes. There are 3 APIs for that. But the locks are "advisory". Nonetheless those locks have their usages in real life.
Descriptor multiplexing is the fundamental basis of performant networking. To process huge number of descriptors in each worker thread efficiently.
There are several multiplexing APIs: select(), poll(), kqueue() (Mac, BSD), epoll() (Linux) (and uring (Linux)).
POSIX AIO - asynchronous IO in a thread pool. Never use it.
IO batching - readv(), writev(), sendmmsg(), recvmmsg() - can send many buffers at once. Huge performance boost compared to non-batched operations done in a loop
Chat
Need to implement a game lobby chat. It consists of a chat server and a client. The clients connect to the server and each message is broadcasted to all the clients via this server. So the server is like a game lobby. Everyone reads all messages from all the others and the messages are not persisted anyhow. There is a solution template in .h and .c files.
Points: 15 - 25.
Deadline: 2 weeks.
Penalty: -1 for each day after deadline, max -10
Publish your solution on Github and give me the link. Assessment: any way you want - messengers, calls, emails.
Lectures: slides.com/gerold103/decks/sysprog_eng
Next time:
Users and groups. Login. Real and effective user. Access rights for processes, files. Session. Process daemonization.
Press on the heart, if like the lecture
By Vladislav Shpilevoy
Advanced IO. Non-blocking IO operations. File blocks: flock, lockf, fcntl. Multiplexed IO: select, poll, kqueue. Async IO: aio_read/write.
Database C developer at Tarantool. Backend C++ developer at VirtualMinds.