Lecture 6:

Threads. Difference from processes. Atomic operations. Synchronisation. Attributes. Multithreaded processes specifics. Kernel representation.

Version: 3

System programming

Education

Lecture plan

  • Threads concept
  • clone() - threads in Linux
  • Volatile
  • Atomic operations
  • Futex - mutexes in Linux
  • Memory barriers
  • pthread - standard threads

Process [1]

0x0

0xffffffff

.text

.data

.bss

.heap

.stack

.env

kernel

0xc0000000

File descriptors

Signal queue

IPC

Process [2]

Task

.text

.data

.bss

.stack

.heap

.stack

.heap

Translation Lookaside Buffer

  • Messages transmission
  • Lock on the shared memory
  • Copying via that memory
  • "Kill" of TLB

Thread [1]

Task

.text

.data

.bss

.stack

.heap

Translation Lookaside Buffer

  • Direction function calls
  • Direct memory access
  • Single address space

call()

call()

Thread [2]

struct task_struct {
	struct thread_info		thread_info;
	volatile long			state;
	void				*stack;
	atomic_t			usage;
	unsigned int			cpu;
	int				prio;
	struct mm_struct		*mm;
	int				exit_state;
	int				exit_code;
	int				exit_signal;
	pid_t				pid;
	struct task_struct              *parent;
	struct list_head		children;
	u64				start_time;
	const struct cred		*cred;
	struct files_struct		*files;
	struct thread_struct		thread;
};

In Linux threads and processes are "the same"

These are shared between threads

Clone() [1]

int
clone(int (*fn)(void *), void *child_stack,
      int flags, void *arg, ...);

Create a new struct task_struct from user space

#define CLONE_CHILD_CLEARTID
#define CLONE_CHILD_SETTID
#define CLONE_FILES
#define CLONE_FS
#define CLONE_IO
#define CLONE_NEWCGROUP
#define CLONE_NEWIPC
#define CLONE_NEWNET
#define CLONE_NEWNS
#define CLONE_NEWPID
#define CLONE_NEWUSER
#define CLONE_NEWUTS
#define CLONE_PARENT
#define CLONE_PARENT_SETTID
#define CLONE_PID
#define CLONE_PTRACE
#define CLONE_SETTLS
#define CLONE_SIGHAND
#define CLONE_STOPPED
#define CLONE_SYSVSEM
#define CLONE_THREAD
#define CLONE_UNTRACED
#define CLONE_VFORK
#define CLONE_VM
CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD

Clone() [2]

int
thread_create_clone(int (*func)(void *), void *arg, void **stack)
{
	int stack_size = 65 * 1024;
	*stack = malloc(stack_size);
	void *stack_top = (char *) *stack + stack_size;
	int flags = CLONE_VM | CLONE_FS | CLONE_FILES |
		    CLONE_SIGHAND | CLONE_THREAD;
	return clone(func, stack_top, flags, arg);
}

stack

stack_top

What to share?

  • CLONE_VM - memory
  • CLONE_FS - work directory
  • CLONE_FILES - file descriptor table
  • CLONE_SIGHAND - signal handlers
  • CLONE_THREAD - keep process ID the same (PID)

Takes beginning of the new stack

Created from here

But stack grows down - begin is higher

Clone() [3]

volatile bool is_finished = false;

int thread_f(void *arg)
{
	pid_t thread_id = gettid();
	pid_t pid = getpid();
	printf("pid %d, tid %d: new thread, arg = %d\n",
	       (int)pid, (int)thread_id,
               *((int *) arg));
	is_finished = true;
	return 0;
}

int main()
{
	pid_t thread_id = gettid();
	pid_t pid = getpid();
	printf("pid %d, tid %d: main thread\n",
	       (int)pid, (int)thread_id);
	void *stack;
	int arg = 100;
	int ret = thread_create_clone(thread_f,
                                      (void *) &arg,
                                      &stack);
	printf("pid %d, tid %d: clone result = %d\n",
	       (int)pid, (int)thread_id, ret);
	while (! is_finished)
		sched_yield();
	free(stack);
	return 0;
}

That will be a new thread

Print thread and process ID in both threads

Start the thread and wait for its termination

Don't forget to free the stack

$> gcc 1_clone.c
$> ./a.out

pid 10194, tid 10194:
    main thread

pid 10194, tid 10194:
    clone result = 10195

pid 10194, tid 10195:
    new thread, arg = 100

The same PID

Different TID

Why wait for termination?

Clone() [4]

volatile bool is_finished = false;

int thread_f(void *arg)
{
	/* ... */
}

int main()
{
	/* ... */
	while (! is_finished)
		sched_yield();
	free(stack);
	return 0;
}

What will happen without waiting for the new thread termination?

1) the stack will be freed right during usage

2) termination of the main thread terminates the process

1 point

Volatile [1]

Volatile memory

Registers

DRAM,

SRAM

Non-volatile memory

Magnetic

Flash

bool value = true;

while (value) {
    /*
     * do not change
     * 'value'
     */
}

gcc

while (true) {
    /* ... */
}
struct object obj;
/* ... */
a = a + obj.value;
/* ... */
b = b + obj.value;
/* ... */
volatile bool value = true;

while (value) {
    /*
     * do not change
     * 'value'
     */
}
volatile bool value = true;

while (value) {
    /*
     * do not change
     * 'value'
     */
}
struct object obj;
int tmp = obj.value;
/* ... */
a = a + tmp;
/* ... */
b = b + tmp;
/* ... */
volatile struct object obj;
/* ... */
a = a + obj.value;
/* ... */
b = b + obj.value;
/* ... */
volatile struct object obj;
/* ... */
a = a + obj.value;
/* ... */
b = b + obj.value;
/* ... */

Volatile [2]

void *
thread_f(void *arg)
{
	*((bool *)arg) = true;
	return NULL;
}

int
main()
{
	const bool is_finished = false;
	pthread_t tid;
	pthread_create(&tid, NULL,
                       thread_f,
		       &is_finished);
	while (! is_finished)
		sched_yield();

	pthread_join(tid, NULL);
	return 0;
}
$> clang 2_non_volatile.c
$> ./a.out

The program hangs

$> gobjdump -d a.out
<_main>:
100000f40: push   %rbp
100000f41: mov    %rsp,%rbp
100000f44: sub    $0x20,%rsp
100000f48: lea    -0x10(%rbp),%rdi
100000f4c: xor    %eax,%eax
100000f4e: mov    %eax,%esi
100000f50: lea    -0x37(%rip),%rdx
100000f57: lea    -0x5(%rbp),%rcx
100000f5b: movl   $0x0,-0x4(%rbp)
100000f62: movb   $0x0,-0x5(%rbp)
100000f66: callq  100000f7c <_main+0x3c>
100000f6b: mov    %eax,-0x14(%rbp)
100000f6e: callq  100000f82 <_main+0x42>
100000f73: mov    %eax,-0x18(%rbp)
100000f76: jmpq   100000f6e <_main+0x2e>

This is the loop

With unconditional jump

It means that is_finished was dropped

How so ...?

Volatile [3]

void *
thread_f(void *arg)
{
	*((bool *)arg) = true;
	return NULL;
}

int
main()
{
	const volatile bool
            is_finished = false;
	pthread_t tid;
	pthread_create(&tid, NULL,
                       thread_f,
		       &is_finished);
	while (! is_finished)
		sched_yield();

	pthread_join(tid, NULL);
	return 0;
}
$> clang 3_volatile.c
$> ./a.out
$> # finished
$> gobjdump -d a.out
<_main>:
100000f00: push   %rbp
...
100000f2e: mov    -0x5(%rbp),%al
100000f31: xor    $0xff,%al
100000f33: test   $0x1,%al
100000f35: jne    100000f40 <_main+0x40>
100000f3b: jmpq   100000f4d <_main+0x4d>
100000f40: callq  100000f74 <_main+0x74>
100000f45: mov    %eax,-0x18(%rbp)
100000f48: jmpq   100000f2e <_main+0x2e>
100000f4d: xor    %eax,%eax
100000f4f: mov    %eax,%esi
100000f51: mov    -0x10(%rbp),%rdi
...
100000f66: retq

The loop has changed

The loop beginning and condition check. If passed - go to the loop

If didn't pass, then jump out of the loop

In the end jump back to the beginning

check_condition:
    if (! is_finished)
        goto do_iteration;
    goto end;

do_iteration:
    sched_yield();
    goto check_condition;

end:

Atomicity [1]

volatile bool is_finished = false;
volatile int counter = 0;

int
thread_f(void *arg)
{
	for (int i = 0; i < 100000; ++i)
		counter = counter + 1;
	is_finished = true;
	return 0;
}

int
main()
{
	void *stack;
	thread_create_clone(thread_f, NULL,
                            &stack);
	for (int i = 0; i < 100000; ++i)
		counter = counter + 1;
	while (! is_finished)
		sched_yield();
	printf("counter = %d\n", counter);
	free(stack);
	return 0;
}
$> gcc 4_not_atomic.c
$> ./a.out
counter = 133664
$> ./a.out
counter = 200000
$> ./a.out
counter = 127934
$> ./a.out
counter = 140386

Atomicity [2]

counter = counter + 1;

gcc

mov counter,%eax
add $0x1,%eax
mov %eax,counter

Load from memory -

Increase -

Store into memory -

Thread 1

Thread 2

mov counter, $eax
mov counter,%eax
add $0x1,%eax
mov %eax,counter
add $0x1,%eax
mov %eax,counter

Increases the old value

Atomicity [3]

__sync_add_and_fetch(&counter, 1);

gcc

lock addl $0x1,counter
type __sync_fetch_and_add (type *ptr, type value, ...)
type __sync_fetch_and_sub (type *ptr, type value, ...)

type __sync_add_and_fetch (type *ptr, type value, ...)
type __sync_sub_and_fetch (type *ptr, type value, ...)

type __sync_val_compare_and_swap (type *ptr, type oldval, type newval, ...)
void __atomic_load (type *ptr, type *ret, int memorder)
void __atomic_store (type *ptr, type *val, int memorder)

void __atomic_exchange (type *ptr, type *val, type *ret, int memorder)
bool __atomic_compare_exchange (type *ptr, type *expected, type *desired, ...)

type __atomic_add_fetch (type *ptr, type val, int memorder)
type __atomic_sub_fetch (type *ptr, type val, int memorder)

type __atomic_fetch_add (type *ptr, type val, int memorder)
type __atomic_fetch_sub (type *ptr, type val, int memorder)

Atomicity [4]

volatile bool is_finished = false;
volatile int counter = 0;

int
thread_f(void *arg)
{
	for (int i = 0; i < 100000; ++i)
		__sync_add_and_fetch(&counter, 1);
	is_finished = true;
	return 0;
}

int
main()
{
	void *stack;
	thread_create_clone(thread_f, NULL,
                            &stack);
	for (int i = 0; i < 100000; ++i)
		__sync_add_and_fetch(&counter, 1);
	while (! is_finished)
		sched_yield();
	printf("counter = %d\n", counter);
	free(stack);
	return 0;
}
$> gcc 5_atomic.c
$> ./a.out
counter = 200000
$> ./a.out
counter = 200000
$> ./a.out
counter = 200000
volatile bool is_finished = false;
volatile int counter = 0;

int
thread_f(void *arg)
{
	for (int i = 0; i < 100000; ++i)
		counter = counter + 1;
	is_finished = true;
	return 0;
}

int
main()
{
	void *stack;
	thread_create_clone(thread_f, NULL,
                            &stack);
	for (int i = 0; i < 100000; ++i)
		counter = counter + 1;
	while (! is_finished)
		sched_yield();
	printf("counter = %d\n", counter);
	free(stack);
	return 0;
}

Atomicity [5]

$> clang 6_bad_lock.c
struct complex_thing {
	bool lock;
	char string[128];
};

volatile struct complex_thing thing;
volatile bool start = false;

void *thread_f(void *arg)
{
	int id = (int) arg;
	while (! start) {};
	if (! thing.lock) {
		thing.lock = true;
		for (int i = 0; i < 127; ++i)
			thing.string[i] = 'a' + id;
	}
	return NULL;
}

int main()
{
	pthread_t tid[6];
	for (int i = 0; i < 6; ++i) {
		pthread_create(&tid[i], NULL, thread_f,
                               (void *) i);
        }
	start = true;
	for (int i = 0; i < 6; ++i)
		pthread_join(tid[i], NULL);
	printf("%s\n", thing.string);
	return 0;
}
$> ./a.out
bbaaaaaaaaaaaaaaaaaaaaaa
bbbbbbaaabbaaaaaaaaabaaa
aabaaaabbbabbbbbbbbaaaaa
aaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaa

Another thread may interfere between these lines

Atomicity [6]

$> clang 6_bad_lock.c
struct complex_thing {
	bool lock;
	char string[128];
};

volatile struct complex_thing thing;
volatile bool start = false;

void *thread_f(void *arg)
{
	int id = (int) arg;
	while (! start) {};
	if (! thing.lock) {
		thing.lock = true;
		for (int i = 0; i < 127; ++i)
			thing.string[i] = 'a' + id;
	}
	return NULL;
}

int main()
{
	pthread_t tid[6];
	for (int i = 0; i < 6; ++i) {
		pthread_create(&tid[i], NULL, thread_f,
                               (void *) i);
        }
	start = true;
	for (int i = 0; i < 6; ++i)
		pthread_join(tid[i], NULL);
	printf("%s\n", thing.string);
	return 0;
}
$> ./a.out
aaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaa

$> ./a.out
ffffffffffffffffffffffff
ffffffffffffffffffffffff
ffffffffffffffffffffffff
ffffffffffffffffffffffff
ffffffffffffffffffffffff
fffffff
struct complex_thing {
	bool lock;
	char string[128];
};

volatile struct complex_thing thing;
volatile bool start = false;

void *thread_f(void *arg)
{
	int id = (int) arg;
	while (! start) {};
	if (__sync_bool_compare_and_swap(&thing.lock, 0, 1)) {
		for (int i = 0; i < sizeof(thing.string) - 1; ++i)
			thing.string[i] = 'a' + id;
		__sync_bool_compare_and_swap(&thing.lock, 1, 0);
	}
	return NULL;
}

int main()
{
	pthread_t tid[6];
	for (int i = 0; i < 6; ++i) {
		pthread_create(&tid[i], NULL, thread_f,
                               (void *) i);
        }
	start = true;
	for (int i = 0; i < 6; ++i)
		pthread_join(tid[i], NULL);
	printf("%s\n", thing.string);
	return 0;
}

Spinlock

void
spin_lock(volatile bool *lock)
{
    while (! __sync_bool_compare_and_swap(lock, 0, 1))
    {}
}

void
spin_unlock(volatile bool *lock)
{
    __sync_bool_compare_and_swap(lock, 1, 0);
}

Blocking by means of the CPU, without the kernel and interrupts

If unblocks fast, then caches are registers are saved

If does not unblock fast, then CPU time and electricity are wasted for nothing

Futex [1]

Fast User-space Mutex

Mutex - is not a basic concept of locking in Linux. The base is Futex.

Managed by the kernel

int
futex(int *uaddr, int futex_op, int val,
      const struct timespec *timeout,
      int *uaddr2, int val3);

Futex [2]

int
futex(int *uaddr, int futex_op, int val,
      const struct timespec *timeout,
      int *uaddr2, int val3);

Futex - is an integer

Thread wants to know when this value will be != val

futex(&value, FUTEX_WAIT, wait_not_for);

Another thread may notify about the change 1 or more threads

futex(&value, FUTEX_WAKE, n_to_wake);

Futex [3]

Thread 1

Thread 2

int value = 100;
futex(&value, FUTEX_WAIT, 100,
      NULL, NULL, 0);
/* sleep ... */

Thread sleeps and wants to wakeup when value != 100

value = 0;
futex(&value, FUTEX_WAKE, 1,
      NULL, NULL, 0);

Another thread changes the value to 0 and notifies the waiters.

value != 100;
/* continue ... */

The kernel wakes the thread up

Futex [4]

int
futex_wait(int *futex, int val)
{
	return syscall(SYS_futex, futex, FUTEX_WAIT, val,
		       NULL, NULL, 0);
}

int
futex_wake(int *futex)
{
	return syscall(SYS_futex, futex, FUTEX_WAKE, 1,
		       NULL, NULL, 0);
}

void
futex_lock(int *futex)
{
	while (__sync_val_compare_and_swap(futex, 0, 1) != 0)
		futex_wait(futex, 1);
}

void
futex_unlock(int *futex)
{
	__sync_bool_compare_and_swap(futex, 1, 0);
	futex_wake(futex);
}

No library wrapper - use direct syscall.

Futex does not change the value, only userspace does it, and it needs to be atomic

Fast User-space Mutex - because the first attempt is in userspace using a simple atomic operation

Unlock is the same

Store 2 values only - 0 and 1, like free and locked

Futex [5]

volatile int counter = 0;
volatile bool start = false;
volatile int futex = 0;

void *thread_f(void *futex)
{
	while (! start) {};

	for (int i = 0; i < 100000; ++i) {
		futex_lock((int *) futex);
		counter = counter + 1;
		futex_unlock((int *) futex);
	}
	return NULL;
}

int main()
{
	pthread_t tid[6];
	int futex = 0;
	for (int i = 0; i < 6; ++i) {
		pthread_create(&tid[i], NULL, thread_f,
                               (void *) &futex);
        }
	start = true;
	for (int i = 0; i < 6; ++i)
		pthread_join(tid[i], NULL);
	printf("%d\n", counter);
	return 0;
}
$> gcc 8_futex.c -pthread
$> ./a.out
600000
$> ./a.out
600000
$> ./a.out
600000
$> ./a.out
600000

Memory barriers [1]

read a

write b

read c

The problem:

Source code:

Execution:

write e

read a

write b

read c

write e

> code

CPU can execute instructions in arbitrary order, if they are independent by data

Memory barriers [2]. Simple error

Reordering on CPU means that this is forbidden:

while (! is_ready)
	usleep(10000);
printf("%d\n", a);
a = 200;
is_ready = true;
a = 100

Thread 1

Thread 2

Both 100 and 200 can be printed. Thread 1 may see is_ready=true earlier than a=200. Because of reordering.

Without a proper protection threads may see each other in absolutely any state

Memory barriers [3]. Complex error

X = 1
r1 = Y

Thread 1

Y = 1
r2 = X

Thread 2

X = 0
Y = 0

In fact in the end r1 and r2 can both become 0, even on x86. These commands consist of instructions:

write(X, 1)
read(register, Y)
write(r1, register)

Thread 1

Thread 2

write(Y, 1)
read(register, X)
write(r2, register)

The reads can finish first

Obviously, in the end either r1, or r2, or both must = 1

Memory barriers [4]. In theory

LoadLoad

LoadStore

StoreStore

StoreLoad

read(B);
     write(A);
A = B;
C = D;
read(D);
     write(C);
read(B);
     write(A);
     read(D);
write(C);
read(B);
     write(A);
read(D);
     write(C);
read(B);
     write(A);
     read(D);
write(C);

Barrier

Barrier

Barrier

Barrier

Memory barriers [5]. In practice

StoreStore

StoreLoad

LoadLoad

LoadStore

Acquire

Release

3 really existing barriers:

  • Acquire (Acquire-Read)
  • Release (Release-Write)
  • Full

Full

Memory barriers [6]. Acquire-Release

Acquire: LoadLoad + LoadStore - no reorderings from 'down' to 'up'

Release: LoadStore + StoreStore - no reorderings from 'up' to 'down'

while (! read_acquire(is_ready))
	usleep(10000);

printf("%d\n", a);
a = 200;


write_release(&is_ready, true);
a = 100

Thread 1

Thread 2

Acquire won't allow variable a be read before is_ready.

Release won't allow variable a be written after is_ready.

Memory barriers [7]. Usage

Instructions with a full barrier

type __sync_fetch_and_add (type *ptr, type value, ...)
type __sync_fetch_and_sub (type *ptr, type value, ...)

type __sync_add_and_fetch (type *ptr, type value, ...)
type __sync_sub_and_fetch (type *ptr, type value, ...)

type __sync_val_compare_and_swap (type *ptr, type oldval, type newval, ...)
void __atomic_load (type *ptr, type *ret, int memorder)
void __atomic_store (type *ptr, type *val, int memorder)

void __atomic_exchange (type *ptr, type *val, type *ret, int memorder)
bool __atomic_compare_exchange (type *ptr, type *expected, type *desired, ...)

type __atomic_add_fetch (type *ptr, type val, int memorder)
type __atomic_sub_fetch (type *ptr, type val, int memorder)

type __atomic_fetch_add (type *ptr, type val, int memorder)
type __atomic_fetch_sub (type *ptr, type val, int memorder)

Instructions with a configurable barrier

__ATOMIC_RELAXED, __ATOMIC_CONSUME, __ATOMIC_ACQUIRE,
__ATOMIC_RELEASE, __ATOMIC_ACQ_REL, __ATOMIC_SEQ_CST
memorder:

Memory barriers [8]. Spinlock

void
spin_lock(volatile bool *lock)
{
    while (! __atomic_test_and_set(lock, __ATOMIC_ACQUIRE))
    {}
}

void
spin_unlock(volatile bool *lock)
{
    __atomic_clear(lock, __ATOMIC_RELEASE);
}

Optimal spinlock without full barriers

Works ~1.5 times faster than the previous version with

__sync_bool_compare_and_swap

Portability

clone();
futex();

Linux-specific

__sync_bool_compare_and_swap();
__sync_val_compare_and_swap();
__sync_add_and_fetch();
/* ... */

GCC-specific

Mac? FreeBSD? ...

clang? MinGW?

Windows-specific

_InterlockedExchange();
_InterlockedCompareExchange();
_InterlockedIncrement();
WaitOnAddress()
/* ... */

pthread [1]

pthread_create();
pthread_join();
pthread_exit();
pthread_mutex_lock();
pthread_mutex_unlock();
/* ... */

GCC on Linux

futex();
clone();
__sync_*();

GCC on Mac

MSVC on Windows

Probably "mach threads"

Not present at all

pthread [2]

int
pthread_create(pthread_t *thread, const pthread_attr_t *attr,
               void *(*start_routine) (void *), void *arg);

pthread_t - thread descriptor

Function to run in the new thread

struct pthread_t {
        int lock;
        pid_t tid;
        void *func_result;
        void *stack;
        size_t stack_size;
};

Lock to protect this structure

Kernel id of this thread

Function result kept for pickup

Thread stack for clone(), which is allocated and will be freed automatically

pthread [3]

volatile bool is_finished = false;

volatile pthread_t child_id;

int
thread_f(void *arg)
{
	child_id = pthread_self();
	is_finished = true;
	return 0;
}

int
main()
{
	void *stack;
	thread_create_clone(thread_f, NULL, &stack);

	while (! is_finished)
		sched_yield();
	
	pthread_join(child_id, NULL);
	free(stack);
	return 0;
}
$> gcc 9_clone_vs_pthread.c \
    -pthread
$> ./a.out
$> # finished ok

Pthread function is called from a thread, created not by pthread_сreate()

Resources are freed by pthread_join() - pthread_t is destroyed

pthread [4]

int
pthread_join(pthread_t thread, void **retval);
{
        while (! thread.is_finished) {};
        free(thread.stack);
        *retval = thread.func_value;
        return 0;
}

pthread [5]

typedef int (*cthread_f)(void *);

struct cthread {
	int returned_code;
	cthread_f func;
	void *arg;
	void *stack;
	bool is_finished;
};

int cthread_runner(void *arg)
{
	struct cthread *thread =
                (struct cthread *) arg;
	thread->returned_code =
                thread->func(thread->arg);
	thread->is_finished = true;
	return 0;
}
void cthread_create(struct cthread *result,
                    cthread_f func, void *arg)
{
	result->returned_code = 0;
	result->func = func;
	result->arg = arg;
	result->is_finished = false;
	thread_create_clone(cthread_runner,
                            (void *) result,
			    &result->stack);
}

int cthread_join(volatile struct
                 cthread *thread)
{
	while (! thread->is_finished)
		sched_yield();
	free(thread->stack);
	return thread->returned_code;
}

Thread descriptor

Stack and return code for cthread_join

Start a thread, but with different function - why?

Because user-defined function won't save its result into cthread, and won't set finish flag

1 point

pthread [6]

int
func(void *arg)
{
	printf("arg = %d\n", *((int *) arg));
	return 200;
}

int
main()
{
	struct cthread thread;
	int arg = 100;
	cthread_create(&thread, func, (void *) &arg);
	int retcode = cthread_join(&thread);
	printf("thread is joined with retcode = %d\n", retcode);
	return 0;
}
$> gcc 10_cthreads_1.c
$> ./a.out
arg = 100
thread is joined with retcode = 200

pthread [7]

void
pthread_exit(void *retval);

How is pthread_exit() implemented?

setjmp()/longjmp()

1 point

pthread [8]

typedef int (*cthread_f)(void *);

struct cthread {
	int returned_code;
	cthread_f func;
	void *arg;
	void *stack;
	bool is_finished;
};


int cthread_runner(void *arg)
{
	struct cthread *thread =
                (struct cthread *) arg;
	thread->returned_code =
                thread->func(thread->arg);
	thread->is_finished = true;
	return 0;
}
void cthread_create(struct cthread *result,
                    cthread_f func, void *arg)
{
	result->returned_code = 0;
	result->func = func;
	result->arg = arg;
	result->is_finished = false;
	thread_create_clone(cthread_runner,
                            (void *) result,
			    &result->stack);
}

int cthread_join(volatile struct
                 cthread *thread)
{
	while (! thread->is_finished)
		sched_yield();
	free(thread->stack);
	return thread->returned_code;
}
typedef int (*cthread_f)(void *);

struct cthread {
	int returned_code;
	cthread_f func;
	void *arg;
	void *stack;
	bool is_finished;
	jmp_buf jmp;
};

int cthread_runner(void *arg)
{
	struct cthread *thread =
                (struct cthread *) arg;
        if (setjmp(thread->jmp) == 0) {
                arg = thread->arg;
		thread->returned_code =
			thread->func(arg);
	}
	thread->is_finished = true;
	return 0;
}
void cthread_exit(struct cthread *thread,
                  int retcode)
{
	thread->returned_code = retcode;
	longjmp(thread->jmp, 1);
}

Stack position is saved in the beginning, and the thread can jump here any moment

pthread [9]

int
pthread_detach(pthread_t thread);

So as it would not be necessary to call pthread_join(). The resources are freed automatically, when the thread is finished.

pthread [10]

typedef int (*cthread_f)(void *);

struct cthread {
	int returned_code;
	cthread_f func;
	void *arg;
	void *stack;
	bool is_finished;
	bool is_detached;
	jmp_buf jmp;
};

void cthread_destroy(struct cthread *thread)
{
	printf("thread is destroyed\n");
	free(thread->stack);
}

int cthread_runner(void *arg)
{
	struct cthread *thread =
                (struct cthread *) arg;
	if (setjmp(thread->jmp) == 0) {
                arg = thread->arg;
		thread->returned_code =
			thread->func(arg);
	}
	if (thread->is_detached)
		cthread_destroy(thread);
	thread->is_finished = true;
	return 0;
}
void cthread_detach(struct cthread *thread)
{
	if (thread->is_finished)
		cthread_destroy(thread);
	thread->is_detached = true;
}

void cthread_create(struct cthread *result,
                    cthread_f func, void *arg)
{
	result->returned_code = 0;
	result->func = func;
	result->arg = arg;
	result->is_finished = false;
	result->is_detached = false;
	thread_create_clone(cthread_runner,
                            (void *) result,
			    &result->stack);
}

Flag is_detached is added

pthread [11]

void cthread_detach(struct cthread *thread)
{
	if (thread->is_finished)
		cthread_destroy(thread);
	thread->is_detached = true;
}
void cthread_destroy(struct cthread *thread)
{
	printf("thread is destroyed\n");
	free(thread->stack);
}

int cthread_runner(void *arg)
{
	/* ... */
	if (thread->is_detached)
		cthread_destroy(thread);
	thread->is_finished = true;
	return 0;
}

Find here 2 bugs

1) Stack is freed right in cthread_runner, which works on this stack right now.

2) Flags is_detached and is_finished are accessed not atomically.

1 point for each

pthread [12]

int
cthread_runner(void *arg)
{
	struct cthread *thread = (struct cthread *) arg;
	if (setjmp(thread->jmp) == 0) {
		thread->returned_code =
			thread->func(thread->arg);
	}
	if (thread->is_detached)
		cthread_destroy(thread);
	thread->is_finished = true;
	return 0;
}

Own stack can't be deleted here - it is used even by return

struct cthread_stack {
	void *stack;
	struct cthread_stack *next;
};

The stack may be deleted not now. Its deletion can be postponed until the thread is finished. But how to find if the thread is really finished  even in the kernel?

pthread [13]

static inline int
thread_create_clone_tid(int (*func)(void *), void *arg, void **stack,
			pid_t *tid)
{
	int stack_size = 65 * 1024;
	*stack = malloc(stack_size);
	void *stack_top = (char *) *stack + stack_size;
	int flags = CLONE_VM | CLONE_FS | CLONE_FILES |
		    CLONE_SIGHAND | CLONE_THREAD |
                    CLONE_CHILD_CLEARTID | CLONE_CHILD_SETTID;
	return clone(func, stack_top, flags, arg, NULL, NULL, tid);
}

In clone() it is possible to ask to nullify a given address when the thread is fully finished.

struct cthread_stack {
	pid_t tid;
	void *stack;
	struct cthread_stack *next;
};

With each stack a number is associated which the kernel will nullify when the thread (task_struct) is fully finished

pthread [14]

struct cthread {
	int returned_code;
	cthread_f func;
	void *arg;
	void *stack;
	bool is_finished;
	bool is_detached;
	jmp_buf jmp;
};

void cthread_destroy(struct cthread *thread)
{
	printf("thread is destroyed\n");
	free(thread->stack);
}

void cthread_detach(struct cthread *thread)
{
	if (thread->is_finished)
		cthread_destroy(thread);
        sleep(1);
	thread->is_detached = true;
}

int func(void *arg)
{
	printf("thread started\n");
	return 0;
}
int main()
{
	struct cthread thread;
	cthread_create(&thread, func, NULL);
	cthread_detach(&thread);
	while (! thread.is_finished)
		sched_yield();
	printf("detached thread finished\n");
	return 0;
}
$> gcc 11_bad_detach.c
$> ./a.out
thread started
detached thread finished

The stack is not freed

pthread [15]

struct cthread_stack {
	pid_t tid;
	void *stack;
	struct cthread_stack *next;
};

struct cthread_stack *stack_list = NULL;
volatile bool last_stack_lock = false;

struct cthread {
	int returned_code;
	cthread_f func;
	void *arg;
	struct cthread_stack *stack;
	bool lock;
	bool is_finished;
	bool is_detached;
	jmp_buf jmp;
};
void cthread_destroy(struct cthread *thread)
{
	printf("thread is destroyed\n");
	spin_lock(&last_stack_lock);
	struct cthread_stack *iter = stack_list;
	while (iter != NULL) {
		if (iter->tid != 0)
			break;
		struct cthread_stack *next =
                        iter->next;
		free(iter->stack);
		free(iter);
		iter = next;
		printf("a stack is freed\n");
	}
	thread->stack->next = iter;
	stack_list = thread->stack;
	spin_unlock(&last_stack_lock);
}

A list of stacks and a lock to protect it. Each cthread has own stack.

At deletion try to free the stacks, which the kernel allowed to free

Those whose tid != 0 are still used - don't touch them

Put own stack into the list

int cthread_runner(void *arg)
{
	struct cthread *thread =
                (struct cthread *) arg;
	if (setjmp(thread->jmp) == 0) {
                arg = thread->arg;
		thread->returned_code =
			thread->func(arg);
	}
	spin_lock(&thread->lock);
	if (thread->is_detached)
		cthread_destroy(thread);
	thread->is_finished = true;
	spin_unlock(&thread->lock);
	return 0;
}

Operations with flags is_detached, is_finished are now protected

void cthread_detach(struct cthread *thread)
{
	spin_lock(&thread->lock);
	if (thread->is_finished)
		cthread_destroy(thread);
	thread->is_detached = true;
	spin_unlock(&thread->lock);
}

The same

void cthread_create(struct cthread *result, cthread_f func,
	            void *arg)
{
	result->returned_code = 0;
	result->func = func;
	result->arg = arg;
	result->is_finished = false;
	result->is_detached = false;
	result->lock = false;
	result->stack =
                malloc(sizeof(*result->stack));
	result->stack->next = NULL;
        void **stack_p = &result->stack->stack;
        pid_t *tid_p = &result->stack->tid;
	thread_create_clone_tid(cthread_runner,
                                result,
                                stack_p,
                                tid_p);
}

struct cthread_stack is created on heap so as it could survive thread termination. Here the kernel will update tid, when clone() is finished.

pthread [16]

int func(void *arg)
{
	printf("thread started\n");
	return 0;
}

int main()
{
	struct cthread thread[10];
	for (int i = 0; i < 10; ++i) {
		cthread_create(&thread[i], func,
                               NULL);
		cthread_detach(&thread[i]);
	}
	for (int i = 0; i < 10; ++i) {
		if (! thread[i].is_finished) {
			i = 0;
			sched_yield();
		}
	}
	printf("detached threads finished\n");
	return 0;
}
$> gcc 12_cthread_detach.c
$> ./a.out
thread started
thread is destroyed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
thread started
thread is destroyed
a stack is freed
detached threads finished

pthread [17]

pthread_create();
pthread_join();
pthread_detach();

pthread [18]

int
pthread_mutex_destroy(pthread_mutex_t *mutex);

int
pthread_mutex_init(pthread_mutex_t *mutex,
                   const pthread_mutexattr_t *attr);

int
pthread_mutex_lock(pthread_mutex_t *mutex);

int
pthread_mutex_trylock(pthread_mutex_t *mutex);

int
pthread_mutex_unlock(pthread_mutex_t *mutex);

int
pthread_mutex_timedlock(pthread_mutex_t *mutex,
                        const struct timespec *abs_timeout); 

On Linux the mutexes are implemented via futex and atomic operations

pthread [18]

int
pthread_mutex_trylock(pthread_mutex_t *mutex);

How is pthread_mutex_trylock implemented?

{
        return __sync_bool_compare_and_swap(mutex->lock, 0, 1);
}

1 point

pthread [19]

int
pthread_rwlock_destroy(pthread_rwlock_t *rwlock);

int
pthread_rwlock_init(pthread_rwlock_t *rwlock,
                    const pthread_rwlockattr_t *attr);

int
pthread_rwlock_rdlock(pthread_rwlock_t *rwlock);

int
pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock);

int
pthread_rwlock_timedrdlock(pthread_rwlock_t *rwlock,
                           const struct timespec *abs_timeout);

int
pthread_rwlock_timedwrlock(pthread_rwlock_t *rwlock,
                           const struct timespec *abs_timeout);

int
pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock);

int
pthread_rwlock_wrlock(pthread_rwlock_t *rwlock);

int
pthread_rwlock_unlock(pthread_rwlock_t *rwlock);

Any number of read locks

Only one on write

On both rdlock() and wrlock() only one unlock()

pthread [20]

int
pthread_rwlock_rdlock(pthread_rwlock_t *rwlock);

int
pthread_rwlock_wrlock(pthread_rwlock_t *rwlock);

int
pthread_rwlock_unlock(pthread_rwlock_t *rwlock);

Why there is only one unlock()? Why no separate rdunlock(), rwunlock()?

/* Possible implementation. */
{
        if (mutex->readers > 0)
                mutex->readers--;
        else
                mutex->writers = 0;
}

1 point

pthread [21]

struct condvar {
	pthread_mutex_t event_lock;
	bool is_event_set;
};

void condvar_signal(struct condvar *condvar)
{
	printf("condvar signaled\n");
	condvar->is_event_set = true;
	pthread_mutex_unlock(&condvar->event_lock);

	pthread_mutex_lock(&condvar->event_lock);
	condvar->is_event_set = false;
	printf("condvar is locked\n");
}

void condvar_wait(struct condvar *condvar)
{
        pthread_mutex_t *lock =
                &condvar->event_lock;
	for (;;) {
		pthread_mutex_lock(lock);
		if (condvar->is_event_set) {
			condvar->is_event_set =
                                false;
			pthread_mutex_unlock(lock);
			return;
		}
		pthread_mutex_unlock(lock);
	}
}
void condvar_create(struct condvar *condvar)
{
        pthread_t *lock =
                &condvar->event_lock;
	pthread_mutex_init(lock, NULL);
	pthread_mutex_lock(lock);
	condvar->is_event_set = false;
}

Trivial condition variable is an event flag protected by a mutex

The lock is taken at initialization by the event generator

The waiting threads also try to take the lock, but are blocked

The generator at appearence of a new event frees the lock, and one of the waiters wakes up

pthread [22]

struct condvar condvar;

void *
thread_f(void *arg)
{
	int id = (int) arg;
	for (;;) {
		condvar_wait(&condvar);
		printf("%d processed the event\n",
                       id);
	}
}

int
main()
{
	condvar_create(&condvar);
	const int thread_count = 10;
	pthread_t tid[thread_count];
	for (int i = 0; i < thread_count; ++i) {
		pthread_create(&tid[i], NULL,
                               thread_f,
                               (void *) i);
        }
	while (getchar() != EOF)
		condvar_signal(&condvar);
	for (int i = 0; i < thread_count; ++i)
		pthread_join(tid[i], NULL);
	return 0;
}
$> # Mac
$> clang 13_simple_condvar.c
$> ./a.out

condvar signaled
0 processed the event
condvar is locked

condvar signaled
1 processed the event
condvar is locked

condvar signaled
1 processed the event
condvar is locked

$> # Linux
$> gcc 13_simple_condvar.c \
    -pthread
$> ./a.out

condvar signaled
condvar is locked

condvar signaled
condvar is locked

condvar signaled
condvar is locked

condvar signaled
condvar is locked

pthread [23]

struct condvar {
	pthread_mutex_t event_lock;
	bool is_event_set;
	int wait_count;
};

void
condvar_signal(struct condvar *condvar)
{
        int *ptr = &condvar->wait_count;
        pthread_mutex_t *lock =
                &condvar->event_lock;
	int wait_count =
                __sync_fetch_and_add(ptr,
                                     0);
	printf("wait_count = %d\n",
               wait_count);
	if (wait_count == 0) {
		printf("no waiters\n");
		return;
	}
	printf("condvar signaled\n");
	condvar->is_event_set = true;
	pthread_mutex_unlock(lock);

	while (condvar->is_event_set) {}

	pthread_mutex_lock(lock);
	condvar->is_event_set = false;
	printf("condvar is locked\n");
}
void
condvar_wait(struct condvar *condvar)
{
        int *wc = &condvar->wait_count;
        pthread_mutex_t *lock = &condvar->event_lock;
	__sync_add_and_fetch(wc, 1);
	for (;;) {
		pthread_mutex_lock(lock);
		if (condvar->is_event_set) {
			condvar->is_event_set = false;
			__sync_sub_and_fetch(wc, 1);
			pthread_mutex_unlock(lock);
			return;
		}
		pthread_mutex_unlock(lock);
	}
}

void
condvar_create(struct condvar *condvar)
{
        pthread_mutex_t *lock =
                &condvar->event_lock;
	pthread_mutex_init(lock, NULL);
	pthread_mutex_lock(lock);
	condvar->is_event_set = false;
	condvar->wait_count = 0;
}

Add wait_count

pthread [24]

struct condvar condvar;

void *
thread_f(void *arg)
{
	int id = (int) arg;
	for (;;) {
		condvar_wait(&condvar);
		printf("%d processed the"\
                       " event\n", id);
		usleep(100000);
	}
}

int
main()
{
	condvar_create(&condvar);
	const int thread_count = 10;
	pthread_t tid[thread_count];
	for (int i = 0; i < thread_count; ++i) {
		pthread_create(&tid[i], NULL,
                               thread_f, i);
        }
	while (getchar() != EOF)
		condvar_signal(&condvar);
	for (int i = 0; i < thread_count; ++i)
		pthread_join(tid[i], NULL);
	return 0;
}
$> gcc 14_condvar.c \
    -pthread
$> ./a.out
wait_count = 10
condvar signaled
2 processed the event
condvar is locked
asdfghjklqwert
wait_count = 10
condvar signaled
4 processed the event
condvar is locked
wait_count = 9
condvar signaled
3 processed the event
condvar is locked
wait_count = 8
...
wait_count = 2
condvar signaled
1 processed the event
condvar is locked
wait_count = 1
condvar signaled
condvar is locked
2 processed the event
wait_count = 0
no waiters
wait_count = 0
no waiters

pthread [25]

int
pthread_cond_destroy(pthread_cond_t *cond);

int
pthread_cond_init(pthread_cond_t *cond,
                  const pthread_condattr_t *attr);

int
pthread_cond_broadcast(pthread_cond_t *cond);

int
pthread_cond_signal(pthread_cond_t *cond);

int
pthread_cond_timedwait(pthread_cond_t *cond,
                       pthread_mutex_t *mutex,
                       const struct timespec *abstime);

int
pthread_cond_wait(pthread_cond_t *cond,
                  pthread_mutex_t *mutex); 

pthread [26]

int
pthread_barrier_destroy(pthread_barrier_t *barrier);

int
pthread_barrier_init(pthread_barrier_t *barrier,
                     const pthread_barrierattr_t *attr,
                     unsigned count);

int
pthread_barrier_wait(pthread_barrier_t *barrier);

N

pthread [27]

static __thread int a = -1;

volatile int ready_to_print = 0;
volatile bool print = false;

void *
thread_f(void *arg)
{
	a = (int) arg;
	__sync_fetch_and_add(&ready_to_print, 1);
	while (! print) {}
	printf("a = %d\n", a);
	return NULL;
}

int
main()
{
	pthread_t t1, t2;
	a = 0;
	pthread_create(&t1, NULL, thread_f, 1);
	pthread_create(&t2, NULL, thread_f, 2);
	while (__sync_fetch_and_add(&ready_to_print,
                                    0) != 2) {}
	print = true;
	pthread_join(t1, NULL);
	pthread_join(t2, NULL);
	printf("main a = %d\n", a);
	return 0;
}
$> gcc 15_gcc_thread.c \
    -pthread
$> ./a.out
a = 2
a = 1
main a = 0

Copy of that variable is created for each thread

pthread [28]

.text

.data

.bss

.heap

.stack

.env

kernel

.tdata

+

$> gcc 15_gcc_thread.c \
    -pthread
$> objdump -s a.out
...
Contents of section .tdata:
 200d8c ffffffff
...
int
pthread_create(pthread_t *thread, const pthread_attr_t *attr,
               void *(*start_routine) (void *), void *arg)
{
        copy_tdata();
        /* ... */

pthread [29]

void *
pthread_getspecific(pthread_key_t key);

int
pthread_setspecific(pthread_key_t key,
                    const void *value);

int
pthread_key_create(pthread_key_t *key,
                   void (*destructor)(void*));

pthread [30]

volatile int ready_to_print = 0;
volatile bool print = false;

pthread_key_t key;

void *
thread_f(void *arg)
{
	pthread_setspecific(key, arg);
	__sync_fetch_and_add(&ready_to_print, 1);
	while (! print) {}
	int tmp = (int) pthread_getspecific(key);
	printf("value = %d\n", tmp);
	return NULL;
}

int
main()
{
	pthread_t t1, t2;
	pthread_key_create(&key, NULL);
	pthread_setspecific(key, (const void *) 0);

	pthread_create(&t1, NULL, thread_f, (void *) 1);
	pthread_create(&t2, NULL, thread_f, (void *) 2);
	while (__sync_fetch_and_add(&ready_to_print, 0) != 2) {}
	print = true;
	pthread_join(t1, NULL);
	pthread_join(t2, NULL);
	int tmp = (int) pthread_getspecific(key);
	printf("main value = %d\n", tmp);
	return 0;
}
$> gcc 16_pthread_key.c \
    -pthread
$> ./a.out
value = 2
value = 1
main value = 0

The key is the same for everyone

But different values in different threads

Each thread has own version of the value

pthread [31]

volatile bool finish = false;
volatile bool ptr_is_set = false;
volatile int *a_ptr = NULL;

void *
thread_f(void *arg)
{
	int a = 100;
	printf("child stack top = %p\n", &a);
	a_ptr = &a;
	ptr_is_set = true;
	while (! finish) {}
	return NULL;
}

int
main()
{
	pthread_t t;
	printf("main stack top = %p\n", &t);
	pthread_create(&t, NULL, thread_f, NULL);
	while (! ptr_is_set) {}
	printf("foreign a = %d\n", *a_ptr);
	finish = true;
	pthread_join(t, NULL);
	return 0;
}
$> # Linux
$> gcc 17_thread_stacks.c \
    -pthread
$> ./a.out
main stack top = 0x7ffeffb1fe90
child stack top = 0x7f0c9de9cee4
foreign a = 100

$> # Mac
$> clang 17_thread_stacks.c
$> ./a.out
main stack top = 0x7ffee3a53aa0
child stack top = 0x70000521bee4
foreign a = 100

pthread [32]

int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
                   void *(*start_routine) (void *), void *arg);

int
pthread_attr_init(pthread_attr_t *attr);

int
pthread_attr_destroy(pthread_attr_t *attr);

int
pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);

int
pthread_attr_getstacksize(const pthread_attr_t *attr, size_t *stacksize);

int
pthread_attr_setstack(pthread_attr_t *attr,
                      void *stackaddr, size_t stacksize);

int
pthread_attr_getstack(const pthread_attr_t *attr,
                      void **stackaddr, size_t *stacksize);

int
pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate);

int
pthread_attr_getdetachstate(const pthread_attr_t *attr,
                            int *detachstate);

Summary

Linux doesn't have threads or processes - it has just tasks which are created with clone(). Threads and processes are just clone-flag combinations.

Volatile keyword - to protect variables from being moved or deleted by the compiler. They can change in unexpected for compiler ways.

Atomics allow to change simple variables in mutliple threads in a safe way. Without them threads can corrupt those shared variables.

For protecting non-trivial code blocks there are mutexes and spinlocks. They allow one thread to execute the protected code at a time.

Pthread is a platform-agnostic (except Windows) standard library for working with threads and for their synchronization.

In C there is keyword __thread (in C++ - thread_local) which allows to have one copy of the marked variable in each thread.

Practice

Thread pool

Need to implement a thread pool. It takes callbacks and executes them in worker threads. The callbacks are wrapped into "tasks" which have their own API for checking their state, for joining, deletion. You are given the documented API; a solution template; and some unit tests which should pass. Note, that passing the tests is a necessary, but not a sufficient step in doing this homework.

Points: 15 - 25.

Deadline: 3 weeks.

Penalty: -1 for each day after deadline, max -10

Publish your solution on Github and give me the link. Assessment: any way you want - messengers, calls, emails.

Conclusion

Next time:

IPC. Pipe, FIFO. XSI and POSIX. Sockets: domain, network.


Press on the heart, if like the lecture

System programming 6

By Vladislav Shpilevoy

System programming 6

Threads in Linux, kernel representation. Difference from processes. POSIX. Synchronisation: mutex, rw-lock, condition variable, spin lock, barrier. Atomic operations. Attributes of thread and synchronisation object.

  • 1,795