Version: 3

System programming

Lecture 2:

Process. Work modes, memory, resources. Interrupts. Communication with the kernel. System calls.

Education

Lecture plan

  • "Process" overview
  • Fork()
  • Process memory sections
  • File descriptors
  • Pipe()
  • Process interrupts
  • System calls

Linux kernel

Processes

Hardware

Time

Filesystem

IPC

Network

Users

Data structures

Virtualization

Process [1]

.text

.data

.stack

.heap

.stack

.stack

File descriptors

Signal queue

IPC

Memory

int
main(int argc, char **argv)
{
	/* set locale to make iswXXXX function work */
	if (setlocale(LC_CTYPE, "C.UTF-8") == NULL &&
	    setlocale(LC_CTYPE, "en_US.UTF-8") == NULL &&
	    setlocale(LC_CTYPE, "en_US.utf8") == NULL)
		fprintf(stderr, "Failed to set locale to C.UTF-8\n");
	fpconv_check();

	/* Enter interactive mode after executing 'script' */
	bool interactive = false;
	/* Lua interpeter options, e.g. -e and -l */
	int optc = 0;
	char **optv = NULL;
	auto guard = make_scoped_guard([=]{ if (optc) free(optv); });

	static struct option longopts[] = {
		{"help", no_argument, 0, 'h'},
		{"version", no_argument, 0, 'v'},
		{NULL, 0, 0, 0},
	};
	static const char *opts = "+hVvie:l:";

	int ch;
	while ((ch = getopt_long(argc, argv, opts, longopts, NULL)) != -1) {
		switch (ch) {
		case 'V':
		case 'v':
			print_version();
			return 0;

Process is a virtualization mechanism

P1

P2

P1

P2

of processor

of memory

Process [2]

/**
 * 30.09.2018
 * #include/linux/sched.h
 * 618 lines.
 */
struct task_struct {
	struct thread_info		thread_info;
	volatile long			state;
	void				*stack;
	atomic_t			usage;
	unsigned int			cpu;
	int				prio;
	struct mm_struct		*mm;
	int				exit_state;
	int				exit_code;
	int				exit_signal;
	pid_t				pid;
	struct task_struct              *parent;
	struct list_head		children;
	u64				start_time;
	const struct cred		*cred;
	struct files_struct		*files;
	struct thread_struct		thread;
};

Process [3]

Fork [1]

Parent process

Child process

Parent code section

Parent code section

New program's code section

fork();
int
execl/le/lp/v/vp/vP(const char *path,
                    const char *arg0, ...);
pid = wait(&status);
exit();

Fork [2]

vladislav$ ps aux
USER  PID  %CPU %MEM      VSZ    RSS   TT  STAT STARTED      TIME COMMAND
root    1   0,0  0,1  4373080  15616   ??  Ss   31aug18  90:58.65 /sbin/launchd

Mac

vladislav$ ps aux
USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
root         1  0.0  0.2 160256  9636 ?        Ss   oct05   0:06 /sbin/init

Linux

struct task_struct init_task = {
	.state		= 0, /* Runnable. */
	.stack		= init_stack,
	.usage		= ATOMIC_INIT(2),
	.flags		= PF_KTHREAD,
	.prio		= MAX_PRIO - 20,
	.policy		= SCHED_NORMAL,
	.cpus_allowed	= CPU_MASK_ALL,
	.mm		= NULL,
	.real_parent	= &init_task,
	.parent		= &init_task,
	.files		= &init_files,
	.signal		= &init_signals,
};
EXPORT_SYMBOL(init_task);
struct task_struct {
        struct task_struct *parent;
        struct list_head children;
};

Process tree

Fork [3]

int main()
{
	printf("I am process %d\n", (int) getpid());
	char *mem = (char *) calloc(1, 100);
	pid_t child_pid = fork();
	pid_t my_pid = getpid();
	if (child_pid == 0) {
		printf("%d: I am child, fork returned %d\n",
		       (int) my_pid, (int) child_pid);
		printf("%d: child is terminated with code 100\n",
		       (int) my_pid);
		printf("%d: memory values are set to 1\n", (int) my_pid);
		memset(mem, 1, 100);
		return 100;
	}
	printf("%d: I am parent, fork returned %d\n",
	       (int) my_pid, (int) child_pid);
	int stat;
	pid_t wait_result = wait(&stat);
	printf("%d: wait returned %d and stat %d\n", (int) my_pid,
	       (int) wait_result, stat);
	printf("%d: memory values are %d\n", (int) my_pid, (int) mem[0]);
	printf("%d: returned child code was %d\n", (int) my_pid,
	       WEXITSTATUS(stat));
	return 0;
}

Now there are 2 processes

In the child it is true

int main()
{
	printf("I am process %d\n", (int) getpid());
	char *mem = (char *) calloc(1, 100);
	pid_t child_pid = fork();
	pid_t my_pid = getpid();
	if (child_pid == 0) {
		printf("%d: I am child, fork returned %d\n",
		       (int) my_pid, (int) child_pid);
		printf("%d: child is terminated with code 100\n",
		       (int) my_pid);
		printf("%d: memory values are set to 1\n", (int) my_pid);
		memset(mem, 1, 100);
		return 100;
	}
	printf("%d: I am parent, fork returned %d\n",
	       (int) my_pid, (int) child_pid);
	int stat;
	pid_t wait_result = wait(&stat);
	printf("%d: wait returned %d and stat %d\n", (int) my_pid,
	       (int) wait_result, stat);
	printf("%d: memory values are %d\n", (int) my_pid, (int) mem[0]);
	printf("%d: returned child code was %d\n", (int) my_pid,
	       WEXITSTATUS(stat));
	return 0;
}
vladislav$ gcc 1_fork.c 
vladislav$ ./a.out
I am process 45601
45601: I am parent, fork returned 45602
45602: I am child, fork returned 0
45602: child is terminated with code 100
45602: memory values are set to 1
45601: wait returned 45602 and stat 25600
45601: memory values are 0
45601: returned child code was 100

The child does something, prints ...

Finishes with code 100

The parent waits for the child termination

Obtains its return code - 100

vladislav$> man wait # or man 2 wait
WIFEXITED(status)
WIFSIGNALED(status)
WIFSTOPPED(status)

WEXITSTATUS(status)
WTERMSIG(status)
WCOREDUMP(status)
WSTOPSIG(status)

Fork [4]

Why is fork() fast even in big processes?

Copy On Write - COW

1 point

Copy on write [1]

Parent

Child

int a = 100;
a = 200;
a = 200;

Copy the page

int a = 100;
int a = 200;

Copy on write [2]

int main()
{
	printf("I am process %d\n", (int) getpid());
	char *mem = (char *) calloc(1, 100);
	pid_t child_pid = fork();
	pid_t my_pid = getpid();
	if (child_pid == 0) {
		printf("%d: I am child, fork returned %d\n",
		       (int) my_pid, (int) child_pid);
		printf("%d: child is terminated with code 100\n",
		       (int) my_pid);
		printf("%d: memory values are set to 1\n", (int) my_pid);
		memset(mem, 1, 100);
		return 100;
	}
	printf("%d: I am parent, fork returned %d\n",
	       (int) my_pid, (int) child_pid);
	int stat;
	pid_t wait_result = wait(&stat);
	printf("%d: wait returned %d and stat %d\n", (int) my_pid,
	       (int) wait_result, stat);
	printf("%d: memory values are %d\n", (int) my_pid, (int) mem[0]);
	printf("%d: returned child code was %d\n", (int) my_pid,
	       WEXITSTATUS(stat));
	return 0;
}

The memory is nullifed before fork()

int main()
{
	printf("I am process %d\n", (int) getpid());
	char *mem = (char *) calloc(1, 100);
	pid_t child_pid = fork();
	pid_t my_pid = getpid();
	if (child_pid == 0) {
		printf("%d: I am child, fork returned %d\n",
		       (int) my_pid, (int) child_pid);
		printf("%d: child is terminated with code 100\n",
		       (int) my_pid);
		printf("%d: memory values are set to 1\n", (int) my_pid);
		memset(mem, 1, 100);
		return 100;
	}
	printf("%d: I am parent, fork returned %d\n",
	       (int) my_pid, (int) child_pid);
	int stat;
	pid_t wait_result = wait(&stat);
	printf("%d: wait returned %d and stat %d\n", (int) my_pid,
	       (int) wait_result, stat);
	printf("%d: memory values are %d\n", (int) my_pid, (int) mem[0]);
	printf("%d: returned child code was %d\n", (int) my_pid,
	       WEXITSTATUS(stat));
	return 0;
}
vladislav$ gcc 1_fork.c 
vladislav$ ./a.out
I am process 45601
45601: I am parent, fork returned 45602
45602: I am child, fork returned 0
45602: child is terminated with code 100
45602: memory values are set to 1
45601: wait returned 45602 and stat 25600
45601: memory values are 0
45601: returned child code was 100

The child changes the value to 1

The parent is not affected

Copy on write [3]

pid_t
vfork(void);
  • Allows even not to care about COW - just create new process
  • Extra fast - does not depend on parent process page count
  • But the parent is blocked until the child calls exec or exit function

Zombie

Why is wait() needed?

exit();

Zombie-process

wait();

Zombie-process

Parent

vladislav$ ps aux | grep a.out
v.shpilevoy  45758   0,0  0,0  0  0 s000  Z   2:15   0:00.00 (a.out)
int main()
{
	if (fork() == 0)
		return 0;
	sleep(15);
	wait(NULL);
	return 0;
}

Exit

void
_start()
{
	printf("hello, world\n");
	exit(0);
}

Linux

Mac

$> gcc -nostartfiles 1_5_exit.c
$> ./a.out
hello, world

$>
$> gcc -e __start 1_5_exit.c
$> ./a.out
hello, world

$>
void
_start()
{
	printf("hello, world\n");
	//exit(0);
}
$> gcc -nostartfiles 1_5_exit.c
$> ./a.out
hello, world
Segmentation fault (core dumped)
$>

Intermediate summary

Process' life:

  • birth in fork(), starting from init (pid 1)
  • fork() turns on COW
  • parent collects return status from wait()
  • fork() - exit() - wait()

Proces resources:

  • memory
  • file descriptors
  • IPC

Process address space

0x0

0xffffffff

.text

.data

.bss

.heap

.stack

.env

Section .text

Sections .data and .bss

Section .bss

Section .data

void test_f() {
        static int a = 100;
}

const char *b = NULL;
long c[3] = {1, 2, 3};
static int d = 200;
void test_f() {
        static int a;
}

const char *b;
long c[3];
static int d;

Section .heap

void
free(void *ptr);

void *
malloc(size_t size);

void *
brk(const void *addr);

0x0

0xffffffff

32MB

32MB

16MB

16MB

16MB

16MB

8MB

8MB

8MB

8MB

8MB

8MB

8MB

8MB

Slab-allocator

Slicing of big slabs into smaller allocations and their handing out

malloc(14 * 1024 * 1024)
malloc(8 * 1024 * 1024)
brk();

Section .mmap

.heap

.mmap

< 32Mb

>= 32Mb

malloc();
mmap();
dlopen();

Sections in the kernel [1]

/**
 * 08.10.2018
 * 162 lines.
 */
struct mm_struct {
	struct vm_area_struct *mmap;
	unsigned long task_size;

	unsigned long start_code, end_code;
        unsigned long start_data, end_data;
	unsigned long start_brk, brk;
        unsigned long start_stack;
	unsigned long arg_start, arg_end;
        unsigned long env_start, env_end;
};
/**
 * 08.10.2018
 * 63 lines.
 */
struct vm_area_struct {
	unsigned long vm_start;
	unsigned long vm_end;
	struct vm_area_struct *vm_next;
        struct vm_area_struct *vm_prev;
	unsigned long vm_flags;
	struct file * vm_file;
	void * vm_private_data;
};

Process memory - all the segments

Segment list

struct vm_area_struct *mmap;

Process memory size

unsigned long task_size;

Addresses of sections .text, .data, .heap, .stack, .env

unsigned long start_code, end_code;
unsigned long start_data, end_data;
unsigned long start_brk, brk;
unsigned long start_stack;
unsigned long arg_start, arg_end;
unsigned long env_start, env_end;

One segment

unsigned long vm_start;
unsigned long vm_end;

Borders

struct vm_area_struct *vm_next;
struct vm_area_struct *vm_prev;

Neighbours

struct file * vm_file;
void * vm_private_data;

Content

unsigned long vm_flags;

Access flags

#define VM_READ		0x00000001
#define VM_WRITE	0x00000002
#define VM_EXEC		0x00000004
#define VM_SHARED	0x00000008

Sections in the kernel [2]

0x0

0xffffffffffffffff

'Holes' in the virtual address space - no physical mapping. Access attempt = Segmentation Fault

Sections in user space - .text [1]

int uninitialized;

const char *str = "const char *str";
const char str2[] = "const char str2[]";

void test_stack(void)
{
	int a;
	printf("stack top in test_stack: %p\n", &a);
	const char *str3 = "const char *str3";
	const char str4[] = "const char str4[]";
	char str5[] = "char str5[]";
	char b = 'x';
	char c = 'x';
	char d = 'x';
	int e = 32;
	int f = 64;
	int g = 128;
	printf("a = %d\n", a);
	a = 10;
}

int main(void)
{
	int a = 20;
	printf("stack top in main: %p\n", &a);
	test_stack();
	test_stack();
	return 0;
}
vladislav$> gcc -c 2_proc_memory.c -o obj.o
vladislav$> objdump -s -d obj.o
Содержимое раздела .text:
 0000 554889e5 4883ec40 488d3dee 00000048  UH..H..@H.=....H
 0010 8d75e848 8b050000 0000488b 00488945  .u.H......H..H.E
 0020 f8b000e8 00000000 488d3d08 01000048  ........H.=....H
 0030 8d35e400 00004889 75e0488b 35ea0000  .5....H.u.H.5...
 0040 00488975 ec8b0d08 00000089 4df4c645  .H.u........M..E
 0050 df78c645 de78c645 dd78c745 d8200000  .x.E.x.E.x.E. ..
 0060 00c745d4 40000000 c745d080 0000008b  ..E.@....E......
 0070 75e88945 ccb000e8 00000000 c745e80a  u..E.........E..
 0080 00000048 8b3d0000 0000488b 3f488b55  ...H.=....H.?H.U
 0090 f84839d7 8945c80f 85060000 004883c4  .H9..E.......H..
 00a0 405dc3e8 00000000 0f1f8400 00000000  @]..............
 00b0 554889e5 4883ec10 488d3d80 00000048  UH..H...H.=....H
 00c0 8d75f8c7 45fc0000 0000c745 f8140000  .u..E......E....
 00d0 00b000e8 00000000 8945f4e8 00000000  .........E......
 00e0 e8000000 0031c048 83c4105d c3        .....1.H...].   
Содержимое раздела .cstring:
 00ed 636f6e73 74206368 6172202a 73747200  const char *str.
 00fd 73746163 6b20746f 7020696e 20746573  stack top in tes
 010d 745f7374 61636b3a 2025700a 00636f6e  t_stack: %p..con
 011d 73742063 68617220 2a737472 33006368  st char *str3.ch
 012d 61722073 7472355b 5d006120 3d202564  ar str5[].a = %d
 013d 0a007374 61636b20 746f7020 696e206d  ..stack top in m
 014d 61696e3a 2025700a 00                 ain: %p..       
Содержимое раздела .data:
 0158 ed000000 00000000                    ........        
Содержимое раздела .const:
 0160 636f6e73 74206368 61722073 7472325b  const char str2[
 0170 5d000000 00000000 00000000 00000000  ]...............
 0180 636f6e73 74206368 61722073 7472345b  const char str4[
 0190 5d00                                 ].

Mac

Linux

Contents of section .text:
 0000 554889e5 4883ec50 64488b04 25280000  UH..H..PdH..%(..
 0010 00488945 f831c048 8d45b848 89c6488d  .H.E.1.H.E.H..H.
 0020 3d000000 00b80000 0000e800 00000048  =..............H
 0030 8d050000 00004889 45c848b8 636f6e73  ......H.E.H.cons
 0040 74206368 48ba6172 20737472 345b4889  t chH.ar str4[H.
 0050 45e04889 55e866c7 45f05d00 48b86368  E.H.U.f.E.].H.ch
 0060 61722073 74724889 45d4c745 dc355b5d  ar strH.E..E.5[]
 0070 00c645b5 78c645b6 78c645b7 78c745bc  ..E.x.E.x.E.x.E.
 0080 20000000 c745c040 000000c7 45c48000   ....E.@....E...
 0090 00008b45 b889c648 8d3d0000 0000b800  ...E...H.=......
 00a0 000000e8 00000000 c745b80a 00000090  .........E......
 00b0 488b45f8 64483304 25280000 007405e8  H.E.dH3.%(...t..
 00c0 00000000 c9c35548 89e54883 ec106448  ......UH..H...dH
 00d0 8b042528 00000048 8945f831 c0c745f4  ..%(...H.E.1..E.
 00e0 14000000 488d45f4 4889c648 8d3d0000  ....H.E.H..H.=..
 00f0 0000b800 000000e8 00000000 b8000000  ................
 0100 00e80000 0000b800 000000e8 00000000  ................
 0110 b8000000 00488b55 f8644833 14252800  .....H.U.dH3.%(.
 0120 00007405 e8000000 00c9c3             ..t........
Contents of section .text:
UH..H..PdH..%(...H.E.1.H.E.H..H.
=..............H......H.E.H.const
chH.ar str4[H.E.H.U.f.E.].H.char
strH.E..E.5[]..E.x.E.x.E.x.E. ...
.E.@....E......E...H.=...........
....E......H.E.dH3.%(...t........
UH..H...dH..%(...H.E.1..E.....H.E
.H..H.=..........................
.............H.U.dH3.%(...t........

mov    %rax,-0x2c(%rbp)
movl   $0x5d5b35,-0x24(%rbp)
movb   $0x78,-0x4b(%rbp)
movb   $0x78,-0x4a(%rbp)
movb   $0x78,-0x49(%rbp)
movl   $0x20,-0x44(%rbp)
movl   $0x40,-0x40(%rbp)
movl   $0x80,-0x3c(%rbp)

Sections in user space - .text [2]

Where are str, str2, str3? Why?

In .data, because they are all global and initialized

1 point

Sections in user space - .text [3]

const char *s = "abc";



const char s[] = "abc";



char s[] = "abc";

Variable s and constant string "abc".

Constant array s.

Array s.

s = "cde";
s[0] = 'c';

Sections in user space - .text [4]

Why are str2 (.data) and str4 (.text) in different sections?

Because of different visibility scopes: str2 is global and fell into .data.

const char *str = "const char *str";
const char str2[] = "const char str2[]";

void
test_stack()
{
	int a;
	printf("stack top in test_stack: %p\n", &a);
	const char *str3 = "const char *str3";
	const char str4[] = "const char str4[]";
	char str5[] = "char str5[]";
	char b = 'x';
	char c = 'x';
	char d = 'x';
	int e = 32;
	int f = 64;
	int g = 128;
	printf("a = %d\n", a);
	a = 10;
}

1 point

Sections in user space - .stack [1]

Push, call

Return

LIFO

0xffffffff

0x0

Sections in user space - .stack [2]

int uninitialized;
const char *str = "const char *str";
const char str2[] = "const char str2[]";

void another_function(void)
{
	char array[128];
	memset(array, 0, sizeof(array));
	printf("called another function, stack is %p\n", array);
}

void test_stack(void)
{
	int a;
	printf("stack top in test_stack: %p\n", &a);
	const char *str3 = "const char *str3";
	const char str4[] = "const char str4[]";
	char str5[] = "char str5[]";
	char b = 'x';
	char c = 'x';
	char d = 'x';
	int e = 32;
	int f = 64;
	int g = 128;
	printf("a = %d\n", a);
	a = 10;
}

int main(void)
{
	int a = 20;
	printf("stack top in main: %p\n", &a);
	test_stack();
	test_stack();
	another_function();
	test_stack();
	return 0;
}
vladislav$> gcc -c 2_proc_memory.c -o obj.o
vladislav$> gcc obj.o
vladislav$> ./a.out
stack top in main: 0x7ffeeaa5e9f8
stack top in test_stack: 0x7ffeeaa5e9c8
a = -358225424
stack top in test_stack: 0x7ffeeaa5e9c8
a = 10
called another function, stack is 0x7ffeeaa5e950
stack top in test_stack: 0x7ffeeaa5e9c8
a = 0

Stack grows down

It is reused

But reused by all functions

/proc/<pid>/maps [1]

int main()
{
	int fd_me = open("3_fs_proc.c", O_RDONLY);
	char *shared_mem = (char *) mmap(NULL, 100, PROT_READ,
					 MAP_FILE | MAP_SHARED, fd_me, 0);
	char buf[128];

	sprintf(buf, "/proc/%d/maps", (int) getpid());
	int fd = open(buf, O_RDONLY);
	printf("print %s\n", buf);
	if (fd == -1) {
		printf("exit %s\n", strerror(errno));
		exit(1);
	}
	int nbyte;
	while ((nbyte = read(fd, buf, sizeof(buf))) > 0)
		printf("%.*s", nbyte, buf);
	printf("\n");
	close(fd);
	munmap(shared_mem, 100);
	close(fd_me);
	return 0;
}
vladislav$> gcc 3_fs_proc.c
vladislav$> ./a.out
sprintf(buf, "/proc/%d/maps", (int) getpid());
int fd = open(buf, O_RDONLY);

/proc/<pid>/maps [2]

vladislav$>./a.out
print /proc/816/maps
55cad3bee000-55cad3bef000 r-xp 00000000 08:01 3670028   /home/vladislav/a.out
55cad3def000-55cad3df0000 r--p 00001000 08:01 3670028   /home/vladislav/a.out
55cad3df0000-55cad3df1000 rw-p 00002000 08:01 3670028   /home/vladislav/a.out
55cad4479000-55cad449a000 rw-p 00000000 00:00 0         [heap]
7fe06c0a2000-7fe06c289000 r-xp 00000000 08:01 1315532   /lib/x86_64-linux-gnu/libc-2.27.so
7fe06c289000-7fe06c489000 ---p 001e7000 08:01 1315532   /lib/x86_64-linux-gnu/libc-2.27.so
7fe06c489000-7fe06c48d000 r--p 001e7000 08:01 1315532   /lib/x86_64-linux-gnu/libc-2.27.so
7fe06c48d000-7fe06c48f000 rw-p 001eb000 08:01 1315532   /lib/x86_64-linux-gnu/libc-2.27.so
7fe06c48f000-7fe06c493000 rw-p 00000000 00:00 0 
7fe06c493000-7fe06c4ba000 r-xp 00000000 08:01 1315504   /lib/x86_64-linux-gnu/ld-2.27.so
7fe06c6a4000-7fe06c6a6000 rw-p 00000000 00:00 0 
7fe06c6b9000-7fe06c6ba000 r--s 00000000 08:01 3670336   /home/vladislav/3_fs_proc.c
7fe06c6ba000-7fe06c6bb000 r--p 00027000 08:01 1315504   /lib/x86_64-linux-gnu/ld-2.27.so
7fe06c6bb000-7fe06c6bc000 rw-p 00028000 08:01 1315504   /lib/x86_64-linux-gnu/ld-2.27.so
7fe06c6bc000-7fe06c6bd000 rw-p 00000000 00:00 0 
7ffcb1886000-7ffcb18a7000 rw-p 00000000 00:00 0         [stack]
7ffcb1989000-7ffcb198c000 r--p 00000000 00:00 0         [vvar]
7ffcb198c000-7ffcb198e000 r-xp 00000000 00:00 0         [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]
struct vm_area_struct {
	unsigned long vm_start;
	unsigned long vm_end;
	unsigned long vm_flags;
	struct file * vm_file;
	void * vm_private_data;
};

.text

.data

.bss

.heap

.mmap

.stack

.env

Shared library segment

mmap(shared)

int main()
{
	char *shared_mem = (char *) mmap(NULL, 100, PROT_READ | PROT_WRITE,
					 MAP_ANON | MAP_SHARED, -1, 0);
	char *private_mem = (char *) malloc(100);
	shared_mem[0] = 55;
	private_mem[0] = 55;
	if (fork() == 0) {
		shared_mem[0] = 56;
		private_mem[0] = 56;
		goto exit;
	}
	wait(NULL);
	printf("shared: %d, private: %d\n", (int) shared_mem[0],
	       (int) private_mem[0]);
exit:
	munmap(shared_mem, 100);
	free(private_mem);
	return 0;
}
vladislav$> gcc 4_shared_mem.c

vladislav$> ./a.out
shared: 56, private: 55

Private memory - COW

Shared memory - no  COW

The child changes both, and what?

Only shared memory change is visible in the parent

Sections in user space - .env

int main(int argc, char **argv, char **env)
{
	char *path = getenv("PATH");
	printf("env: %p\n", path);
	printf("argv: %p\n", argv);
	int a;
	printf("stack: %p\n", &a);
	void *m = malloc(100);
	printf("heap: %p\n", m);
	free(m);

	int i = 0;
	while (env[i] != NULL)
		printf("%s\n", env[i++]);
	return 0;
}
vladislav$> gcc 5_argv_env.c

vladislav$> ./a.out
env: 0x7ffda1c1ae8f
argv: 0x7ffda1c194f8
stack: 0x7ffda1c193f0
heap: 0x555d856c5670
LC_PAPER=ru_RU.UTF-8
LC_MONETARY=ru_RU.UTF-8
XDG_MENU_PREFIX=gnome-
LANG=en_US.UTF-8
DISPLAY=:0
GNOME_SHELL_SESSION_MODE=ubuntu
PWD=/home/vladislav
HOME=/home/vladislav

...
char *
getenv(const char *name);

int
setenv(const char *name, const char *value,
       int overwrite);

int
putenv(char *string);

int
unsetenv(const char *name);
"name=value"

Sections in user space - .debug

Contents of section .debug_info:
 0000 8f030000 04000000 00000801 00000000  ................
 0010 0c000000 00000000 00000000 00000000  ................
 0020 00110100 00000000 00000000 00020000  ................
Contents of section .debug_abbrev:
 0000 01110125 0e130b03 0e1b0e11 01120710  ...%............
 0010 17000002 1600030e 3a0b3b0b 49130000  ........:.;.I...
 0020 0324000b 0b3e0b03 0e000004 24000b0b  .$...>......$...
Contents of section .debug_aranges:
 0000 2c000000 02000000 00000800 00000000  ,...............
 0010 00000000 00000000 11010000 00000000  ................
 0020 00000000 00000000 00000000 00000000  ................
Contents of section .debug_line:
 0000 e0000000 0200b300 00000101 fb0e0d00  ................
 0010 01010101 00000001 0000012f 7573722f  .........../usr/
 0020 6c69622f 6763632f 7838365f 36342d6c  lib/gcc/x86_64-l
 0030 696e7578 2d676e75 2f372f69 6e636c75  inux-gnu/7/inclu
 0040 6465002f 7573722f 696e636c 7564652f  de./usr/include/
 0050 7838365f 36342d6c 696e7578 2d676e75  x86_64-linux-gnu
Contents of section .debug_str:
 0000 5f5f6f66 665f7400 5f494f5f 72656164  __off_t._IO_read
 0010 5f707472 005f6368 61696e00 73697a65  _ptr._chain.size
 0020 5f74005f 73686f72 74627566 00474e55  _t._shortbuf.GNU
 0030 20433131 20372e33 2e30202d 6d74756e   C11 7.3.0 -mtun
 0040 653d6765 6e657269 63202d6d 61726368  e=generic -march
 0050 3d783836 2d363420 2d67202d 66737461  =x86-64 -g -fsta
 0060 636b2d70 726f7465 63746f72 2d737472  ck-protector-str
 0070 6f6e6700 5f494f5f 325f315f 73746465  ong._IO_2_1_stde

File descriptors [1]

struct task_struct {
        /* ... */
        struct files_struct *files;
        /* ... */
};

struct files_struct {
	struct fdtable *fdt;
};

struct fdtable {
	unsigned int max_fds;
	struct file **fd;
};

struct file {
	struct path	  f_path;
	struct inode	  *f_inode;
	atomic_long_t	  f_count;
	unsigned int 	  f_flags;
	fmode_t		  f_mode;
	loff_t		  f_pos;
	const struct cred *f_cred;
};

Process keeps opened file descriptors

In a table of descriptors

The table is just an ordinary array

File descriptor in the kernel is a structure. In user space - number, an index in the descriptor array

File descriptors [2]

Predefined standard descriptors

#define STDIN_FILENO 0
#define STDOUT_FILENO 1
#define STDERR_FILENO 2
int main()
{
	char buf[] = "write to 1\n";
	write(STDOUT_FILENO, buf, sizeof(buf));
	printf("stdout fileno = %d\n", STDOUT_FILENO);
	return 0;
}
vladislav$> gcc 6_stdout.c

vladislav$> ./a.out
write to 1
stdout fileno = 1

More descriptors via open().

File descriptors [3]

task 1

task 2

struct file

stdout
fd1
fd1
stdout
pos   = 0
count = 1

struct file

pos   = 0
count = 1

struct inode

struct file: stdout

pos   = 0
count = 1

struct file: stdout

pos   = 0
count = 1

struct inode

struct inode

File descriptors [4]

Why can struct inode stdin/out/err of different processes be different?

Because of different consoles. Each console is a 'file'.

1 point

File descriptors [5]

int main()
{
	int fd = open("tmp.txt", O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
	pid_t pid = getpid();
	printf("press any key to write my pid %d\n", (int) pid);
	getchar();

	dprintf(fd, "%d ", (int) pid);
	close(fd);
	return 0;
}

Problem of appending to a file

X 2

vladislav$> gcc 7_basic_append.c

vladislav$> ./a.out
press any key to write my pid 46152


vladislav$> ./a.out
press any key to write my pid 46153
vladislav$> #press enter
vladislav$> #press enter
vladislav$> cat tmp.txt
46153
open("tmp.txt", O_RDWR | O_CREAT | O_APPEND, S_IRUSR | S_IWUSR);

File descriptors [6]

Why does not lseek + write work instead of O_APPEND?

Because of interrupts. A process can be interrupted after lseek before write. Another process can write into the file here.

1 point

Pipe [1]

int
dup(int fildes);

int
dup2(int fildes, int fildes2);

task 1

struct file

fd1
fd2
pos   = 0
count = 2

struct inode

fd2 = dup(fd1);
int
pipe(int fildes[2]);
vladislav$> ps aux | grep audio
_coreaudiod      37568   6048   ??  Ss   пт10   48:17.63 /usr/sbin/coreaudiod

Pipe

Pipe [2]

int main()
{
	int fd = open("tmp.txt", O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
	int fd2 = dup(fd);
	dprintf(fd2, "1 ");
	close(fd2);
	dprintf(fd, "2 ");
	close(fd);
	return 0;
}
vladislav$> gcc 8_dup.c

vladislav$> ./a.out

vladislav$> cat tmp.txt
1 2

Pipe [3]

What happens with file descriptors after fork()?

Nothing, they are kept as is. The child and the parent will literally share one struct file as if dup() would be called on all the descriptors.

1 point

Pipe [4]

int main()
{
	int fd = open("tmp.txt", O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
	int fd2 = dup(fd);
	if (fork() == 0) {
		close(fd);
		dprintf(fd2, "%d ", (int) getpid());
		close(fd2);
		return 0;
	}
	close(fd2);
	wait(NULL);
	dprintf(fd, "%d ", (int) getpid());
	close(fd);
	return 0;
}
vladislav$> gcc 9_fork_dup.c

vladislav$> ./a.out

vladislav$> cat tmp.txt
46297 46296

Pipe [5]

task 1

task 2

struct file

fd1
fd2
pos   = 0
count = 2

struct inode

Pipe [6]

int
pipe(int fildes[2]);

task 1

struct file

fd[0]
fd[1]
pos   = 0
count = 1
flags = read

struct file

pos   = 0
count = 1
flags = write

Pipe [7]

int main()
{
	int to_parent[2];
	int to_child[2];
	pipe(to_child);
	pipe(to_parent);
	char buf[16];
	if (fork() == 0) {
		close(to_parent[0]);
		close(to_child[1]);
		read(to_child[0], buf,
                     sizeof(buf));
		printf("%d: read %s\n",
                       (int) getpid(), buf);
		write(to_parent[1], "hello2",
                      sizeof("hello2"));
		return 0;
	}
	close(to_parent[1]);
	close(to_child[0]);
	write(to_child[1], "hello1",
              sizeof("hello"));
	read(to_parent[0], buf, sizeof(buf));
	printf("%d: read %s\n", (int) getpid(),
               buf);
	wait(NULL);
	return 0;
}
vladislav$> gcc 10_pipe.c

vladislav$> ./a.out
46312: read hello1
46311: read hello2

Two channels. to_child[1] -> to_child[0], to_parent[1] ->to_parent[0]

The child closes unused descriptors. Now it reads from to_child[0] and writes to to_parent[1].

The parent closes unused descriptors. Now it reads from to_parent[0] and writes to to_child[1].

Pipe [8]

parent

to_parent[0]

to_parent[0]
to_parent[1]
to_child[0]
to_child[1]
pos   = 0
count = 2
flags = read

child

to_parent[0]
to_parent[1]
to_child[0]
to_child[1]
pos   = 0
count = 2
flags = write
pos   = 0
count = 2
flags = read
pos   = 0
count = 2
flags = write

to_parent[1]

to_child[0]

to_child[1]


to_parent[1]
to_child[0]

pos   = 0
count = 1
flags = read
pos   = 0
count = 1
flags = write
to_parent[0]


to_child[1]
pos   = 0
count = 1
flags = write
pos   = 0
count = 1
flags = read

Pipe [9]

int
main(int argc, const char **argv)
{
	int channel[2];
	int need_close = argc > 1 &&
                         strcmp(argv[1], "close") == 0;
	pipe(channel);
	if (fork() == 0) {
		char buf[16];
		printf("child: started\n");
		if (need_close) {
			printf("child: close output channel\n");
			close(channel[1]);
		}
		while (read(channel[0], buf, sizeof(buf)) > 0)
			printf("child: read %s\n", buf);
		printf("child: EOF\n");
		return 0;
	}
	write(channel[1], "100", 3);
	printf("parent: written 100\n");
	if (need_close) {
		printf("parent: close output channel\n");
		close(channel[1]);
	}
	printf("parent: waiting for child termination ...\n");
	wait(NULL);
	return 0;
}
$> gcc 10_5_pipe_close.c
$> ./a.out
parent: written 100
parent: waiting for child termination ...
child: started
child: read 100
^C
$> ./a.out close
parent: written 100
parent: close output channel
parent: waiting for child termination ...
child: started
child: close output channel
child: read 100
child: EOF
$>

Create channel between parent and child - write to channel[1], read from channel[0]

Child reads from channel[0] until end of file

Parent writes to channel[1], waits for child termination, and exits

With command line option close the program closes channel[1], when it is not needed anymore

But it hangs ...

Pipe [10]

int main()
{
	int to_child[2];
	pipe(to_child);
	char buf[16];
	dup2(to_child[0], 0);
	if (fork() == 0) {
		close(to_child[1]);
		int n;
		scanf("%d", &n);
		printf("%d: read %d\n", (int) getpid(), n);
		return 0;
	}
	close(to_child[0]);
	write(to_child[1], "100", sizeof("100"));
	wait(NULL);
	return 0;
}
vladislav$> gcc 11_advanced_pipe.c

vladislav$> ./a.out
46339: read 100

Pipe [11]

int main()
{
	int to_child[2];
	pipe(to_child);
	char buf[16];
	dup2(to_child[0], 0);
	if (fork() == 0) {
		close(to_child[1]);
		return execlp("python3", "python3", "-i", NULL);
	}
	close(to_child[0]);
	const char cmd[] = "print(100 + 200)";
	write(to_child[1], cmd, sizeof(cmd));
	close(to_child[1]);
	wait(NULL);
	return 0;
}
vladislav$> gcc 12_exec.c

vladislav$> ./a.out
Python 3.6.5 (v3.6.5:f59c0932b4, Mar 28 2018, 05:52:31) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> ... 
300
>>>

Pipe [12]

int
execl(const char *path, const char *arg0, ...);

int
execle(const char *path, const char *arg0, ...);

int
execlp(const char *file, const char *arg0, ...);

int
execv(const char *path, char *const argv[]);

int
execvp(const char *file, char *const argv[]);

int
execvP(const char *file, const char *search_path, char *const argv[]);

Other legacy

Fork() copies:

  • struct task_struct
  • memory segment list for COW (not the segments themselves, only pointers to them)
  • user identifiers
  • signal handlers and masks
  • resource restrictions
  • work directory path

Fork() does not copy:

  • file locks
  • signal queue
  • threads

IPC

Mutexes, semaphores

Message queues

Shared memory

Process works with these IPC but does not own them. Then can live longer than the creator.

int
mkfifo(const char *path, mode_t mode);

int
shmget(key_t key, size_t size, int shmflg);

int
semget(key_t key, int nsems, int semflg);

int
msgget(key_t key, int msgflg);

Process interrupts [1]

  • Scheduler
  • Exception
  • Signal

 Process interrupts [2]

  • 0 division
  • Unknown address
  • Protected address

Stack overflow, COW

Handle exception

Foreign or not existing memory, 0 division

Kill process

Process interrupts [3]

int main()
{
	int a = 1;
	int b = 2;
	char buffer[100];
	buffer[0] = 3;
	buffer[50] = 4;
	buffer[10000] = 5;
	return 0;
}
vladislav$> ulimit -c unlimited

vladislav$> gcc -g 13_core_dump.c

vladislav$> ./a.out
Segmentation fault: 11 (core dumped)
vladislav$> lldb --core /cores/core.46461
(lldb) target create --core "/cores/core.46461"
Core file '/cores/core.46461' (x86_64) was loaded.
(lldb) bt
* thread #1, stop reason = signal SIGSTOP
  * frame #0: 0x0000000102a6ef66 a.out`main at 13_core_dump.c:8
    frame #1: 0x00007fff5531f015 libdyld.dylib`start + 1
(lldb) f 0
frame #0: 0x0000000102a6ef66 a.out`main at 13_core_dump.c:8
   5   		char buffer[100];
   6   		buffer[0] = 3;
   7   		buffer[50] = 4;
-> 8   		buffer[10000] = 5;
   9   		return 0;
   10  	}
(lldb)  p a
(int) $0 = 1
(lldb) p b
(int) $1 = 2
(lldb) p buffer[0]
(char) $2 = '\x03'

System calls [1]. Standards

System call - execution of code in the kernel context

Standards does not describe 'system call' concept. They describe only API. "Syscallability" depends on hardware, OS, kernel.

System calls [2]. Numbers

.long sys_ni_syscall	/* old ulimit syscall holder */
.long sys_ni_syscall	/* sys_olduname */
.long sys_umask			/* 60 */
.long sys_chroot
.long sys_ustat
.long sys_dup2
.long sys_getppid
.long sys_getpgrp		/* 65 */
.long sys_setsid
.long sys_sigaction
.long sys_sgetmask
.long sys_ssetmask
.long sys_setreuid16		/* 70 */
.long sys_setregid16
.long sys_sigsuspend
.long sys_sigpending

A part of system call table in the kernel

asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
SYSCALL_DEFINE2(dup2, unsigned int, oldfd,
                unsigned int, newfd)
{
        /* ... implementation. */
}

System calls [3]

How does syscall enter the kernel context?

1) Via registers and a special interrupt;

2) Via a special processor instruction;

2 points

System calls [4]

int 0x80
sysenter/sysexit
syscall/sysret

Process

int 0x80

Handler address?

0x0...0x80...

IDT - Interrupt Descriptor Table

Process

syscall
sysenter

System calls [5]. Arguments

%eax  (  %ebx  ,  %ecx  ,  %edx  ,  %esi  ,  %edi  )

System call number

<= 5 arguments

Pointer on stack, when > 5 arguments

0xffffffff

0x0

Process puts arguments and programmatically calls a special interrupt - the kernel catches it and reads the arguments

System calls [6]. VDSO

vladislav$>./a.out
print /proc/816/maps
...
7fe06c6b9000-7fe06c6ba000 r--s 00000000 08:01 3670336   /home/vladislav/3_fs_proc.c
7fe06c6ba000-7fe06c6bb000 r--p 00027000 08:01 1315504   /lib/x86_64-linux-gnu/ld-2.27.so
7fe06c6bb000-7fe06c6bc000 rw-p 00028000 08:01 1315504   /lib/x86_64-linux-gnu/ld-2.27.so
7fe06c6bc000-7fe06c6bd000 rw-p 00000000 00:00 0 
7ffcb1886000-7ffcb18a7000 rw-p 00000000 00:00 0         [stack]
7ffcb1989000-7ffcb198c000 r--p 00000000 00:00 0         [vvar]
7ffcb198c000-7ffcb198e000 r-xp 00000000 00:00 0         [vdso]
ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]

System calls [7]. Windows

User space subsystem, application, service processes

Public and documented DLL libraries

Windows kernel

User space

Kernel space

System calls

Function calls

CreateProcess()
CreateProcessInternalW()
NtCreateUserProcess()

Summary

New processes are always cloned from other processes. There is the single common ancestor for all of them - init-process.

Process = resource virtualization. It hides from us things like physical memory, system calls, interrupts, number of CPU cores, etc.

Process has multiple memory sections for different purposes like main stack, heap, dynamic libraries, global variables, command line, etc. Can see them in /proc/<pid>/maps

File descriptors are not only for files. Can communicate with other processes too like with pipe() + dup() + dup2(). Be careful how FDs are inherited on fork().

Process communicates with the world via the kernel. To the kernel it talks via the system calls. They work like calling certain functions inside the kernel to use files, network, time, to read certain system-wide info

Conclusion

Memory. Virtual and physical. Cache levels and cache lines. User and kernel memory. False sharing.

Next time:


Press on the heart, if like the lecture

System programming 2

By Vladislav Shpilevoy

System programming 2

Work modes: privileged, normal. Memory, its sections. Process resources - descriptors, locks. Process interrupts: exceptions, hardware, scheduling. Work context: user, kernel, signal. Interaction with the kernel - system calls: what are they, how parameters are passed and a result returned.

  • 1,615