0x03 A glance into the kernel: Proc source

Friday, February 28, 2020 • edited Sunday, March 15, 2020


My machine

Linux debian-laptop 5.4.0-4-amd64 #1 SMP Debian 5.4.19-1 (2020-02-13) x86_64 GNU/Linux

Architecture:                    x86_64
CPU op-mode(s):                  32-bit, 64-bit
Byte Order:                      Little Endian
Address sizes:                   39 bits physical, 48 bits virtual
CPU(s):                          4
On-line CPU(s) list:             0-3
Thread(s) per core:              2
Core(s) per socket:              2
Socket(s):                       1
NUMA node(s):                    1
Vendor ID:                       GenuineIntel
CPU family:                      6
Model:                           142
Model name:                      Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz
Stepping:                        9
CPU MHz:                         3100.016
CPU max MHz:                     3100.0000
CPU min MHz:                     400.0000
BogoMIPS:                        5399.81
Virtualization:                  VT-x
L1d cache:                       64 KiB
L1i cache:                       64 KiB
L2 cache:                        512 KiB
L3 cache:                        3 MiB
NUMA node0 CPU(s):               0-3
Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdp
                                 e1gb rdtscp lm constant_tsc art arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monito
                                 r ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
                                  lahf_lm abm 3dnowprefetch cpuid_fault epb invpcid_single ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase t
                                 sc_adjust bmi1 avx2 smep bmi2 erms invpcid mpx rdseed adx smap clflushopt intel_pt xsaveopt xsavec xgetbv1 xsaves dtherm ida arat pln 
                                 pts hwp hwp_notify hwp_act_window hwp_epp md_clear flush_l1d

gcc version 9.2.1 20200224 (Debian 9.2.1-30) 
clang version 9.0.1-8 
Target: x86_64-pc-linux-gnu
Thread model: posix

Description of a process

line 127

We should note this:

 * Below is a key of locks used to protect each member of struct proc.  The
 * lock is indicated by a reference to a specific character in parens in the
 * associated comment.

Proc structure

line 583

Comments are detailed and easy to understand, just read it when forget something.

Then let’s check line 598

int		p_flag;		/* (c) P_* flags. */
int		p_flag2;	/* (c) P2_* flags. */	

and the following line 737

/* These flags are kept in p_flag. */
#define	P_ADVLOCK	0x00001	/* Process may hold a POSIX advisory lock. */
#define	P_CONTROLT	0x00002	/* Has a controlling terminal. */
#define	P_KPROC		0x00004	/* Kernel process. */
#define	P_UNUSED3	0x00008	/* --available-- */
#define	P_PPWAIT	0x00010	/* Parent is waiting for child to exec/exit. */
#define	P_PROFIL	0x00020	/* Has started profiling. */
#define	P_STOPPROF	0x00040	/* Has thread requesting to stop profiling. */
#define	P_HADTHREADS	0x00080	/* Has had threads (no cleanup shortcuts) */
#define	P_SUGID		0x00100	/* Had set id privileges since last exec. */
#define	P_SYSTEM	0x00200	/* System proc: no sigs, stats or swapping. */
#define	P_SINGLE_EXIT	0x00400	/* Threads suspending should exit, not wait. */
#define	P_TRACED	0x00800	/* Debugged process being traced. */
#define	P_WAITED	0x01000	/* Someone is waiting for us. */
#define	P_WEXIT		0x02000	/* Working on exiting. */
#define	P_EXEC		0x04000	/* Process called exec. */
#define	P_WKILLED	0x08000	/* Killed, go to kernel/user boundary ASAP. */
#define	P_CONTINUED	0x10000	/* Proc has continued from a stopped state. */
#define	P_STOPPED_SIG	0x20000	/* Stopped due to SIGSTOP/SIGTSTP. */
#define	P_STOPPED_TRACE	0x40000	/* Stopped because of tracing. */
#define	P_STOPPED_SINGLE 0x80000 /* Only 1 thread can continue (not to user). */
#define	P_PROTECTED	0x100000 /* Do not kill on memory overcommit. */
#define	P_SIGEVENT	0x200000 /* Process pending signals changed. */
#define	P_SINGLE_BOUNDARY 0x400000 /* Threads should suspend at user boundary. */
#define	P_HWPMC		0x800000 /* Process is using HWPMCs */
#define	P_JAILED	0x1000000 /* Process is in jail. */
#define	P_TOTAL_STOP	0x2000000 /* Stopped in stop_all_proc. */
#define	P_INEXEC	0x4000000 /* Process is in execve(). */
#define	P_STATCHILD	0x8000000 /* Child process stopped or exited. */
#define	P_INMEM		0x10000000 /* Loaded into memory. */
#define	P_SWAPPINGOUT	0x20000000 /* Process is being swapped out. */
#define	P_SWAPPINGIN	0x40000000 /* Process is being swapped in. */
#define	P_PPTRACE	0x80000000 /* PT_TRACEME by vforked child. */

#define	P_SHOULDSTOP(p)	((p)->p_flag & P_STOPPED)
#define	P_KILLED(p)	((p)->p_flag & P_WKILLED)

/* These flags are kept in p_flag2. */
#define	P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */
#define	P2_NOTRACE	0x00000002	/* No ptrace(2) attach or coredumps. */
#define	P2_NOTRACE_EXEC 0x00000004	/* Keep P2_NOPTRACE on exec(2). */
#define	P2_AST_SU	0x00000008	/* Handles SU ast for kthreads. */
#define	P2_PTRACE_FSTP	0x00000010 /* SIGSTOP from PT_ATTACH not yet handled. */
#define	P2_TRAPCAP	0x00000020	/* SIGTRAP on ENOTCAPABLE */
#define	P2_ASLR_ENABLE	0x00000040	/* Force enable ASLR. */
#define	P2_ASLR_DISABLE	0x00000080	/* Force disable ASLR. */
#define	P2_ASLR_IGNSTART 0x00000100	/* Enable ASLR to consume sbrk area. */
#define	P2_PROTMAX_ENABLE 0x00000200	/* Force enable implied PROT_MAX. */
#define	P2_PROTMAX_DISABLE 0x00000400	/* Force disable implied PROT_MAX. */
#define	P2_STKGAP_DISABLE 0x00000800	/* Disable stack gap for MAP_STACK */
#define	P2_STKGAP_DISABLE_EXEC 0x00001000 /* Stack gap disabled after exec */

Such implementation can make the most of bits of every flag and save much on computation consumption.


  • Address Space Layout Randomization (ASLR)

  • Hardware Performance Monitoring Counter (HWPMC)

  • the td argument in any system call is actually a pointer to the calling thread’s thread structure, which describes the thread

line 625

/* The following fields are all zeroed upon creation in fork. */
#define	p_startzero	p_vmspace
	struct vmspace	*p_vmspace;	/* (b) Address space. */
	u_int		p_swtick;	/* (c) Tick when swapped in or out. */
	u_int		p_cowgen;	/* (c) Generation of COW pointers. */
	struct itimerval p_realtimer;	/* (c) Alarm timer. */
	struct rusage	p_ru;		/* (a) Exit information. */
	struct rusage_ext p_rux;	/* (cu) Internal resource usage. */
	struct rusage_ext p_crux;	/* (c) Internal child resource usage. */
	int		p_profthreads;	/* (c) Num threads in addupc_task. */
	volatile int	p_exitthreads;	/* (j) Number of threads exiting */
	int		p_traceflag;	/* (o) Kernel trace points. */
	struct vnode	*p_tracevp;	/* (c + o) Trace to vnode. */
	struct ucred	*p_tracecred;	/* (o) Credentials to trace with. */
	struct vnode	*p_textvp;	/* (b) Vnode of executable. */
	u_int		p_lock;		/* (c) Proclock (prevent swap) count. */
	struct sigiolst	p_sigiolst;	/* (c) List of sigio sources. */
	int		p_sigparent;	/* (c) Signal to parent on exit. */
	int		p_sig;		/* (n) For core dump/debugger XXX. */
	u_int		p_stops;	/* (c) Stop event bitmask. */
	u_int		p_stype;	/* (c) Stop event type. */
	char		p_step;		/* (c) Process is stopped. */
	u_char		p_pfsflags;	/* (c) Procfs flags. */
	u_int		p_ptevents;	/* (c + e) ptrace() event mask. */
	struct nlminfo	*p_nlminfo;	/* (?) Only used by/for lockd. */
	struct kaioinfo	*p_aioinfo;	/* (y) ASYNC I/O info. */
	struct thread	*p_singlethread;/* (c + j) If single threading this is it */
	int		p_suspcount;	/* (j) Num threads in suspended mode. */
	struct thread	*p_xthread;	/* (c) Trap thread */
	int		p_boundary_count;/* (j) Num threads at user boundary */
	int		p_pendingcnt;	/* how many signals are pending */
	struct itimers	*p_itimers;	/* (c) POSIX interval timers. */
	struct procdesc	*p_procdesc;	/* (e) Process descriptor, if any. */
	u_int		p_treeflag;	/* (e) P_TREE flags */
	int		p_pendingexits; /* (c) Count of pending thread exits. */
	struct filemon	*p_filemon;	/* (c) filemon-specific data. */
	int		p_pdeathsig;	/* (c) Signal from parent on exit. */
/* End area that is zeroed on creation. */
#define	p_endzero	p_magic

/* The following fields are all copied upon creation in fork. */
#define	p_startcopy	p_endzero
	u_int		p_magic;	/* (b) Magic number. */

p_startzero and p_endzero will be used in sys/kern/kern_fork.c line 371

	    __rangeof(struct proc, p_startzero, p_endzero));

and indirectly depend on __builtin_memset and __builtin_offsetof on my platform which can be located in sys/sys/system.h and sys/cdefs.h

#define bzero(buf, len) __builtin_memset((buf), 0, (len))

#ifdef KCSAN
void	*kcsan_memset(void *, int, size_t);
void	*kcsan_memcpy(void *, const void *, size_t);
void	*kcsan_memmove(void *, const void *, size_t);
int	kcsan_memcmp(const void *, const void *, size_t);
#define bcopy(from, to, len) kcsan_memmove((to), (from), (len))
#define bzero(buf, len) kcsan_memset((buf), 0, (len)) // comment by Kowalski, notice here
#define bcmp(b1, b2, len) kcsan_memcmp((b1), (b2), (len))
#define memset(buf, c, len) kcsan_memset((buf), (c), (len))
#define memcpy(to, from, len) kcsan_memcpy((to), (from), (len))
#define memmove(dest, src, n) kcsan_memmove((dest), (src), (n))
#define memcmp(b1, b2, len) kcsan_memcmp((b1), (b2), (len))
#define bcopy(from, to, len) __builtin_memmove((to), (from), (len))
#define bzero(buf, len) __builtin_memset((buf), 0, (len)) // commnet by Kowalski, and here, selected on my platform
#define bcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len))
#define memset(buf, c, len) __builtin_memset((buf), (c), (len))
#define memcpy(to, from, len) __builtin_memcpy((to), (from), (len))
#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n))
#define memcmp(b1, b2, len) __builtin_memcmp((b1), (b2), (len))
#define	__rangeof(type, start, end) \
	(__offsetof(type, end) - __offsetof(type, start))

#if __GNUC_PREREQ__(4, 1)
#define	__offsetof(type, field)	 __builtin_offsetof(type, field) // comment by Kowalski, selected on my platform
#ifndef __cplusplus
#define	__offsetof(type, field) \
	((__size_t)(__uintptr_t)((const volatile void *)&((type *)0)->field))
#define	__offsetof(type, field)					\
  (__offsetof__ (reinterpret_cast <__size_t>			\
                 (&reinterpret_cast <const volatile char &>	\
                  (static_cast<type *> (0)->field))))
CC BY 4.0

0x04 A glance into the kernel: turnstile

0x02 A glance into the kernel: Overview on kernel services

comments powered by Disqus