How to create rootfs for kernel debugging

Compulsory kernel options

CONFIG_E1000=y

create rootfs

readonly BASEDIR=$(readlink -f $(dirname $0))/

QEMU_PATH=${BASEDIR}/
QEMU_BUILD_PATH=${QEMU_PATH}/build/
IMAGE_NAME=qemu-image.img


qemu-img create ${QEMU_PATH}/${IMAGE_NAME} 100g
mkfs.ext4 ${QEMU_PATH}/${IMAGE_NAME}
mkdir -p ${QEMU_PATH}/mount-point.dir
sudo mount -o loop ${QEMU_PATH}/${IMAGE_NAME} ${QEMU_PATH}/mount-point.dir/
sudo apt install debootstrap debian-keyring debian-archive-keyring
# When debootstrap install if wget error occurs,
# add proxy configuration to /etc/wgetrc (http_proxy, https_proxy, ftp_proxy)
sudo debootstrap --no-check-certificate --arch amd64 lunar ${QEMU_PATH}/mount-point.dir/
# mkdir - ${QEMU_PATH}/mnt
sudo mount ${IMAGE_NAME} ${QEMU_PATH}/mnt
cd ${QEMU_PATH}/mnt
sudo chroot .

# After creating rootfs, 
# 1) Change root password
# $ passwd

# 2) (Optional) Add proxy configuration
# Open /etc/profile, and add proxy configuration
# export HTTP_PROXY="http://:/"
# export HTTPS_PROXY="http://:/"
# $ sh /etc/profile

# 3) (Optional) Add local Ubuntu repository mirrors
# Open /etc/apt/sources.list, and add repository mirrors

# 4) Package update
# $ apt update
# $ apt upgrade
# $ apt install nano pciutils debian-keyring debian-archive-keyring openssh-server net-tools ifupdown

# 5) Modify sshd config
# $ vi /etc/ssh/sshd_config
# ...
# PermitRootLogin yes
# ...

# 6) Modify network config
# $ vi /etc/network/interfaces
# # add below lines
# auto ens3
# iface ens3 inet dhcp
# # for q35 machine
# auto enp0s2
# iface enp0s2 inet dhcp
# alternative network config
# $ vi /etc/netplan/00-installer-config.yaml
# # add below lines
# network:
#  ethernets:
#    ens3:
#      dhcp4: true
#    enp0s2:
#      dhcp4: true
#  version: 2
# 7) Quit
# $ exit

After create the rootfs and compiler the kernel

SMDK_KERNEL_PATH=${BASEDIR}/Bede-linux/
ROOTFS_PATH=${BASEDIR}
MONITOR_PORT=45454

QEMU_SYSTEM_BINARY=`which qemu-system-x86_64`
BZIMAGE_PATH=${SMDK_KERNEL_PATH}/arch/x86_64/boot/bzImage
INITRD_PATH=/boot/initrd.img-6.4.0+
IMAGE_PATH=${ROOTFS_PATH}/qemu-image.img

function print_usage(){
	echo ""
	echo "Usage:"
	echo " $0 [-x vm_index(0-9)]"
	echo ""
}

while getopts "x:" opt; do
	case "$opt" in
		x)
			if [ $OPTARG -lt 0 ] || [ $OPTARG -gt 9 ]; then
				echo "Error: VM count should be 0-9"
				exit 2
			fi
			VMIDX=$OPTARG
			;;
		*)
			print_usage
			exit 2
			;;
	esac
done

if [ -z ${VMIDX} ]; then
	NET_OPTION="-net user,hostfwd=tcp::2242-:22,hostfwd=tcp::6379-:6379,hostfwd=tcp::11211-:11211, -net nic"
else
	echo "Info: Running VM #${VMIDX}..."
	MONITOR_PORT="4545${VMIDX}"
	IMAGE_PATH=$(echo ${IMAGE_PATH} | sed 's/.img/-'"${VMIDX}"'.img/')
	MACADDR="52:54:00:12:34:${VMIDX}${VMIDX}"
	TAPNAME="tap${VMIDX}"
	NET_OPTION="-net nic,macaddr=${MACADDR} -net tap,ifname=${TAPNAME},script=no"

	IFCONFIG_TAPINFO=`ifconfig | grep ${TAPNAME}`
	if [ -z "${IFCONFIG_TAPINFO}" ]; then
		log_error "${TAPNAME} SHOULD be up for using network in VM. Run 'setup_bridge.sh' in /path/to/SMDK/lib/qemu/"
		exit 2
	fi
fi

if [ ! -f "${QEMU_SYSTEM_BINARY}" ]; then
	log_error "qemu-system-x86_64 binary does not exist. Run 'build_lib.sh qemu' in /path/to/SMDK/lib/"
	exit 2
fi

if [ ! -f "${BZIMAGE_PATH}" ]; then
	log_error "SMDK kernel image does not exist. Run 'build_lib.sh kernel' in /path/to/SMDK/lib/"
	exit 2
fi

if [ ! -f "${IMAGE_PATH}" ]; then
	log_error "QEMU rootfs ${IMAGE_PATH} does not exist. Run 'create_rootfs.sh' in /path/to/SMDK/lib/qemu/"
	exit 2
fi
#  echo sudo ${QEMU_SYSTEM_BINARY} \
#     -smp 3 \
#     -numa node,cpus=0-2,memdev=mem0,nodeid=0 \
#     -object memory-backend-ram,id=mem0,size=8G \
#     -kernel ${BZIMAGE_PATH} \
# 	-initrd ${INITRD_PATH} \
#     -drive file=${IMAGE_PATH},index=0,media=disk,format=raw \
#     -drive file=${IMAGE1_PATH},index=1,media=disk,format=raw \
#     -enable-kvm \
#     -monitor telnet::${MONITOR_PORT},server,nowait \
#     -serial mon:stdio \
#     -nographic \
#     -append "root=/dev/sda rw console=ttyS0 nokaslr memblock=debug loglevel=7" \
#     -m 8G,slots=4,maxmem=32G \
# 	-device virtio-crypto-pci,id=crypto0,cryptodev=cryptodev0 \
#     -object cryptodev-backend-builtin,id=cryptodev0 \
# 	-object secret,id=sec0,file=./Drywall/passwd.txt \
#     ${NET_OPTION}

 ${QEMU_SYSTEM_BINARY} \
    -S -s -smp 4  \
    -numa node,cpus=0,memdev=mem0,nodeid=0 \
    -object memory-backend-ram,id=mem0,size=8G \
	-numa node,cpus=1,memdev=mem1,nodeid=1 \
    -object memory-backend-ram,id=mem1,size=8G \
	-numa node,cpus=2,memdev=mem2,nodeid=2 \
    -object memory-backend-ram,id=mem2,size=8G \
	-numa node,cpus=3,memdev=mem3,nodeid=3 \
    -object memory-backend-ram,id=mem3,size=8G \
    -kernel ${BZIMAGE_PATH} \
	-initrd ${INITRD_PATH} \
    -drive file=${IMAGE_PATH},index=0,media=disk,format=raw \
    -serial mon:stdio \
    -nographic \
    -append "root=/dev/sda rw console=ttyS0 memblock=debug loglevel=7 cgroup_no_v1=1" \
    -m 32G,slots=4,maxmem=36G \
	-nic bridge,br=virbr0,model=virtio-net-pci,mac=02:76:7d:d7:1e:3f

OSDI23 Attendency

TCLocks

pyy是四作
lock中transfer ctx而非data。因为shared data 同步是多线程lock unlock的最大开销。
所以用delayed OoO lock主动interrupts拿到epherical stack(会被prefetch)。

用ip+stack capture locks不sound?

明煜问了个如果解锁顺序不一样可能会死锁。回答说无关。

RON

因为CPU的topology,single threadripper上的topology planning。在thread ripper CCX的topology下用TSP来scheduler lock的unlock顺序。多个锁的情况把同一个锁放到一个ccx里。

UB

所有syscall profiling guided都被userspace binary recompile到jit把所有东西弄到kernel里面做。syscall 变成kernel 态的jump,保护用SFI。

~992 cycle

Indirect cost

  1. L1 TLB pollution
  2. OoO stall
  3. KPTI

compared with iouring. Should compared with iouring + codegen to bpf

由于SFI 有online 开销,我觉得最好做法还是codegen BPF。

Relational Debugging

用good run run到的地方比对bad run到的地方的trace,发现炸的地方和root cause的关系。(逻辑关系一多又不行了。

Larch

extension to MPC with 椭圆曲线, make honest client enrollment for precomputation. $f_1(r)(m+kf_2(r))$, and save them in the client side for future authentication.

k9db

expose data ownership(multigraph) graph soundly to the third party and mark annotations for developers to reason the dependency

during storage operations like group by, it will also cast on the ownership multigraph efficiently.

HeDB

using fork with privacy preserving record &replay+ concolic construction of fake control flow input of secured DB for maintenance to fix smuggle attacks from database administrator. exp query rewriting and concurrent write workload.

LVMT

Proof Sharding: Merkle Tree operator of data chunks.

Honeycomb GPU TEE

static analysis of gpu binary rather than increasing the TCB size and decrease the IPC and runtime monitor, 一个在SEV-SNP里面,一个在sandbox VM里,两个在deployment可以解耦. 有kernel 部分的driver和user space的driver。 lauch kernel的时候hook 所有memory 加密,bus的传输假定可靠,会leak也没用。

Core Slicing

这篇是shanyizhou的,用rv pmp来控制对slice的生命周期,对intel来说,如果不能改micro code,是需要对硬件的model (尤其是ACPI)有假设的。

Duvisor

听limingyu说这玩意的第一性原理是Gu Ronghui的把kernel里无法验证的bounded loop提到上层,然后每一层都是容易被验证的。这篇吹的故事把kvm和driver code解耦,硬件的地址的取得都放rv里面。其实这东西在intel上uipi+driver的context软件实现就没问题,rocetchip PC也不喜欢,容易被喷。osdi22被拒的原因是“xilinx板子上夸张的强于kvm 2倍,qemu模拟的设备会放大duvisor在mmio上对kvm的优势”,现在才中。

听作者说在firesim里拿到的结果和xilinx的差的很多,主要原因是Xilinx板子没有给出网络性能的cycle accurate保证, 最后只交了firesim的结果,按理说rtl生成的一样怎么会性能差很多,虽然firesim那玩意坑很多,但听说网络性能输出稳定。最后的实验是非常让kvm的,二级页表优势关掉了,而且kvm加了直通中断。

这个工作让来自host-attacking的漏洞给弥补了,以后不再会有类似问题。

eZNS

动态分配zone space

PM RDMA replication

lyy组的,利用了PCIe硬件的queue和memory control和Pmem的write combining,insight就说在kv大小可控的情况下,daemon走pcie应该分多少size的对pm的写。

SEPH

在同时写PM和DRAM的场景下想个办法把对PMEM的写放前面。

Remote Fork

remote docker cold start的问题,不如先remote fork。naive的做法是CRIU(传diff 怎么样?)所以就直接上RDMA codesign, 让remote aware,先one sided read page table in the page fault migration。

cxl似乎就是用来解决cross memory migration without serialization的。

Johnny cache

太神了

CXL-ANNS

两边都做实验,他们的rv设备和gem5,主要就是offload一些operator到cxl controller。

CXL-flash

写了个mqsim的trace based 模拟器,trace是用vagrid收的。用于design cxl flash,一半内存一半flash。讲了如何设计flahs的prefetch和mshr算法来fill ld/st与nvme访问的gap。两者甚至会打架(cache pollution)。

STYP

这工作本质上是把CPU的compssion的cycle放到smartnic的CPU上。cgroup,kernel same-page merging, like page deduplication。zswap

kthread page statistics。ccnuma?smartNIC?zswap take cycle?

BWOS

fuming的文章。用block based split operation that only verify every blocks' workstealing来 proof parallel computing using task based schduling.问题是每次block 都要同步来prove,不知道这样的speed gain会不会影响正确性

SPOQ

No Loop or intrinsics allowed for verification

formalize LLVM IR, loop finetune gen proof from spec。Some how a good thing to gen a small set of data to speed up verify systemC.

Sharding the state machine

分而治之steate machine,两个例子是page cache in SplinterDB和node replication in NrOS

Exact Once DAG

failure 会检查有没有执行过。

SMartNIC Characterization

blueflied2 上arm core 和host cpu对Switch的data path的讨论,有PCIe的congestion也有CPU的DMA controller的throttle。

ServiceRouter

一个service区域调度器

Ensō

用streaming的抽象替换descripter,ringbuffer和packets抽象。而且如果还是用原来的抽象对比dpdk在几乎所有packets大小对比的bandwidth大小比较上都最优,可以说对大部分应用是free lunch.

这里在NIC里需要维护ip forward和prefetch,实际用到了on chip 计算资源。对延时敏感,packets size aware的用户会感觉到这个没有用。

Design of per cgroup memory disaggregation

This post will be integrate with yyw's knowledge base

For an orchestration system, resource management needs to consider at least the following aspects:

  1. An abstraction of the resource model; including,
  • What kinds of resources are there, for example, CPU, memory (local vs remote that can be transparent to the user), etc.;
  • How to represent these resources with data structures;

1. resource scheduling

  • How to describe a resource application (spec) of a workload, for example, "This container requires 4 cores and 12GB~16GB(4GB local/ 8GB-12GB remote) of memory";
  • How to describe the current resource allocation status of a node, such as the amount of allocated/unallocated resources, whether it supports over-segmentation, etc.;
  • Scheduling algorithm: how to select the most suitable node for it according to the workload spec;

2.Resource quota

  • How to ensure that the amount of resources used by the workload does not exceed the preset range (so as not to affect other workloads);
  • How to ensure the quota of workload and system/basic service so that the two do not affect each other.

k8s is currently the most popular container orchestration system, so how does it solve these problems?

k8s resource model

Compared with the above questions, let's see how k8s is designed:

  1. Resource model :
    • Abstract resource types such as cpu/memory/device/hugepage;
    • Abstract the concept of node;
  2. Resource Scheduling :
    • requestThe two concepts of and are abstracted limit, respectively representing the minimum (request) and maximum (limit) resources required by a container;
    • AllocatableThe scheduling algorithm selects the appropriate node for the container according to the amount of resources currently available for allocation ( ) of each node ; Note that k8s scheduling only looks at requests, not limits .
  3. Resource enforcement :
    • Use cgroups to ensure that the maximum amount of resources used by a workload does not exceed the specified limits at multiple levels.

An example of a resource application (container):

apiVersion: v2
kind: Pod
spec:
  containers:
  - name: busybox
    image: busybox
    resources:
      limits:
        cpu: 500m
        memory: "400Mi"
      requests:
        cpu: 250m
        memory: "300Mi"
    command: ["md5sum"]
    args: ["/dev/urandom"]

Here requestsand limits represent the minimum and maximum values ​​of required resources, respectively.

  • The unit of CPU resources m is millicores the abbreviation, which means one-thousandth of a core, so cpu: 500m means that 0.5 a core is required;
  • The unit of memory is well understood, that is, common units such as MB and GB.

Node resource abstraction

$ k describe node <node>
...
Capacity:
  cpu:                          48
  mem-hard-eviction-threshold:  500Mi
  mem-soft-eviction-threshold:  1536Mi
  memory:                       263192560Ki
  pods:                         256
Allocatable:
  cpu:                 46
  memory:              258486256Ki
  pods:                256
Allocated resources:
  (Total limits may be over 100 percent, i.e., overcommitted.)
  Resource            Requests     Limits
  --------            --------     ------
  cpu                 800m (1%)    7200m (15%)
  memory              1000Mi (0%)  7324Mi (2%)
  hugepages-1Gi       0 (0%)       0 (0%)
...

Let's look at these parts separately.

Capacity

The total resources of this node (which can be simply understood as physical configuration ), for example, the above output shows that this node has 48CPU, 256GB memory, and so on.

Allocatable

The total amount of resources that can be allocated by k8s , obviously, Allocatable will not exceed Capacity, for example, there are 2 less CPUs as seen above, and only 46 are left.

Allocated

The amount of resources that this node has allocated so far, note that the message also said that the node may be oversubscribed , so the sum may exceed Allocatable, but it will not exceed Capacity.

Allocatable does not exceed Capacity, and this concept is also well understood; but which resources are allocated specifically , causing Allocatable < Capacityit?

Node resource segmentation (reserved)

Because k8s-related basic services such as kubelet/docker/containerd and other operating system processes such as systemd/journald run on each node, not all resources of a node can be used to create pods for k8s. Therefore, when k8s manages and schedules resources, it needs to separate out the resource usage and enforcement of these basic services.

To this end, k8s proposed the Node Allocatable Resources[1] proposal, from which the above terms such as Capacity and Allocatable come from. A few notes:

  • If Allocatable is available, the scheduler will use Allocatable, otherwise it will use Capacity;
  • Using Allocatable is not overcommit, using Capacity is overcommit;

Calculation formula: [Allocatable] = [NodeCapacity] - [KubeReserved] - [SystemReserved] - [HardEvictionThreshold]

Let’s look at these types separately.

System Reserved

Basic services of the operating system, such as systemd, journald, etc., are outside k8s management . k8s cannot manage the allocation of these resources, but it can manage the enforcement of these resources, as we will see later.

Kube Reserved

k8s infrastructure services, including kubelet/docker/containerd, etc. Similar to the system services above, k8s cannot manage the allocation of these resources, but it can manage the enforcement of these resources, as we will see later.

EvictionThreshold (eviction threshold)

When resources such as node memory/disk are about to be exhausted, kubelet starts to expel pods according to the QoS priority (best effort/burstable/guaranteed) , and eviction resources are reserved for this purpose.

Allocatable

Resources available for k8s to create pods.

The above is the basic resource model of k8s. Let's look at a few related configuration parameters.

Kubelet related configuration parameters

kubelet command parameters related to resource reservation (segmentation):

  • --system-reserved=""
  • --kube-reserved=""
  • --qos-reserved=""
  • --reserved-cpus=""

It can also be configured via the kubelet, for example,

$ cat /etc/kubernetes/kubelet/config
...
systemReserved:
  cpu: "2"  
  memory: "4Gi"

Do you need to use a dedicated cgroup for resource quotas for these reserved resources to ensure that they do not affect each other:

  • --kube-reserved-cgroup=""
  • --system-reserved-cgroup=""

The default is not enabled. In fact, it is difficult to achieve complete isolation. The consequence is that the system process and the pod process may affect each other. For example, as of v1.26, k8s does not support IO isolation, so the IO of the host process (such as log rotate) soars, or when a pod process executes java dump, It will affect all pods on this node.

The k8s resource model will be introduced here first, and then enter the focus of this article, how k8s uses cgroups to limit the resource usage of workloads such as containers, pods, and basic services (enforcement).

k8s cgroup design

cgroup base

groups are Linux kernel infrastructures that can limit, record and isolate the amount of resources (CPU, memory, IO, etc.) used by process groups.

There are two versions of cgroup, v1 and v2. For the difference between the two, please refer to Control Group v2. Since it's already 2023, we focus on v2. The cgroup v1 exposes more memory stats like swapiness, and all the control is flat control, v2 exposes only cpuset and memory and exposes the hierarchy view.

$ mount | grep cgroup
cgroup2 on /sys/fs/cgroup type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recursiveprot)

$ root@banana:~/CXLMemSim/microbench# ls /sys/fs/cgroup
cgroup.controllers      cpuset.mems.effective  memory.reclaim
cgroup.max.depth        dev-hugepages.mount    memory.stat
cgroup.max.descendants  dev-mqueue.mount       misc.capacity
cgroup.pressure         init.scope             misc.current
cgroup.procs            io.cost.model          sys-fs-fuse-connections.mount
cgroup.stat             io.cost.qos            sys-kernel-config.mount
cgroup.subtree_control  io.pressure            sys-kernel-debug.mount
cgroup.threads          io.prio.class          sys-kernel-tracing.mount
cpu.pressure            io.stat                system.slice
cpu.stat                memory.numa_stat       user.slice
cpuset.cpus.effective   memory.pressure        yyw

$ root@banana:~/CXLMemSim/microbench# ls /sys/fs/cgroup/yyw
cgroup.controllers      cpu.uclamp.max       memory.oom.group
cgroup.events           cpu.uclamp.min       memory.peak
cgroup.freeze           cpu.weight           memory.pressure
cgroup.kill             cpu.weight.nice      memory.reclaim
cgroup.max.depth        io.pressure          memory.stat
cgroup.max.descendants  memory.current       memory.swap.current
cgroup.pressure         memory.events        memory.swap.events
cgroup.procs            memory.events.local  memory.swap.high
cgroup.stat             memory.high          memory.swap.max
cgroup.subtree_control  memory.low           memory.swap.peak
cgroup.threads          memory.max           memory.zswap.current
cgroup.type             memory.min           memory.zswap.max
cpu.idle                memory.node_limit1   pids.current
cpu.max                 memory.node_limit2   pids.events
cpu.max.burst           memory.node_limit3   pids.max
cpu.pressure            memory.node_limit4   pids.peak
cpu.stat                memory.numa_stat

The procfs is registered in