# 如更改tls-san,编辑/etc/systemd/system/k3s.service, 在启动命令后加--tls-san =ip
systemctl daemon-reload
# 在其中一个节点上执行
k3s kubectl --insecure-skip-tls-verify=true delete secret k3s-serving -n kube-system
# 在每个节点上执行
rm -rf /var/lib/rancher/k3s/server/tls/dynamic-cert.json
systemctl restart k3s
- k3s 在线安装server
# 先更改node的hostname,hostname必须唯一
export INSTALL_K3S_VERSION=v1.23.14+k3s1
export INSTALL_K3S_EXEC="--kubelet-arg=max-pods=1000 --kube-apiserver-arg=service-node-port-range=1-65535"
curl -sfL https://get.k3s.io | sh -s - --docker
- 添加worker节点
# 先更改node的hostname,hostname必须唯一
export INSTALL_K3S_VERSION=v1.23.14+k3s1
export INSTALL_K3S_EXEC="--kubelet-arg=max-pods=1000"
# K3S_TOKEN创建在/var/lib/rancher/k3s/server/node-token
cat /var/lib/rancher/k3s/server/node-token
export INSTALL_K3S_VERSION=v1.20.13+k3s1
export INSTALL_K3S_EXEC="--kubelet-arg=max-pods=1000 --kube-apiserver-arg=service-node-port-range=1-65535"
curl -sfL curl -sfL https://get.k3s.io | K3S_URL=https://172.20.176.99:6443 K3S_TOKEN=K10de0bfbb06c5d3f98b8f422abb0b31f8764369e98611d1d13586215ae04b0392d::server:854453a23e9704cb2707272fe62c0af2 sh -s - --docker
- k3s 离线安装 , release note
- 下载tar包,对应的下载位置
sudo mkdir -p /var/lib/rancher/k3s/agent/images/ sudo cp ./k3s-airgap-images-$ARCH.tar /var/lib/rancher/k3s/agent/images/
- 下载k3s,https://github.com/k3s-io/k3s/releases, 放到/usr/local/bin, chmod +x k3s
- 下载k3s安装脚本 https://get.k3s.io/ , 命名为install.sh。
INSTALL_K3S_SKIP_DOWNLOAD=true sh install.sh
INSTALL_K3S_SKIP_DOWNLOAD=true INSTALL_K3S_EXEC='server' K3S_DATASTORE_ENDPOINT='mysql://username:password@tcp(hostname:3306)/database-name' ./install.sh
- k3s 安装选项 , https://docs.rancher.cn/docs/k3s/installation/install-options/server-config/_index#%E9%9B%86%E7%BE%A4%E9%80%89%E9%A1%B9
- kubeconfig 不检验server证书
- cluster:
insecure-skip-tls-verify: true
server: https://127.0.0.1:443
name: my-cluster
# 需要删除如下所示的certificate-authority-data
apiVersion: v1
clusters:
- cluster:
certificate-authority-data: xxx
server: https://xxx:6443
name: cluster1
k3s的证书问题
- 证书位置在/var/lib/rancher/k3s/server/tls/ ,里面存放了各种证书。
- mutatingwebhook的cabundle 字段是client-ca.crt的base64值。
cat /var/lib/rancher/k3s/server/tls/client-ca.crt|base64 |tr -d '\n'
安装nvidia-plugin
- 参考https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide
- 下载 nerdctl https://github.com/containerd/nerdctl/releases
- 导出变量 export CONTAINERD_ADDRESS=”unix:///run/k3s/containerd/containerd.sock”
- 安装device-plugin ,github项目
- k3s用docker安装。参考k3s用docker做运行时
- 部署k3s参考
k3s安装 nvidia device plugin
curl https://releases.rancher.com/install-docker/20.10.sh | sh
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
&& curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt update -y && sudo apt install nvidia-container-runtime
sudo wget https://k3d.io/v4.4.8/usage/guides/cuda/config.toml.tmpl -O /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
ctr image pull docker.io/nvidia/cuda:11.4.0-base-ubuntu20.04
ctr run --rm --gpus 0 -t docker.io/nvidia/cuda:11.4.0-base-ubuntu20.04 nvidia-smi
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.2/deployments/static/nvidia-device-plugin.yml
# 安装完插件可能需要重启。
cat <<EOF | kubectl create -f -
apiVersion: v1
kind: Pod
metadata:
name: gpu
spec:
restartPolicy: Never
containers:
- name: gpu
image: "nvidia/cuda:11.4.1-base-ubuntu20.04"
command: [ "/bin/bash", "-c", "nvidia-smi" ]
# resources:
# limits:
# nvidia.com/gpu: 1
EOF
kubectl logs -f gpu
k3d的默认containerd的使用nvidia runtime, /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
[plugins.opt]
path = "{{ .NodeConfig.Containerd.Opt }}"
[plugins.cri]
stream_server_address = "127.0.0.1"
stream_server_port = "10010"
{{- if .IsRunningInUserNS }}
disable_cgroup = true
disable_apparmor = true
restrict_oom_score_adj = true
{{end}}
{{- if .NodeConfig.AgentConfig.PauseImage }}
sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
{{end}}
{{- if not .NodeConfig.NoFlannel }}
[plugins.cri.cni]
bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
{{end}}
[plugins.cri.containerd.runtimes.runc]
# ---- changed from 'io.containerd.runc.v2' for GPU support
runtime_type = "io.containerd.runtime.v1.linux"
# ---- added for GPU support
[plugins.linux]
runtime = "nvidia-container-runtime"
{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors."{{$k}}"]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
{{end}}
{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.cri.registry.configs."{{$k}}".auth]
{{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}}
{{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}}
{{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.cri.registry.configs."{{$k}}".tls]
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
{{end}}
{{end}}
{{end}}
containerd配置默认的引擎为nvidia-docker
[plugins.cri.cni]
bin_dir = "/var/lib/rancher/k3s/data/c0830be39589f4503a78572e92ac1ff62de74be5bc69c98a71ff0aac3cc8f847/bin"
conf_dir = "/var/lib/rancher/k3s/agent/etc/cni/net.d"
[plugins.cri.containerd.runtimes.runc]
# ---- changed from 'io.containerd.runc.v2' for GPU support
runtime_type = "io.containerd.runtime.v1.linux"
# ---- added for GPU support
[plugins.linux]
runtime = "nvidia-container-runtime"
docker配置nvidia runtime
/etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"args": [],
"path": "nvidia-container-runtime"
}
}
}