1. k3s 证书轮换
# 如更改tls-san,编辑/etc/systemd/system/k3s.service, 在启动命令后加--tls-san =ip

systemctl daemon-reload

# 在其中一个节点上执行
k3s kubectl --insecure-skip-tls-verify=true delete secret k3s-serving -n kube-system

# 在每个节点上执行
rm -rf /var/lib/rancher/k3s/server/tls/dynamic-cert.json
systemctl restart k3s
  1. k3s 在线安装server
# 先更改node的hostname,hostname必须唯一
export INSTALL_K3S_VERSION=v1.23.14+k3s1 
export INSTALL_K3S_EXEC="--kubelet-arg=max-pods=1000 --kube-apiserver-arg=service-node-port-range=1-65535"
curl -sfL https://get.k3s.io | sh -s - --docker
  1. 添加worker节点
# 先更改node的hostname,hostname必须唯一
export INSTALL_K3S_VERSION=v1.23.14+k3s1
export INSTALL_K3S_EXEC="--kubelet-arg=max-pods=1000"

# K3S_TOKEN创建在/var/lib/rancher/k3s/server/node-token
cat /var/lib/rancher/k3s/server/node-token
export INSTALL_K3S_VERSION=v1.20.13+k3s1 
export INSTALL_K3S_EXEC="--kubelet-arg=max-pods=1000 --kube-apiserver-arg=service-node-port-range=1-65535"
curl -sfL curl -sfL https://get.k3s.io |  K3S_URL=https://172.20.176.99:6443 K3S_TOKEN=K10de0bfbb06c5d3f98b8f422abb0b31f8764369e98611d1d13586215ae04b0392d::server:854453a23e9704cb2707272fe62c0af2 sh -s - --docker
  1. k3s 离线安装 , release note
  2. 下载tar包对应的下载位置
  3. sudo mkdir -p /var/lib/rancher/k3s/agent/images/ sudo cp ./k3s-airgap-images-$ARCH.tar /var/lib/rancher/k3s/agent/images/
  4. 下载k3s,https://github.com/k3s-io/k3s/releases, 放到/usr/local/bin, chmod +x k3s
  5. 下载k3s安装脚本 https://get.k3s.io/ , 命名为install.sh。
  6. INSTALL_K3S_SKIP_DOWNLOAD=true sh install.sh
  7. INSTALL_K3S_SKIP_DOWNLOAD=true INSTALL_K3S_EXEC='server' K3S_DATASTORE_ENDPOINT='mysql://username:password@tcp(hostname:3306)/database-name' ./install.sh
  8. k3s 安装选项 , https://docs.rancher.cn/docs/k3s/installation/install-options/server-config/_index#%E9%9B%86%E7%BE%A4%E9%80%89%E9%A1%B9
  9. kubeconfig 不检验server证书
- cluster:
insecure-skip-tls-verify: true
server: https://127.0.0.1:443
name: my-cluster

#   需要删除如下所示的certificate-authority-data
apiVersion: v1
clusters:
- cluster:
certificate-authority-data: xxx
server: https://xxx:6443
name: cluster1

k3s的证书问题

  1. 证书位置在/var/lib/rancher/k3s/server/tls/ ,里面存放了各种证书。
  2. mutatingwebhook的cabundle 字段是client-ca.crt的base64值。 cat /var/lib/rancher/k3s/server/tls/client-ca.crt|base64 |tr -d '\n'

安装nvidia-plugin

  1. 参考https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide
  2. 下载 nerdctl https://github.com/containerd/nerdctl/releases
  3. 导出变量 export CONTAINERD_ADDRESS=”unix:///run/k3s/containerd/containerd.sock”
  4. 安装device-plugin ,github项目
  5. k3s用docker安装。参考k3s用docker做运行时
  6. 部署k3s参考

k3s安装 nvidia device plugin


curl https://releases.rancher.com/install-docker/20.10.sh | sh
distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
      && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
      && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | \
            sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
            sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt update -y && sudo apt install nvidia-container-runtime

sudo wget https://k3d.io/v4.4.8/usage/guides/cuda/config.toml.tmpl -O /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl

ctr image pull  docker.io/nvidia/cuda:11.4.0-base-ubuntu20.04 
ctr  run --rm --gpus 0 -t docker.io/nvidia/cuda:11.4.0-base-ubuntu20.04 nvidia-smi
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.2/deployments/static/nvidia-device-plugin.yml
# 安装完插件可能需要重启。
cat <<EOF | kubectl create -f -
apiVersion: v1
kind: Pod
metadata:
  name: gpu
spec:
  restartPolicy: Never
  containers:
    - name: gpu
      image: "nvidia/cuda:11.4.1-base-ubuntu20.04"
      command: [ "/bin/bash", "-c", "nvidia-smi" ]
      # resources:
      #   limits:
      #     nvidia.com/gpu: 1
EOF
kubectl logs -f gpu

k3d的默认containerd的使用nvidia runtime, /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl

[plugins.opt]
  path = "{{ .NodeConfig.Containerd.Opt }}"

[plugins.cri]
  stream_server_address = "127.0.0.1"
  stream_server_port = "10010"

{{- if .IsRunningInUserNS }}
  disable_cgroup = true
  disable_apparmor = true
  restrict_oom_score_adj = true
{{end}}

{{- if .NodeConfig.AgentConfig.PauseImage }}
  sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
{{end}}

{{- if not .NodeConfig.NoFlannel }}
[plugins.cri.cni]
  bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
  conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
{{end}}

[plugins.cri.containerd.runtimes.runc]
  # ---- changed from 'io.containerd.runc.v2' for GPU support
  runtime_type = "io.containerd.runtime.v1.linux"

# ---- added for GPU support
[plugins.linux]
  runtime = "nvidia-container-runtime"

{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors."{{$k}}"]
  endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
{{end}}

{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.cri.registry.configs."{{$k}}".auth]
  {{ if $v.Auth.Username }}username = "{{ $v.Auth.Username }}"{{end}}
  {{ if $v.Auth.Password }}password = "{{ $v.Auth.Password }}"{{end}}
  {{ if $v.Auth.Auth }}auth = "{{ $v.Auth.Auth }}"{{end}}
  {{ if $v.Auth.IdentityToken }}identitytoken = "{{ $v.Auth.IdentityToken }}"{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.cri.registry.configs."{{$k}}".tls]
  {{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
  {{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
  {{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
{{end}}
{{end}}
{{end}}

containerd配置默认的引擎为nvidia-docker

[plugins.cri.cni]
  bin_dir = "/var/lib/rancher/k3s/data/c0830be39589f4503a78572e92ac1ff62de74be5bc69c98a71ff0aac3cc8f847/bin"
  conf_dir = "/var/lib/rancher/k3s/agent/etc/cni/net.d"


[plugins.cri.containerd.runtimes.runc]
  # ---- changed from 'io.containerd.runc.v2' for GPU support
  runtime_type = "io.containerd.runtime.v1.linux"

# ---- added for GPU support
[plugins.linux]
  runtime = "nvidia-container-runtime"

docker配置nvidia runtime

/etc/docker/daemon.json

{
    "default-runtime": "nvidia",
    "runtimes": {
        "nvidia": {
            "args": [],
            "path": "nvidia-container-runtime"
        }
    }
}