From 23663922ecc59a84ba591d347fa61439f0d8dd7f Mon Sep 17 00:00:00 2001 From: Nick Adam Date: Mon, 19 Jan 2026 00:02:08 +0100 Subject: [PATCH] fix: emergency mode due to unsed disk --- infrastructure/ansible/README.md | 163 ++++++++++++++++++ infrastructure/ansible/cleanup_rook.yml | 73 -------- infrastructure/ansible/inventory.ini | 2 +- .../ansible/roles/common/tasks/main.yml | 7 + .../ansible/setup_longhorn_disks.yml | 33 +++- 5 files changed, 197 insertions(+), 81 deletions(-) create mode 100644 infrastructure/ansible/README.md delete mode 100644 infrastructure/ansible/cleanup_rook.yml diff --git a/infrastructure/ansible/README.md b/infrastructure/ansible/README.md new file mode 100644 index 0000000..2202197 --- /dev/null +++ b/infrastructure/ansible/README.md @@ -0,0 +1,163 @@ +# Ansible Infrastructure + +Ansible Playbooks für die Verwaltung der Infrastruktur. + +## Voraussetzungen + +### Auf dem Management-Host (localhost) + +1. **Ansible installiert:** + ```bash + pip3 install ansible + # ODER + sudo apt install ansible + ``` + +2. **Ansible Collections installieren:** + ```bash + ansible-galaxy collection install -r requirements.yml + ``` + +3. **Python Dependencies (für Vault-Zugriff):** + + **Option A: pipx (empfohlen für isolierte Installation):** + ```bash + # Installiere pipx falls nicht vorhanden + sudo apt install pipx + pipx ensurepath + + # Installiere hvac in isoliertem Environment + pipx install hvac + ``` + + **Option B: Virtual Environment:** + ```bash + # Erstelle Virtual Environment + python3 -m venv ~/.venv/ansible + source ~/.venv/ansible/bin/activate + + # Installiere Dependencies + pip install -r requirements.txt + + # Nutze Ansible aus Virtual Environment + # (Aktivierung vor jedem Ansible-Run nötig) + ``` + + **Option C: System-Paket (falls verfügbar):** + ```bash + sudo apt install python3-hvac + ``` + + **Option D: --break-system-packages (nicht empfohlen):** + ```bash + pip3 install --break-system-packages -r requirements.txt + ``` + + **Wichtig:** Das `hvac` Modul wird für Vault-Lookups benötigt, die auf `localhost` ausgeführt werden (z.B. in `k3s_deploy.yml`). + +### Auf den Managed Hosts (VMs) + +Die VMs müssen per SSH erreichbar sein. Die benötigten Pakete werden automatisch via Ansible installiert. + +## Installation + +### 1. Ansible Collections installieren + +```bash +cd infrastructure/infrastructure/ansible +ansible-galaxy collection install -r requirements.yml +``` + +### 2. Python Dependencies installieren + +**Option A: pipx (empfohlen):** +```bash +sudo apt install pipx +pipx ensurepath +pipx install hvac +``` + +**Option B: System-Paket:** +```bash +sudo apt install python3-hvac +``` + +**Option C: Virtual Environment:** +```bash +python3 -m venv ~/.venv/ansible +source ~/.venv/ansible/bin/activate +pip install -r requirements.txt +# Hinweis: Virtual Environment muss vor jedem Ansible-Run aktiviert werden +``` + +**Wichtig:** Diese Dependencies sind für Vault-Lookups auf `localhost` erforderlich. + +## Verwendung + +### K3s Cluster Deployment + +```bash +export VAULT_TOKEN="" +export VAULT_ADDR="https://10.100.30.11:8200" # Optional, Default in Playbook +ansible-playbook -i inventory.ini k3s_deploy.yml +``` + +### Docker Apps Deployment + +```bash +ansible-playbook -i inventory.ini deploy.yml +``` + +### Longhorn Disk Setup + +```bash +ansible-playbook -i inventory.ini setup_longhorn_disks.yml +``` + +## Troubleshooting + +### "ModuleNotFoundError: No module named 'hvac'" + +**Problem:** Vault-Lookup schlägt fehl, weil `hvac` nicht installiert ist. + +**Lösung:** +```bash +pip3 install hvac +# ODER +pip3 install -r requirements.txt +``` + +### "sudo: a password is required" bei Vault-Lookup + +**Problem:** Task mit `delegate_to: localhost` versucht sudo zu nutzen. + +**Lösung:** Vault-Tasks sollten `ansible_become: false` haben (bereits implementiert). + +### "Guest Agent not running" in Proxmox + +**Problem:** qemu-guest-agent läuft nicht auf den VMs. + +**Lösung:** Wird automatisch via `common` Role installiert. Siehe [Troubleshooting](../docs/internal/operations/troubleshooting-proxmox-guest-agent.md). + +## Struktur + +``` +ansible/ +├── roles/ +│ ├── common/ # Basis-Setup (Docker, User, CA Certs, Guest Agent) +│ ├── k3s/ # K3s Cluster Setup +│ └── users/ # User Management +├── deploy.yml # Docker Apps Deployment (Push Mode) +├── k3s_deploy.yml # K3s Cluster Deployment +├── setup_longhorn_disks.yml # Longhorn Disk Setup +├── inventory.ini # Production Inventory +├── inventory_local.ini # Local Inventory (für Tests) +├── requirements.yml # Ansible Collections +└── requirements.txt # Python Dependencies (hvac) +``` + +## Referenzen + +- [Troubleshooting Docs](../../docs/internal/operations/) +- [K3s Planning](../../K3S_PLANNING.md) +- [Vault Setup](../../setup_k3s_secrets.sh) diff --git a/infrastructure/ansible/cleanup_rook.yml b/infrastructure/ansible/cleanup_rook.yml deleted file mode 100644 index fe7f7e9..0000000 --- a/infrastructure/ansible/cleanup_rook.yml +++ /dev/null @@ -1,73 +0,0 @@ ---- -- name: Cleanup Rook Ceph Resources (K8s) - hosts: k3s_masters[0] - become: yes - tasks: - - name: Delete ArgoCD Applications if they exist - shell: kubectl delete application -n argocd rook-ceph-cluster rook-ceph-operator --ignore-not-found - ignore_errors: yes - - - name: Delete Rook Ceph Cluster CR - shell: kubectl -n rook-ceph delete cephcluster rook-ceph --wait=false --ignore-not-found - - - name: Patch CephCluster finalizer (to force deletion if stuck) - shell: | - kubectl -n rook-ceph patch cephcluster rook-ceph --type merge -p '{"metadata":{"finalizers": []}}' - ignore_errors: yes - - - name: Patch CephBlockPool finalizers - shell: | - kubectl -n rook-ceph get cephblockpool -o name | xargs -I {} kubectl -n rook-ceph patch {} --type merge -p '{"metadata":{"finalizers": []}}' - ignore_errors: yes - - - name: Patch CephObjectStore finalizers - shell: | - kubectl -n rook-ceph get cephobjectstore -o name | xargs -I {} kubectl -n rook-ceph patch {} --type merge -p '{"metadata":{"finalizers": []}}' - ignore_errors: yes - - - name: Patch CephFilesystem finalizers - shell: | - kubectl -n rook-ceph get cephfilesystem -o name | xargs -I {} kubectl -n rook-ceph patch {} --type merge -p '{"metadata":{"finalizers": []}}' - ignore_errors: yes - - - name: Patch all remaining Rook resources finalizers - shell: | - kubectl api-resources --verbs=list --namespaced -o name | grep ceph.rook.io | xargs -n 1 kubectl get --show-kind --ignore-not-found -n rook-ceph -o name | xargs -r -n 1 kubectl -n rook-ceph patch --type merge -p '{"metadata":{"finalizers": []}}' - ignore_errors: yes - - - name: Force delete Namespace rook-ceph (remove finalizers from NS) - shell: | - kubectl get namespace rook-ceph -o json | jq '.spec.finalizers=[]' | kubectl replace --raw "/api/v1/namespaces/rook-ceph/finalize" -f - - ignore_errors: yes - - - name: Delete Rook Ceph Namespace - shell: kubectl delete namespace rook-ceph --wait=false --ignore-not-found - ignore_errors: yes - - - name: Delete Rook Ceph CRDs (Global cleanup) - shell: kubectl delete crd $(kubectl get crd | grep ceph.rook.io | awk '{print $1}') - ignore_errors: yes - -- name: Cleanup Rook Ceph Data on Nodes - hosts: k3s_masters - become: yes - tasks: - - name: Remove /var/lib/rook directory - file: - path: /var/lib/rook - state: absent - force: yes - - # WARNING: These commands will WIPE DATA on /dev/sdb - - name: Zap Disk sdb - shell: sgdisk --zap-all /dev/sdb || true - ignore_errors: yes - - - name: WipeFS sdb - shell: wipefs -a /dev/sdb || true - ignore_errors: yes - - - name: Mapper clean - shell: ls /dev/mapper/ceph-* | xargs -I% -- dmsetup remove % - ignore_errors: yes - failed_when: false diff --git a/infrastructure/ansible/inventory.ini b/infrastructure/ansible/inventory.ini index 71c1c97..20f9be3 100644 --- a/infrastructure/ansible/inventory.ini +++ b/infrastructure/ansible/inventory.ini @@ -1,6 +1,6 @@ [docker_hosts] vm-docker-apps-301.stabify.de ansible_host=10.100.30.11 -vm-docker-traefik-302.stabify.de ansible_host=10.100.30.12 +# vm-docker-traefik-302 entfernt (Traefik Edge läuft jetzt im k3s Cluster) # vm-docker-mailcow-300.stabify.de ansible_host=10.100.30.10 [k3s_masters] diff --git a/infrastructure/ansible/roles/common/tasks/main.yml b/infrastructure/ansible/roles/common/tasks/main.yml index 8ac5c25..3122187 100644 --- a/infrastructure/ansible/roles/common/tasks/main.yml +++ b/infrastructure/ansible/roles/common/tasks/main.yml @@ -12,9 +12,16 @@ - ca-certificates - gnupg - lsb-release + - qemu-guest-agent # Proxmox Guest Agent state: present update_cache: true +- name: "Starte qemu-guest-agent Service" + systemd: + name: qemu-guest-agent + state: started + enabled: yes + - name: "Verteile Stabify Root CA" copy: src: "{{ playbook_dir }}/../../vault-ca.crt" # Relativ zum Playbook-Root (wenn Push) diff --git a/infrastructure/ansible/setup_longhorn_disks.yml b/infrastructure/ansible/setup_longhorn_disks.yml index cb29c15..49f1971 100644 --- a/infrastructure/ansible/setup_longhorn_disks.yml +++ b/infrastructure/ansible/setup_longhorn_disks.yml @@ -19,6 +19,11 @@ enabled: yes state: started + - name: Wait for /dev/sdb to be available + wait_for: + path: /dev/sdb + timeout: 30 + - name: Check if /dev/sdb exists stat: path: /dev/sdb @@ -26,15 +31,29 @@ - name: Fail if /dev/sdb is missing fail: - msg: "/dev/sdb was not found on this host!" + msg: "/dev/sdb was not found on this host! Ensure the disk is attached in Proxmox/Terraform." when: not disk_sdb.stat.exists - - name: Create ext4 filesystem on /dev/sdb + - name: Check if /dev/sdb has a filesystem + command: blkid /dev/sdb + register: disk_blkid + changed_when: false + failed_when: false + + - name: Create ext4 filesystem on /dev/sdb (only if no filesystem exists) filesystem: fstype: ext4 dev: /dev/sdb - # force: yes # Be careful with force, but since we wiped it, it should be fine. - # If filesystem already exists (e.g. from a previous partial run), this is idempotent. + when: disk_blkid.rc != 0 + + - name: Get UUID of /dev/sdb + command: blkid -s UUID -o value /dev/sdb + register: disk_uuid + changed_when: false + + - name: Display UUID for verification + debug: + msg: "Disk UUID: {{ disk_uuid.stdout }}" - name: Create mount point /var/lib/longhorn file: @@ -42,13 +61,13 @@ state: directory mode: '0755' - - name: Mount /dev/sdb to /var/lib/longhorn + - name: Mount disk using UUID to /var/lib/longhorn mount: path: /var/lib/longhorn - src: /dev/sdb + src: "UUID={{ disk_uuid.stdout }}" fstype: ext4 state: mounted - opts: defaults,noatime + opts: defaults,noatime,nofail - name: Display disk usage for /var/lib/longhorn shell: df -h /var/lib/longhorn