fix: emergency mode due to unsed disk

This commit is contained in:
2026-01-19 00:02:08 +01:00
parent 156419a77f
commit 23663922ec
5 changed files with 197 additions and 81 deletions

View File

@@ -0,0 +1,163 @@
# Ansible Infrastructure
Ansible Playbooks für die Verwaltung der Infrastruktur.
## Voraussetzungen
### Auf dem Management-Host (localhost)
1. **Ansible installiert:**
```bash
pip3 install ansible
# ODER
sudo apt install ansible
```
2. **Ansible Collections installieren:**
```bash
ansible-galaxy collection install -r requirements.yml
```
3. **Python Dependencies (für Vault-Zugriff):**
**Option A: pipx (empfohlen für isolierte Installation):**
```bash
# Installiere pipx falls nicht vorhanden
sudo apt install pipx
pipx ensurepath
# Installiere hvac in isoliertem Environment
pipx install hvac
```
**Option B: Virtual Environment:**
```bash
# Erstelle Virtual Environment
python3 -m venv ~/.venv/ansible
source ~/.venv/ansible/bin/activate
# Installiere Dependencies
pip install -r requirements.txt
# Nutze Ansible aus Virtual Environment
# (Aktivierung vor jedem Ansible-Run nötig)
```
**Option C: System-Paket (falls verfügbar):**
```bash
sudo apt install python3-hvac
```
**Option D: --break-system-packages (nicht empfohlen):**
```bash
pip3 install --break-system-packages -r requirements.txt
```
**Wichtig:** Das `hvac` Modul wird für Vault-Lookups benötigt, die auf `localhost` ausgeführt werden (z.B. in `k3s_deploy.yml`).
### Auf den Managed Hosts (VMs)
Die VMs müssen per SSH erreichbar sein. Die benötigten Pakete werden automatisch via Ansible installiert.
## Installation
### 1. Ansible Collections installieren
```bash
cd infrastructure/infrastructure/ansible
ansible-galaxy collection install -r requirements.yml
```
### 2. Python Dependencies installieren
**Option A: pipx (empfohlen):**
```bash
sudo apt install pipx
pipx ensurepath
pipx install hvac
```
**Option B: System-Paket:**
```bash
sudo apt install python3-hvac
```
**Option C: Virtual Environment:**
```bash
python3 -m venv ~/.venv/ansible
source ~/.venv/ansible/bin/activate
pip install -r requirements.txt
# Hinweis: Virtual Environment muss vor jedem Ansible-Run aktiviert werden
```
**Wichtig:** Diese Dependencies sind für Vault-Lookups auf `localhost` erforderlich.
## Verwendung
### K3s Cluster Deployment
```bash
export VAULT_TOKEN="<your-vault-token>"
export VAULT_ADDR="https://10.100.30.11:8200" # Optional, Default in Playbook
ansible-playbook -i inventory.ini k3s_deploy.yml
```
### Docker Apps Deployment
```bash
ansible-playbook -i inventory.ini deploy.yml
```
### Longhorn Disk Setup
```bash
ansible-playbook -i inventory.ini setup_longhorn_disks.yml
```
## Troubleshooting
### "ModuleNotFoundError: No module named 'hvac'"
**Problem:** Vault-Lookup schlägt fehl, weil `hvac` nicht installiert ist.
**Lösung:**
```bash
pip3 install hvac
# ODER
pip3 install -r requirements.txt
```
### "sudo: a password is required" bei Vault-Lookup
**Problem:** Task mit `delegate_to: localhost` versucht sudo zu nutzen.
**Lösung:** Vault-Tasks sollten `ansible_become: false` haben (bereits implementiert).
### "Guest Agent not running" in Proxmox
**Problem:** qemu-guest-agent läuft nicht auf den VMs.
**Lösung:** Wird automatisch via `common` Role installiert. Siehe [Troubleshooting](../docs/internal/operations/troubleshooting-proxmox-guest-agent.md).
## Struktur
```
ansible/
├── roles/
│ ├── common/ # Basis-Setup (Docker, User, CA Certs, Guest Agent)
│ ├── k3s/ # K3s Cluster Setup
│ └── users/ # User Management
├── deploy.yml # Docker Apps Deployment (Push Mode)
├── k3s_deploy.yml # K3s Cluster Deployment
├── setup_longhorn_disks.yml # Longhorn Disk Setup
├── inventory.ini # Production Inventory
├── inventory_local.ini # Local Inventory (für Tests)
├── requirements.yml # Ansible Collections
└── requirements.txt # Python Dependencies (hvac)
```
## Referenzen
- [Troubleshooting Docs](../../docs/internal/operations/)
- [K3s Planning](../../K3S_PLANNING.md)
- [Vault Setup](../../setup_k3s_secrets.sh)

View File

@@ -1,73 +0,0 @@
---
- name: Cleanup Rook Ceph Resources (K8s)
hosts: k3s_masters[0]
become: yes
tasks:
- name: Delete ArgoCD Applications if they exist
shell: kubectl delete application -n argocd rook-ceph-cluster rook-ceph-operator --ignore-not-found
ignore_errors: yes
- name: Delete Rook Ceph Cluster CR
shell: kubectl -n rook-ceph delete cephcluster rook-ceph --wait=false --ignore-not-found
- name: Patch CephCluster finalizer (to force deletion if stuck)
shell: |
kubectl -n rook-ceph patch cephcluster rook-ceph --type merge -p '{"metadata":{"finalizers": []}}'
ignore_errors: yes
- name: Patch CephBlockPool finalizers
shell: |
kubectl -n rook-ceph get cephblockpool -o name | xargs -I {} kubectl -n rook-ceph patch {} --type merge -p '{"metadata":{"finalizers": []}}'
ignore_errors: yes
- name: Patch CephObjectStore finalizers
shell: |
kubectl -n rook-ceph get cephobjectstore -o name | xargs -I {} kubectl -n rook-ceph patch {} --type merge -p '{"metadata":{"finalizers": []}}'
ignore_errors: yes
- name: Patch CephFilesystem finalizers
shell: |
kubectl -n rook-ceph get cephfilesystem -o name | xargs -I {} kubectl -n rook-ceph patch {} --type merge -p '{"metadata":{"finalizers": []}}'
ignore_errors: yes
- name: Patch all remaining Rook resources finalizers
shell: |
kubectl api-resources --verbs=list --namespaced -o name | grep ceph.rook.io | xargs -n 1 kubectl get --show-kind --ignore-not-found -n rook-ceph -o name | xargs -r -n 1 kubectl -n rook-ceph patch --type merge -p '{"metadata":{"finalizers": []}}'
ignore_errors: yes
- name: Force delete Namespace rook-ceph (remove finalizers from NS)
shell: |
kubectl get namespace rook-ceph -o json | jq '.spec.finalizers=[]' | kubectl replace --raw "/api/v1/namespaces/rook-ceph/finalize" -f -
ignore_errors: yes
- name: Delete Rook Ceph Namespace
shell: kubectl delete namespace rook-ceph --wait=false --ignore-not-found
ignore_errors: yes
- name: Delete Rook Ceph CRDs (Global cleanup)
shell: kubectl delete crd $(kubectl get crd | grep ceph.rook.io | awk '{print $1}')
ignore_errors: yes
- name: Cleanup Rook Ceph Data on Nodes
hosts: k3s_masters
become: yes
tasks:
- name: Remove /var/lib/rook directory
file:
path: /var/lib/rook
state: absent
force: yes
# WARNING: These commands will WIPE DATA on /dev/sdb
- name: Zap Disk sdb
shell: sgdisk --zap-all /dev/sdb || true
ignore_errors: yes
- name: WipeFS sdb
shell: wipefs -a /dev/sdb || true
ignore_errors: yes
- name: Mapper clean
shell: ls /dev/mapper/ceph-* | xargs -I% -- dmsetup remove %
ignore_errors: yes
failed_when: false

View File

@@ -1,6 +1,6 @@
[docker_hosts] [docker_hosts]
vm-docker-apps-301.stabify.de ansible_host=10.100.30.11 vm-docker-apps-301.stabify.de ansible_host=10.100.30.11
vm-docker-traefik-302.stabify.de ansible_host=10.100.30.12 # vm-docker-traefik-302 entfernt (Traefik Edge läuft jetzt im k3s Cluster)
# vm-docker-mailcow-300.stabify.de ansible_host=10.100.30.10 # vm-docker-mailcow-300.stabify.de ansible_host=10.100.30.10
[k3s_masters] [k3s_masters]

View File

@@ -12,9 +12,16 @@
- ca-certificates - ca-certificates
- gnupg - gnupg
- lsb-release - lsb-release
- qemu-guest-agent # Proxmox Guest Agent
state: present state: present
update_cache: true update_cache: true
- name: "Starte qemu-guest-agent Service"
systemd:
name: qemu-guest-agent
state: started
enabled: yes
- name: "Verteile Stabify Root CA" - name: "Verteile Stabify Root CA"
copy: copy:
src: "{{ playbook_dir }}/../../vault-ca.crt" # Relativ zum Playbook-Root (wenn Push) src: "{{ playbook_dir }}/../../vault-ca.crt" # Relativ zum Playbook-Root (wenn Push)

View File

@@ -19,6 +19,11 @@
enabled: yes enabled: yes
state: started state: started
- name: Wait for /dev/sdb to be available
wait_for:
path: /dev/sdb
timeout: 30
- name: Check if /dev/sdb exists - name: Check if /dev/sdb exists
stat: stat:
path: /dev/sdb path: /dev/sdb
@@ -26,15 +31,29 @@
- name: Fail if /dev/sdb is missing - name: Fail if /dev/sdb is missing
fail: fail:
msg: "/dev/sdb was not found on this host!" msg: "/dev/sdb was not found on this host! Ensure the disk is attached in Proxmox/Terraform."
when: not disk_sdb.stat.exists when: not disk_sdb.stat.exists
- name: Create ext4 filesystem on /dev/sdb - name: Check if /dev/sdb has a filesystem
command: blkid /dev/sdb
register: disk_blkid
changed_when: false
failed_when: false
- name: Create ext4 filesystem on /dev/sdb (only if no filesystem exists)
filesystem: filesystem:
fstype: ext4 fstype: ext4
dev: /dev/sdb dev: /dev/sdb
# force: yes # Be careful with force, but since we wiped it, it should be fine. when: disk_blkid.rc != 0
# If filesystem already exists (e.g. from a previous partial run), this is idempotent.
- name: Get UUID of /dev/sdb
command: blkid -s UUID -o value /dev/sdb
register: disk_uuid
changed_when: false
- name: Display UUID for verification
debug:
msg: "Disk UUID: {{ disk_uuid.stdout }}"
- name: Create mount point /var/lib/longhorn - name: Create mount point /var/lib/longhorn
file: file:
@@ -42,13 +61,13 @@
state: directory state: directory
mode: '0755' mode: '0755'
- name: Mount /dev/sdb to /var/lib/longhorn - name: Mount disk using UUID to /var/lib/longhorn
mount: mount:
path: /var/lib/longhorn path: /var/lib/longhorn
src: /dev/sdb src: "UUID={{ disk_uuid.stdout }}"
fstype: ext4 fstype: ext4
state: mounted state: mounted
opts: defaults,noatime opts: defaults,noatime,nofail
- name: Display disk usage for /var/lib/longhorn - name: Display disk usage for /var/lib/longhorn
shell: df -h /var/lib/longhorn shell: df -h /var/lib/longhorn