N-Docs LogoN-Docs

Best Practices

Essential best practices for Proxmox VE deployment, management, and maintenance

Best Practices

Following established best practices ensures optimal performance, security, and reliability of your Proxmox VE infrastructure. This guide covers essential recommendations for all aspects of Proxmox management.

Infrastructure Planning

Hardware Considerations

Proper hardware selection and configuration form the foundation of a reliable virtualization platform.

CPU Requirements
Memory Planning
Storage Strategy

Network Architecture

# Management network (isolated)
auto vmbr0
iface vmbr0 inet static
    address 192.168.1.100/24
    gateway 192.168.1.1
    bridge-ports bond0
    bridge-stp off
    bridge-fd 0
    # Management traffic only

# VM network (production)
auto vmbr1
iface vmbr1 inet manual
    bridge-ports bond1
    bridge-stp off
    bridge-fd 0
    bridge-vlan-aware yes
    # VM traffic with VLAN support

# Storage network (dedicated)
auto vmbr2
iface vmbr2 inet static
    address 10.0.0.100/24
    bridge-ports enp3s0
    bridge-stp off
    bridge-fd 0
    # Storage and replication traffic
# Network bonding for redundancy
auto bond0
iface bond0 inet manual
    bond-slaves enp1s0 enp2s0
    bond-miimon 100
    bond-mode active-backup
    bond-primary enp1s0

# LACP bonding for performance
auto bond1
iface bond1 inet manual
    bond-slaves enp4s0 enp5s0
    bond-miimon 100
    bond-mode 802.3ad
    bond-xmit-hash-policy layer2+3
# Network tuning for high throughput
echo 'net.core.rmem_max = 134217728' >> /etc/sysctl.conf
echo 'net.core.wmem_max = 134217728' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_rmem = 4096 87380 134217728' >> /etc/sysctl.conf
echo 'net.ipv4.tcp_wmem = 4096 65536 134217728' >> /etc/sysctl.conf

# Apply settings
sysctl -p

Security Best Practices

Access Control

Implement defense-in-depth security strategies to protect your virtualization infrastructure.

# Create dedicated admin user
pveum user add admin@pve --firstname Admin --lastname User --email [email protected]

# Create custom role with limited permissions
pveum role add VMManager --privs "VM.Allocate,VM.Clone,VM.Config.CDROM,VM.Config.CPU,VM.Config.Cloudinit,VM.Config.Disk,VM.Config.HWType,VM.Config.Memory,VM.Config.Network,VM.Config.Options,VM.Console,VM.Monitor,VM.PowerMgmt,VM.Snapshot,VM.Snapshot.Rollback"

# Assign role to user for specific VMs
pveum acl modify /vms/100 --users admin@pve --roles VMManager

# Enable two-factor authentication
pveum user modify admin@pve --keys "otpauth://totp/..."

Network Security

# Enable datacenter firewall
echo 'enable: 1' > /etc/pve/firewall/cluster.fw
echo 'policy_in: DROP' >> /etc/pve/firewall/cluster.fw
echo 'policy_out: ACCEPT' >> /etc/pve/firewall/cluster.fw

# Create security groups
cat > /etc/pve/firewall/cluster.fw << 'EOF'
[group management]
IN SSH(ACCEPT) -source 192.168.1.0/24
IN ACCEPT -p tcp -dport 8006 -source 192.168.1.0/24

[group web-servers]
IN HTTP(ACCEPT)
IN HTTPS(ACCEPT)
IN SSH(ACCEPT) -source 192.168.1.0/24
EOF

# Apply to nodes
echo '[OPTIONS]' > /etc/pve/nodes/proxmox1/host.fw
echo 'enable: 1' >> /etc/pve/nodes/proxmox1/host.fw
echo '[RULES]' >> /etc/pve/nodes/proxmox1/host.fw
echo 'GROUP management' >> /etc/pve/nodes/proxmox1/host.fw
# SSH configuration hardening
cat >> /etc/ssh/sshd_config << 'EOF'
# Disable root login
PermitRootLogin no

# Use key-based authentication only
PasswordAuthentication no
PubkeyAuthentication yes

# Limit users
AllowUsers admin

# Change default port
Port 2222

# Disable unused features
X11Forwarding no
AllowTcpForwarding no
EOF

systemctl restart sshd
# Generate custom SSL certificate
openssl req -x509 -newkey rsa:4096 -keyout /etc/pve/local/pve-ssl.key -out /etc/pve/local/pve-ssl.pem -days 365 -nodes

# Or use Let's Encrypt
pvenode acme account register default [email protected]
pvenode acme plugin add dns cloudflare --data api_token=your_token
pvenode config set --acme domains=proxmox.example.com
pvenode acme cert order

Performance Optimization

Host System Tuning

CPU Optimization
Memory Management
I/O Optimization
# CPU performance tuning
echo 'performance' | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor

# Memory tuning
echo 'vm.swappiness = 10' >> /etc/sysctl.conf
echo 'vm.vfs_cache_pressure = 50' >> /etc/sysctl.conf

# I/O scheduler optimization
echo 'ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="0", ATTR{queue/scheduler}="noop"' > /etc/udev/rules.d/60-ssd-scheduler.rules
echo 'ACTION=="add|change", KERNEL=="sd[a-z]", ATTR{queue/rotational}=="1", ATTR{queue/scheduler}="deadline"' >> /etc/udev/rules.d/60-ssd-scheduler.rules

VM Configuration Best Practices

# Use host CPU type for best performance
qm set 100 --cpu host

# Enable CPU hotplug
qm set 100 --hotplug cpu,memory,disk,network

# CPU pinning for critical VMs
qm set 100 --cpus 4 --affinity 4,5,6,7

# NUMA topology for large VMs
qm set 100 --numa 1 --cpus 8 --memory 16384
# Disable ballooning for performance-critical VMs
qm set 100 --balloon 0

# Enable huge pages for large VMs
qm set 100 --hugepages 1024

# Memory shares for prioritization
qm set 100 --shares 2000
# Use VirtIO SCSI with multiple queues
qm set 100 --scsihw virtio-scsi-pci --scsi0 local-lvm:vm-100-disk-0,iothread=1,queues=4

# Enable discard for thin provisioning
qm set 100 --scsi0 local-lvm:vm-100-disk-0,discard=on

# Cache settings based on use case
# Write-back for performance (with UPS)
qm set 100 --scsi0 local-lvm:vm-100-disk-0,cache=writeback

# None for safety (without UPS)
qm set 100 --scsi0 local-lvm:vm-100-disk-0,cache=none

Monitoring and Maintenance

System Monitoring

Proactive monitoring helps identify issues before they impact production workloads.

# Install monitoring tools
apt update && apt install htop iotop nethogs

# System resource monitoring script
#!/bin/bash
# /usr/local/bin/system-monitor.sh

LOG_FILE="/var/log/system-monitor.log"
DATE=$(date '+%Y-%m-%d %H:%M:%S')

# CPU usage
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)

# Memory usage
MEM_USAGE=$(free | grep Mem | awk '{printf("%.2f", $3/$2 * 100.0)}')

# Disk usage
DISK_USAGE=$(df -h / | awk 'NR==2{print $5}' | cut -d'%' -f1)

# Load average
LOAD_AVG=$(uptime | awk -F'load average:' '{print $2}')

echo "$DATE - CPU: ${CPU_USAGE}%, Memory: ${MEM_USAGE}%, Disk: ${DISK_USAGE}%, Load:${LOAD_AVG}" >> $LOG_FILE

# Alert if thresholds exceeded
if (( $(echo "$CPU_USAGE > 80" | bc -l) )); then
    echo "High CPU usage: $CPU_USAGE%" | mail -s "Alert: High CPU" [email protected]
fi

Automated Maintenance

# Automated update script
#!/bin/bash
# /usr/local/bin/update-system.sh

# Update package lists
apt update

# Check for available updates
UPDATES=$(apt list --upgradable 2>/dev/null | grep -c upgradable)

if [ $UPDATES -gt 0 ]; then
    echo "Updates available: $UPDATES packages"
    
    # Create system snapshot before updates
    zfs snapshot rpool/ROOT/pve-1@pre-update-$(date +%Y%m%d)
    
    # Apply updates (excluding kernel for manual review)
    apt upgrade -y --exclude=pve-kernel-*
    
    # Log update completion
    echo "$(date): System updated, $UPDATES packages" >> /var/log/system-updates.log
fi

# Schedule via cron (weekly)
# 0 2 * * 0 /usr/local/bin/update-system.sh
# Configure log rotation
cat > /etc/logrotate.d/proxmox-custom << 'EOF'
/var/log/vzdump.log {
    weekly
    rotate 12
    compress
    delaycompress
    missingok
    notifempty
    create 644 root root
}

/var/log/system-monitor.log {
    daily
    rotate 30
    compress
    delaycompress
    missingok
    notifempty
    create 644 root root
}
EOF
# Automated cleanup script
#!/bin/bash
# /usr/local/bin/cleanup-system.sh

# Clean old backups (keep last 30 days)
find /var/lib/vz/dump -name "*.vma*" -mtime +30 -delete

# Clean old logs
journalctl --vacuum-time=30d

# Clean package cache
apt autoremove -y
apt autoclean

# Clean temporary files
find /tmp -type f -atime +7 -delete
find /var/tmp -type f -atime +7 -delete

# ZFS cleanup (if applicable)
zfs list -t snapshot | grep auto | awk '{print $1}' | xargs -I {} zfs destroy {}

echo "$(date): System cleanup completed" >> /var/log/cleanup.log

# Schedule daily
# 0 3 * * * /usr/local/bin/cleanup-system.sh

Backup and Recovery

Backup Strategy Implementation

3-2-1 Rule
Testing Schedule
# Comprehensive backup script
#!/bin/bash
# /usr/local/bin/backup-vms.sh

BACKUP_STORAGE="pbs-main"
LOG_FILE="/var/log/backup-custom.log"
DATE=$(date '+%Y-%m-%d %H:%M:%S')

# Function to log messages
log_message() {
    echo "$DATE - $1" >> $LOG_FILE
}

# Pre-backup checks
if ! pvesm status $BACKUP_STORAGE >/dev/null 2>&1; then
    log_message "ERROR: Backup storage $BACKUP_STORAGE not available"
    exit 1
fi

# Backup all VMs
log_message "Starting backup process"
vzdump --all --storage $BACKUP_STORAGE --mode snapshot --compress lzo --mailto [email protected]

if [ $? -eq 0 ]; then
    log_message "Backup completed successfully"
else
    log_message "ERROR: Backup failed"
    exit 1
fi

# Verify recent backups
RECENT_BACKUPS=$(find /var/lib/vz/dump -name "*.vma*" -mtime -1 | wc -l)
log_message "Recent backups found: $RECENT_BACKUPS"

# Cleanup old backups
find /var/lib/vz/dump -name "*.vma*" -mtime +7 -delete
log_message "Old backups cleaned up"

High Availability and Clustering

Cluster Best Practices

Proper cluster configuration ensures seamless failover and resource sharing across nodes.

# Cluster network configuration
# Dedicated cluster network recommended
auto eth1
iface eth1 inet static
    address 10.0.1.100/24
    # Cluster communication only

# Corosync configuration optimization
cat >> /etc/pve/corosync.conf << 'EOF'
totem {
    version: 2
    cluster_name: production-cluster
    transport: knet
    crypto_cipher: aes256
    crypto_hash: sha256
}

nodelist {
    node {
        ring0_addr: 10.0.1.100
        name: node1
        nodeid: 1
    }
    node {
        ring0_addr: 10.0.1.101
        name: node2
        nodeid: 2
    }
    node {
        ring0_addr: 10.0.1.102
        name: node3
        nodeid: 3
    }
}

quorum {
    provider: corosync_votequorum
    two_node: 0
}
EOF

Resource Management

# Create resource pools for organization
pvesh create /pools --poolid production --comment "Production VMs"
pvesh create /pools --poolid development --comment "Development VMs"
pvesh create /pools --poolid testing --comment "Testing Environment"

# Assign VMs to pools
qm set 100 --pool production
qm set 200 --pool development
# Configure HA groups
ha-manager groupadd production --nodes node1:2,node2:1,node3:0 --restricted

# Add VMs to HA
ha-manager add vm:100 --group production --max_restart 3 --max_relocate 1

# Configure fencing
ha-manager set vm:100 --state started
# Live migration settings
echo 'migration: secure,network=10.0.1.0/24' >> /etc/pve/datacenter.cfg
echo 'migration_unsecure: 1' >> /etc/pve/datacenter.cfg

# Bandwidth limiting for migrations
echo 'bwlimit: migration=100000' >> /etc/pve/datacenter.cfg

Documentation and Change Management

Infrastructure Documentation

Network Diagram
VM Inventory
Procedures

Change Management Process

# Change tracking script
#!/bin/bash
# /usr/local/bin/log-change.sh

CHANGE_LOG="/var/log/infrastructure-changes.log"
DATE=$(date '+%Y-%m-%d %H:%M:%S')
USER=$(whoami)
CHANGE_DESC="$1"

if [ -z "$CHANGE_DESC" ]; then
    echo "Usage: $0 'Description of change'"
    exit 1
fi

echo "$DATE - $USER - $CHANGE_DESC" >> $CHANGE_LOG

# Optional: Send notification
echo "Change logged: $CHANGE_DESC" | mail -s "Infrastructure Change" [email protected]

Troubleshooting Guidelines

Common Issues and Solutions

Always create backups or snapshots before attempting major troubleshooting steps.

# Performance diagnostic script
#!/bin/bash
echo "=== System Performance Diagnostics ==="
echo "Date: $(date)"
echo

echo "=== CPU Information ==="
lscpu | grep -E "(Model name|CPU\(s\)|Thread|Core)"
echo "Load Average: $(uptime | awk -F'load average:' '{print $2}')"
echo

echo "=== Memory Usage ==="
free -h
echo

echo "=== Disk I/O ==="
iostat -x 1 3
echo

echo "=== Network Usage ==="
ss -tuln | grep :8006
echo

echo "=== Top Processes ==="
ps aux --sort=-%cpu | head -10
# Network diagnostic script
#!/bin/bash
echo "=== Network Diagnostics ==="
echo

echo "=== Interface Status ==="
ip addr show
echo

echo "=== Bridge Configuration ==="
brctl show
echo

echo "=== Routing Table ==="
ip route show
echo

echo "=== Connectivity Tests ==="
ping -c 3 8.8.8.8
echo

echo "=== Firewall Status ==="
iptables -L -n | head -20
# Storage diagnostic script
#!/bin/bash
echo "=== Storage Diagnostics ==="
echo

echo "=== Disk Usage ==="
df -h
echo

echo "=== Storage Configuration ==="
pvesm status
echo

echo "=== ZFS Status (if applicable) ==="
zpool status 2>/dev/null || echo "ZFS not configured"
echo

echo "=== LVM Status ==="
vgs
lvs
echo

echo "=== Disk Health ==="
for disk in /dev/sd?; do
    echo "Checking $disk:"
    smartctl -H $disk 2>/dev/null || echo "SMART not available for $disk"
done

Compliance and Auditing

Security Auditing

# Security audit script
#!/bin/bash
# /usr/local/bin/security-audit.sh

AUDIT_LOG="/var/log/security-audit.log"
DATE=$(date '+%Y-%m-%d %H:%M:%S')

echo "$DATE - Starting security audit" >> $AUDIT_LOG

# Check for unauthorized users
echo "Checking user accounts..." >> $AUDIT_LOG
awk -F: '$3 >= 1000 {print $1}' /etc/passwd >> $AUDIT_LOG

# Check SSH configuration
echo "Checking SSH configuration..." >> $AUDIT_LOG
grep -E "(PermitRootLogin|PasswordAuthentication)" /etc/ssh/sshd_config >> $AUDIT_LOG

# Check firewall status
echo "Checking firewall status..." >> $AUDIT_LOG
iptables -L | grep -c "DROP\|REJECT" >> $AUDIT_LOG

# Check for failed login attempts
echo "Recent failed logins:" >> $AUDIT_LOG
grep "Failed password" /var/log/auth.log | tail -10 >> $AUDIT_LOG

echo "$DATE - Security audit completed" >> $AUDIT_LOG

Summary

Following these best practices ensures:

  • Reliability: Robust infrastructure with proper redundancy
  • Security: Defense-in-depth approach to system protection
  • Performance: Optimized configurations for maximum efficiency
  • Maintainability: Automated processes and proper documentation
  • Scalability: Infrastructure that grows with your needs

Regular review and updates of these practices ensure your Proxmox VE environment remains secure, efficient, and reliable.