更新日期: 2025年1月 项目: DV IT Infrastructure Platform 基于: 生产环境实际故障排查经验 VPS: 156.67.214.225
# 查看所有容器状态 docker ps -a # 查看容器日志 docker logs <container_name> # 查看网络 docker network ls docker network inspect <network_name> # 查看Traefik路由 docker logs traefik # 检查端口 netstat -tlnp | grep :80 netstat -tlnp | grep :443
docker ps | grep traefik docker logs traefik
# 检查容器是否配置了正确的labels docker inspect <container_name> | grep -A 10 -B 5 "Labels"
# 检查容器是否在traefik-network中 docker network inspect traefik-network # 检查容器间网络连通性 docker exec -it <container_name> ping traefik
# 检查acme.json权限 ls -la /opt/discountvapor-it/traefik/acme.json # 检查证书内容 docker exec traefik ls -la /letsencrypt/
# 将容器加入网络 docker network connect traefik-network <container_name> # 重启容器 docker restart <container_name>
# 正确的labels配置示例 labels: - "traefik.enable=true" - "traefik.http.routers.service.rule=Host(`domain.oasisvape.co.nz`)" - "traefik.http.routers.service.entrypoints=websecure" - "traefik.http.routers.service.tls.certresolver=letsencrypt" - "traefik.http.services.service.loadbalancer.server.port=<port>" - "traefik.docker.network=traefik-network"
# 修复acme.json权限 chmod 600 /opt/discountvapor-it/traefik/acme.json chown 1000:1000 /opt/discountvapor-it/traefik/acme.json # 重启Traefik docker restart traefik
docker ps | grep erpnext docker logs discountvapor-it-frontend-1 docker logs discountvapor-it-backend-1
# 检查MariaDB容器 docker logs discountvapor-it-database-1 # 检查数据库连接 docker exec -it discountvapor-it-backend-1 bench doctor
# 检查Redis容器 docker logs discountvapor-it-redis-cache-1 docker logs discountvapor-it-redis-queue-1
# 检查.env文件中的DB_PASSWORD cat /opt/discountvapor-it/.env # 重新配置数据库连接 docker exec -it discountvapor-it-configurator-1 bench set-config -g db_host database docker exec -it discountvapor-it-configurator-1 bench set-config -gp db_port 3306
# 修复数据目录权限 chown -R 1000:1000 /opt/discountvapor-it/data/erpnext/ # 重启ERPNext服务 docker compose -f docker-compose-erpnext.yml restart
# 重新运行配置器 docker compose -f docker-compose-erpnext.yml up configurator # 重启所有ERPNext服务 docker compose -f docker-compose-erpnext.yml restart
docker logs discountvapor-it-n8n-1 docker inspect discountvapor-it-n8n-1 | grep -A 20 "Env"
# 检查环境变量 docker exec -it discountvapor-it-n8n-1 env | grep N8N
# 正确的n8n配置 environment: - N8N_HOST=n8n.oasisvape.co.nz - N8N_PORT=5678 - WEBHOOK_TUNNEL_URL=https://n8n.oasisvape.co.nz
# 检查PostgreSQL连接 docker exec -it discountvapor-it-n8n-1 ping strapi_postgres # 检查数据库schema docker exec -it strapi_postgres psql -U strapi_user -d strapi_db -c "\dn"
docker logs dokuwiki docker exec -it dokuwiki ls -la /config/dokuwiki/
ls -la /opt/discountvapor-it/data/ chown -R 1000:1000 /opt/discountvapor-it/data/
# 修复权限 chown -R 1000:1000 /opt/discountvapor-it/data/ chown -R 1000:1000 /opt/discountvapor-it/logs/ # 重启DokuWiki docker restart dokuwiki
# 正确的DokuWiki配置 environment: - PUID=1000 - PGID=1000 - TZ=Pacific/Auckland volumes: - ./data:/config - ./logs:/var/log/dokuwiki
docker logs grafana docker exec -it grafana ls -la /var/lib/grafana/
ls -la /opt/discountvapor-it/data/grafana/
# 修复Grafana数据目录权限 chown -R 472:472 /opt/discountvapor-it/data/grafana/ # 重启Grafana docker restart grafana
# 正确的Grafana配置 environment: - GF_SECURITY_ADMIN_USER=admin - GF_SECURITY_ADMIN_PASSWORD=35@Riccarton - GF_USERS_ALLOW_SIGN_UP=false - TZ=Pacific/Auckland
ls -la /opt/discountvapor-it/traefik/acme.json docker exec traefik ls -la /letsencrypt/
docker logs traefik | grep -i "certificate\|acme\|error"
nslookup erp.oasisvape.co.nz nslookup cms.oasisvape.co.nz
# 修复权限 chmod 600 /opt/discountvapor-it/traefik/acme.json chown 1000:1000 /opt/discountvapor-it/traefik/acme.json # 重启Traefik docker restart traefik
# 删除旧的acme.json(谨慎操作) rm /opt/discountvapor-it/traefik/acme.json # 重启Traefik重新申请证书 docker restart traefik
# 检查DNS记录 dig erp.oasisvape.co.nz dig cms.oasisvape.co.nz # 确保A记录指向正确的IP # 156.67.214.225
ufw status
iptables -L
netstat -tlnp | grep :80 netstat -tlnp | grep :443
# 开放必要端口 ufw allow 22/tcp ufw allow 80/tcp ufw allow 443/tcp # 关闭其他端口 ufw deny 3000 ufw deny 5678 ufw deny 8080 ufw deny 8081 ufw deny 8180 ufw deny 9090
# 检查Traefik是否监听80/443端口 docker logs traefik # 重启Traefik docker restart traefik
# 检查磁盘使用情况 df -h # 检查Docker使用情况 docker system df # 检查大文件 du -sh /opt/discountvapor-it/data/* | sort -hr
# 清理Docker缓存 docker system prune -a # 清理日志文件 find /opt/discountvapor-it/logs/ -name "*.log" -mtime +7 -delete # 清理备份文件 find /opt/discountvapor-it/config_backup/ -name "*.bak" -mtime +30 -delete
# 检查内存使用 free -h # 检查容器内存使用 docker stats --no-stream
# 重启内存占用高的容器 docker restart <container_name> # 调整容器内存限制 # 在docker-compose.yml中添加 # deploy: # resources: # limits: # memory: 1G
# 检查网络接口 ip addr show # 检查路由 ip route show # 检查DNS cat /etc/resolv.conf
# 重启网络服务 systemctl restart networking # 重启Docker网络 docker network prune docker network create traefik-network
# 在Grafana中配置告警规则 - alert: ContainerDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "Container {{ $labels.instance }} is down"
# 设置系统监控脚本 #!/bin/bash # 检查容器状态 docker ps --format "table {{.Names}}\t{{.Status}}" | grep -v "Up" # 检查磁盘空间 df -h | awk '$5 > "80%" {print $0}' # 检查内存使用 free -h | awk 'NR==2{if($3/$2 > 0.8) print "Memory usage high: " $3 "/" $2}'
# 配置logrotate cat > /etc/logrotate.d/docker-logs << EOF /opt/discountvapor-it/logs/*.log { daily rotate 7 compress delaycompress missingok notifempty create 644 root root } EOF
# 1. 停止所有服务 docker compose -f docker-compose-*.yml down # 2. 备份当前状态 tar -czf backup-$(date +%Y%m%d_%H%M%S).tar.gz /opt/discountvapor-it/ # 3. 恢复配置 cp config_backup/*.yml ./ cp config_backup/.env ./ # 4. 重启服务 docker compose -f docker-compose-traefik.yml up -d docker compose -f docker-compose-erpnext.yml up -d docker compose -f docker-compose-strapi.yml up -d docker compose -f docker-compose-n8n.yml up -d docker compose -f docker-compose-dokuwiki.yml up -d docker compose up -d
# ERPNext数据库恢复 docker exec -i discountvapor-it-database-1 mysql -u root -p${DB_PASSWORD} < backup.sql # PostgreSQL数据库恢复 docker exec -i strapi_postgres psql -U strapi_user -d strapi_db < backup.sql
最后更新: 2025-01-06 基于生产环境: 156.67.214.225