跳到主要内容

最佳实践

在生产环境中使用 Terraform 需要遵循一系列最佳实践,以确保基础设施的安全性、可维护性和可靠性。本章将介绍 Terraform 的核心最佳实践。

项目结构

推荐的目录结构

terraform-project/
├── modules/ # 可重用模块
│ ├── vpc/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ └── README.md
│ ├── compute/
│ └── database/
├── environments/ # 环境配置
│ ├── dev/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ ├── backend.tf
│ │ ├── provider.tf
│ │ └── terraform.tfvars
│ ├── staging/
│ └── prod/
├── policies/ # 策略文件
│ └── sentinel/
├── scripts/ # 辅助脚本
│ ├── plan.sh
│ └── apply.sh
├── .gitignore
├── .terraform-version # tfenv 版本文件
└── README.md

文件命名规范

# 推荐的文件命名
main.tf # 主要资源配置
variables.tf # 变量定义
outputs.tf # 输出定义
providers.tf # Provider 配置
backend.tf # 后端配置
terraform.tfvars # 变量值(不提交到版本控制)
locals.tf # 本地值定义
data.tf # 数据源定义

# 避免使用
resource.tf # 太笼统
stuff.tf # 无意义
1.tf, 2.tf # 数字命名

代码风格

使用 terraform fmt

始终使用 terraform fmt 格式化代码:

# 格式化当前目录
terraform fmt

# 递归格式化
terraform fmt -recursive

# 检查格式(CI 中使用)
terraform fmt -check -recursive

代码组织原则

# providers.tf - Provider 配置放在单独文件
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
}

provider "aws" {
region = var.aws_region

default_tags {
tags = {
Environment = var.environment
ManagedBy = "Terraform"
Project = var.project_name
}
}
}

# variables.tf - 变量定义
variable "aws_region" {
description = "AWS 区域"
type = string
default = "us-east-1"
}

variable "environment" {
description = "环境名称"
type = string
}

# locals.tf - 本地值
locals {
common_tags = {
Environment = var.environment
ManagedBy = "Terraform"
}

name_prefix = "${var.project_name}-${var.environment}"
}

# main.tf - 资源配置
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr

tags = merge(local.common_tags, {
Name = "${local.name_prefix}-vpc"
})
}

命名规范

# 资源命名
resource "aws_instance" "web_server" { # 使用下划线分隔
# ...
}

# 变量命名
variable "instance_type" { # 使用小写字母和下划线
# ...
}

# 标签命名
resource "aws_instance" "web" {
tags = {
Name = "WebServer" # 首字母大写
Environment = "Production" # 首字母大写
ManagedBy = "Terraform"
}
}

变量管理

变量定义最佳实践

# 1. 始终提供描述
variable "instance_type" {
description = "EC2 实例类型"
type = string
default = "t2.micro"
}

# 2. 使用适当的类型
variable "tags" {
description = "资源标签"
type = map(string)
default = {}
}

variable "subnets" {
description = "子网配置"
type = list(object({
name = string
cidr = string
public = bool
}))
}

# 3. 敏感变量标记
variable "db_password" {
description = "数据库密码"
type = string
sensitive = true
}

# 4. 变量验证
variable "environment" {
description = "环境名称"
type = string

validation {
condition = contains(["dev", "staging", "prod"], var.environment)
error_message = "环境必须是 dev、staging 或 prod。"
}
}

variable "instance_count" {
description = "实例数量"
type = number
default = 1

validation {
condition = var.instance_count > 0 && var.instance_count <= 10
error_message = "实例数量必须在 1-10 之间。"
}
}

变量值管理

# terraform.tfvars - 环境特定值
aws_region = "us-east-1"
environment = "dev"
instance_type = "t2.micro"
instance_count = 2

# 敏感值使用环境变量或密钥管理服务
# export TF_VAR_db_password="secretpassword"

状态管理最佳实践

远程状态配置

terraform {
backend "s3" {
bucket = "company-terraform-state"
key = "project/environment/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"

# 使用 IAM 角色而不是硬编码凭证
}
}

状态锁定

始终启用状态锁定:

# DynamoDB 表用于锁定
resource "aws_dynamodb_table" "terraform_locks" {
name = "terraform-locks"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"

attribute {
name = "LockID"
type = "S"
}
}

状态备份

# S3 版本控制
resource "aws_s3_bucket_versioning" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id

versioning_configuration {
status = "Enabled"
}
}

# 生命周期规则
resource "aws_s3_bucket_lifecycle_configuration" "terraform_state" {
bucket = aws_s3_bucket.terraform_state.id

rule {
id = "old-versions"
status = "Enabled"

noncurrent_version_expiration {
noncurrent_days = 90
}
}
}

安全管理

敏感数据处理

# 1. 标记敏感变量
variable "api_key" {
type = string
sensitive = true
}

# 2. 标记敏感输出
output "db_password" {
value = aws_db_instance.main.password
sensitive = true
}

# 3. 避免在代码中硬编码敏感信息
# 不好的做法
resource "aws_db_instance" "main" {
password = "hardcoded_password" # 永远不要这样做
}

# 好的做法
resource "aws_db_instance" "main" {
password = var.db_password # 从变量获取
}

最小权限原则

{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": [
"ec2:DescribeInstances",
"ec2:RunInstances",
"ec2:TerminateInstances"
],
"Resource": "*",
"Condition": {
"StringEquals": {
"ec2:Region": "us-east-1"
}
}
},
{
"Effect": "Deny",
"Action": "ec2:TerminateInstances",
"Resource": "*",
"Condition": {
"StringNotEquals": {
"ec2:ResourceTag/Environment": "dev"
}
}
}
]
}

密钥管理

# 使用 AWS Secrets Manager
data "aws_secretsmanager_secret_version" "db_password" {
secret_id = "prod/db/password"
}

resource "aws_db_instance" "main" {
password = data.aws_secretsmanager_secret_version.db_password.secret_string
}

# 或使用 HashiCorp Vault
provider "vault" {
address = "https://vault.example.com"
}

data "vault_generic_secret" "db_credentials" {
path = "secret/db/prod"
}

resource "aws_db_instance" "main" {
username = data.vault_generic_secret.db_credentials.data["username"]
password = data.vault_generic_secret.db_credentials.data["password"]
}

模块开发最佳实践

模块设计原则

# 1. 单一职责
module "vpc" {
source = "./modules/vpc"
# 只处理网络相关资源
}

# 2. 清晰的接口
module "compute" {
source = "./modules/compute"

# 必需的参数
subnet_ids = module.vpc.private_subnet_ids

# 可选的参数(有默认值)
instance_type = "t2.micro"
}

# 3. 完整的文档
# 每个模块都应该有 README.md

模块版本控制

# 使用版本约束
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "~> 5.0" # 兼容 5.x 版本
}

# 锁定到特定版本(生产环境推荐)
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
version = "5.1.2" # 精确版本
}

CI/CD 集成

GitHub Actions 工作流

name: Terraform

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
terraform:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write

steps:
- uses: actions/checkout@v4

- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "1.6.0"
terraform_wrapper: false

- name: Terraform Format
run: terraform fmt -check -recursive

- name: Terraform Init
run: terraform init

- name: Terraform Validate
run: terraform validate

- name: Terraform Plan
run: terraform plan -out=tfplan

- name: Terraform Apply
if: github.ref == 'refs/heads/main'
run: terraform apply -auto-approve tfplan

预提交钩子

# .pre-commit-config.yaml
repos:
- repo: https://github.com/antonbabenko/pre-commit-terraform
rev: v1.86.0
hooks:
- id: terraform_fmt
- id: terraform_validate
- id: terraform_tflint
- id: terraform_tfsec

变更管理

变更流程

# 1. 创建变更计划
terraform plan -out=tfplan

# 2. 审查变更
terraform show tfplan

# 3. 应用变更(需要审批)
terraform apply tfplan

# 4. 验证变更
terraform show

破坏性变更处理

resource "aws_instance" "web" {
ami = "ami-new"
instance_type = "t2.micro"

lifecycle {
# 创建新资源后再销毁旧资源
create_before_destroy = true

# 防止意外销毁
prevent_destroy = true
}
}

监控和审计

启用日志记录

# AWS CloudTrail
resource "aws_cloudtrail" "terraform" {
name = "terraform-audit"
s3_bucket_name = aws_s3_bucket.cloudtrail.id

event_selector {
read_write_type = "All"
include_management_events = true
exclude_management_event_sources = []
}
}

资源标记策略

locals {
mandatory_tags = {
Environment = var.environment
Project = var.project_name
ManagedBy = "Terraform"
CostCenter = var.cost_center
DataClassification = var.data_classification
Owner = var.owner
CreatedAt = timestamp()
}
}

resource "aws_instance" "web" {
ami = data.aws_ami.amazon_linux.id
instance_type = "t2.micro"

tags = local.mandatory_tags
}

性能优化

并行执行

# 增加并行度(默认 10)
terraform apply -parallelism=20

资源依赖优化

# 避免不必要的依赖
resource "aws_instance" "web" {
# 好的做法:只声明必要的依赖
depends_on = [aws_iam_role_policy_attachment.web]
}

# 使用数据源减少依赖
data "aws_ami" "amazon_linux" {
most_recent = true
owners = ["amazon"]
}

状态文件优化

# 按服务分离状态
# networking/terraform.tfstate
# compute/terraform.tfstate
# database/terraform.tfstate

# 使用 terraform_remote_state 引用
data "terraform_remote_state" "networking" {
backend = "s3"
config = {
bucket = "terraform-state"
key = "networking/terraform.tfstate"
region = "us-east-1"
}
}

故障排除

调试技巧

# 启用详细日志
export TF_LOG=DEBUG
export TF_LOG_PATH=terraform.log

# 查看资源图
terraform graph | dot -Tpng > graph.png

# 查看状态
terraform state list
terraform state show <resource>

# 刷新状态
terraform refresh

常见问题处理

# 状态锁定问题
terraform force-unlock <LOCK_ID>

# 状态不一致
terraform state rm <resource>
terraform import <resource> <id>

# 资源漂移
terraform plan -refresh-only

工具推荐

开发工具

  • tflint: Terraform 代码检查
  • tfsec: 安全扫描
  • checkov: 合规性检查
  • terraform-docs: 自动生成文档
  • terragrunt: Terraform 包装器

使用示例

# tflint
brew install tflint
tflint --init
tflint

# tfsec
brew install tfsec
tfsec

# terraform-docs
brew install terraform-docs
terraform-docs markdown . > README.md

持续改进

定期审查

  1. 代码审查:使用 Pull Request 审查所有变更
  2. 成本审查:定期分析资源成本
  3. 安全审查:定期运行安全扫描
  4. 性能审查:优化慢速资源

学习和更新

  1. 关注 Terraform 版本更新
  2. 学习新的 Provider 功能
  3. 参与社区讨论
  4. 分享最佳实践

总结

遵循这些最佳实践可以帮助你:

  • 提高代码质量和可维护性
  • 增强基础设施安全性
  • 简化团队协作
  • 降低运营风险
  • 提升工作效率

记住,最佳实践不是一成不变的,应根据团队规模、项目复杂度和组织需求进行调整。