Terraform and AWS instance auto-recovery

I have helped out a few people with this exact issue: auto-recovery of an AWS instance so figured it was time to share just how easy this is to set up.

Why is this useful?

  1. DNS server
  2. syslog server
  3. legacy application

By default I suggest everything be ephemeral and not to use this except for very specific case(s).

But if you have a need then it really is this easy.. A complete example is below if you wish to test it out.

# our region
variable "region" {
  default = "us-east-1"
}

# replace with real public key
variable "ssh_public_key" {
  default = "ssh-rsa replace-with-real-key"
}

# query for latest Ubuntu 16.04 LTS AMI
data "aws_ami" "ubuntu" {
  most_recent = true

  filter {
    name   = "name"
    values = ["ubuntu/images/hvm-ssd/ubuntu-xenial-16.04-amd64-server-*"]
  }

  filter {
    name   = "virtualization-type"
    values = ["hvm"]
  }

  owners = ["099720109477"] # Canonical
}

# AWS SSH key for logging in if you wish to
resource "aws_key_pair" "test_example" {
  key_name   = "test_example"
  public_key = "${var.ssh_public_key}"
}

Create an instance
resource "aws_instance" "test_example" {
  ami                                  = "${data.aws_ami.ubuntu.id}"
  instance_type                        = "t2.nano"
  availability_zone                    = "us-east-1a"
  key_name                             = "${aws_key_pair.test_example.key_name}"

  # a private IP address within this VPC
  private_ip = "10.163.22.11"

  tags {
    Name         = "test-example"
  }
  volume_tags {
    Name         = "test-example"
  }
}

# build the CloudWatch auto-recovery alarm and recovery action
resource "aws_cloudwatch_metric_alarm" "test_example" {
  alarm_name         = "instance-autorecovery"
  namespace          = "AWS/EC2"
  evaluation_periods = "2"
  period             = "60"
  alarm_description  = "This metric auto recovers EC2 instances"

  alarm_actions = ["arn:aws:automate:${var.region}:ec2:recover"]

  statistic           = "Minimum"
  comparison_operator = "GreaterThanThreshold"
  threshold           = "0.0"
  metric_name         = "StatusCheckFailed_System"

  dimensions {
    InstanceId = "${aws_instance.test_example.id}"
  }
}