I have helped out a few people with this exact issue: auto-recovery of an AWS instance so figured it was time to share just how easy this is to set up.
Why is this useful?
- DNS server
- syslog server
- legacy application
By default I suggest everything be ephemeral and not to use this except for very specific case(s).
But if you have a need then it really is this easy.. A complete example is below if you wish to test it out.
# our region variable "region" { default = "us-east-1" } # replace with real public key variable "ssh_public_key" { default = "ssh-rsa replace-with-real-key" } # query for latest Ubuntu 16.04 LTS AMI data "aws_ami" "ubuntu" { most_recent = true filter { name = "name" values = ["ubuntu/images/hvm-ssd/ubuntu-xenial-16.04-amd64-server-*"] } filter { name = "virtualization-type" values = ["hvm"] } owners = ["099720109477"] # Canonical } # AWS SSH key for logging in if you wish to resource "aws_key_pair" "test_example" { key_name = "test_example" public_key = "${var.ssh_public_key}" } Create an instance resource "aws_instance" "test_example" { ami = "${data.aws_ami.ubuntu.id}" instance_type = "t2.nano" availability_zone = "us-east-1a" key_name = "${aws_key_pair.test_example.key_name}" # a private IP address within this VPC private_ip = "10.163.22.11" tags { Name = "test-example" } volume_tags { Name = "test-example" } } # build the CloudWatch auto-recovery alarm and recovery action resource "aws_cloudwatch_metric_alarm" "test_example" { alarm_name = "instance-autorecovery" namespace = "AWS/EC2" evaluation_periods = "2" period = "60" alarm_description = "This metric auto recovers EC2 instances" alarm_actions = ["arn:aws:automate:${var.region}:ec2:recover"] statistic = "Minimum" comparison_operator = "GreaterThanThreshold" threshold = "0.0" metric_name = "StatusCheckFailed_System" dimensions { InstanceId = "${aws_instance.test_example.id}" } }