Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions .github/workflows/terraform-mn-vectorization.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
name: "Terraform — MN Vectorization"

on:
pull_request:
paths:
- "mn-vectorization/infra/**"
push:
branches:
- main
paths:
- "mn-vectorization/infra/**"

permissions:
contents: read

env:
TF_WORKING_DIR: mn-vectorization/infra
TF_ENV: dev
AWS_REGION: us-east-1

jobs:
terraform:
name: Terraform
runs-on: ubuntu-latest
defaults:
run:
working-directory: ${{ env.TF_WORKING_DIR }}

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "~1.5"

# NOTE: Using static keys for POC. OIDC migration deferred until
# Nomad provisions IAM OIDC identity provider + role in AWS.
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.MN_VECTORIZATION_AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.MN_VECTORIZATION_AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}

- name: Terraform Format Check
run: terraform fmt -check -recursive

- name: Terraform Init
run: |
terraform init \
-backend-config="bucket=sf-terraform-state" \
-backend-config="key=mn-vectorization/${{ env.TF_ENV }}/terraform.tfstate" \
-backend-config="region=${{ env.AWS_REGION }}"

- name: Terraform Validate
run: terraform validate

- name: Terraform Plan
if: github.event_name == 'pull_request'
run: |
set +e
terraform plan -var-file=environments/${{ env.TF_ENV }}.tfvars -no-color -input=false -detailed-exitcode
ec=$?
if [ "$ec" -eq 1 ]; then
echo "::error::Terraform plan failed"
exit 1
fi
exit 0

- name: Terraform Apply
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: terraform apply -var-file=environments/${{ env.TF_ENV }}.tfvars -auto-approve -input=false
4 changes: 4 additions & 0 deletions mn-vectorization/infra/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.terraform/
*.tfstate
*.tfstate.backup
*.tfplan
3 changes: 3 additions & 0 deletions mn-vectorization/infra/backend.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
terraform {
backend "s3" {}
}
75 changes: 75 additions & 0 deletions mn-vectorization/infra/cloudwatch.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# -----------------------------------------------------
# CloudWatch log groups
# -----------------------------------------------------

resource "aws_cloudwatch_log_group" "main" {
for_each = local.log_groups
name = each.value
retention_in_days = var.log_retention_days
tags = { Name = each.value }
}

# -----------------------------------------------------
# CloudWatch alarms
# -----------------------------------------------------

# Alarm 1: Indexing failures (custom metric from Temporal worker)
resource "aws_cloudwatch_metric_alarm" "indexing_failures" {
count = var.is_alarm_enabled ? 1 : 0
alarm_name = "${local.name_prefix}_indexing_failures_alarm"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 1
metric_name = "IndexingFailures"
namespace = "MNVectorization/${var.environment}"
period = 300
statistic = "Sum"
threshold = 0
alarm_description = "Indexing pipeline failure detected"
treat_missing_data = "notBreaching"

alarm_actions = var.alarm_sns_topic_arn != "" ? [var.alarm_sns_topic_arn] : []

tags = { Name = "${local.name_prefix}_indexing_failures_alarm" }
}

# Alarm 2: Query latency p99 (custom metric from MCP server)
resource "aws_cloudwatch_metric_alarm" "query_latency_p99" {
count = var.is_alarm_enabled ? 1 : 0
alarm_name = "${local.name_prefix}_query_latency_p99_alarm"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 2
metric_name = "QueryLatencyP99"
namespace = "MNVectorization/${var.environment}"
period = 300
statistic = "Maximum"
threshold = 30000
alarm_description = "Query p99 latency exceeds 30s"
treat_missing_data = "notBreaching"

alarm_actions = var.alarm_sns_topic_arn != "" ? [var.alarm_sns_topic_arn] : []

tags = { Name = "${local.name_prefix}_query_latency_p99_alarm" }
}

# Alarm 3: DynamoDB throttling (per table)
resource "aws_cloudwatch_metric_alarm" "dynamodb_throttling" {
for_each = var.is_alarm_enabled ? local.dynamodb_tables : {}
alarm_name = "${local.name_prefix}_${each.key}_throttling_alarm"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 1
metric_name = "ThrottledRequests"
namespace = "AWS/DynamoDB"
period = 60
statistic = "Sum"
threshold = 0
alarm_description = "DynamoDB throttling on ${each.key} table"
treat_missing_data = "notBreaching"

dimensions = {
TableName = aws_dynamodb_table.main[each.key].name
}

alarm_actions = var.alarm_sns_topic_arn != "" ? [var.alarm_sns_topic_arn] : []

tags = { Name = "${local.name_prefix}_${each.key}_throttling_alarm" }
}
11 changes: 11 additions & 0 deletions mn-vectorization/infra/data.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
data "aws_vpc" "existing" {
id = var.vpc_id
}

data "aws_instance" "existing" {
instance_id = var.ec2_instance_id
}

data "aws_caller_identity" "current" {}

data "aws_region" "current" {}
30 changes: 30 additions & 0 deletions mn-vectorization/infra/dynamodb.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -----------------------------------------------------
# DynamoDB tables for task state and user ACL
# Created via for_each over local.dynamodb_tables
# -----------------------------------------------------

resource "aws_dynamodb_table" "main" {
for_each = local.dynamodb_tables
name = "${local.name_prefix}_${each.key}_ddb"
billing_mode = "PAY_PER_REQUEST"
hash_key = each.value.hash_key

attribute {
name = each.value.hash_key
type = "S"
}

dynamic "ttl" {
for_each = each.value.ttl_attr != null ? [each.value.ttl_attr] : []
content {
attribute_name = ttl.value
enabled = true
}
}

point_in_time_recovery {
enabled = var.environment == "prod"
}

tags = { Name = "${local.name_prefix}_${each.key}_ddb" }
}
21 changes: 21 additions & 0 deletions mn-vectorization/infra/environments/dev.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
environment = "dev"
aws_region = "us-east-1"
billing_tag = "mn-vectorization"

# Existing infrastructure — replace with actual IDs
vpc_id = "vpc-385f9a56"
ec2_instance_id = "i-XXXXXXXXXXXXXXXXX"

# CloudWatch
log_retention_days = 14
alarm_sns_topic_arn = ""
is_alarm_enabled = false

# MCP Server
mcp_server_port = 3000

# Encryption
is_kms_enabled = false

# S3 lifecycle
embeddings_expiry_days = 0
22 changes: 22 additions & 0 deletions mn-vectorization/infra/environments/prod.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
environment = "prod"
aws_region = "us-east-1"
billing_tag = "mn-vectorization"

# Existing infrastructure — replace with actual IDs
vpc_id = "vpc-XXXXXXXXXXXXXXXXX"
ec2_instance_id = "i-XXXXXXXXXXXXXXXXX"

# CloudWatch
log_retention_days = 90
alarm_sns_topic_arn = "arn:aws:sns:us-east-1:891612588877:mn-vectorization-alerts"
is_alarm_enabled = true

# MCP Server
mcp_server_port = 3000

# Encryption
is_kms_enabled = false
# kms_key_arn = "arn:aws:kms:us-east-1:891612588877:key/XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX"

# S3 lifecycle
embeddings_expiry_days = 0
21 changes: 21 additions & 0 deletions mn-vectorization/infra/environments/staging.tfvars
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
environment = "staging"
aws_region = "us-east-1"
billing_tag = "mn-vectorization"

# Existing infrastructure — replace with actual IDs
vpc_id = "vpc-XXXXXXXXXXXXXXXXX"
ec2_instance_id = "i-XXXXXXXXXXXXXXXXX"

# CloudWatch
log_retention_days = 30
alarm_sns_topic_arn = ""
is_alarm_enabled = true
Comment on lines +10 to +12
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Enable alarms only when a notification target exists.

This staging config enables alarms but leaves alarm_sns_topic_arn empty, so alarms will not notify anyone. Consider setting is_alarm_enabled = false until an SNS topic is wired.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@mn-vectorization/infra/environments/staging.tfvars` around lines 10 - 12, The
staging tfvars enables alarms with is_alarm_enabled = true while
alarm_sns_topic_arn is empty; change the configuration so alarms are only
enabled when a notification target exists by either supplying a valid SNS topic
ARN to alarm_sns_topic_arn or setting is_alarm_enabled = false until an SNS
topic is provisioned (update the values for alarm_sns_topic_arn and
is_alarm_enabled accordingly, leaving log_retention_days unchanged).


# MCP Server
mcp_server_port = 3000

# Encryption
is_kms_enabled = false

# S3 lifecycle
embeddings_expiry_days = 0
Loading
Loading