Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/workflows/pr-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,17 @@ jobs:
permissions:
contents: read

# Terraform security scanning via Checkov (soft-fail until matrix is fully addressed)
terraform-security:
name: Terraform Security
uses: ./.github/workflows/terraform-security.yml
with:
soft-fail: true
working-directory: infrastructure/terraform
permissions:
contents: read
security-events: write

# Terraform test execution with Codecov Test Analytics
terraform-tests:
name: Terraform Tests
Expand Down
63 changes: 63 additions & 0 deletions .github/workflows/terraform-security.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: Terraform Security Scan
Comment thread
nguyena2 marked this conversation as resolved.

on:
workflow_call:
inputs:
working-directory:
description: Directory passed to checkov via -d
required: false
type: string
default: infrastructure/terraform
soft-fail:
description: Whether to continue on Checkov violations
required: false
type: boolean
default: true

permissions:
contents: read

jobs:
checkov:
name: Checkov
runs-on: ubuntu-latest
permissions:
contents: read
security-events: write
steps:
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false

- name: Create logs directory
run: mkdir -p logs

# TODO(security): Pin bridgecrewio/checkov-action to a specific commit SHA
# via the dependency-pinning workflow before disabling soft-fail.
- name: Run Checkov
id: checkov
Comment thread
nguyena2 marked this conversation as resolved.
uses: bridgecrewio/checkov-action@38a95e98d734de90a74687a8d986d6b06107f342 # v12.2898.0
continue-on-error: ${{ inputs.soft-fail }}
with:
directory: ${{ inputs.working-directory }}
framework: terraform
output_format: cli,sarif
output_file_path: console,logs/checkov.sarif
soft_fail: ${{ inputs.soft-fail }}
download_external_modules: false

- name: Upload SARIF to GitHub code scanning
if: always()
uses: github/codeql-action/upload-sarif@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2
with:
sarif_file: logs/checkov.sarif
category: checkov

- name: Upload Checkov SARIF artifact
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: checkov-sarif
path: logs/checkov.sarif
retention-days: 30
22 changes: 22 additions & 0 deletions infrastructure/examples/terraform.tfvars.dev
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,25 @@ should_enable_microsoft_defender = false
// should_deploy_monitor_workspace = true
// should_deploy_ampls = false
// should_deploy_dce = false

// =============================================================================
// Conversion Pipeline (Optional)
// =============================================================================
// Cost-optimized for dev: LRS, public networking permitted, F2 capacity, brief
// raw retention. Set should_deploy_conversion_pipeline = true to provision.
// =============================================================================

should_deploy_conversion_pipeline = false
conversion_pipeline_config = {
storage_replication_type = "LRS"
should_enable_public_network_access = true
should_enable_shared_key = false
should_enable_private_endpoint = false
should_create_fabric_capacity = true
should_create_fabric_workspace = false
fabric_capacity_sku = "F2"
raw_retention_days = 7
converted_cool_days = 30
converted_archive_days = 90
should_enable_event_grid_dead_letter = true
}
22 changes: 22 additions & 0 deletions infrastructure/examples/terraform.tfvars.prod
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,25 @@ should_enable_purge_protection = true
// should_deploy_monitor_workspace = true
// should_deploy_ampls = true
// should_deploy_dce = true

// =============================================================================
// Conversion Pipeline (Optional)
// =============================================================================
// Production posture: GRS, private endpoints required, F32 capacity, full
// retention. Set should_deploy_conversion_pipeline = true to provision.
// =============================================================================

should_deploy_conversion_pipeline = false
conversion_pipeline_config = {
storage_replication_type = "GRS"
should_enable_public_network_access = false
should_enable_shared_key = false
should_enable_private_endpoint = true
should_create_fabric_capacity = true
should_create_fabric_workspace = false
fabric_capacity_sku = "F32"
raw_retention_days = 30
converted_cool_days = 30
converted_archive_days = 90
should_enable_event_grid_dead_letter = true
}
82 changes: 82 additions & 0 deletions infrastructure/examples/terraform.tfvars.staging
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// =============================================================================
// Staging Environment
// =============================================================================
// Production-shaped (private networking, HA backend) but right-sized for
// pre-production validation. Smaller GPU footprint and F8 Fabric capacity.
// =============================================================================

// Core Configuration
environment = "staging"
location = "westus3"
resource_prefix = "roboticsstg"
instance = "001"

// Resource Group
should_create_resource_group = true

// AKS System Node Pool
system_node_pool_vm_size = "Standard_D8ds_v5"
system_node_pool_node_count = 2

// Single GPU pool with dedicated subnet
node_pools = {
rtxprogpu = {
vm_size = "Standard_NC128ds_xl_RTXPRO6000BSE_v6"
subnet_address_prefixes = ["10.0.7.0/24"]
node_taints = ["nvidia.com/gpu:NoSchedule"]
gpu_driver = "None"
node_labels = {
"nvidia.com/gpu.deploy.driver" = "false"
}
priority = "Regular"
should_enable_auto_scaling = true
min_count = 1
max_count = 2
zones = []
}
}

// OSMO Backend Services with HA
should_deploy_postgresql = true
should_deploy_redis = true

// PostgreSQL HA
postgresql_sku_name = "GP_Standard_D2s_v3"
postgresql_high_availability = {
enabled = true
standby_availability_zone = "2"
Comment thread
nguyena2 marked this conversation as resolved.
}

// Redis HA
should_enable_redis_high_availability = true

// Network Security — Full Private
should_enable_private_endpoint = true
should_enable_private_aks_cluster = true

should_enable_public_network_access = false
should_add_current_user_key_vault_admin = true
should_enable_microsoft_defender = true
should_enable_purge_protection = false

// =============================================================================
// Conversion Pipeline (Optional)
// =============================================================================
// Pre-production posture: ZRS, private endpoints required, F8 capacity.
// Set should_deploy_conversion_pipeline = true to provision.
// =============================================================================

should_deploy_conversion_pipeline = false
conversion_pipeline_config = {
storage_replication_type = "ZRS"
should_enable_public_network_access = false
should_enable_shared_key = false
should_enable_private_endpoint = true
should_create_fabric_capacity = true
should_create_fabric_workspace = false
fabric_capacity_sku = "F8"
raw_retention_days = 30
converted_cool_days = 30
converted_archive_days = 90
should_enable_event_grid_dead_letter = true
}
Loading
Loading