etl.tf

################################################################
# Glue data catalog
################################################################

resource "aws_glue_catalog_database" "data_lake_raw_db" {
  name = "raw_db"
}

resource "aws_glue_catalog_database" "data_lake_curated_db" {
  name = "curated_db"
}

resource "aws_glue_catalog_database" "data_lake_aggregated_db" {
  name = "aggregated_db"
}

resource "aws_glue_catalog_table" "employee" {
  name          = "employee"
  database_name = aws_glue_catalog_database.data_lake_raw_db.name
  table_type    = "EXTERNAL_TABLE"
  storage_descriptor {
    compressed    = false
    input_format  = "org.apache.hadoop.mapred.TextInputFormat"
    location      = "s3://${module.s3_bucket_raw.bucket.bucket}/employee/"
    output_format = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
    ser_de_info {
      name                  = "LazySimpleSerDe"
      serialization_library = "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"

      parameters = {
        "field.delim" : ","
      }
    }
    columns {
      name = "employee_id"
      type = "bigint"
    }
    columns {
      name = "first_name"
      type = "string"
    }
    columns {
      name = "last_name"
      type = "string"
    }
    columns {
      name = "email"
      type = "string"
    }
    columns {
      name = "phone_number"
      type = "string"
    }
    columns {
      name = "hire_date"
      type = "string"
    }
    columns {
      name = "job_id"
      type = "string"
    }
    columns {
      name = "salary"
      type = "bigint"
    }
    columns {
      name = "commission_pct"
      type = "bigint"
    }
    columns {
      name = "manager_id"
      type = "bigint"
    }
    columns {
      name = "department_id"
      type = "bigint"
    }
  }
}


################################################################
# Glue jobs for ETL
################################################################

resource "aws_s3_bucket" "glue" {
  bucket = "glue-${local.post_fix}"
  acl    = "private"
}

resource "aws_s3_account_public_access_block" "glue" {
  block_public_acls       = true
  block_public_policy     = true
  ignore_public_acls      = true
  restrict_public_buckets = true
}

resource "aws_iam_role" "glue_execution_role" {
  name               = "GlueExecutionRole"
  assume_role_policy = jsonencode({
    Version   = "2012-10-17"
    Statement = [
      {
        Action    = "sts:AssumeRole"
        Effect    = "Allow"
        Principal = {
          Service = "glue.amazonaws.com"
        }
      },
    ]
  })
}

resource "aws_iam_role_policy_attachment" "glue_service_role" {
  for_each   = toset([
    "arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole",
    "arn:aws:iam::aws:policy/AmazonS3FullAccess",
  ])
  role       = aws_iam_role.glue_execution_role.name
  policy_arn = each.value
}

resource "aws_s3_bucket_object" "spline_agent_jar" {
  bucket = aws_s3_bucket.glue.bucket
  key    = "lib/${basename(local.spline_agent_file_path)}"
  source = local.spline_agent_file_path
  etag   = filemd5(local.spline_agent_file_path)
}

module "glue_job_raw_to_curated_employee_optimize" {
  source           = "./modules/etl_glue_job"
  name             = "RawToCurated_employee_optimize"
  role_arn         = aws_iam_role.glue_execution_role.arn
  script_file_path = "src/glue/raw_to_curated_employee_optimize.py"
  s3_bucket_name   = aws_s3_bucket.glue.bucket
  lineage_endpoint = aws_apigatewayv2_stage.data_lineage.invoke_url
  jars             = "s3://${aws_s3_bucket_object.spline_agent_jar.bucket}/${aws_s3_bucket_object.spline_agent_jar.key}"
  job_parameters = {
    "--OUTPUT_LOCATION": "s3://${module.s3_bucket_curated.bucket.bucket}/employee/"
  }
}

module "glue_job_curated_to_aggregated_employee" {
  source           = "./modules/etl_glue_job"
  name             = "CuratedToAggregated_employee"
  role_arn         = aws_iam_role.glue_execution_role.arn
  script_file_path = "src/glue/curated_to_aggregated_employee.py"
  s3_bucket_name   = aws_s3_bucket.glue.bucket
  lineage_endpoint = aws_apigatewayv2_stage.data_lineage.invoke_url
  jars             = "s3://${aws_s3_bucket_object.spline_agent_jar.bucket}/${aws_s3_bucket_object.spline_agent_jar.key}"
  job_parameters = {
    "--OUTPUT_LOCATION": "s3://${module.s3_bucket_aggregated.bucket.bucket}/employee/"
  }
}