.github/workflows/evaluation_pipeline.yml

name: Evaluate LLM Workflow and Post Comment

permissions:
  contents: read
  pull-requests: write
  issues: write

on: workflow_dispatch

jobs:
  evaluation-pipeline:
    runs-on: ubuntu-latest
    defaults:
      run:
        working-directory: ./backend
    steps:
      - uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v4
        with:
          python-version: 3.11.6
      - name: Install poetry
        uses: abatilo/actions-poetry@v2

      - name: Setup a local virtual environment (if no poetry.toml file)
        run: |
          poetry config virtualenvs.create true --local
          poetry config virtualenvs.in-project true --local

      - uses: actions/cache@v3
        name: Define a cache for the virtual environment based on the dependencies lock file
        with:
          path: ./.venv
          key: venv-${{ hashFiles('poetry.lock') }}

      - name: Install the project dependencies
        run: poetry install --with dev --no-interaction --no-root

      - name: Run eval pipeline test cases
        env:
          DATA_DIRECTORY: ./
          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
        run: PYTHONPATH=. poetry run pytest tests/evaluation/test_query_flow.py --run-expensive
      
      - name: Upload test results as artifact
        uses: actions/upload-artifact@v3
        with:
          name: test-results.csv
          path: ./backend/test_results.csv

      - name: Run score aggregation script
        run: poetry run python tests/evaluation/aggregate_scores.py

      - name: Save scores as artifact
        uses: actions/upload-artifact@v3
        with:
          name: scores.md
          path: ./backend/scores.md

      - name: Find associated pull request
        uses: jwalton/gh-find-current-pr@v1
        id: findPr
        # Only run this on PRs
        if: github.ref != 'refs/heads/main'
        continue-on-error: true
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Download main branch scores
        if: github.ref != 'refs/heads/main'
        uses: dawidd6/action-download-artifact@v6
        with:
          workflow: evaluation_pipeline.yml
          branch: main
          name: scores.md
          path: ./backend/main-scores

      - name: Compare scores and generate report
        if: github.ref != 'refs/heads/main'
        run: |
          poetry run python tests/evaluation/compare_scores.py \
            scores.md \
            main-scores/scores.md

      - name: Generate markdown comment
        if: github.ref != 'refs/heads/main'
        id: comparison
        run: |
          comparison=$(cat comparison.md )
          echo "comparison<<EOF" >> $GITHUB_OUTPUT
          echo "$comparison" >> $GITHUB_OUTPUT
          echo "EOF" >> $GITHUB_OUTPUT

      - name: Post comparison on PR
        if: success() && steps.findPr.outputs.number
        uses: marocchino/sticky-pull-request-comment@v2
        with:
          number: ${{ steps.findPr.outputs.number }}
          message: |
            ${{ steps.comparison.outputs.comparison }}

      - name: Upload comparison as artifact
        if: github.ref != 'refs/heads/main'
        uses: actions/upload-artifact@v3
        with:
          name: score-comparison
          path: ./backend/comparison.md