From 50d787abb442285cdfa7d485a0bacc107a031be9 Mon Sep 17 00:00:00 2001 From: CUK_AIDev Date: Thu, 5 Dec 2024 00:14:35 +0000 Subject: [PATCH] Adding Spark CI docker --- .github/workflows/docker-build.yml | 74 ++++++++++++++++++++++++++++ docker/Spark.CI.Dockerfile | 78 ++++++++++++++++++++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 .github/workflows/docker-build.yml create mode 100644 docker/Spark.CI.Dockerfile diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 0000000..1604363 --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,74 @@ +name: Build and Push Docker Image + +on: + push: + branches: [ "main" ] + tags: [ 'v*.*.*' ] + pull_request: + branches: [ "main" ] + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + security-events: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ghcr.io/${{ github.repository }}/spark-ci + tags: | + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: 'ghcr.io/${{ github.repository }}/spark-ci:${{ github.sha }}' + format: 'sarif' + output: 'trivy-results.sarif' + + - name: Cache Docker layers + uses: actions/cache@v3 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: docker/Spark.CI.Dockerfile + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + BUILDKIT_INLINE_CACHE=1 \ No newline at end of file diff --git a/docker/Spark.CI.Dockerfile b/docker/Spark.CI.Dockerfile new file mode 100644 index 0000000..878f68c --- /dev/null +++ b/docker/Spark.CI.Dockerfile @@ -0,0 +1,78 @@ +FROM ubuntu:24.04 + +# 3.11.7 +ARG PYTHON_VERSION=3.9.16 +ARG SPARK_VERSION=3.5.0 +ARG HADOOP_VERSION=3.3.5 +ARG JAVA_VERSION=11.0.24-zulu + +# Install necessary dependencies +RUN apt-get update && apt-get install -y \ + curl \ + zip \ + unzip \ + build-essential \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + wget \ + llvm \ + libncurses5-dev \ + xz-utils \ + tk-dev \ + libxml2-dev \ + libxmlsec1-dev \ + libffi-dev \ + liblzma-dev \ + git \ + && rm -rf /var/lib/apt/lists/* + +# Install pyenv +RUN curl https://pyenv.run | bash + +# Set up pyenv environment variables +ENV PYENV_ROOT /root/.pyenv +ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH + +# Install Python using pyenv +RUN pyenv install ${PYTHON_VERSION} && \ + pyenv global ${PYTHON_VERSION} && \ + pyenv rehash + +# Install sdkman +RUN curl -s "https://get.sdkman.io" | bash + +# Install Java, Spark, and Hadoop using sdkman +RUN bash -c "source $HOME/.sdkman/bin/sdkman-init.sh && \ + sdk install java ${JAVA_VERSION} && \ + sdk use java ${JAVA_VERSION} && \ + sdk install spark ${SPARK_VERSION} && \ + sdk use spark ${SPARK_VERSION}" + # && \ + # sdk install hadoop ${HADOOP_VERSION} && \ + # sdk use hadoop ${HADOOP_VERSION}" + +# Set up environment variables +ENV JAVA_HOME=/root/.sdkman/candidates/java/current +ENV SPARK_HOME=/root/.sdkman/candidates/spark/current +# ENV HADOOP_HOME=/root/.sdkman/candidates/hadoop/current +ENV PATH=$PATH:$JAVA_HOME/bin:$SPARK_HOME/bin:$SPARK_HOME/sbin +# :$HADOOP_HOME/bin + +# Set up Python +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install --user pipx && \ + python3 -m pipx ensurepath +ENV PATH="/root/.local/bin:$PATH" + +RUN pipx install poetry==1.8.4 + +# Verify installations +RUN echo "Verifying installations..." && \ + java -version && \ + python3 --version && \ + spark-submit --version && \ + poetry -V + # hadoop version