Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release spark 3.4 #128

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ SHELL := /bin/sh

# Set variables if testing locally
ifeq ($(IS_RELEASE_BUILD),)
SPARK_VERSION := 3.3
SPARK_VERSION := 3.4
PROCESSOR := cpu
FRAMEWORK_VERSION := py39
SM_VERSION := 1.0
Expand Down Expand Up @@ -51,7 +51,7 @@ install-container-library: init
# temporarily bypass py=1.1.0 because pytest-parallel has a dependency on it however the module is no longer maitained.
# In the future the pylib will be removed from pytest-parallel dependency and 51457 should only impact the local tests.
# For more info, https://github.com/pytest-dev/py/issues/287
pipenv run safety check -i 43975 -i 51457 # https://github.com/pyupio/safety
pipenv run safety check -i 43975 -i 51457 -i 39611 # https://github.com/pyupio/safety

build-static-config:
./scripts/fetch-ec2-instance-type-info.sh --region ${REGION} --use-case ${USE_CASE} --spark-version ${SPARK_VERSION} \
Expand Down
51 changes: 27 additions & 24 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,38 @@ verify_ssl = true
[dev-packages]

[packages]
tenacity = "==8.0.1"
psutil = "==5.9.0"
click = "==8.1.2"
watchdog = "==0.10.3"
pyyaml = "==5.3.1"
tenacity = "==8.2.3"
psutil = "==5.9.5"
click = "==8.1.7"
watchdog = "==3.0.0"
waitress = "==2.1.2"
types-waitress = "==2.0.6"
requests = "==2.27.1"
types-requests = "==2.27.16"
rsa = "==4.3"
pyasn1 = "==0.4.8"
boto3 = "==1.21.33"
safety = "==2.3.1"
black = "==22.3.0"
mypy = "==0.942"
flake8 = "==4.0.1"
flake8-docstrings = "==1.5.0"
pytest = "==7.1.1"
pytest-cov = "==2.10.0"
pytest-xdist = "==2.5.0"
docker = "==5.0.3"
types-waitress = "==2.1.4.9"
requests = "==2.31.0"
types-requests = "==2.31.0.2"
rsa = "==4.7"
pyasn1 = "==0.5.0"
boto3 = "==1.28.38"
safety = "==2.3.5"
black = "==22.12.0"
mypy = "==1.5.1"
flake8 = "==6.1.0"
flake8-docstrings = "==1.7.0"
pytest = "==7.4.0"
pytest-cov = "==4.1.0"
pytest-xdist = "==3.3.1"
docker = "==6.1.3"
docker-compose = "==1.29.2"
cryptography = "==36.0.2"
typing-extensions = "==4.1.1"
cryptography = "==41.0.3"
typing-extensions = "==4.7.1"
sagemaker = "==2.117.0"
smspark = {editable = true, path = "."}
importlib-metadata = "==4.11.3"
importlib-metadata = "==4.13.0"
pytest-parallel = "==0.1.1"
pytest-rerunfailures = "10.0"
numpy = "==1.22.2"
pytest-rerunfailures = "==12.0"
numpy = "==1.25.2"
py = "==1.11.0"
awscli = "==1.29.38"

[requires]
python_version = "3.9"
2 changes: 1 addition & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions cython_constraint.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cython<3
4 changes: 2 additions & 2 deletions new_images.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
new_images:
- spark: "3.3"
- spark: "3.4"
use-case: "processing"
processors: ["cpu"]
python: ["py39"]
sm_version: "1.2"
sm_version: "1.0"
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[tool.black]
line-length = 120
target-version = ['py39']
48 changes: 48 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import glob
import os

from setuptools import find_packages, setup

with open("VERSION", "r") as version_file:
version = version_file.read()

setup(
name="smspark",
description="Library that enables running Spark Processing jobs on Amazon SageMaker",
version=version,
python_requires=">3.7.0",
packages=find_packages("src"),
package_dir={"": "src"},
py_modules=[os.path.splitext(os.path.basename(path))[0] for path in glob.glob("src/smspark/*.py")],
author="Amazon Web Services",
url="https://github.com/aws/smspark/",
license="Apache License 2.0",
keywords="ML Amazon AWS AI SageMaker Processing Spark",
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
"Natural Language :: English",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
"Programming Language :: Python :: 3.9",
],
setup_requires=["setuptools", "wheel"],
entry_points={
"console_scripts": [
"smspark-submit=smspark.cli:submit_main",
"smspark-history-server=smspark.history_server_cli:run_history_server",
]
},
)
46 changes: 24 additions & 22 deletions smsparkbuild/py39/Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,36 +6,38 @@ verify_ssl = true
[dev-packages]

[packages]
tenacity = "==8.0.1"
psutil = "==5.9.0"
click = "==8.1.2"
watchdog = "==0.10.3"
pyyaml = "==5.3.1"
tenacity = "==8.2.3"
psutil = "==5.9.5"
click = "==8.1.7"
watchdog = "==3.0.0"
waitress = "==2.1.2"
types-waitress = "==2.0.6"
types-waitress = "==2.1.4.9"
requests = "==2.31.0"
types-requests = "==2.27.16"
rsa = "==4.9"
pyasn1 = "==0.4.8"
boto3 = "==1.21.33"
types-requests = "==2.31.0.2"
rsa = "==4.7"
pyasn1 = "==0.5.0"
boto3 = "==1.28.38"
safety = "==2.3.5"
black = "==22.3.0"
mypy = "==0.942"
flake8 = "==4.0.1"
flake8-docstrings = "==1.5.0"
pytest = "==7.2.2"
pytest-cov = "==2.10.0"
pytest-xdist = "==3.2.1"
docker = "==5.0.3"
black = "==22.12.0"
mypy = "==1.5.1"
flake8 = "==6.1.0"
flake8-docstrings = "==1.7.0"
pytest = "==7.4.0"
pytest-cov = "==4.1.0"
pytest-xdist = "==3.3.1"
docker = "==6.1.3"
docker-compose = "==1.29.2"
cryptography = "==39.0.2"
typing-extensions = "==4.1.1"
cryptography = "==41.0.3"
typing-extensions = "==4.7.1"
sagemaker = "==2.117.0"
smspark = {editable = true, path = "."}
importlib-metadata = "==4.11.3"
importlib-metadata = "==4.13.0"
pytest-parallel = "==0.1.1"
pytest-rerunfailures = "10.0"
numpy = "==1.22.2"
pytest-rerunfailures = "==12.0"
numpy = "==1.25.2"
py = "==1.11.0"
awscli = "==1.29.38"

[requires]
python_version = "3.9"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo "Not implemented"
137 changes: 137 additions & 0 deletions spark/processing/3.4/py3/docker/py39/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
FROM 137112412989.dkr.ecr.us-west-2.amazonaws.com/amazonlinux:2
ARG REGION
ENV AWS_REGION ${REGION}

ENV JAVA_HOME /etc/alternatives/jre

RUN yum clean all \
&& yum update -y \
&& yum install -y awscli bigtop-utils curl gcc gzip unzip zip gunzip tar wget liblapack* libblas* libopencv* libopenblas*

# Taken from EMR https://tiny.amazon.com/1dp4p55nm/codeamazpackAwsCblob8b00src
RUN amazon-linux-extras enable corretto8 nginx1 \
&& yum install -y java-1.8.0-amazon-corretto-devel nginx python-virtualenv \
&& yum remove -y java-1.8.0-openjdk-headless

# Install python 3.9
ARG PYTHON_BASE_VERSION=3.9
ARG PYTHON_WITH_BASE_VERSION=python${PYTHON_BASE_VERSION}
ARG PIP_WITH_BASE_VERSION=pip${PYTHON_BASE_VERSION}
ARG PYTHON_VERSION=${PYTHON_BASE_VERSION}.12
RUN yum -y groupinstall 'Development Tools' \
&& yum -y install openssl-devel bzip2-devel libffi-devel sqlite-devel xz-devel \
&& wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
&& tar xzf Python-${PYTHON_VERSION}.tgz \
&& cd Python-*/ \
&& ./configure --enable-optimizations \
&& make altinstall \
&& echo -e 'alias python3=python3.9\nalias pip3=pip3.9' >> ~/.bashrc \
&& ln -s $(which ${PYTHON_WITH_BASE_VERSION}) /usr/local/bin/python3 \
&& ln -s $(which ${PIP_WITH_BASE_VERSION}) /usr/local/bin/pip3 \
&& cd .. \
&& rm Python-${PYTHON_VERSION}.tgz \
&& rm -rf Python-${PYTHON_VERSION}

# install nginx amazonlinux:2.0.20200304.0 does not have nginx, so need to install epel-release first
RUN wget https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
RUN yum install -y nginx

RUN rm -rf /var/cache/yum

ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# http://blog.stuart.axelbrooke.com/python-3-on-spark-return-of-the-pythonhashseed
ENV PYTHONHASHSEED 0
ENV PYTHONIOENCODING UTF-8
ENV PIP_DISABLE_PIP_VERSION_CHECK 1

# Install EMR Spark/Hadoop
ENV HADOOP_HOME /usr/lib/hadoop
ENV HADOOP_CONF_DIR /usr/lib/hadoop/etc/hadoop
ENV SPARK_HOME /usr/lib/spark

COPY yum/emr-apps.repo /etc/yum.repos.d/emr-apps.repo

# Install hadoop / spark dependencies from EMR's yum repository for Spark optimizations.
# replace placeholder with region in repository URL
RUN sed -i "s/REGION/${AWS_REGION}/g" /etc/yum.repos.d/emr-apps.repo
RUN adduser -N hadoop

# These packages are a subset of what EMR installs in a cluster with the
# "hadoop", "spark", and "hive" applications.
# They include EMR-optimized libraries and extras.
RUN yum install -y aws-hm-client \
aws-java-sdk \
aws-sagemaker-spark-sdk \
emr-goodies \
emr-ruby \
emr-scripts \
emr-s3-select \
emrfs \
hadoop \
hadoop-client \
hadoop-hdfs \
hadoop-hdfs-datanode \
hadoop-hdfs-namenode \
hadoop-httpfs \
hadoop-kms \
hadoop-lzo \
hadoop-yarn \
hadoop-yarn-nodemanager \
hadoop-yarn-proxyserver \
hadoop-yarn-resourcemanager \
hadoop-yarn-timelineserver \
hive \
hive-hcatalog \
hive-hcatalog-server \
hive-jdbc \
hive-server2 \
s3-dist-cp \
spark-core \
spark-datanucleus \
spark-external \
spark-history-server \
spark-python

# Point Spark at proper python binary
ENV PYSPARK_PYTHON=/usr/local/bin/python3.9

# Setup Spark/Yarn/HDFS user as root
ENV PATH="/usr/bin:/opt/program:${PATH}"
ENV YARN_RESOURCEMANAGER_USER="root"
ENV YARN_NODEMANAGER_USER="root"
ENV HDFS_NAMENODE_USER="root"
ENV HDFS_DATANODE_USER="root"
ENV HDFS_SECONDARYNAMENODE_USER="root"

# Set up bootstrapping program and Spark configuration
COPY hadoop-config /opt/hadoop-config
COPY nginx-config /opt/nginx-config
COPY aws-config /opt/aws-config
COPY Pipfile Pipfile.lock setup.py *.whl /opt/program/
ENV PIPENV_PIPFILE=/opt/program/Pipfile
# Use --system flag, so it will install all packages into the system python,
# and not into the virtualenv. Since docker containers do not need to have virtualenvs
# pipenv > 2022.4.8 fails to build smspark
RUN /usr/local/bin/python3.9 -m pip install pipenv==2022.4.8 \
&& pipenv install --system \
&& /usr/local/bin/python3.9 -m pip install /opt/program/*.whl

# Setup container bootstrapper
COPY container-bootstrap-config /opt/container-bootstrap-config
RUN chmod +x /opt/container-bootstrap-config/bootstrap.sh \
&& /opt/container-bootstrap-config/bootstrap.sh

# With this config, spark history server will not run as daemon, otherwise there
# will be no server running and container will terminate immediately
ENV SPARK_NO_DAEMONIZE TRUE

WORKDIR $SPARK_HOME

# Install the sagemaker feature store spark connector
# https://docs.aws.amazon.com/sagemaker/latest/dg/batch-ingestion-spark-connector-setup.html
# Feature store connector library currently does not support spark 3.4 so commenting out this line
# RUN /usr/local/bin/python3.9 -m pip install sagemaker-feature-store-pyspark-3.3==1.1.2 --no-binary :all:

ENTRYPOINT ["smspark-submit"]
31 changes: 31 additions & 0 deletions spark/processing/3.4/py3/hadoop-config/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://nn_uri/</value>
<description>NameNode URI</description>
</property>
<property>
<name>fs.s3a.aws.credentials.provider</name>
<value>com.amazonaws.auth.DefaultAWSCredentialsProviderChain</value>
<description>AWS S3 credential provider</description>
</property>
<property>
<name>fs.s3.impl</name>
<value>org.apache.hadoop.fs.s3a.S3AFileSystem</value>
<description>s3a filesystem implementation</description>
</property>
<property>
<name>fs.AbstractFileSystem.s3a.imp</name>
<value>org.apache.hadoop.fs.s3a.S3A</value>
<description>s3a filesystem implementation</description>
</property>
<property>
<name>fs.s3a.connection.maximum</name>
<value>100</value>
<description>s3a filesystem maximum connection</description>
</property>
</configuration>
Loading