From ae46990864bc1c51d3639c054f5471292c453eae Mon Sep 17 00:00:00 2001 From: Juncong Moo Date: Tue, 7 Mar 2023 08:55:11 +0000 Subject: [PATCH] polish code and add setup script --- MANIFEST.in | 4 +++ README.md | 32 +++++++++++++++------- energonai/__init__.py | 1 + examples/bloom/run.sh | 0 requirements.txt | 12 ++++---- setup.py | 64 ++++++++++++++++++++++++++++++------------- version.txt | 2 +- 7 files changed, 79 insertions(+), 36 deletions(-) create mode 100644 MANIFEST.in mode change 100644 => 100755 examples/bloom/run.sh diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c2699c9 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include version.txt +include requirements.txt + + diff --git a/README.md b/README.md index 84a1215..a313aa3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ - + + # Energon-AI ![](https://img.shields.io/badge/Made%20with-ColossalAI-blueviolet?style=flat) @@ -17,23 +18,34 @@ For models trained by [Colossal-AI](https://github.com/hpcaitech/ColossalAI), th For single-device models, they require manual coding works to introduce tensor parallelism and pipeline parallelism. -### Installation -**Install from source** +## Installation + +There are three ways to install energonai. + +- **Install from pypi** + +``` bash +pip install energonai +``` + + +- **Install from source** ``` bash $ git clone git@github.com:hpcaitech/EnergonAI.git $ pip install -r requirements.txt $ pip install . ``` -**Use docker** + +- **Use docker** ``` bash $ docker pull hpcaitech/energon-ai:latest ``` -### Build an online OPT service in 5 minutes +## Build an online OPT service in 5 minutes 1. **Download OPT model:** - To launch the distributed inference service quickly, you can download the checkpoint of OPT-125M [here](https://huggingface.co/patrickvonplaten/opt_metaseq_125m/blob/main/model/restored.pt). You can get details for loading other sizes of models [here](https://github.com/hpcaitech/EnergonAI/tree/main/examples/opt/script). +To launch the distributed inference service quickly, you can download the checkpoint of OPT-125M [here](https://huggingface.co/patrickvonplaten/opt_metaseq_125m/blob/main/model/restored.pt). You can get details for loading other sizes of models [here](https://github.com/hpcaitech/EnergonAI/tree/main/examples/opt/script). 2. **Launch an HTTP service:** To launch a service, we need to provide python scripts to describe the model type and related configurations, and start an http service. @@ -55,7 +67,7 @@ For example, set the model class as opt_125M and set the correct checkpoint path Then open ***https://[ip]:[port]/docs*** in your browser and try out! -### Publication +## Publication You can find technical details in our blog and manuscript: [Build an online OPT service using Colossal-AI in 5 minutes](https://www.colossalai.org/docs/advanced_tutorials/opt_service/) @@ -73,8 +85,8 @@ You can find technical details in our blog and manuscript: } ``` -### Contributing +## Contributing If interested in making your own contribution to the project, please refer to [Contributing](./CONTRIBUTING.md) for guidance. -Thanks so much! \ No newline at end of file +Thanks so much! diff --git a/energonai/__init__.py b/energonai/__init__.py index e2f19d4..8f8899d 100644 --- a/energonai/__init__.py +++ b/energonai/__init__.py @@ -4,3 +4,4 @@ __all__ = ['BatchManager', 'launch_engine', 'SubmitEntry', 'TaskEntry', 'QueueFullError'] +__version__='0.0.2' diff --git a/examples/bloom/run.sh b/examples/bloom/run.sh old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt index 7af25d6..ac86335 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ -numpy +colossalai tqdm psutil packaging -fastapi~=0.75.1 -uvicorn==0.14 +numpy~=1.23.5 +fastapi~=0.92.0 +uvicorn~=0.20.0 typer redis scipy @@ -12,11 +13,10 @@ requests click transformers readerwriterlock ---extra-index-url https://download.pytorch.org/whl/cu113 torch torchvision -torchaudio -colossalai +torchaudio omegaconf +hiq-python prometheus-fastapi-instrumentator diff --git a/setup.py b/setup.py index 5160cc4..954a73e 100644 --- a/setup.py +++ b/setup.py @@ -1,14 +1,14 @@ import os import subprocess import sys - +import hiq import torch from setuptools import setup, find_packages from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME # ninja build does not work unless include_dirs are abs path this_dir = os.path.dirname(os.path.abspath(__file__)) -build_cuda_ext = True +build_cuda_ext = torch.cuda.is_available() if '--no_cuda_ext' in sys.argv: sys.argv.remove('--no_cuda_ext') @@ -16,6 +16,8 @@ def get_cuda_bare_metal_version(cuda_dir): + if cuda_dir is None or not os.path.exists(cuda_dir + "/bin/nvcc"): + return [None]*3 raw_output = subprocess.check_output([cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True) output = raw_output.split() release_idx = output.index("release") + 1 @@ -35,12 +37,11 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir): print(raw_output + "from " + cuda_dir + "/bin\n") if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor): - raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " + - "not match the version used to compile Pytorch binaries. " + - "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) + - "In some cases, a minor-version mismatch will not cause later errors: " + - "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. " - "You can try commenting out this check (at your own risk).") + print("*"*40 + "!!!Warning!!!" + "*"*40) + print(f"CUDA(nvcc) version({bare_metal_major}.{bare_metal_minor}) does not match the version({torch.version.cuda}) used to compile Pytorch binaries.") + print(f"We strongly recommend you reinstall Pytorch compiled with CUDA version {bare_metal_major}.{bare_metal_minor}.") + print("In some cases, even a minor-version mismatch will cause subtle error. Pleas refer to: https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.") + print("*"*90) def append_nvcc_threads(nvcc_extra_args): @@ -50,11 +51,6 @@ def append_nvcc_threads(nvcc_extra_args): return nvcc_extra_args -def fetch_requirements(path): - with open(path, 'r') as fd: - return [r.strip() for r in fd.readlines()] - - if not torch.cuda.is_available(): # https://github.com/NVIDIA/apex/issues/486 # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to @@ -69,14 +65,14 @@ def fetch_requirements(path): 'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n' 'If you wish to cross-compile for a single specific architecture,\n' 'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n') - if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: + if CUDA_HOME is not None and os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None: _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME) if int(bare_metal_major) == 11: os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0" else: os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5" -print("\n\ntorch.__version__ = {}\n\n".format(torch.__version__)) +print("torch.__version__ = {}".format(torch.__version__)) TORCH_MAJOR = int(torch.__version__.split('.')[0]) TORCH_MINOR = int(torch.__version__.split('.')[1]) @@ -160,10 +156,25 @@ def get_version(): version += f'+torch{torch_version}cu{cuda_version}' return version +def package_files(ds): + paths = [] + for d in ds: + for (path, directories, filenames) in os.walk(d): + for filename in filenames: + if '__pycache__' not in str(filename): + paths.append(str(os.path.join(path, filename))[len('energonai/'):]) + return paths +extra_files = package_files(['energonai/']) + +#print("ext_modules:", ext_modules) +#print("extra_files:", extra_files) setup( name='energonai', - version=get_version(), + maintainer='Juncong Moo;Open Source Community;HPCAiTech', + url='https://github.com/hpcaitech/EnergonAI', + maintainer_email='juncongmoo@gmail.com', + version=hiq.read_file('version.txt')[0], packages=find_packages( exclude=( 'benchmark', @@ -173,17 +184,32 @@ def get_version(): 'examples', 'tests', 'scripts', - 'requirements', '*.egg-info', 'dist', 'build', )), - description='Large-scale Model Inference', + description='EnergonAI: An Inference System for Large Transformer Models', + long_description=hiq.read_file('README.md', by_line=False), + long_description_content_type="text/markdown", license='Apache Software License 2.0', ext_modules=ext_modules, cmdclass={'build_ext': BuildExtension} if ext_modules else {}, - # install_requires=fetch_requirements('requirements.txt'), + install_requires=hiq.read_file('requirements.txt'), entry_points={ 'console_scripts': ['energonai=energonai.cli:typer_click_object', ], }, + package_data={"energonai": extra_files, "": ['requirements.txt']}, + classifiers=[ + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + ], ) + diff --git a/version.txt b/version.txt index 8a9ecc2..4e379d2 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.0.1 \ No newline at end of file +0.0.2