diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml new file mode 100644 index 0000000..ec071d6 --- /dev/null +++ b/.github/workflows/sphinx.yml @@ -0,0 +1,28 @@ +name: "Sphinx: Render docs" + +on: push + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + with: + persist-credentials: false + - name: Build HTML + uses: ammaraskar/sphinx-action@master + with: + docs-folder: "doc/" + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: html-docs + path: doc/build/html/ + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + if: github.ref == 'refs/heads/main' + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: doc/build/html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bb04a20 --- /dev/null +++ b/.gitignore @@ -0,0 +1,168 @@ +# PE +results +amlt +_test +_data + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..31fe7d6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: https://github.com/psf/black + rev: 24.8.0 + hooks: + - id: black + args: [--line-length=119] \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..f9ba8cf --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,9 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9e841e7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/README.md b/README.md new file mode 100644 index 0000000..3af5417 --- /dev/null +++ b/README.md @@ -0,0 +1,43 @@ +# Private Evolution: Differentially Private Synthetic Data via Foundation Model APIs + +This repo is a Python library to **generate differentially private (DP) synthetic data without the need of any ML model training**. It is based on the following papers that proposed a new DP synthetic data framework that only utilizes the blackbox inference APIs of foundation models (e.g., Stable Diffusion, GPT models). + +* Differentially Private Synthetic Data via Foundation Model APIs 1: Images + [[paper (ICLR 2024)]](https://openreview.net/forum?id=YEhQs8POIo) [[paper (arxiv)](https://arxiv.org/abs/2305.15560)] + **Authors:** [[Zinan Lin](https://zinanlin.me/)], [[Sivakanth Gopi](https://www.microsoft.com/en-us/research/people/sigopi/)], [[Janardhan Kulkarni](https://www.microsoft.com/en-us/research/people/jakul/)], [[Harsha Nori](https://www.microsoft.com/en-us/research/people/hanori/)], [[Sergey Yekhanin](http://www.yekhanin.org/)] + + +## Documentation +Please refer to the [documentation](https://microsoft.github.io/DPSDA/) for more details, including the installation instructions, usage, and examples. + +## Attention + +The code that was published along with the [paper](https://arxiv.org/abs/2305.15560) has been moved to the [deprecated](https://github.com/microsoft/DPSDA/tree/deprecated) branch on 11/21/2024, which is no longer maintained. The code in the current main branch is a refactored version of the original codebase, which is more modularized and easier to use, with support of more advanced Private Evolution algorithms and APIs. + +## Contributing + +This project welcomes contributions and suggestions. Most contributions require you to agree to a +Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us +the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. + +When you submit a pull request, a CLA bot will automatically determine whether you need to provide +a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions +provided by the bot. You will only need to do this once across all repos using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or +contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. + +## Trademarks + +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow +[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). +Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. +Any use of third-party trademarks or logos are subject to those third-party's policies. + +## Responsible Uses + +This project uses foundation model APIs to create [synthetic data](https://en.wikipedia.org/wiki/Synthetic_data) with [differential privacy](https://en.wikipedia.org/wiki/Differential_privacy) guarantees. Differential privacy (DP) is a formal framework that ensures the output of an algorithm does not reveal too much information about its inputs. Without a formal privacy guarantee, a synthetic data generation algorithm may inadvertently reveal sensitive information about its input datapoints. + +Using synthetic data in downstream applications can carry risk. Synthetic data may not always reflect the true data distribution, and can cause harms in downstream applications. Both the dataset and algorithms behind the foundation model APIs may contain various types of bias, leading to potential allocation, representation, and quality-of-service harms. Additionally, privacy violations can still occur if the ε and δ privacy parameters are set inappropriately, or if multiple copies of a sample exist in the seed dataset. It is important to consider these factors carefully before any potential deployments. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..e138ec5 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,41 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). + +If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. + +## Reporting Security Issues + +**Please do not report security vulnerabilities through public GitHub issues.** + +Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). + +If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). + +You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). + +Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: + + * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) + * Full paths of source file(s) related to the manifestation of the issue + * The location of the affected source code (tag/branch/commit or direct URL) + * Any special configuration required to reproduce the issue + * Step-by-step instructions to reproduce the issue + * Proof-of-concept or exploit code (if possible) + * Impact of the issue, including how an attacker might exploit the issue + +This information will help us triage your report more quickly. + +If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. + +## Preferred Languages + +We prefer all communications to be in English. + +## Policy + +Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). + + diff --git a/SUPPORT.md b/SUPPORT.md new file mode 100644 index 0000000..1be2295 --- /dev/null +++ b/SUPPORT.md @@ -0,0 +1,13 @@ +# Support + +## How to file issues and get help + +This project uses GitHub Issues to track bugs and feature requests. Please search the existing +issues before filing new issues to avoid duplicates. For new issues, file your bug or +feature request as a new Issue. + +For help and questions about using this project, please contact zinanlin AT microsoft.com. + +## Microsoft Support Policy + +Support for this project is limited to the resources listed above. diff --git a/doc/.gitkeep b/doc/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/build_autodoc.sh b/doc/build_autodoc.sh new file mode 100644 index 0000000..d20cf2f --- /dev/null +++ b/doc/build_autodoc.sh @@ -0,0 +1,2 @@ +sphinx-apidoc -f --module-first -d 3 -o source/api ../pe +make clean html \ No newline at end of file diff --git a/doc/build_doc.sh b/doc/build_doc.sh new file mode 100644 index 0000000..ac9f8ab --- /dev/null +++ b/doc/build_doc.sh @@ -0,0 +1 @@ +make html \ No newline at end of file diff --git a/doc/make.bat b/doc/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/doc/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/doc/requirements.txt b/doc/requirements.txt new file mode 100644 index 0000000..5614a3c --- /dev/null +++ b/doc/requirements.txt @@ -0,0 +1 @@ +sphinx_rtd_theme==3.0.2 \ No newline at end of file diff --git a/doc/serve_doc.sh b/doc/serve_doc.sh new file mode 100644 index 0000000..b6acc2c --- /dev/null +++ b/doc/serve_doc.sh @@ -0,0 +1 @@ +python3 -m http.server 8000 --directory build/html diff --git a/doc/source/api/api.rst b/doc/source/api/api.rst new file mode 100644 index 0000000..fc98e25 --- /dev/null +++ b/doc/source/api/api.rst @@ -0,0 +1,10 @@ +API Reference +=============================== + + +.. toctree:: + :maxdepth: 3 + :caption: Contents: + + modules + diff --git a/doc/source/api/modules.rst b/doc/source/api/modules.rst new file mode 100644 index 0000000..a446077 --- /dev/null +++ b/doc/source/api/modules.rst @@ -0,0 +1,7 @@ +pe +== + +.. toctree:: + :maxdepth: 3 + + pe diff --git a/doc/source/api/pe.api.image.improved_diffusion_lib.rst b/doc/source/api/pe.api.image.improved_diffusion_lib.rst new file mode 100644 index 0000000..841044c --- /dev/null +++ b/doc/source/api/pe.api.image.improved_diffusion_lib.rst @@ -0,0 +1,26 @@ +pe.api.image.improved\_diffusion\_lib package +============================================= + +.. automodule:: pe.api.image.improved_diffusion_lib + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.api.image.improved\_diffusion\_lib.gaussian\_diffusion module +---------------------------------------------------------------- + +.. automodule:: pe.api.image.improved_diffusion_lib.gaussian_diffusion + :members: + :undoc-members: + :show-inheritance: + +pe.api.image.improved\_diffusion\_lib.unet module +------------------------------------------------- + +.. automodule:: pe.api.image.improved_diffusion_lib.unet + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.api.image.rst b/doc/source/api/pe.api.image.rst new file mode 100644 index 0000000..c53f742 --- /dev/null +++ b/doc/source/api/pe.api.image.rst @@ -0,0 +1,34 @@ +pe.api.image package +==================== + +.. automodule:: pe.api.image + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + pe.api.image.improved_diffusion_lib + +Submodules +---------- + +pe.api.image.improved\_diffusion\_api module +-------------------------------------------- + +.. automodule:: pe.api.image.improved_diffusion_api + :members: + :undoc-members: + :show-inheritance: + +pe.api.image.stable\_diffusion\_api module +------------------------------------------ + +.. automodule:: pe.api.image.stable_diffusion_api + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.api.rst b/doc/source/api/pe.api.rst new file mode 100644 index 0000000..a2f2616 --- /dev/null +++ b/doc/source/api/pe.api.rst @@ -0,0 +1,34 @@ +pe.api package +============== + +.. automodule:: pe.api + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + pe.api.image + +Submodules +---------- + +pe.api.api module +----------------- + +.. automodule:: pe.api.api + :members: + :undoc-members: + :show-inheritance: + +pe.api.util module +------------------ + +.. automodule:: pe.api.util + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.callback.common.rst b/doc/source/api/pe.callback.common.rst new file mode 100644 index 0000000..9c387d0 --- /dev/null +++ b/doc/source/api/pe.callback.common.rst @@ -0,0 +1,26 @@ +pe.callback.common package +========================== + +.. automodule:: pe.callback.common + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.callback.common.compute\_fid module +-------------------------------------- + +.. automodule:: pe.callback.common.compute_fid + :members: + :undoc-members: + :show-inheritance: + +pe.callback.common.save\_checkpoints module +------------------------------------------- + +.. automodule:: pe.callback.common.save_checkpoints + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.callback.image.rst b/doc/source/api/pe.callback.image.rst new file mode 100644 index 0000000..784912c --- /dev/null +++ b/doc/source/api/pe.callback.image.rst @@ -0,0 +1,26 @@ +pe.callback.image package +========================= + +.. automodule:: pe.callback.image + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.callback.image.sample\_images module +--------------------------------------- + +.. automodule:: pe.callback.image.sample_images + :members: + :undoc-members: + :show-inheritance: + +pe.callback.image.save\_all\_images module +------------------------------------------ + +.. automodule:: pe.callback.image.save_all_images + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.callback.rst b/doc/source/api/pe.callback.rst new file mode 100644 index 0000000..ae740bf --- /dev/null +++ b/doc/source/api/pe.callback.rst @@ -0,0 +1,27 @@ +pe.callback package +=================== + +.. automodule:: pe.callback + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + pe.callback.common + pe.callback.image + +Submodules +---------- + +pe.callback.callback module +--------------------------- + +.. automodule:: pe.callback.callback + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.constant.rst b/doc/source/api/pe.constant.rst new file mode 100644 index 0000000..4f56470 --- /dev/null +++ b/doc/source/api/pe.constant.rst @@ -0,0 +1,18 @@ +pe.constant package +=================== + +.. automodule:: pe.constant + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.constant.data module +----------------------- + +.. automodule:: pe.constant.data + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.data.image.rst b/doc/source/api/pe.data.image.rst new file mode 100644 index 0000000..177b224 --- /dev/null +++ b/doc/source/api/pe.data.image.rst @@ -0,0 +1,42 @@ +pe.data.image package +===================== + +.. automodule:: pe.data.image + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.data.image.camelyon17 module +------------------------------- + +.. automodule:: pe.data.image.camelyon17 + :members: + :undoc-members: + :show-inheritance: + +pe.data.image.cat module +------------------------ + +.. automodule:: pe.data.image.cat + :members: + :undoc-members: + :show-inheritance: + +pe.data.image.cifar10 module +---------------------------- + +.. automodule:: pe.data.image.cifar10 + :members: + :undoc-members: + :show-inheritance: + +pe.data.image.image module +-------------------------- + +.. automodule:: pe.data.image.image + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.data.rst b/doc/source/api/pe.data.rst new file mode 100644 index 0000000..27d092f --- /dev/null +++ b/doc/source/api/pe.data.rst @@ -0,0 +1,26 @@ +pe.data package +=============== + +.. automodule:: pe.data + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + pe.data.image + +Submodules +---------- + +pe.data.data module +------------------- + +.. automodule:: pe.data.data + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.dp.rst b/doc/source/api/pe.dp.rst new file mode 100644 index 0000000..6473b69 --- /dev/null +++ b/doc/source/api/pe.dp.rst @@ -0,0 +1,26 @@ +pe.dp package +============= + +.. automodule:: pe.dp + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.dp.dp module +--------------- + +.. automodule:: pe.dp.dp + :members: + :undoc-members: + :show-inheritance: + +pe.dp.gaussian module +--------------------- + +.. automodule:: pe.dp.gaussian + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.embedding.image.rst b/doc/source/api/pe.embedding.image.rst new file mode 100644 index 0000000..1777711 --- /dev/null +++ b/doc/source/api/pe.embedding.image.rst @@ -0,0 +1,18 @@ +pe.embedding.image package +========================== + +.. automodule:: pe.embedding.image + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.embedding.image.inception module +----------------------------------- + +.. automodule:: pe.embedding.image.inception + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.embedding.rst b/doc/source/api/pe.embedding.rst new file mode 100644 index 0000000..fbeee54 --- /dev/null +++ b/doc/source/api/pe.embedding.rst @@ -0,0 +1,26 @@ +pe.embedding package +==================== + +.. automodule:: pe.embedding + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + pe.embedding.image + +Submodules +---------- + +pe.embedding.embedding module +----------------------------- + +.. automodule:: pe.embedding.embedding + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.histogram.nearest_neighbor_backend.rst b/doc/source/api/pe.histogram.nearest_neighbor_backend.rst new file mode 100644 index 0000000..6377013 --- /dev/null +++ b/doc/source/api/pe.histogram.nearest_neighbor_backend.rst @@ -0,0 +1,26 @@ +pe.histogram.nearest\_neighbor\_backend package +=============================================== + +.. automodule:: pe.histogram.nearest_neighbor_backend + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.histogram.nearest\_neighbor\_backend.faiss module +---------------------------------------------------- + +.. automodule:: pe.histogram.nearest_neighbor_backend.faiss + :members: + :undoc-members: + :show-inheritance: + +pe.histogram.nearest\_neighbor\_backend.sklearn module +------------------------------------------------------ + +.. automodule:: pe.histogram.nearest_neighbor_backend.sklearn + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.histogram.rst b/doc/source/api/pe.histogram.rst new file mode 100644 index 0000000..2316f2d --- /dev/null +++ b/doc/source/api/pe.histogram.rst @@ -0,0 +1,34 @@ +pe.histogram package +==================== + +.. automodule:: pe.histogram + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + pe.histogram.nearest_neighbor_backend + +Submodules +---------- + +pe.histogram.histogram module +----------------------------- + +.. automodule:: pe.histogram.histogram + :members: + :undoc-members: + :show-inheritance: + +pe.histogram.nearest\_neighbors module +-------------------------------------- + +.. automodule:: pe.histogram.nearest_neighbors + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.logger.rst b/doc/source/api/pe.logger.rst new file mode 100644 index 0000000..5c3b302 --- /dev/null +++ b/doc/source/api/pe.logger.rst @@ -0,0 +1,50 @@ +pe.logger package +================= + +.. automodule:: pe.logger + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.logger.csv\_print module +--------------------------- + +.. automodule:: pe.logger.csv_print + :members: + :undoc-members: + :show-inheritance: + +pe.logger.image\_file module +---------------------------- + +.. automodule:: pe.logger.image_file + :members: + :undoc-members: + :show-inheritance: + +pe.logger.log\_print module +--------------------------- + +.. automodule:: pe.logger.log_print + :members: + :undoc-members: + :show-inheritance: + +pe.logger.logger module +----------------------- + +.. automodule:: pe.logger.logger + :members: + :undoc-members: + :show-inheritance: + +pe.logger.matplotlib\_pdf module +-------------------------------- + +.. automodule:: pe.logger.matplotlib_pdf + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.logging.rst b/doc/source/api/pe.logging.rst new file mode 100644 index 0000000..638a63b --- /dev/null +++ b/doc/source/api/pe.logging.rst @@ -0,0 +1,7 @@ +pe.logging package +================== + +.. automodule:: pe.logging + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.metric_item.rst b/doc/source/api/pe.metric_item.rst new file mode 100644 index 0000000..91146e4 --- /dev/null +++ b/doc/source/api/pe.metric_item.rst @@ -0,0 +1,7 @@ +pe.metric\_item package +======================= + +.. automodule:: pe.metric_item + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.population.rst b/doc/source/api/pe.population.rst new file mode 100644 index 0000000..c7f26a6 --- /dev/null +++ b/doc/source/api/pe.population.rst @@ -0,0 +1,26 @@ +pe.population package +===================== + +.. automodule:: pe.population + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.population.pe\_population module +----------------------------------- + +.. automodule:: pe.population.pe_population + :members: + :undoc-members: + :show-inheritance: + +pe.population.population module +------------------------------- + +.. automodule:: pe.population.population + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.rst b/doc/source/api/pe.rst new file mode 100644 index 0000000..5d6227f --- /dev/null +++ b/doc/source/api/pe.rst @@ -0,0 +1,27 @@ +pe package +========== + +.. automodule:: pe + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 3 + + pe.api + pe.callback + pe.constant + pe.data + pe.dp + pe.embedding + pe.histogram + pe.logger + pe.logging + pe.metric_item + pe.population + pe.runner + pe.util diff --git a/doc/source/api/pe.runner.rst b/doc/source/api/pe.runner.rst new file mode 100644 index 0000000..f7b7b13 --- /dev/null +++ b/doc/source/api/pe.runner.rst @@ -0,0 +1,18 @@ +pe.runner package +================= + +.. automodule:: pe.runner + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.runner.pe module +------------------- + +.. automodule:: pe.runner.pe + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/api/pe.util.rst b/doc/source/api/pe.util.rst new file mode 100644 index 0000000..de60c5b --- /dev/null +++ b/doc/source/api/pe.util.rst @@ -0,0 +1,18 @@ +pe.util package +=============== + +.. automodule:: pe.util + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +pe.util.download module +----------------------- + +.. automodule:: pe.util.download + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 0000000..c64d68e --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,56 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "Private Evolution" +copyright = "2024, Zinan Lin" +author = "Zinan Lin" +release = "0.0.1" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [] + +templates_path = ["_templates"] +exclude_patterns = [] + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" # 'alabaster' +html_static_path = ["_static"] + +html_theme_options = {"navigation_depth": 6} + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "sphinx.ext.napoleon", +] + +# Napoleon settings +napoleon_google_docstring = True +napoleon_numpy_docstring = True +napoleon_include_init_with_doc = True +napoleon_include_private_with_doc = True +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = False +napoleon_use_admonition_for_notes = False +napoleon_use_admonition_for_references = False +napoleon_use_ivar = False +napoleon_use_param = True +napoleon_use_rtype = True + +numfig = True diff --git a/doc/source/getting_started/details/api.rst b/doc/source/getting_started/details/api.rst new file mode 100644 index 0000000..af3ada6 --- /dev/null +++ b/doc/source/getting_started/details/api.rst @@ -0,0 +1,32 @@ +APIs +==== + +API reference: :doc:`/api/pe.api` + +:py:class:`pe.api.api.API` is responsible for implementing the foundation model APIs. It has the following key methods: + +* :py:meth:`pe.api.api.API.random_api`: Randomly generates the synthetic samples for the initial samples of the **Private Evolution** algorithm. +* :py:meth:`pe.api.api.API.variation_api`: Generates the variations of the given synthetic samples for the initial or the next **Private Evolution** iteration. + +Available APIs +-------------- + +Currently, the following APIs are implemented: + +* Images + + * :py:class:`pe.api.image.stable_diffusion_api.StableDiffusion`: The APIs of `Stable Diffusion`_. + * :py:class:`pe.api.image.improved_diffusion_api.ImprovedDiffusion`: The APIs of the `improved diffusion model`_. + +* Text + + * Coming soon! + +Adding Your Own APIs +-------------------- + +To add your own APIs, you need to create a class that inherits from :py:class:`pe.api.api.API` and implements the :py:meth:`pe.api.api.API.random_api` and :py:meth:`pe.api.api.API.variation_api` methods. + + +.. _improved diffusion model: https://github.com/openai/improved-diffusion +.. _Stable Diffusion: https://huggingface.co/CompVis/stable-diffusion-v1-4 diff --git a/doc/source/getting_started/details/callback_and_logger.rst b/doc/source/getting_started/details/callback_and_logger.rst new file mode 100644 index 0000000..cd68f30 --- /dev/null +++ b/doc/source/getting_started/details/callback_and_logger.rst @@ -0,0 +1,36 @@ +Callbacks and Loggers +====================== + +API reference: :doc:`/api/pe.callback` and :doc:`/api/pe.logger`. + +:py:class:`pe.callback.callback.Callback` can be configured to be called after each **Private Evolution** iteration with the synthetic data as the input. It is useful for computing metrics, saving the synthetic samples, monitoring the progress, etc. Each :py:class:`pe.callback.callback.Callback` can return a list of results (float numbers, images, matplotlib plots, etc.) in the form of :py:class:`pe.metric_item.MetricItem` (see :py:mod:`pe.metric_item`). All :py:class:`pe.metric_item.MetricItem` from all :py:class:`pe.callback.callback.Callback` will be passed through each of the :py:class:`pe.logger.logger.Logger` modules, which will then log the results in the desired way. + +Available Callbacks +------------------- + +Currently, the following callbacks are implemented: + +* For any data modality + + * :py:class:`pe.callback.common.compute_fid.ComputeFID`: Computes the FID between the synthetic samples and the private samples. + * :py:class:`pe.callback.common.save_checkpoints.SaveCheckpoints`: Saves the checkpoint of current synthetic samples to files. + +* Images + + * :py:class:`pe.callback.image.sample_images.SampleImages`: Samples some images from each class. + * :py:class:`pe.callback.image.save_all_images.SaveAllImages`: Saves all synthetic images to files. + +* Text + + * Coming soon! + + +Available Loggers +----------------- + +Currently, the following loggers are implemented: + +* :py:class:`pe.logger.csv_print.CSVPrint`: Saves the float results to a CSV file. +* :py:class:`pe.logger.log_print.LogPrint`: Prints the float results to the console and/or files using the logging module. +* :py:class:`pe.logger.image_file.ImageFile`: Saves the images to files. +* :py:class:`pe.logger.matplotlib_pdf.MatplotlibPDF`: Saves the matplotlib plots to PDF files. diff --git a/doc/source/getting_started/details/data.rst b/doc/source/getting_started/details/data.rst new file mode 100644 index 0000000..420a5a9 --- /dev/null +++ b/doc/source/getting_started/details/data.rst @@ -0,0 +1,49 @@ +Data +==== + +API reference: :doc:`/api/pe.data`. + +:py:class:`pe.data.data.Data` is the base class for holding the synthetic samples or the private samples, along with their metadata. Different components are mostly communicated through objects of this class. +:py:class:`pe.data.data.Data` has two key attributes: + +* ``data_frame``: A pandas_ DataFrame that holds the samples. Each row in the DataFrame is a sample, and each column is part of the sample (e.g., the image, the text, the label) and other information of the sample (e.g., its embedding produced by :doc:`embedding`). +* ``metadata``: A OmegaConf_ that holds the metadata of the samples, such as the **Private Evolution** iteration number when the samples are generated, and the label names of the classes. + +Available Datasets +------------------ + +For convenience, some well-known datasets are already packaged as `pe.data.data.Data` classes: + +* Image datasets + + * :py:class:`pe.data.image.cifar10.Cifar10`: The `CIFAR10 dataset`_. + * :py:class:`pe.data.image.camelyon17.Camelyon17`: The `Camelyon17 dataset`_. + * :py:class:`pe.data.image.cat.Cat`: The `Cat dataset`_. + * In addition, you can easily load a custom image dataset from a (nested) directory with the image files using :py:meth:`pe.data.image.image.load_image_folder`. + +* Text datasets + + * Coming soon! + +Using Your Own Datasets +----------------------- +To apply **Private Evolution** to your own private dataset, you need to create a :py:class:`pe.data.data.Data` object that holds your dataset, with two parameters, ``data_frame`` and ``metadata``, passed to the constructor: + +* ``data_frame``: A pandas_ DataFrame that holds the samples. Each row in the DataFrame is a sample. The following columns must be included: + + * :py:attr:`pe.constant.data.LABEL_ID_COLUMN_NAME`: The label (class) ID of the sample. The label IDs must be in {0, 1, ..., K-1} if there are K classes. If you are targeting unconditional generation, the values of this column can just be zeros. + + The ``data_frame`` can have any numbers of additional columns that hold the data of the samples, as long as the modules you are using (e.g., :doc:`api`, :doc:`Callbacks `) can recognize them. + +* ``metadata``: A dictionary that holds the metadata of the samples. The following keys must be included: + + * ``label_names``: A list of strings that holds the names of the classes. The length of the list must be equal to K. + + In addition, you can include any other keys that hold the metadata of the samples if needed. + + +.. _OmegaConf: https://omegaconf.readthedocs.io/en/latest/ +.. _pandas: https://pandas.pydata.org/ +.. _Cat dataset: https://www.kaggle.com/datasets/fjxmlzn/cat-cookie-doudou +.. _CIFAR10 dataset: https://www.cs.toronto.edu/~kriz/cifar.html +.. _Camelyon17 dataset: https://camelyon17.grand-challenge.org/ diff --git a/doc/source/getting_started/details/details.rst b/doc/source/getting_started/details/details.rst new file mode 100644 index 0000000..37efb39 --- /dev/null +++ b/doc/source/getting_started/details/details.rst @@ -0,0 +1,16 @@ +Details of the Library +======================= + +.. toctree:: + :maxdepth: 5 + :caption: Contents: + + overview + data + population + api + histogram + dp + embedding + callback_and_logger + runner diff --git a/doc/source/getting_started/details/dp.rst b/doc/source/getting_started/details/dp.rst new file mode 100644 index 0000000..2a3e57c --- /dev/null +++ b/doc/source/getting_started/details/dp.rst @@ -0,0 +1,16 @@ +DP +=== + +API reference: :doc:`/api/pe.dp`. + +:py:class:`pe.dp.dp.DP` is responsible for implementing the differential privacy mechanism. It has the following key methods: + +* :py:meth:`pe.dp.dp.DP.set_epsilon_and_delta`: Set the privacy budget for the differential privacy mechanism. +* :py:meth:`pe.dp.dp.DP.add_noise`: Add noise to the histogram values to achieve differential privacy. + +Available Differential Privacy Mechanisms +----------------------------------------- + +Currently, the following differential privacy mechanisms are implemented: + +* :py:class:`pe.dp.gaussian.Gaussian`: The Gaussian mechanism, which adds Gaussian noise to the histogram values. \ No newline at end of file diff --git a/doc/source/getting_started/details/embedding.rst b/doc/source/getting_started/details/embedding.rst new file mode 100644 index 0000000..acf93ec --- /dev/null +++ b/doc/source/getting_started/details/embedding.rst @@ -0,0 +1,22 @@ +Embeddings +========== + +API reference: :doc:`/api/pe.embedding`. + +:py:class:`pe.embedding.embedding.Embedding` is responsible for computing the embeddings of the (synthetic or private) samples. It has the following key methods/attributes: + +* :py:meth:`pe.embedding.embedding.Embedding.compute_embedding`: Computes the embeddings of the (synthetic or private) samples. +* :py:attr:`pe.embedding.embedding.Embedding.column_name`: The column name to be used when saving the embeddings in the ``data_frame`` of `pe.data.data.Data`. + +Available Embeddings +-------------------- + +Currently, the following embeddings are implemented: + +* Images + + * :py:class:`pe.embedding.image.inception.Inception`: The embeddings computed using the Inception model. + +* Text + + * Coming soon! diff --git a/doc/source/getting_started/details/histogram.rst b/doc/source/getting_started/details/histogram.rst new file mode 100644 index 0000000..b7ee4ac --- /dev/null +++ b/doc/source/getting_started/details/histogram.rst @@ -0,0 +1,15 @@ +Histograms +========== + +API reference: :doc:`/api/pe.histogram`. + +:py:class:`pe.histogram.histogram.Histogram` is responsible for generating the histograms over the synthetic samples. It has the following key methods: + +* :py:meth:`pe.histogram.histogram.Histogram.compute_histogram`: Generates the histograms over the synthetic samples using private samples. + +Available Histograms +-------------------- + +Currently, the following histograms are implemented: + +* :py:class:`pe.histogram.nearest_neighbors.NearestNeighbors`: This histogram algorithm projects the synthetic samples and the private samples into an embedding space and computes the nearest neighbor(s) of each private sample in the synthetic samples. The histogram value for each synthetic sample is the number of times it is the nearest neighbor(s) of a private sample. \ No newline at end of file diff --git a/doc/source/getting_started/details/overview.rst b/doc/source/getting_started/details/overview.rst new file mode 100644 index 0000000..7bc59bf --- /dev/null +++ b/doc/source/getting_started/details/overview.rst @@ -0,0 +1,59 @@ +Overview +======== + +.. _workflow: +.. figure:: workflow.jpg + :align: center + :figwidth: 90% + + The workflow of **Private Evolution**. + + +The Private Evolution Algorithm +------------------------------- + +The workflow of `Private Evolution `_ is shown in the above :numref:`workflow`. +The workflow consists of the following steps: + +* Using the foundation model API to randomly generate the initial synthetic samples. +* Iteratively refining the synthetic samples by: + + * Building a histogram over the synthetic samples using the private samples. The histogram value for each synthetic sample represents how similar it is to the private samples. + * Adding noise to the histogram to ensure differential privacy. + * Selecting a subset of the synthetic samples based on the noisy histogram. Those selected samples are expected to be more similar to the private samples. + * Using the foundation model API to generate variations of the selected synthetic samples. + +* Outputting the final synthetic samples. + +Core Design Principles of This Library +-------------------------------------- + +The design principles of this library are: + +* Easy to use. +* Supporting different data modalities (e.g., images, text), different foundation model APIs (e.g., Stable Diffusion, GPT models), different **Private Evolution** algorithms (e.g., PE, Aug-PE), and different evaluation metrics (e.g., FID), all in one framework. +* Easy to add new data modalities, foundation model APIs, **Private Evolution** algorithms, evaluation metrics, etc. + +Towards these goals, the library is designed to be highly modular and extensible, as discussed next. + +Core Components of This Library +-------------------------------- + +This library provides a set of core components that can be easily customized or replaced. The core components shown in the :numref:`workflow` include: + +* :doc:`runner`: Running the whole **Private Evolution** algorithm. +* :doc:`population`: Generating the initial synthetic samples and the variations of the synthetic samples. +* :doc:`histogram`: Building the histogram over the synthetic samples. +* :doc:`dp`: Adding noise to the histogram to ensure differential privacy. + +In additional to these components shown in the :numref:`workflow`, the library also has the following core components: + +* :doc:`data`: This class holds the synthetic samples or the private samples. Different components are mostly communicated through objects of this class. +* :doc:`api`: This class implements the foundation model APIs. This class is utilized by :doc:`population` to generate the synthetic samples and the variations of the synthetic samples. It might also be used in some :doc:`histogram` algorithms when building the histogram (e.g., when ``lookahead`` in `PE `_ is used). +* :doc:`embedding`: This class is used to embed the synthetic/private samples into an embedding space. It might be used in some :doc:`histogram` algorithms when building the histogram. It might also be used in some :doc:`metric evaluation callback modules ` (e.g., for computing FID). +* :doc:`callback_and_logger`: The :doc:`runner` can be configured to call a given list of callback modules at the end of each Private Evolution iteration. This is very useful for saving the intermediate results, evaluating the synthetic samples, etc. Since we might want to evaluate multiple metrics (e.g., FID, precision, recall), and for each metric, we might want to log it in different ways (e.g., saving it to a file, printing it to the console, uploading it to WandB), the library abstracts this part into two modules: + + * :doc:`Callbacks `: This module computes the metrics and (optionally) return the results. (The callback can also return nothing if it does not need the loggers to help with logging the results, e.g., if this callback is for saving the immediate synthetic samples.) + * :doc:`Loggers `: All results returned by the callback modules will be passed through each of the logger modules, which will then log the results in the desired way. + +.. _paper: https://arxiv.org/abs/2305.15560 diff --git a/doc/source/getting_started/details/population.rst b/doc/source/getting_started/details/population.rst new file mode 100644 index 0000000..0396076 --- /dev/null +++ b/doc/source/getting_started/details/population.rst @@ -0,0 +1,15 @@ +Population +========== + +API reference: :doc:`/api/pe.population`. + +:py:class:`pe.population.population.Population` is responsible for generating the initial synthetic samples and the new synthetic samples for each **Private Evolution** iteration. It has the following key methods: + +* :py:meth:`pe.population.population.Population.initial`: Generates the initial synthetic samples. +* :py:meth:`pe.population.population.Population.next`: Generates the synthetic samples for the next **Private Evolution** iteration. + +Available Populations +--------------------- + +:py:class:`pe.population.pe_population.PEPopulation` is currently the only implementation of :py:class:`pe.population.population.Population`. It supports the key population algorthms from existing **Private Evolution** papers (https://github.com/fjxmlzn/private-evolution-papers). + diff --git a/doc/source/getting_started/details/runner.rst b/doc/source/getting_started/details/runner.rst new file mode 100644 index 0000000..24054b0 --- /dev/null +++ b/doc/source/getting_started/details/runner.rst @@ -0,0 +1,8 @@ +Runner +====== + +API reference: :doc:`/api/pe.runner`. + +:py:class:`pe.runner.pe.PE` manages the main **Private Evolution** algorithm by calling the other components discussed before. It has the following key methods: + +* :py:meth:`pe.runner.pe.PE.run`: Runs the **Private Evolution** algorithm. diff --git a/doc/source/getting_started/details/workflow.jpg b/doc/source/getting_started/details/workflow.jpg new file mode 100644 index 0000000..b684486 Binary files /dev/null and b/doc/source/getting_started/details/workflow.jpg differ diff --git a/doc/source/getting_started/examples.rst b/doc/source/getting_started/examples.rst new file mode 100644 index 0000000..a5d17cd --- /dev/null +++ b/doc/source/getting_started/examples.rst @@ -0,0 +1,30 @@ +Examples +======== + +Here are some examples of how to use the **Private Evolution** library. + +Images +------ + +These examples follow the experimental settings in the paper `Differentially Private Synthetic Data via Foundation Model APIs 1: Images (ICLR 2024) `__. + +* **CIFAR10**: `This example `__ shows how to generate differentially private synthetic images for the `CIFAR10 dataset`_ using the APIs from a pre-trained `ImageNet diffusion model`_. + +* **Camelyon17**: `This example `__ shows how to generate differentially private synthetic images for the `Camelyon17 dataset`_ using the APIs from a pre-trained `ImageNet diffusion model`_. + +* **Cat**: `This example `__ shows how to generate differentially private synthetic images of the `Cat dataset`_ using the APIs from `Stable Diffusion`_. + +Text +---- + +Coming soon! + +.. _ImageNet diffusion model: https://github.com/openai/improved-diffusion +.. _Stable Diffusion: https://huggingface.co/CompVis/stable-diffusion-v1-4 +.. _Cat dataset: https://www.kaggle.com/datasets/fjxmlzn/cat-cookie-doudou +.. _CIFAR10 dataset: https://www.cs.toronto.edu/~kriz/cifar.html +.. _Camelyon17 dataset: https://camelyon17.grand-challenge.org/ +.. _CIFAR10 example: https://github.com/microsoft/DPSDA/blob/main/example/image/cifar10.py +.. _Camelyon17 example: https://github.com/microsoft/DPSDA/blob/main/example/image/camelyon17.py +.. _Cat example: https://github.com/microsoft/DPSDA/blob/main/example/image/cat.py +.. _paper: https://arxiv.org/abs/2305.15560 \ No newline at end of file diff --git a/doc/source/getting_started/getting_started.rst b/doc/source/getting_started/getting_started.rst new file mode 100644 index 0000000..ecd0a76 --- /dev/null +++ b/doc/source/getting_started/getting_started.rst @@ -0,0 +1,13 @@ +Getting Started +=============================== + + +.. toctree:: + :maxdepth: 5 + :caption: Contents: + + intro + installation + examples + details/details + diff --git a/doc/source/getting_started/installation.rst b/doc/source/getting_started/installation.rst new file mode 100644 index 0000000..3bf0ac8 --- /dev/null +++ b/doc/source/getting_started/installation.rst @@ -0,0 +1,40 @@ +Installation +============ + +PIP +--- + +To install the core package of **Private Evolution**, please use the following command: + +.. code-block:: bash + + pip install "private-evolution @ git+https://github.com/microsoft/DPSDA.git" + +If you are using **Private Evolution** to generate images, use the following command instead to install the package with the necessary dependencies for image generation: + +.. code-block:: bash + + pip install "private-evolution[image] @ git+https://github.com/microsoft/DPSDA.git" + +Faiss +----- + +**Private Evolution** requires a nearest neighbor search process. By default, it uses the sklearn_ package for this purpose. However, for faster computation, we recommend using the faiss_ package. +To install `faiss 1.8.0`, please use the following command: + +.. code-block:: bash + + conda install -y -c pytorch -c nvidia faiss-gpu=1.8.0 + +Please check out the faiss_ website for the latest information on how to install the package. + +.. + Docker + ------ + + We provide Docker images for **Private Evolution** with all dependencies (including faiss_) pre-installed. To pull the Docker image, please use the following command: + + TODO + +.. _faiss: https://faiss.ai/ +.. _sklearn: https://scikit-learn.org/dev/modules/generated/sklearn.neighbors.NearestNeighbors.html \ No newline at end of file diff --git a/doc/source/getting_started/intro.rst b/doc/source/getting_started/intro.rst new file mode 100644 index 0000000..48b42fd --- /dev/null +++ b/doc/source/getting_started/intro.rst @@ -0,0 +1,40 @@ +What is Private Evolution? +=============================== + +**Private Evolution** (PE in short) is an algorithm for **generating differentially private synthetic data without the need of any ML model training**. + +Given a dataset, **Private Evolution** can generate a new synthetic dataset that is statistically similar to the original dataset, while ensuring a rigorous privacy guarantee called `differential privacy (DP) `_, which implies that the privacy of individuals in the original dataset is protected. It is particularly useful in situations where the original data is sensitive or confidential, such as medical records, financial data, or personal information. The DP synthetic dataset can replace the original data in various use cases where privacy is a concern, for example: + +* Sharing them with other parties for collaboration and research. +* Using them in downstream algorithms (e.g., training ML models) in the normal non-private pipeline. +* Inspecting the data directly for easier product debugging and development. + +Key Features +------------ + +Compared to other DP synthetic data alternatives, **Private Evolution** has the following key features: + +* ✅ **No training needed!** **Private Evolution** only requires the inference APIs of foundation models. Therefore, it can leverage any state-of-the-art black-box models (e.g., GPT-4) and open-source models (e.g., Stable Diffusion, Llama). +* ✅ **Protects privacy even from the API provider.** Even when using APIs from a third-party provider, you can rest assured that the information of individuals in the original dataset is still protected, as all API queries made from **Private Evolution** are also differentially private. +* ✅ **Works across images, text, etc.** **Private Evolution** can generate synthetic data for various data types, including images and text. More data modalities are coming soon! +* ✅ **Could even match/beat SoTA training-based methods in data quality.** **Private Evolution** can generate synthetic data that is statistically similar to the original data, and in some cases, it can even match or beat the state-of-the-art training-based methods in data quality even though it does not require any training. + +What This Library Provides +-------------------------- + +**This library is the official Python package of Private Evolution**. It allows you to generate differentially private synthetic data (e.g., images, text) using the **Private Evolution** algorithm. This library is designed to be easy to use, flexible, modular, and extensible. It provides several popular foundation model APIs, and you can easily extend it to work with your own foundation models (and/or APIs), data types, or new **Private Evolution** algorithms if needed. + +The source code of this **Private Evolution** library is available at https://github.com/microsoft/DPSDA. + +Citations +--------- + +If you use **Private Evolution** in your research or work, please cite the following papers: + +.. literalinclude:: pe1.bib + :language: bibtex + +.. literalinclude:: pe2.bib + :language: bibtex + +Please see https://github.com/fjxmlzn/private-evolution-papers for the full list of **Private Evolution** papers done by the community. diff --git a/doc/source/getting_started/pe1.bib b/doc/source/getting_started/pe1.bib new file mode 100644 index 0000000..d78c1d0 --- /dev/null +++ b/doc/source/getting_started/pe1.bib @@ -0,0 +1,6 @@ +@article{lin2023differentially, + title={Differentially private synthetic data via foundation model apis 1: Images}, + author={Lin, Zinan and Gopi, Sivakanth and Kulkarni, Janardhan and Nori, Harsha and Yekhanin, Sergey}, + journal={arXiv preprint arXiv:2305.15560}, + year={2023} +} \ No newline at end of file diff --git a/doc/source/getting_started/pe2.bib b/doc/source/getting_started/pe2.bib new file mode 100644 index 0000000..5bde0ab --- /dev/null +++ b/doc/source/getting_started/pe2.bib @@ -0,0 +1,6 @@ +@article{xie2024differentially, + title={Differentially private synthetic data via foundation model apis 2: Text}, + author={Xie, Chulin and Lin, Zinan and Backurs, Arturs and Gopi, Sivakanth and Yu, Da and Inan, Huseyin A and Nori, Harsha and Jiang, Haotian and Zhang, Huishuai and Lee, Yin Tat and others}, + journal={arXiv preprint arXiv:2403.01749}, + year={2024} +} \ No newline at end of file diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 0000000..1c98d94 --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,18 @@ +.. Private Evolution documentation master file, created by + sphinx-quickstart on Wed Oct 30 22:51:11 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Private Evolution Documentation +=============================== + +The source code of this **Private Evolution** library is available at https://github.com/microsoft/DPSDA. + + +.. toctree:: + :maxdepth: 5 + :caption: Contents: + + getting_started/getting_started + api/api + diff --git a/example/image/camelyon17.py b/example/image/camelyon17.py new file mode 100644 index 0000000..79f6674 --- /dev/null +++ b/example/image/camelyon17.py @@ -0,0 +1,64 @@ +from pe.data.image import Camelyon17 +from pe.logging import setup_logging +from pe.runner import PE +from pe.population import PEPopulation +from pe.api.image import ImprovedDiffusion270M +from pe.embedding.image import Inception +from pe.histogram import NearestNeighbors +from pe.callback import SaveCheckpoints +from pe.callback import SampleImages +from pe.callback import ComputeFID +from pe.logger import ImageFile +from pe.logger import CSVPrint +from pe.logger import LogPrint + +import pandas as pd +import os +import numpy as np + +pd.options.mode.copy_on_write = True + + +if __name__ == "__main__": + exp_folder = "results/image/camelyon17" + + setup_logging(log_file=os.path.join(exp_folder, "log.txt")) + + data = Camelyon17(root_dir="/tmp/data/") + api = ImprovedDiffusion270M( + variation_degrees=[0] * 5 + [1] * 5 + [2] * 5 + [3] * 4 + list(range(20, 31)), + timestep_respacing=["ddim10"] * 19 + ["40"] * 11, + ) + embedding = Inception(res=64, batch_size=100) + histogram = NearestNeighbors( + embedding=embedding, + mode="L2", + lookahead_degree=8, + lookahead_log_folder=os.path.join(exp_folder, "lookahead"), + voting_details_log_folder=os.path.join(exp_folder, "voting_details"), + api=api, + backend="faiss", + ) + population = PEPopulation(api=api, histogram_threshold=4) + + save_checkpoints = SaveCheckpoints(os.path.join(exp_folder, "checkpoint")) + sample_images = SampleImages() + compute_fid = ComputeFID(priv_data=data, embedding=embedding) + + image_file = ImageFile(output_folder=exp_folder) + csv_print = CSVPrint(output_folder=exp_folder) + log_print = LogPrint() + + pe_runner = PE( + priv_data=data, + population=population, + histogram=histogram, + callbacks=[save_checkpoints, sample_images, compute_fid], + loggers=[image_file, csv_print, log_print], + ) + pe_runner.run( + num_samples_schedule=[302436] * 30, + delta=3e-6, + noise_multiplier=2 * np.sqrt(2), + checkpoint_path=os.path.join(exp_folder, "checkpoint"), + ) diff --git a/example/image/cat.py b/example/image/cat.py new file mode 100644 index 0000000..0516419 --- /dev/null +++ b/example/image/cat.py @@ -0,0 +1,66 @@ +from pe.data.image import Cat +from pe.logging import setup_logging +from pe.runner import PE +from pe.population import PEPopulation +from pe.api.image import StableDiffusion +from pe.embedding.image import Inception +from pe.histogram import NearestNeighbors +from pe.callback import SaveCheckpoints +from pe.callback import SampleImages +from pe.callback import SaveAllImages +from pe.callback import ComputeFID +from pe.logger import ImageFile +from pe.logger import CSVPrint +from pe.logger import LogPrint + +import pandas as pd +import os +import numpy as np + +pd.options.mode.copy_on_write = True + + +if __name__ == "__main__": + exp_folder = "results/image/cat" + + setup_logging(log_file=os.path.join(exp_folder, "log.txt")) + + data = Cat(root_dir="/tmp/data/") + api = StableDiffusion( + prompt={"cookie": "A photo of ragdoll cat", "doudou": "A photo of ragdoll cat"}, + variation_degrees=list(np.arange(1.0, 0.9, -0.02)) + list(np.arange(0.88, 0.36, -0.04)), + ) + embedding = Inception(res=512, batch_size=100) + histogram = NearestNeighbors( + embedding=embedding, + mode="L2", + lookahead_degree=8, + lookahead_log_folder=os.path.join(exp_folder, "lookahead"), + voting_details_log_folder=os.path.join(exp_folder, "voting_details"), + api=api, + backend="faiss", + ) + population = PEPopulation(api=api, histogram_threshold=2) + + save_checkpoints = SaveCheckpoints(os.path.join(exp_folder, "checkpoint")) + sample_images = SampleImages() + save_all_images = SaveAllImages(output_folder=os.path.join(exp_folder, "all_images")) + compute_fid = ComputeFID(priv_data=data, embedding=embedding) + + image_file = ImageFile(output_folder=exp_folder) + csv_print = CSVPrint(output_folder=exp_folder) + log_print = LogPrint() + + pe_runner = PE( + priv_data=data, + population=population, + histogram=histogram, + callbacks=[save_checkpoints, sample_images, compute_fid, save_all_images], + loggers=[image_file, csv_print, log_print], + ) + pe_runner.run( + num_samples_schedule=[200] * 18, + delta=1e-3, + noise_multiplier=2, + checkpoint_path=os.path.join(exp_folder, "checkpoint"), + ) diff --git a/example/image/cifar10.py b/example/image/cifar10.py new file mode 100644 index 0000000..fbe38c9 --- /dev/null +++ b/example/image/cifar10.py @@ -0,0 +1,64 @@ +from pe.data.image import Cifar10 +from pe.logging import setup_logging +from pe.runner import PE +from pe.population import PEPopulation +from pe.api.image import ImprovedDiffusion270M +from pe.embedding.image import Inception +from pe.histogram import NearestNeighbors +from pe.callback import SaveCheckpoints +from pe.callback import SampleImages +from pe.callback import ComputeFID +from pe.logger import ImageFile +from pe.logger import CSVPrint +from pe.logger import LogPrint + +import pandas as pd +import os +import numpy as np + +pd.options.mode.copy_on_write = True + + +if __name__ == "__main__": + exp_folder = "results/image/cifar10" + + setup_logging(log_file=os.path.join(exp_folder, "log.txt")) + + data = Cifar10() + api = ImprovedDiffusion270M( + variation_degrees=list(range(0, 42, 2)), + timestep_respacing="100", + ) + embedding = Inception(res=32, batch_size=100) + histogram = NearestNeighbors( + embedding=embedding, + mode="L2", + lookahead_degree=8, + lookahead_log_folder=os.path.join(exp_folder, "lookahead"), + voting_details_log_folder=os.path.join(exp_folder, "voting_details"), + api=api, + backend="faiss", + ) + population = PEPopulation(api=api, histogram_threshold=10) + + save_checkpoints = SaveCheckpoints(os.path.join(exp_folder, "checkpoint")) + sample_images = SampleImages() + compute_fid = ComputeFID(priv_data=data, embedding=embedding) + + image_file = ImageFile(output_folder=exp_folder) + csv_print = CSVPrint(output_folder=exp_folder) + log_print = LogPrint() + + pe_runner = PE( + priv_data=data, + population=population, + histogram=histogram, + callbacks=[save_checkpoints, sample_images, compute_fid], + loggers=[image_file, csv_print, log_print], + ) + pe_runner.run( + num_samples_schedule=[50000] * 21, + delta=1e-5, + noise_multiplier=5 * np.sqrt(2), + checkpoint_path=os.path.join(exp_folder, "checkpoint"), + ) diff --git a/pe/__init__.py b/pe/__init__.py new file mode 100644 index 0000000..0ac84d2 --- /dev/null +++ b/pe/__init__.py @@ -0,0 +1 @@ +from .runner.pe import PE diff --git a/pe/api/__init__.py b/pe/api/__init__.py new file mode 100644 index 0000000..ae40d11 --- /dev/null +++ b/pe/api/__init__.py @@ -0,0 +1 @@ +from .api import API diff --git a/pe/api/api.py b/pe/api/api.py new file mode 100644 index 0000000..31481e6 --- /dev/null +++ b/pe/api/api.py @@ -0,0 +1,25 @@ +from abc import ABC, abstractmethod + + +class API(ABC): + """The abstract class that defines the APIs for the synthetic data generation.""" + + @abstractmethod + def random_api(self, label_name, num_samples): + """The abstract method that generates random synthetic data. + + :param label_name: The name of the label + :type label_name: str + :param num_samples: The number of random samples to generate + :type num_samples: int + """ + ... + + @abstractmethod + def variation_api(self, syn_data): + """The abstract method that generates variations of the synthetic data. + + :param syn_data: The data object of the synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + """ + ... diff --git a/pe/api/image/__init__.py b/pe/api/image/__init__.py new file mode 100644 index 0000000..e26ceab --- /dev/null +++ b/pe/api/image/__init__.py @@ -0,0 +1,3 @@ +from .improved_diffusion_api import ImprovedDiffusion +from .improved_diffusion_api import ImprovedDiffusion270M +from .stable_diffusion_api import StableDiffusion diff --git a/pe/api/image/improved_diffusion_api.py b/pe/api/image/improved_diffusion_api.py new file mode 100644 index 0000000..bc76121 --- /dev/null +++ b/pe/api/image/improved_diffusion_api.py @@ -0,0 +1,379 @@ +import torch +import numpy as np +import pandas as pd +import tempfile +import os + +from pe.api import API +from pe.logging import execution_logger +from pe.data import Data +from pe.constant.data import IMAGE_DATA_COLUMN_NAME +from pe.constant.data import IMAGE_MODEL_LABEL_COLUMN_NAME +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.api.util import ConstantList +from pe.util import download + +from improved_diffusion.script_util import NUM_CLASSES +from .improved_diffusion_lib.unet import create_model +from .improved_diffusion_lib.gaussian_diffusion import create_gaussian_diffusion + + +class ImprovedDiffusion(API): + """The image API that utilizes improved diffusion models from https://arxiv.org/abs/2102.09672.""" + + def __init__( + self, + variation_degrees, + model_path, + model_image_size=64, + num_channels=192, + num_res_blocks=3, + learn_sigma=True, + class_cond=True, + use_checkpoint=False, + attention_resolutions="16,8", + num_heads=4, + num_heads_upsample=-1, + use_scale_shift_norm=True, + dropout=0.0, + diffusion_steps=4000, + sigma_small=False, + noise_schedule="cosine", + use_kl=False, + predict_xstart=False, + rescale_timesteps=False, + rescale_learned_sigmas=False, + timestep_respacing="100", + batch_size=2000, + use_ddim=True, + clip_denoised=True, + use_data_parallel=True, + ): + """Constructor. + See https://github.com/openai/improved-diffusion for the explanation of the parameters not listed here. + + :param variation_degrees: The variation degrees utilized at each PE iteration. If a single int is provided, the + same variation degree will be used for all iterations. + :type variation_degrees: int or list[int] + :param model_path: The path of the model checkpoint + :type model_path: str + :param diffusion_steps: The total number of diffusion steps, defaults to 4000 + :type diffusion_steps: int, optional + :param timestep_respacing: The step configurations for image generation utilized at each PE iteration. If a + single str is provided, the same step configuration will be used for all iterations. Defaults to "100" + :type timestep_respacing: str or list[str], optional + :param batch_size: The batch size for image generation, defaults to 2000 + :type batch_size: int, optional + :param use_data_parallel: Whether to use data parallel during image generation, defaults to True + :type use_data_parallel: bool, optional + """ + super().__init__() + self._model = create_model( + image_size=model_image_size, + num_channels=num_channels, + num_res_blocks=num_res_blocks, + learn_sigma=learn_sigma, + class_cond=class_cond, + use_checkpoint=use_checkpoint, + attention_resolutions=attention_resolutions, + num_heads=num_heads, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + dropout=dropout, + ) + self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self._model.load_state_dict(torch.load(model_path, map_location="cpu")) + self._model.to(self._device) + self._model.eval() + all_timestep_respacing = ( + set(timestep_respacing) if isinstance(timestep_respacing, list) else {timestep_respacing} + ) + self._timestep_respacing_to_diffusion = {} + self._timestep_respacing_to_sampler = {} + for sub_timestep_respacing in all_timestep_respacing: + self._timestep_respacing_to_diffusion[sub_timestep_respacing] = create_gaussian_diffusion( + steps=diffusion_steps, + learn_sigma=learn_sigma, + sigma_small=sigma_small, + noise_schedule=noise_schedule, + use_kl=use_kl, + predict_xstart=predict_xstart, + rescale_timesteps=rescale_timesteps, + rescale_learned_sigmas=rescale_learned_sigmas, + timestep_respacing=sub_timestep_respacing, + ) + self._timestep_respacing_to_sampler[sub_timestep_respacing] = Sampler( + model=self._model, diffusion=self._timestep_respacing_to_diffusion[sub_timestep_respacing] + ) + if use_data_parallel: + self._timestep_respacing_to_sampler[sub_timestep_respacing] = torch.nn.DataParallel( + self._timestep_respacing_to_sampler[sub_timestep_respacing] + ) + if isinstance(timestep_respacing, str): + self._timestep_respacing = ConstantList(timestep_respacing) + else: + self._timestep_respacing = timestep_respacing + self._batch_size = batch_size + self._use_ddim = use_ddim + self._image_size = model_image_size + self._clip_denoised = clip_denoised + self._class_cond = class_cond + if isinstance(variation_degrees, int): + self._variation_degrees = ConstantList(variation_degrees) + else: + self._variation_degrees = variation_degrees + + def random_api(self, label_name, num_samples): + """Generating random synthetic data. + + :param label_name: The name of the label, not utilized in this API + :type label_name: str + :param num_samples: The number of random samples to generate + :type num_samples: int + :return: The data object of the generated synthetic data + :rtype: :py:class:`pe.data.data.Data` + """ + execution_logger.info(f"RANDOM API: creating {num_samples} samples for label {label_name}") + samples, labels = sample( + sampler=self._timestep_respacing_to_sampler[self._timestep_respacing[0]], + start_t=0, + num_samples=num_samples, + batch_size=self._batch_size, + use_ddim=self._use_ddim, + image_size=self._image_size, + clip_denoised=self._clip_denoised, + class_cond=self._class_cond, + device=self._device, + ) + samples = _round_to_uint8((samples + 1.0) * 127.5) + samples = samples.transpose(0, 2, 3, 1) + torch.cuda.empty_cache() + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: list(samples), + IMAGE_MODEL_LABEL_COLUMN_NAME: list(labels), + } + ) + execution_logger.info(f"RANDOM API: finished creating {num_samples} samples for label {label_name}") + return Data(data_frame=data_frame) + + def variation_api(self, syn_data): + """Generating variations of the synthetic data. + + :param syn_data: The data object of the synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :return: The data object of the variation of the input synthetic data + :rtype: :py:class:`pe.data.data.Data` + """ + execution_logger.info(f"VARIATION API: creating variations for {len(syn_data.data_frame)} samples") + images = np.stack(syn_data.data_frame[IMAGE_DATA_COLUMN_NAME].values) + labels = np.array(syn_data.data_frame[IMAGE_MODEL_LABEL_COLUMN_NAME].values) + iteration = getattr(syn_data.metadata, "iteration", -1) + variation_degree = self._variation_degrees[iteration + 1] + timestep_respacing = self._timestep_respacing[iteration + 1] + + execution_logger.info( + f"VARIATION API parameters: variation_degree={variation_degree}, timestep_respacing={timestep_respacing}, " + f"iteration={iteration}" + ) + + images = images.astype(np.float32) / 127.5 - 1.0 + images = images.transpose(0, 3, 1, 2) + variations, _ = sample( + sampler=self._timestep_respacing_to_sampler[timestep_respacing], + start_t=variation_degree, + start_image=torch.Tensor(images).to(self._device), + labels=(None if not self._class_cond else torch.LongTensor(labels).to(self._device)), + num_samples=images.shape[0], + batch_size=self._batch_size, + use_ddim=self._use_ddim, + image_size=self._image_size, + clip_denoised=self._clip_denoised, + class_cond=self._class_cond, + device=self._device, + ) + variations = _round_to_uint8((variations + 1.0) * 127.5) + variations = variations.transpose(0, 2, 3, 1) + torch.cuda.empty_cache() + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: list(variations), + IMAGE_MODEL_LABEL_COLUMN_NAME: list(labels), + } + ) + if LABEL_ID_COLUMN_NAME in syn_data.data_frame.columns: + data_frame[LABEL_ID_COLUMN_NAME] = syn_data.data_frame[LABEL_ID_COLUMN_NAME].values + execution_logger.info(f"VARIATION API: finished creating variations for {len(syn_data.data_frame)} samples") + return Data(data_frame=data_frame, metadata=syn_data.metadata) + + +def sample( + sampler, + num_samples, + start_t, + batch_size, + use_ddim, + image_size, + clip_denoised, + class_cond, + device, + start_image=None, + labels=None, +): + all_images = [] + all_labels = [] + batch_cnt = 0 + cnt = 0 + while cnt < num_samples: + current_batch_size = ( + batch_size if start_image is None else min(batch_size, start_image.shape[0] - batch_cnt * batch_size) + ) + current_batch_size = min(num_samples - cnt, current_batch_size) + shape = (current_batch_size, 3, image_size, image_size) + model_kwargs = {} + if class_cond: + if labels is None: + classes = torch.randint( + low=0, + high=NUM_CLASSES, + size=(current_batch_size,), + device=device, + ) + else: + classes = labels[batch_cnt * batch_size : (batch_cnt + 1) * batch_size] + model_kwargs["y"] = classes + sample = sampler( + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + start_t=max(start_t, 0), + start_image=( + None if start_image is None else start_image[batch_cnt * batch_size : (batch_cnt + 1) * batch_size] + ), + use_ddim=use_ddim, + noise=torch.randn(*shape, device=device), + image_size=image_size, + ) + batch_cnt += 1 + + all_images.append(sample.detach().cpu().numpy()) + + if class_cond: + all_labels.append(classes.detach().cpu().numpy()) + + cnt += sample.shape[0] + execution_logger.info(f"Created {cnt} samples") + + all_images = np.concatenate(all_images, axis=0) + all_images = all_images[:num_samples] + + if class_cond: + all_labels = np.concatenate(all_labels, axis=0) + all_labels = all_labels[:num_samples] + else: + all_labels = np.zeros(shape=(num_samples,)) + return all_images, all_labels + + +class Sampler(torch.nn.Module): + """A wrapper around the model and diffusion modules that handles the entire + sampling process, so as to reduce the communiation rounds between GPUs when + using DataParallel. + """ + + def __init__(self, model, diffusion): + super().__init__() + self._model = model + self._diffusion = diffusion + + def forward( + self, + clip_denoised, + model_kwargs, + start_t, + start_image, + use_ddim, + noise, + image_size, + ): + sample_fn = self._diffusion.p_sample_loop if not use_ddim else self._diffusion.ddim_sample_loop + sample = sample_fn( + self._model, + (noise.shape[0], 3, image_size, image_size), + clip_denoised=clip_denoised, + model_kwargs=model_kwargs, + start_t=max(start_t, 0), + start_image=start_image, + noise=noise, + device=noise.device, + ) + return sample + + +def _round_to_uint8(image): + return np.around(np.clip(image, a_min=0, a_max=255)).astype(np.uint8) + + +class ImprovedDiffusion270M(ImprovedDiffusion): + #: The URL of the checkpoint path + CHECKPOINT_URL = "https://openaipublic.blob.core.windows.net/diffusion/march-2021/imagenet64_cond_270M_250K.pt" + + def __init__( + self, + variation_degrees, + model_path=None, + batch_size=2000, + timestep_respacing="100", + use_data_parallel=True, + ): + """The "Class-conditional ImageNet-64 model (270M parameters, trained for 250K iterations)" model from the + Improved Diffusion paper. + + :param variation_degrees: The variation degrees utilized at each PE iteration + :type variation_degrees: list[int] + :param model_path: The path of the model checkpoint. If not provided, the checkpoint will be downloaded from + the `CHECKPOINT_URL` + :type model_path: str + :param batch_size: The batch size for image generation, defaults to 2000 + :type batch_size: int, optional + :param timestep_respacing: The step configuration for image generation, defaults to "100" + :type timestep_respacing: str, optional + :param use_data_parallel: Whether to use data parallel during image generation, defaults to True + :type use_data_parallel: bool, optional + """ + if model_path is None or not os.path.exists(model_path): + model_path = self._download_checkpoint(model_path) + super().__init__( + variation_degrees=variation_degrees, + model_path=model_path, + model_image_size=64, + num_channels=192, + num_res_blocks=3, + learn_sigma=True, + class_cond=True, + use_checkpoint=False, + attention_resolutions="16,8", + num_heads=4, + num_heads_upsample=-1, + use_scale_shift_norm=True, + dropout=0.0, + diffusion_steps=4000, + sigma_small=False, + noise_schedule="cosine", + use_kl=False, + predict_xstart=False, + rescale_timesteps=False, + rescale_learned_sigmas=False, + timestep_respacing=timestep_respacing, + batch_size=batch_size, + use_ddim=True, + clip_denoised=True, + use_data_parallel=use_data_parallel, + ) + + def _download_checkpoint(self, model_path): + execution_logger.info(f"Downloading ImprovedDiffusion checkpoint from {self.CHECKPOINT_URL}") + if model_path is None: + model_path = tempfile.mktemp(suffix=".pt") + download(url=self.CHECKPOINT_URL, fname=model_path) + execution_logger.info(f"Finished downloading ImprovedDiffusion checkpoint to {model_path}") + return model_path diff --git a/pe/api/image/improved_diffusion_lib/__init__.py b/pe/api/image/improved_diffusion_lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pe/api/image/improved_diffusion_lib/gaussian_diffusion.py b/pe/api/image/improved_diffusion_lib/gaussian_diffusion.py new file mode 100644 index 0000000..881c441 --- /dev/null +++ b/pe/api/image/improved_diffusion_lib/gaussian_diffusion.py @@ -0,0 +1,309 @@ +""" +This code contains minor edits from the original code at +https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py +and +https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/script_util.py +to support sampling from the middle of the diffusion process with start_t and +start_image arguments. +""" + +import torch as th +from improved_diffusion.respace import SpacedDiffusion +from improved_diffusion.respace import space_timesteps +from improved_diffusion.gaussian_diffusion import _extract_into_tensor +from improved_diffusion import gaussian_diffusion as gd + + +class SkippedSpacedDiffusion(SpacedDiffusion): + def p_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + device=None, + progress=False, + start_t=0, + start_image=None, + ): + """ + Generate samples from the model. + + :param model: the model module. + :param shape: the shape of the samples, (N, C, H, W). + :param noise: if specified, the noise from the encoder to sample. + Should be of the same shape as `shape`. + :param clip_denoised: if True, clip x_start predictions to [-1, 1]. + :param denoised_fn: if not None, a function which applies to the + x_start prediction before it is used to sample. + :param model_kwargs: if not None, a dict of extra keyword arguments to + pass to the model. This can be used for conditioning. + :param device: if specified, the device to create the samples on. + If not specified, use a model parameter's device. + :param progress: if True, show a tqdm progress bar. + :return: a non-differentiable batch of samples. + """ + final = None + for sample in self.p_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + start_t=start_t, + start_image=start_image, + ): + final = sample + return final["sample"] + + def p_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + device=None, + progress=False, + start_t=0, + start_image=None, + ): + """ + Generate samples from the model and yield intermediate samples from + each timestep of diffusion. + + Arguments are the same as p_sample_loop(). + Returns a generator over dicts, where each dict is the return value of + p_sample(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + indices = indices[start_t:] + if start_image is not None: + t_batch = th.tensor([indices[0]] * img.shape[0], device=device) + img = self.q_sample(start_image, t=t_batch, noise=img) + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices) + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.p_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + yield out + img = out["sample"] + + def ddim_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t-1} from the model using DDIM. + + Same usage as p_sample(). + """ + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"]) + alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape) + alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape) + sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev) + # Equation 12. + noise = th.randn_like(x) + mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps + nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1))) # no noise when t == 0 + sample = mean_pred + nonzero_mask * sigma * noise + return {"sample": sample, "pred_xstart": out["pred_xstart"]} + + def ddim_reverse_sample( + self, + model, + x, + t, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + eta=0.0, + ): + """ + Sample x_{t+1} from the model using DDIM reverse ODE. + """ + assert eta == 0.0, "Reverse ODE only for deterministic path" + out = self.p_mean_variance( + model, + x, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + ) + # Usually our model outputs epsilon, but we re-derive it + # in case we used x_start or x_prev prediction. + eps = ( + _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"] + ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape) + alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape) + + # Equation 12. reversed + mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps + + return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]} + + def ddim_sample_loop( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + start_t=0, + start_image=None, + ): + """ + Generate samples from the model using DDIM. + + Same usage as p_sample_loop(). + """ + final = None + for sample in self.ddim_sample_loop_progressive( + model, + shape, + noise=noise, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + device=device, + progress=progress, + eta=eta, + start_t=start_t, + start_image=start_image, + ): + final = sample + return final["sample"] + + def ddim_sample_loop_progressive( + self, + model, + shape, + noise=None, + clip_denoised=True, + denoised_fn=None, + model_kwargs=None, + device=None, + progress=False, + eta=0.0, + start_t=0, + start_image=None, + ): + """ + Use DDIM to sample from the model and yield intermediate samples from + each timestep of DDIM. + + Same usage as p_sample_loop_progressive(). + """ + if device is None: + device = next(model.parameters()).device + assert isinstance(shape, (tuple, list)) + if noise is not None: + img = noise + else: + img = th.randn(*shape, device=device) + indices = list(range(self.num_timesteps))[::-1] + indices = indices[start_t:] + if start_image is not None: + t_batch = th.tensor([indices[0]] * img.shape[0], device=device) + img = self.q_sample(start_image, t=t_batch, noise=img) + if progress: + # Lazy import so that we don't depend on tqdm. + from tqdm.auto import tqdm + + indices = tqdm(indices) + + for i in indices: + t = th.tensor([i] * shape[0], device=device) + with th.no_grad(): + out = self.ddim_sample( + model, + img, + t, + clip_denoised=clip_denoised, + denoised_fn=denoised_fn, + model_kwargs=model_kwargs, + eta=eta, + ) + yield out + img = out["sample"] + + +def create_gaussian_diffusion( + *, + steps=1000, + learn_sigma=False, + sigma_small=False, + noise_schedule="linear", + use_kl=False, + predict_xstart=False, + rescale_timesteps=False, + rescale_learned_sigmas=False, + timestep_respacing="", +): + betas = gd.get_named_beta_schedule(noise_schedule, steps) + if use_kl: + loss_type = gd.LossType.RESCALED_KL + elif rescale_learned_sigmas: + loss_type = gd.LossType.RESCALED_MSE + else: + loss_type = gd.LossType.MSE + if not timestep_respacing: + timestep_respacing = [steps] + return SkippedSpacedDiffusion( + use_timesteps=space_timesteps(steps, timestep_respacing), + betas=betas, + model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X), + model_var_type=( + (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL) + if not learn_sigma + else gd.ModelVarType.LEARNED_RANGE + ), + loss_type=loss_type, + rescale_timesteps=rescale_timesteps, + ) diff --git a/pe/api/image/improved_diffusion_lib/unet.py b/pe/api/image/improved_diffusion_lib/unet.py new file mode 100644 index 0000000..8b5bfec --- /dev/null +++ b/pe/api/image/improved_diffusion_lib/unet.py @@ -0,0 +1,60 @@ +""" +This code contains minor edits from the original code at +https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/unet.py +and +https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/script_util.py +to avoid calling self.input_blocks.parameters() in the original code, which is +not supported by DataParallel. +""" + +import torch +from improved_diffusion.unet import UNetModel +from improved_diffusion.script_util import NUM_CLASSES + + +class FP32UNetModel(UNetModel): + @property + def inner_dtype(self): + return torch.float32 + + +def create_model( + image_size, + num_channels, + num_res_blocks, + learn_sigma, + class_cond, + use_checkpoint, + attention_resolutions, + num_heads, + num_heads_upsample, + use_scale_shift_norm, + dropout, +): + if image_size == 256: + channel_mult = (1, 1, 2, 2, 4, 4) + elif image_size == 64: + channel_mult = (1, 2, 3, 4) + elif image_size == 32: + channel_mult = (1, 2, 2, 2) + else: + raise ValueError(f"unsupported image size: {image_size}") + + attention_ds = [] + for res in attention_resolutions.split(","): + attention_ds.append(image_size // int(res)) + + return FP32UNetModel( + in_channels=3, + model_channels=num_channels, + out_channels=(3 if not learn_sigma else 6), + num_res_blocks=num_res_blocks, + attention_resolutions=tuple(attention_ds), + dropout=dropout, + channel_mult=channel_mult, + num_classes=(NUM_CLASSES if class_cond else None), + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_heads_upsample=num_heads_upsample, + use_scale_shift_norm=use_scale_shift_norm, + ) diff --git a/pe/api/image/stable_diffusion_api.py b/pe/api/image/stable_diffusion_api.py new file mode 100644 index 0000000..4450383 --- /dev/null +++ b/pe/api/image/stable_diffusion_api.py @@ -0,0 +1,206 @@ +import torch +import numpy as np +import pandas as pd +from diffusers import StableDiffusionPipeline +from diffusers import StableDiffusionImg2ImgPipeline +import json +from tqdm import tqdm + +from pe.api import API +from pe.logging import execution_logger +from pe.data import Data +from pe.constant.data import IMAGE_DATA_COLUMN_NAME +from pe.constant.data import IMAGE_PROMPT_COLUMN_NAME +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.api.util import ConstantList + + +def _to_constant_list_if_needed(value): + if not isinstance(value, list): + value = ConstantList(value) + return value + + +def _round_to_uint8(image): + return np.around(np.clip(image * 255, a_min=0, a_max=255)).astype(np.uint8) + + +class StableDiffusion(API): + """The API that uses the Stable Diffusion model to generate synthetic data.""" + + def __init__( + self, + prompt, + variation_degrees, + width=512, + height=512, + random_api_checkpoint="CompVis/stable-diffusion-v1-4", + random_api_guidance_scale=7.5, + random_api_num_inference_steps=50, + random_api_batch_size=10, + variation_api_checkpoint="CompVis/stable-diffusion-v1-4", + variation_api_guidance_scale=7.5, + variation_api_num_inference_steps=50, + variation_api_batch_size=10, + ): + """Constructor. + + :param prompt: The prompt used for each label name. It can be either a string or a dictionary. If it is a + string, it should be the path to a JSON file that contains the prompt for each label name. If it is a + dictionary, it should be a dictionary that maps each label name to its prompt + :type prompt: str or dict + :param variation_degrees: The variation degrees utilized at each PE iteration. If a single float is provided, + the same variation degree will be used for all iterations. + :type variation_degrees: float or list[float] + :param width: The width of the generated images, defaults to 512 + :type width: int, optional + :param height: The height of the generated images, defaults to 512 + :type height: int, optional + :param random_api_checkpoint: The checkpoint of the random API, defaults to "CompVis/stable-diffusion-v1-4" + :type random_api_checkpoint: str, optional + :param random_api_guidance_scale: The guidance scale of the random API, defaults to 7.5 + :type random_api_guidance_scale: float, optional + :param random_api_num_inference_steps: The number of inference steps of the random API, defaults to 50 + :type random_api_num_inference_steps: int, optional + :param random_api_batch_size: The batch size of the random API, defaults to 10 + :type random_api_batch_size: int, optional + :param variation_api_checkpoint: The checkpoint of the variation API, defaults to + "CompVis/stable-diffusion-v1-4" + :type variation_api_checkpoint: str, optional + :param variation_api_guidance_scale: The guidance scale of the variation API utilized at each PE iteration. If + a single float is provided, the same guidance scale will be used for all iterations. Defaults to 7.5 + :type variation_api_guidance_scale: float or list[float], optional + :param variation_api_num_inference_steps: The number of inference steps of the variation API utilized at each + PE iteration. If a single int is provided, the same number of inference steps will be used for all + iterations. Defaults to 50 + :type variation_api_num_inference_steps: int or list[int], optional + :param variation_api_batch_size: The batch size of the variation API, defaults to 10 + :type variation_api_batch_size: int, optional + :raises ValueError: If the prompt is neither a string nor a dictionary + """ + super().__init__() + self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if isinstance(prompt, str): + with open(prompt, "r") as f: + self._prompt = json.load(f) + elif isinstance(prompt, dict): + self._prompt = prompt + else: + raise ValueError("Prompt must be either a string or a dictionary") + + self._width = width + self._height = height + + self._random_api_checkpoint = random_api_checkpoint + self._random_api_guidance_scale = random_api_guidance_scale + self._random_api_num_inference_steps = random_api_num_inference_steps + self._random_api_batch_size = random_api_batch_size + + self._variation_api_checkpoint = variation_api_checkpoint + self._variation_api_guidance_scale = _to_constant_list_if_needed(variation_api_guidance_scale) + self._variation_api_num_inference_steps = _to_constant_list_if_needed(variation_api_num_inference_steps) + self._variation_api_batch_size = variation_api_batch_size + + self._variation_degrees = _to_constant_list_if_needed(variation_degrees) + + self._random_api_pipe = StableDiffusionPipeline.from_pretrained( + self._random_api_checkpoint, torch_dtype=torch.float16 + ) + self._random_api_pipe.safety_checker = None + self._random_api_pipe = self._random_api_pipe.to(self._device) + + self._variation_api_pipe = StableDiffusionImg2ImgPipeline.from_pretrained( + self._variation_api_checkpoint, torch_dtype=torch.float16 + ) + self._variation_api_pipe.safety_checker = None + self._variation_api_pipe = self._variation_api_pipe.to(self._device) + + def random_api(self, label_name, num_samples): + """Generating random synthetic data. + + :param label_name: The name of the label, not utilized in this API + :type label_name: str + :param num_samples: The number of random samples to generate + :type num_samples: int + :return: The data object of the generated synthetic data + :rtype: :py:class:`pe.data.data.Data` + """ + execution_logger.info(f"RANDOM API: creating {num_samples} samples for label {label_name}") + + prompt = self._prompt[label_name] + max_batch_size = self._random_api_batch_size + images = [] + num_iterations = int(np.ceil(float(num_samples) / max_batch_size)) + for iteration in tqdm(range(num_iterations)): + batch_size = min(max_batch_size, num_samples - iteration * max_batch_size) + images.append( + self._random_api_pipe( + prompt=prompt, + width=self._width, + height=self._height, + num_inference_steps=self._random_api_num_inference_steps, + guidance_scale=self._random_api_guidance_scale, + num_images_per_prompt=batch_size, + output_type="np", + ).images + ) + images = _round_to_uint8(np.concatenate(images, axis=0)) + torch.cuda.empty_cache() + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: list(images), + IMAGE_PROMPT_COLUMN_NAME: prompt, + } + ) + execution_logger.info(f"RANDOM API: finished creating {num_samples} samples for label {label_name}") + return Data(data_frame=data_frame) + + def variation_api(self, syn_data): + """Generating variations of the synthetic data. + + :param syn_data: The data object of the synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :return: The data object of the variation of the input synthetic data + :rtype: :py:class:`pe.data.data.Data` + """ + execution_logger.info(f"VARIATION API: creating variations for {len(syn_data.data_frame)} samples") + images = np.stack(syn_data.data_frame[IMAGE_DATA_COLUMN_NAME].values) + prompts = list(syn_data.data_frame[IMAGE_PROMPT_COLUMN_NAME].values) + iteration = getattr(syn_data.metadata, "iteration", -1) + variation_degree = self._variation_degrees[iteration + 1] + guidance_scale = self._variation_api_guidance_scale[iteration + 1] + num_inference_steps = self._variation_api_num_inference_steps[iteration + 1] + + images = images.astype(np.float32) / 127.5 - 1.0 + images = np.transpose(images, (0, 3, 1, 2)) + images = torch.Tensor(images).to(self._device) + max_batch_size = self._variation_api_batch_size + + variations = [] + num_iterations = int(np.ceil(float(images.shape[0]) / max_batch_size)) + for iteration in tqdm(range(num_iterations)): + variations.append( + self._variation_api_pipe( + prompt=prompts[iteration * max_batch_size : (iteration + 1) * max_batch_size], + image=images[iteration * max_batch_size : (iteration + 1) * max_batch_size], + num_inference_steps=num_inference_steps, + strength=variation_degree, + guidance_scale=guidance_scale, + num_images_per_prompt=1, + output_type="np", + ).images + ) + variations = _round_to_uint8(np.concatenate(variations, axis=0)) + + torch.cuda.empty_cache() + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: list(variations), + IMAGE_PROMPT_COLUMN_NAME: prompts, + } + ) + if LABEL_ID_COLUMN_NAME in syn_data.data_frame.columns: + data_frame[LABEL_ID_COLUMN_NAME] = syn_data.data_frame[LABEL_ID_COLUMN_NAME].values + execution_logger.info(f"VARIATION API: finished creating variations for {len(syn_data.data_frame)} samples") + return Data(data_frame=data_frame, metadata=syn_data.metadata) diff --git a/pe/api/util.py b/pe/api/util.py new file mode 100644 index 0000000..78bfcff --- /dev/null +++ b/pe/api/util.py @@ -0,0 +1,6 @@ +class ConstantList: + def __init__(self, value): + self._value = value + + def __getitem__(self, index): + return self._value diff --git a/pe/callback/__init__.py b/pe/callback/__init__.py new file mode 100644 index 0000000..acdc86e --- /dev/null +++ b/pe/callback/__init__.py @@ -0,0 +1,4 @@ +from .common.save_checkpoints import SaveCheckpoints +from .common.compute_fid import ComputeFID +from .image.sample_images import SampleImages +from .image.save_all_images import SaveAllImages diff --git a/pe/callback/callback.py b/pe/callback/callback.py new file mode 100644 index 0000000..f134fab --- /dev/null +++ b/pe/callback/callback.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + + +class Callback(ABC): + """The abstract class that defines the callback for the synthetic data generation. These callbacks can be + configured to be called after each PE iteration. + """ + + @abstractmethod + def __call__(self, syn_data): + """This function is called after each PE iteration. + + :param syn_data: The :py:class:`pe.data.data.Data` object of the synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + """ + ... diff --git a/pe/callback/common/__init__.py b/pe/callback/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pe/callback/common/compute_fid.py b/pe/callback/common/compute_fid.py new file mode 100644 index 0000000..8d3dd6d --- /dev/null +++ b/pe/callback/common/compute_fid.py @@ -0,0 +1,52 @@ +import numpy as np +import cleanfid.fid + +from pe.callback.callback import Callback +from pe.metric_item import FloatMetricItem +from pe.logging import execution_logger + + +class ComputeFID(Callback): + """The callback that computes the Frechet Inception Distance (FID) between the private and synthetic data.""" + + def __init__(self, priv_data, embedding): + """Constructor. + + :param priv_data: The private data + :type priv_data: :py:class:`pe.data.data.Data` + :param embedding: The embedding to compute the FID + :type embedding: :py:class:`pe.embedding.embedding.Embedding` + """ + self._priv_data = priv_data + self._embedding = embedding + + self._priv_data = self._embedding.compute_embedding(self._priv_data) + priv_embedding = np.stack(self._priv_data.data_frame[self._embedding.column_name].values, axis=0).astype( + np.float32 + ) + self._real_mu = np.mean(priv_embedding, axis=0) + self._real_sigma = np.cov(priv_embedding, rowvar=False) + + def __call__(self, syn_data): + """This function is called after each PE iteration that computes the FID between the private and synthetic + data. + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :return: The FID between the private and synthetic data + :rtype: list[:py:class:`pe.metric_item.FloatMetricItem`] + """ + execution_logger.info(f"Computing FID ({type(self._embedding).__name__})") + syn_data = self._embedding.compute_embedding(syn_data) + syn_embedding = np.stack(syn_data.data_frame[self._embedding.column_name].values, axis=0).astype(np.float32) + syn_mu = np.mean(syn_embedding, axis=0) + syn_sigma = np.cov(syn_embedding, rowvar=False) + fid = cleanfid.fid.frechet_distance( + mu1=self._real_mu, + sigma1=self._real_sigma, + mu2=syn_mu, + sigma2=syn_sigma, + ) + metric_item = FloatMetricItem(name=f"fid_{type(self._embedding).__name__}", value=fid) + execution_logger.info(f"Finished computing FID ({type(self._embedding).__name__})") + return [metric_item] diff --git a/pe/callback/common/save_checkpoints.py b/pe/callback/common/save_checkpoints.py new file mode 100644 index 0000000..9a0de86 --- /dev/null +++ b/pe/callback/common/save_checkpoints.py @@ -0,0 +1,46 @@ +import os + +from pe.callback.callback import Callback + + +class SaveCheckpoints(Callback): + """The callback that saves checkpoints of the synthetic data.""" + + def __init__( + self, + output_folder, + iteration_format="09d", + ): + """Constructor. + + :param output_folder: The output folder that will be used to save the checkpoints + :type output_folder: str + :param iteration_format: The format of the iteration number, defaults to "09d" + :type iteration_format: str, optional + """ + self._output_folder = output_folder + self._iteration_format = iteration_format + + def __call__(self, syn_data): + """This function is called after each PE iteration that saves checkpoints of the synthetic data. + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + """ + syn_data.save_checkpoint(self._get_checkpoint_path(syn_data.metadata.iteration)) + + def _get_checkpoint_path(self, iteration): + """Get the checkpoint path. + + :param iteration: The PE iteration number + :type iteration: int + :return: The checkpoint path + :rtype: str + """ + os.makedirs(self._output_folder, exist_ok=True) + iteration_string = format(iteration, self._iteration_format) + checkpoint_path = os.path.join( + self._output_folder, + iteration_string, + ) + return checkpoint_path diff --git a/pe/callback/image/__init__.py b/pe/callback/image/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pe/callback/image/sample_images.py b/pe/callback/image/sample_images.py new file mode 100644 index 0000000..8d60929 --- /dev/null +++ b/pe/callback/image/sample_images.py @@ -0,0 +1,44 @@ +import numpy as np + +from pe.callback.callback import Callback +from pe.constant.data import IMAGE_DATA_COLUMN_NAME +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.metric_item import ImageListMetricItem + + +class SampleImages(Callback): + """The callback that samples images from the synthetic data.""" + + def __init__(self, num_images_per_class=10): + """Constructor. + + :param num_images_per_class: number of images to sample per class, defaults to 10 + :type num_images_per_class: int, optional + """ + self._num_images_per_class = num_images_per_class + + def __call__(self, syn_data): + """This function is called after each PE iteration that samples images from the synthetic data. + + :param syn_data: The :py:class:`pe.data.data.Data` object of the synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :return: A metric item with the list of sampled images + :rtype: list[:py:class:`pe.metric_item.ImageListMetricItem`] + """ + all_image_list = [] + num_classes = len(syn_data.metadata.label_names) + for class_id in range(num_classes): + image_list = syn_data.data_frame[syn_data.data_frame[LABEL_ID_COLUMN_NAME] == class_id][ + IMAGE_DATA_COLUMN_NAME + ] + image_list = image_list.sample(min(self._num_images_per_class, len(image_list))) + all_image_list.extend(image_list) + assert len(image_list) > 0 + if len(image_list) < self._num_images_per_class: + all_image_list.extend(np.zeros_like(image_list[0]) * (self._num_images_per_class - len(image_list))) + metric_item = ImageListMetricItem( + name="image_sample", + value=all_image_list, + num_images_per_row=None if num_classes == 1 else self._num_images_per_class, + ) + return [metric_item] diff --git a/pe/callback/image/save_all_images.py b/pe/callback/image/save_all_images.py new file mode 100644 index 0000000..6429339 --- /dev/null +++ b/pe/callback/image/save_all_images.py @@ -0,0 +1,70 @@ +import imageio +import os +from tqdm import tqdm + +from pe.callback.callback import Callback +from pe.constant.data import IMAGE_DATA_COLUMN_NAME +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.logging import execution_logger + + +class SaveAllImages(Callback): + """The callback that saves all images.""" + + def __init__( + self, + output_folder, + path_format="{iteration:09d}/{label_id}_{label_name}/{index}.png", + tqdm_enabled=True, + ): + """Constructor. + + :param output_folder: The output folder that will be used to save the images + :type output_folder: str + :param path_format: The format of the image paths, defaults to + "{iteration:09d}/{label_id}_{label_name}/{index}.png" + :type path_format: str, optional + :param tqdm_enabled: Whether to show tqdm progress bar when saving the images, defaults to True + :type tqdm_enabled: bool, optional + """ + self._output_folder = output_folder + self._path_format = path_format + self._tqdm_enabled = tqdm_enabled + + def _save_image(self, image, label_name, label_id, index, iteration): + """A helper function that saves an image.""" + path = os.path.join( + self._output_folder, + self._path_format.format( + iteration=iteration, + label_id=label_id, + label_name=label_name, + index=index, + ), + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + imageio.imsave(path, image) + + def __call__(self, syn_data): + """This function is called after each PE iteration that saves all images. + + :param syn_data: The :py:class:`pe.data.data.Data` object of the synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + """ + execution_logger.info("Saving all images") + iterator = range(len(syn_data.data_frame)) + if self._tqdm_enabled: + iterator = tqdm(iterator) + for i in iterator: + image = syn_data.data_frame[IMAGE_DATA_COLUMN_NAME][i] + label_id = int(syn_data.data_frame[LABEL_ID_COLUMN_NAME][i]) + label_name = syn_data.metadata.label_names[label_id] + index = syn_data.data_frame.index[i] + self._save_image( + image=image, + label_name=label_name, + label_id=label_id, + index=index, + iteration=syn_data.metadata.iteration, + ) + execution_logger.info("Finished saving all images") diff --git a/pe/constant/__init__.py b/pe/constant/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pe/constant/data.py b/pe/constant/data.py new file mode 100644 index 0000000..0824790 --- /dev/null +++ b/pe/constant/data.py @@ -0,0 +1,29 @@ +#: The column name of the label ID +LABEL_ID_COLUMN_NAME = "PE.LABEL_ID" + +#: The column name of the clean histogram +CLEAN_HISTOGRAM_COLUMN_NAME = "PE.CLEAN_HISTOGRAM" +#: The column name of the DP histogram +DP_HISTOGRAM_COLUMN_NAME = "PE.DP_HISTOGRAM" +#: The column name of the post-processed (e.g., clipped) DP histogram +POST_PROCESSED_DP_HISTOGRAM_COLUMN_NAME = "PE.POST_PROCESSED_DP_HISTOGRAM" + +#: The column name of the embedding +EMBEDDING_COLUMN_NAME = "PE.EMBEDDING" +#: The column name of the lookahead embedding +LOOKAHEAD_EMBEDDING_COLUMN_NAME = "PE.LOOKAHEAD_EMBEDDING" + +#: The column name of the index of synthetic sample from the previous iteration that generates the current sample +PARENT_SYN_DATA_INDEX_COLUMN_NAME = "PE.PARENT_SYN_DATA_INDEX" +#: The column name of the flag that indicates whether the sample is from the last iteration +FROM_LAST_FLAG_COLUMN_NAME = "PE.FROM_LAST_FLAG" + +#: The column name of the image data +IMAGE_DATA_COLUMN_NAME = "PE.IMAGE" +#: The column name of the image label that is used for the model to generate the image +IMAGE_MODEL_LABEL_COLUMN_NAME = "PE.IMAGE_MODEL_LABEL" +#: The column name of the prompt for the image +IMAGE_PROMPT_COLUMN_NAME = "PE.IMAGE_PROMPT" + +#: The column name of the nearest neighbors voting IDs +HISTOGRAM_NEAREST_NEIGHBORS_VOTING_IDS_COLUMN_NAME = "PE.HISTOGRAM.NEAREST_NEIGHBORS.VOTING_IDS" diff --git a/pe/data/__init__.py b/pe/data/__init__.py new file mode 100644 index 0000000..02a0feb --- /dev/null +++ b/pe/data/__init__.py @@ -0,0 +1 @@ +from .data import Data diff --git a/pe/data/data.py b/pe/data/data.py new file mode 100644 index 0000000..1d2e854 --- /dev/null +++ b/pe/data/data.py @@ -0,0 +1,139 @@ +import os +from omegaconf import OmegaConf +import pandas as pd +from pe.constant.data import LABEL_ID_COLUMN_NAME + + +class Data: + """The class that holds the private data or synthetic data from PE.""" + + def __init__(self, data_frame=None, metadata={}): + """Constructor. + + :param data_frame: A pandas dataframe that holds the data, defaults to None + :type data_frame: :py:class:`pandas.DataFrame`, optional + :param metadata: the metadata of the data, defaults to {} + :type metadata: dict, optional + """ + self.data_frame = data_frame + self.metadata = OmegaConf.create(metadata) + self._data_frame_file_name = "data_frame.pkl" + self._metadata_file_name = "metadata.yaml" + + def __str__(self): + return f"Metadata:\n{self.metadata}\nData frame:\n{self.data_frame}" + + def save_checkpoint(self, path): + """Save the data to a checkpoint. + + :param path: The folder to save the checkpoint + :type path: str + :raises ValueError: If the path is None + :raises ValueError: If the data frame is empty + """ + if path is None: + raise ValueError("Path is None") + if self.data_frame is None: + raise ValueError("Data frame is empty") + os.makedirs(path, exist_ok=True) + self.data_frame.to_pickle(os.path.join(path, self._data_frame_file_name)) + with open(os.path.join(path, self._metadata_file_name), "w") as file: + file.write(OmegaConf.to_yaml(self.metadata)) + + def load_checkpoint(self, path): + """Load data from a checkpoint + + :param path: The folder that contains the checkpoint + :type path: str + :return: Whether the checkpoint is loaded successfully + :rtype: bool + """ + data_frame_path = os.path.join(path, self._data_frame_file_name) + metadata_path = os.path.join(path, self._metadata_file_name) + if not os.path.exists(data_frame_path) or not os.path.exists(metadata_path): + return False + self.data_frame = pd.read_pickle(data_frame_path) + with open(metadata_path, "r") as file: + self.metadata = OmegaConf.create(file.read()) + return True + + def filter_label_id(self, label_id): + """Filter the data frame according to a label id + + :param label_id: The label id that is used to filter the data frame + :type label_id: int + :return: :py:class:`pe.data.data.Data` object with the filtered data frame + :rtype: :py:class:`pe.data.data.Data` + """ + return Data( + data_frame=self.data_frame[self.data_frame[LABEL_ID_COLUMN_NAME] == label_id], + metadata=self.metadata, + ) + + def set_label_id(self, label_id): + """Set the label id for the data frame + + :param label_id: The label id to set + :type label_id: int + """ + self.data_frame[LABEL_ID_COLUMN_NAME] = label_id + + def truncate(self, num_samples): + """Truncate the data frame to a certain number of samples + + :param num_samples: The number of samples to truncate + :type num_samples: int + :return: A new :py:class:`pe.data.data.Data` object with the truncated data frame + :rtype: :py:class:`pe.data.data.Data` + """ + return Data(data_frame=self.data_frame[:num_samples], metadata=self.metadata) + + def random_truncate(self, num_samples): + """Randomly truncate the data frame to a certain number of samples + + :param num_samples: The number of samples to randomly truncate + :type num_samples: int + :return: A new :py:class:`pe.data.data.Data` object with the randomly truncated data frame + :rtype: :py:class:`pe.data.data.Data` + """ + data_frame = self.data_frame.sample(n=num_samples) + return Data(data_frame=data_frame, metadata=self.metadata) + + def merge(self, data): + """Merge the data object with another data object + + :param data: The data object to merge + :type data: :py:class:`pe.data.data.Data` + :raises ValueError: If the metadata of `data` is not the same as the metadata of the current object + :return: The merged data object + :rtype: :py:class:`pe.data.data.Data` + """ + if self.metadata != data.metadata: + raise ValueError("Metadata must be the same") + cols_to_use = data.data_frame.columns.difference(self.data_frame.columns) + if len(cols_to_use) == 0: + return self + data_frame = self.data_frame.join(data.data_frame[cols_to_use]) + return Data(data_frame=data_frame, metadata=self.metadata) + + @classmethod + def concat(cls, data_list, metadata=None): + """Concatenate the data frames of a list of data objects + + :param data_list: The list of data objects to concatenate + :type data_list: list[:py:class:`pe.data.data.Data`] + :param metadata: The metadata of the concatenated data. When None, the metadata of the list of data objects + must be the same and will be used. Defaults to None + :type metadata: dict, optional + :raises ValueError: If the metadata of the data objects are not the same + :return: The concatenated data object + :rtype: :py:class:`pe.data.data.Data` + """ + data_frame_list = [data.data_frame for data in data_list] + if metadata is None: + metadata_list = [data.metadata for data in data_list] + # Check that all metadata are the same. + if len(set(metadata_list)) != 1: + raise ValueError("Metadata must be the same") + metadata = metadata_list[0] + return Data(data_frame=pd.concat(data_frame_list), metadata=metadata) diff --git a/pe/data/image/__init__.py b/pe/data/image/__init__.py new file mode 100644 index 0000000..fc1043d --- /dev/null +++ b/pe/data/image/__init__.py @@ -0,0 +1,4 @@ +from .image import load_image_folder +from .cifar10 import Cifar10 +from .camelyon17 import Camelyon17 +from .cat import Cat diff --git a/pe/data/image/camelyon17.py b/pe/data/image/camelyon17.py new file mode 100644 index 0000000..acfc37a --- /dev/null +++ b/pe/data/image/camelyon17.py @@ -0,0 +1,50 @@ +import pandas as pd +from wilds import get_dataset +from tqdm import tqdm +import numpy as np +import torchvision.transforms as T + +from pe.data import Data +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.constant.data import IMAGE_DATA_COLUMN_NAME + +CAMELYON17_LABEL_NAMES = [ + "no_tumor", + "tumor", +] + + +class Camelyon17(Data): + """The Camelyon17 dataset.""" + + def __init__(self, split="train", root_dir="data", res=64): + """Constructor. + + :param split: The split of the dataset. It should be either "train", "val", or "test", defaults to "train" + :type split: str, optional + :param root_dir: The root directory to save the dataset, defaults to "data" + :type root_dir: str, optional + :param res: The resolution of the images, defaults to 64 + :type res: int, optional + :raises ValueError: If the split is invalid + """ + if split not in ["train", "val", "test"]: + raise ValueError(f"Invalid split: {split}") + dataset = get_dataset(dataset="camelyon17", download=True, root_dir=root_dir) + data = dataset.get_subset(split) + transform = T.Resize(res) + + images = [] + labels = [] + for i in tqdm(range(len(data))): + image, label, _ = data[i] + images.append(np.array(transform(image))) + labels.append(label.item()) + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: images, + LABEL_ID_COLUMN_NAME: labels, + } + ) + metadata = {"label_names": CAMELYON17_LABEL_NAMES} + super().__init__(data_frame=data_frame, metadata=metadata) diff --git a/pe/data/image/cat.py b/pe/data/image/cat.py new file mode 100644 index 0000000..1c0730b --- /dev/null +++ b/pe/data/image/cat.py @@ -0,0 +1,73 @@ +import pandas as pd +import os +from tqdm import tqdm +import numpy as np +import zipfile +from PIL import Image +import torchvision.transforms as T +from collections import defaultdict + +from pe.data import Data +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.constant.data import IMAGE_DATA_COLUMN_NAME +from pe.util import download + +CAT_LABEL_NAMES = [ + "cookie", + "doudou", +] + + +class Cat(Data): + """The Cat dataset.""" + + #: The URL of the dataset + URL = "https://www.kaggle.com/api/v1/datasets/download/fjxmlzn/cat-cookie-doudou" + + def __init__(self, root_dir="data", res=512): + """Constructor. + + :param root_dir: The root directory to save the dataset, defaults to "data" + :type root_dir: str, optional + :param res: The resolution of the images, defaults to 512 + :type res: int, optional + """ + self._zip_path = os.path.join(root_dir, "cat-cookie-doudou.zip") + self._download() + data = self._read_data() + transform = T.Resize(res) + + images = [] + labels = [] + for label, sub_images in data.items(): + for image in tqdm(sub_images, desc=f"Processing {label} images"): + image = Image.fromarray(image) + image = transform(image) + image = np.array(image) + images.append(image) + labels.append(CAT_LABEL_NAMES.index(label)) + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: images, + LABEL_ID_COLUMN_NAME: labels, + } + ) + metadata = {"label_names": CAT_LABEL_NAMES} + super().__init__(data_frame=data_frame, metadata=metadata) + + def _download(self): + """Download the dataset if it does not exist.""" + if not os.path.exists(self._zip_path): + os.makedirs(os.path.dirname(self._zip_path), exist_ok=True) + download(url=self.URL, fname=self._zip_path) + + def _read_data(self): + """Read the data from the zip file.""" + data = defaultdict(list) + with zipfile.ZipFile(self._zip_path) as z: + for name in tqdm(z.namelist(), desc="Reading zip file"): + with z.open(name) as f: + image = Image.open(f) + label = name.split("/")[0] + data[label].append(np.array(image)) + return data diff --git a/pe/data/image/cifar10.py b/pe/data/image/cifar10.py new file mode 100644 index 0000000..c54eaef --- /dev/null +++ b/pe/data/image/cifar10.py @@ -0,0 +1,45 @@ +import torchvision +import tempfile +import pandas as pd + +from pe.data import Data +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.constant.data import IMAGE_DATA_COLUMN_NAME + +CIFAR10_LABEL_NAMES = [ + "plane", + "car", + "bird", + "cat", + "deer", + "dog", + "frog", + "horse", + "ship", + "truck", +] + + +class Cifar10(Data): + """The CIFAR10 dataset.""" + + def __init__(self, split="train"): + """Constructor. + + :param split: The split of the dataset. It should be either "train" or "test", defaults to "train" + :type split: str, optional + :raises ValueError: If the split is invalid + """ + if split not in ["train", "test"]: + raise ValueError(f"Invalid split: {split}") + train = split == "train" + with tempfile.TemporaryDirectory() as tmp_dir: + dataset = torchvision.datasets.CIFAR10(root=tmp_dir, train=train, download=True) + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: list(dataset.data), + LABEL_ID_COLUMN_NAME: dataset.targets, + } + ) + metadata = {"label_names": CIFAR10_LABEL_NAMES} + super().__init__(data_frame=data_frame, metadata=metadata) diff --git a/pe/data/image/image.py b/pe/data/image/image.py new file mode 100644 index 0000000..de495a2 --- /dev/null +++ b/pe/data/image/image.py @@ -0,0 +1,125 @@ +import pandas as pd +from PIL import Image as PILImage +import blobfile as bf +from torch.utils.data import Dataset +import torchvision.transforms as T +from torch.utils.data import DataLoader +import torch +import numpy as np + +from pe.data import Data +from pe.logging import execution_logger +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.constant.data import IMAGE_DATA_COLUMN_NAME + + +def _list_image_files_recursively(data_dir): + """List all image files in a directory recursively. Adapted from + https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/image_datasets.py + """ + results = [] + for entry in sorted(bf.listdir(data_dir)): + full_path = bf.join(data_dir, entry) + ext = entry.split(".")[-1] + if "." in entry and ext.lower() in ["jpg", "jpeg", "png", "gif"]: + results.append(full_path) + elif bf.isdir(full_path): + results.extend(_list_image_files_recursively(full_path)) + return results + + +class ImageDataset(Dataset): + def __init__(self, folder, transform): + super().__init__() + self.folder = folder + self.transform = transform + + self.local_images = _list_image_files_recursively(folder) + self.local_class_names = [bf.basename(path).split("_")[0] for path in self.local_images] + self.class_names = list(sorted(set(self.local_class_names))) + self.class_name_to_id = {x: i for i, x in enumerate(self.class_names)} + self.local_classes = [self.class_name_to_id[x] for x in self.local_class_names] + + def __len__(self): + return len(self.local_images) + + def __getitem__(self, idx): + path = self.local_images[idx] + with bf.BlobFile(path, "rb") as f: + pil_image = PILImage.open(f) + pil_image.load() + + arr = self.transform(pil_image) + + label = self.local_classes[idx] + return arr, label + + +def load_image_folder(path, image_size, class_cond=True, num_images=-1, num_workers=10, batch_size=1000): + """Load a image dataset from a folder that contains image files. The folder can be nested arbitrarily. The image + file names must be in the format of "{class_name without '_'}_{suffix in any string}.ext". The "ext" can be "jpg", + "jpeg", "png", or "gif". The class names will be extracted from the file names before the first "_". If class_cond + is False, the class names will be ignored and all images will be treated as the same class with class name "None". + + :param path: The path to the root folder that contains the image files + :type path: str + :param image_size: The size of the images. Images will be resized to this size + :type image_size: int + :param class_cond: Whether to treat the loaded dataset as class conditional, defaults to True + :type class_cond: bool, optional + :param num_images: The number of images to load. If -1, load all images. Defaults to -1 + :type num_images: int, optional + :param num_workers: The number of workers to use for loading the images, defaults to 10 + :type num_workers: int, optional + :param batch_size: The batch size to use for loading the images, defaults to 1000 + :type batch_size: int, optional + :return: The loaded data + :rtype: :py:class:`pe.data.data.Data` + """ + transform = T.Compose([T.Resize(image_size), T.CenterCrop(image_size), T.ToTensor()]) + dataset = ImageDataset(folder=path, transform=transform) + loader = DataLoader( + dataset=dataset, + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + pin_memory=torch.cuda.is_available(), + drop_last=False, + ) + all_samples = [] + all_labels = [] + cnt = 0 + for batch, cond in loader: + all_samples.append(batch.cpu().numpy()) + + if class_cond: + all_labels.append(cond.cpu().numpy()) + + cnt += batch.shape[0] + + execution_logger.info(f"Loaded {cnt} samples.") + if batch.shape[0] < batch_size: + execution_logger.info("Containing incomplete batch. Please check num_images is desired.") + + if num_images > 0 and cnt >= num_images: + break + + all_samples = np.concatenate(all_samples, axis=0) + if num_images <= 0: + num_images = all_samples.shape[0] + all_samples = all_samples[:num_images] + all_samples = np.around(np.clip(all_samples * 255, a_min=0, a_max=255)).astype(np.uint8) + all_samples = np.transpose(all_samples, (0, 2, 3, 1)) + if class_cond: + all_labels = np.concatenate(all_labels, axis=0) + all_labels = all_labels[:num_images] + else: + all_labels = np.zeros(shape=all_samples.shape[0], dtype=np.int64) + data_frame = pd.DataFrame( + { + IMAGE_DATA_COLUMN_NAME: list(all_samples), + LABEL_ID_COLUMN_NAME: list(all_labels), + } + ) + metadata = {"label_names": dataset.class_names if class_cond else ["None"]} + return Data(data_frame=data_frame, metadata=metadata) diff --git a/pe/dp/__init__.py b/pe/dp/__init__.py new file mode 100644 index 0000000..5cbddc4 --- /dev/null +++ b/pe/dp/__init__.py @@ -0,0 +1,2 @@ +from .dp import DP +from .gaussian import Gaussian diff --git a/pe/dp/dp.py b/pe/dp/dp.py new file mode 100644 index 0000000..284a1ba --- /dev/null +++ b/pe/dp/dp.py @@ -0,0 +1,29 @@ +from abc import ABC, abstractmethod + + +class DP(ABC): + """The abstract class for Differential Privacy (DP) histogram mechanism.""" + + @abstractmethod + def set_epsilon_and_delta(self, num_iterations, epsilon, delta, noise_multiplier): + """Set the epsilon and delta for the DP mechanism. Either epsilon or noise_multiplier should be None. + + :param num_iterations: The number of PE iterations + :type num_iterations: int + :param epsilon: The epsilon value of DP + :type epsilon: float or None + :param delta: The delta value of DP + :type delta: float + :param noise_multiplier: The noise multiplier of the DP mechanism + :type noise_multiplier: float or None + """ + ... + + @abstractmethod + def add_noise(self, syn_data): + """Add noise to the histogram of synthetic data. + + :param syn_data: The synthetic data to add noise + :type syn_data: :py:class:`pe.data.data.Data` + """ + ... diff --git a/pe/dp/gaussian.py b/pe/dp/gaussian.py new file mode 100644 index 0000000..6dd9423 --- /dev/null +++ b/pe/dp/gaussian.py @@ -0,0 +1,172 @@ +import scipy.optimize +from scipy.optimize import root_scalar +import numpy as np + +from pe.dp import DP +from pe.logging import execution_logger +from pe.constant.data import CLEAN_HISTOGRAM_COLUMN_NAME +from pe.constant.data import DP_HISTOGRAM_COLUMN_NAME + + +def delta_Gaussian(eps, mu): + """Compute delta of Gaussian mechanism with shift mu or equivalently noise scale 1/mu. + + :param eps: The epsilon value + :type eps: float + :param mu: The mu value + :type mu: float + :return: The delta value + :rtype: float + """ + if mu == 0: + return 0 + if np.isinf(np.exp(eps)): + return 0 + return scipy.stats.norm.cdf(-eps / mu + mu / 2) - np.exp(eps) * scipy.stats.norm.cdf(-eps / mu - mu / 2) + + +def eps_Gaussian(delta, mu, max_epsilon): + """Compute eps of Gaussian mechanism with shift mu or equivalently noise scale 1/mu. + + :param delta: The delta value + :type delta: float + :param mu: The mu value + :type mu: float + :param max_epsilon: The maximum epsilon value to search for + :type max_epsilon: float + """ + + def f(x): + return delta_Gaussian(x, mu) - delta + + return root_scalar(f, bracket=[0, max_epsilon], method="brentq").root + + +def compute_epsilon(noise_multiplier, num_steps, delta, max_epsilon=1e7): + """Compute epsilon of Gaussian mechanism. + + :param noise_multiplier: The noise multiplier + :type noise_multiplier: float + :param num_steps: The number of steps + :type num_steps: int + :param delta: The delta value + :type delta: float + :param max_epsilon: The maximum epsilon value to search for, defaults to 1e7 + :type max_epsilon: float, optional + :return: The epsilon value. + :rtype: float + """ + return eps_Gaussian(delta=delta, mu=np.sqrt(num_steps) / noise_multiplier, max_epsilon=max_epsilon) + + +def get_noise_multiplier( + epsilon, + num_steps, + delta, + min_noise_multiplier=1e-1, + max_noise_multiplier=500, + max_epsilon=1e7, +): + """Get noise multiplier of Gaussian mechanism. + + :param epsilon: The epsilon value + :type epsilon: float + :param num_steps: The number of steps + :type num_steps: int + :param delta: The delta value + :type delta: float + :param min_noise_multiplier: The minimum noise multiplier to search for, defaults to 1e-1 + :type min_noise_multiplier: float, optional + :param max_noise_multiplier: The maximum noise multiplier to search for, defaults to 500 + :type max_noise_multiplier: float, optional + :param max_epsilon: The maximum epsilon value to search for, defaults to 1e7 + :type max_epsilon: float, optional + """ + + def objective(x): + return ( + compute_epsilon( + noise_multiplier=x, + num_steps=num_steps, + delta=delta, + max_epsilon=max_epsilon, + ) + - epsilon + ) + + output = root_scalar(objective, bracket=[min_noise_multiplier, max_noise_multiplier], method="brentq") + + if not output.converged: + raise ValueError("Failed to converge") + + return output.root + + +class Gaussian(DP): + """The Gaussian mechanism for Differential Privacy (DP) histogram.""" + + def set_epsilon_and_delta(self, num_iterations, epsilon, delta, noise_multiplier): + """Set the epsilon and delta for the Gaussian mechanism. + + :param num_iterations: The number of PE iterations + :type num_iterations: int + :param epsilon: The epsilon value of DP + :type epsilon: float + :param delta: The delta value of DP + :type delta: float + :param noise_multiplier: The noise multiplier of the DP mechanism + :type noise_multiplier: float + :raises ValueError: If delta is None + :raises ValueError: If both epsilon and noise_multiplier are None or not None + """ + if delta is None: + raise ValueError("Delta should not be None") + if (epsilon is None) == (noise_multiplier is None): + raise ValueError("Either epsilon or noise multiplier should be None") + + self._delta = delta + if epsilon is not None: + self._epsilon = epsilon + if num_iterations == 0: + self._noise_multiplier = 0 + execution_logger.warning( + "Since num_iterations is 0, noise_multiplier is set to 0, and epsilon is ignored." + ) + else: + self._noise_multiplier = get_noise_multiplier( + epsilon=epsilon, + num_steps=num_iterations, + delta=delta, + ) + else: + self._noise_multiplier = noise_multiplier + if num_iterations == 0: + self._epsilon = 0 + execution_logger.warning( + "Since num_iterations is 0, epsilon is set to 0, and noise_multiplier is ignored." + ) + else: + self._epsilon = compute_epsilon( + noise_multiplier=noise_multiplier, + num_steps=num_iterations, + delta=delta, + ) + execution_logger.info( + f"DP epsilon={self._epsilon}, delta={self._delta}, noise_multiplier={self._noise_multiplier}, " + f"num_iterations={num_iterations}." + ) + + def add_noise(self, syn_data): + """Add noise to the histogram of synthetic data. + + :param syn_data: The synthetic data to add noise. The synthetic data should have the + :py:const:`pe.constant.data.CLEAN_HISTOGRAM_COLUMN_NAME` column + :type syn_data: :py:class:`pe.data.data.Data` + :return: The synthetic data with noise added to the histogram. The noisy histogram is stored in the + :py:const:`pe.constant.data.DP_HISTOGRAM_COLUMN_NAME` column + :rtype: :py:class:`pe.data.data.Data` + """ + syn_data.data_frame[DP_HISTOGRAM_COLUMN_NAME] = syn_data.data_frame[ + CLEAN_HISTOGRAM_COLUMN_NAME + ] + np.random.normal(scale=self._noise_multiplier, size=len(syn_data.data_frame)) + return syn_data diff --git a/pe/embedding/__init__.py b/pe/embedding/__init__.py new file mode 100644 index 0000000..b42f9d1 --- /dev/null +++ b/pe/embedding/__init__.py @@ -0,0 +1 @@ +from .embedding import Embedding diff --git a/pe/embedding/embedding.py b/pe/embedding/embedding.py new file mode 100644 index 0000000..ae64be6 --- /dev/null +++ b/pe/embedding/embedding.py @@ -0,0 +1,21 @@ +from abc import ABC, abstractmethod + +from pe.constant.data import EMBEDDING_COLUMN_NAME + + +class Embedding(ABC): + """The abstract class that computes the embedding of samples.""" + + @property + def column_name(self): + """The column name to be used in the data frame.""" + return f"{EMBEDDING_COLUMN_NAME}.{type(self).__name__}" + + @abstractmethod + def compute_embedding(self, data): + """Compute the embedding of samples. + + :param data: The data to compute the embedding + :type data: :py:class:`pe.data.data.Data` + """ + pass diff --git a/pe/embedding/image/__init__.py b/pe/embedding/image/__init__.py new file mode 100644 index 0000000..293879f --- /dev/null +++ b/pe/embedding/image/__init__.py @@ -0,0 +1 @@ +from .inception import Inception diff --git a/pe/embedding/image/inception.py b/pe/embedding/image/inception.py new file mode 100644 index 0000000..c6a8e67 --- /dev/null +++ b/pe/embedding/image/inception.py @@ -0,0 +1,79 @@ +import tempfile +import numpy as np +import torch +import pandas as pd +from tqdm import tqdm + +from cleanfid.inception_torchscript import InceptionV3W +from cleanfid.resize import build_resizer +from cleanfid.resize import make_resizer + +from pe.embedding import Embedding +from pe.constant.data import IMAGE_DATA_COLUMN_NAME +from pe.logging import execution_logger + + +def to_uint8(x, min, max): + x = (x - min) / (max - min) + x = np.around(np.clip(x * 255, a_min=0, a_max=255)).astype(np.uint8) + return x + + +class Inception(Embedding): + """Compute the Inception embedding of images.""" + + def __init__(self, res, device="cuda", batch_size=2000): + """Constructor. + + :param res: The resolution of the images. The images will be resized to (res, res) before computing the + embedding + :type res: int + :param device: The device to use for computing the embedding, defaults to "cuda" + :type device: str, optional + :param batch_size: The batch size to use for computing the embedding, defaults to 2000 + :type batch_size: int, optional + """ + super().__init__() + self._temp_folder = tempfile.TemporaryDirectory() + self._device = device + self._inception = InceptionV3W(path=self._temp_folder.name, download=True, resize_inside=False).to(device) + self._resize_pre = make_resizer( + library="PIL", + quantize_after=False, + filter="bicubic", + output_size=(res, res), + ) + self._resizer = build_resizer("clean") + self._batch_size = batch_size + + def compute_embedding(self, data): + """Compute the Inception embedding of images. + + :param data: The data object containing the images + :type data: :py:class:`pe.data.data.Data` + :return: The data object with the computed embedding + :rtype: :py:class:`pe.data.data.Data` + """ + if self.column_name in data.data_frame.columns: + execution_logger.info(f"Embedding: {self.column_name} already computed") + return data + execution_logger.info(f"Embedding: computing {self.column_name} for {len(data.data_frame)} samples") + x = np.stack(data.data_frame[IMAGE_DATA_COLUMN_NAME].values, axis=0) + if x.shape[3] == 1: + x = np.repeat(x, 3, axis=3) + embeddings = [] + for i in tqdm(range(0, len(x), self._batch_size)): + transformed_x = [] + for j in range(i, min(i + self._batch_size, len(x))): + image = x[j] + image = self._resize_pre(image) + image = to_uint8(image, min=0, max=255) + image = self._resizer(image) + transformed_x.append(image) + transformed_x = np.stack(transformed_x, axis=0).transpose((0, 3, 1, 2)) + embeddings.append(self._inception(torch.from_numpy(transformed_x).to(self._device))) + embeddings = torch.cat(embeddings, dim=0) + embeddings = embeddings.cpu().detach().numpy() + data.data_frame[self.column_name] = pd.Series(list(embeddings), index=data.data_frame.index) + execution_logger.info(f"Embedding: finished computing {self.column_name} for {len(data.data_frame)} samples") + return data diff --git a/pe/histogram/__init__.py b/pe/histogram/__init__.py new file mode 100644 index 0000000..7db90c1 --- /dev/null +++ b/pe/histogram/__init__.py @@ -0,0 +1,2 @@ +from .histogram import Histogram +from .nearest_neighbors import NearestNeighbors diff --git a/pe/histogram/histogram.py b/pe/histogram/histogram.py new file mode 100644 index 0000000..1bc7093 --- /dev/null +++ b/pe/histogram/histogram.py @@ -0,0 +1,18 @@ +from abc import ABC, abstractmethod + + +class Histogram(ABC): + """The abstract class for computing the histogram over synthetic samples. The histogram values indicate how good + each synthetic sample is in terms their closeness to the private data. + """ + + @abstractmethod + def compute_histogram(self, priv_data, syn_data): + """Compute the histogram over the synthetic data using the private data. + + :param priv_data: The private data + :type priv_data: :py:class:`pe.data.data.Data` + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + """ + ... diff --git a/pe/histogram/nearest_neighbor_backend/__init__.py b/pe/histogram/nearest_neighbor_backend/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pe/histogram/nearest_neighbor_backend/faiss.py b/pe/histogram/nearest_neighbor_backend/faiss.py new file mode 100644 index 0000000..d4a8ba8 --- /dev/null +++ b/pe/histogram/nearest_neighbor_backend/faiss.py @@ -0,0 +1,40 @@ +import faiss +import torch +import numpy as np + + +def search(syn_embedding, priv_embedding, num_nearest_neighbors, mode): + """Compute the nearest neighbors of the private embedding in the synthetic embedding using FAISS. + + :param syn_embedding: The synthetic embedding + :type syn_embedding: np.ndarray + :param priv_embedding: The private embedding + :type priv_embedding: np.ndarray + :param num_nearest_neighbors: The number of nearest neighbors to search + :type num_nearest_neighbors: int + :param mode: The distance metric to use for finding the nearest neighbors. It should be one of the following: + "l2" (l2 distance), "cos_sim" (cosine similarity), "ip" (inner product) + :type mode: str + :raises ValueError: If the mode is unknown + :return: The distances and indices of the nearest neighbors + :rtype: tuple[np.ndarray, np.ndarray] + """ + if mode.lower() == "l2": + index = faiss.IndexFlatL2(syn_embedding.shape[1]) + elif mode.lower() == "ip": + index = faiss.IndexFlatIP(syn_embedding.shape[1]) + elif mode.lower() == "cos_sim": + index = faiss.IndexFlatIP(syn_embedding.shape[1]) + faiss.normalize_L2(syn_embedding) + faiss.normalize_L2(priv_embedding) + else: + raise ValueError(f"Unknown mode: {mode}") + + if torch.cuda.is_available(): + ngpus = faiss.get_num_gpus() + co = faiss.GpuMultipleClonerOptions() + index = faiss.index_cpu_to_all_gpus(index, co, ngpus) + + index.add(syn_embedding) + distances, ids = index.search(priv_embedding, num_nearest_neighbors) + return np.sqrt(distances), ids diff --git a/pe/histogram/nearest_neighbor_backend/sklearn.py b/pe/histogram/nearest_neighbor_backend/sklearn.py new file mode 100644 index 0000000..2b3554f --- /dev/null +++ b/pe/histogram/nearest_neighbor_backend/sklearn.py @@ -0,0 +1,30 @@ +from sklearn.neighbors import NearestNeighbors + + +def search(syn_embedding, priv_embedding, num_nearest_neighbors, mode): + """Compute the nearest neighbors of the private embedding in the synthetic embedding using sklearn. + + :param syn_embedding: The synthetic embedding + :type syn_embedding: np.ndarray + :param priv_embedding: The private embedding + :type priv_embedding: np.ndarray + :param num_nearest_neighbors: The number of nearest neighbors to search + :type num_nearest_neighbors: int + :param mode: The distance metric to use for finding the nearest neighbors. It should be one of the following: + "l2" (l2 distance), "cos_sim" (cosine similarity) + :type mode: str + :raises ValueError: If the mode is unknown + :return: The distances and indices of the nearest neighbors + :rtype: tuple[np.ndarray, np.ndarray] + """ + if mode.lower() == "l2": + metric = "l2" + elif mode.lower() == "cos_sim": + metric = "cosine" + else: + raise ValueError(f"Unknown mode: {mode}") + + nn = NearestNeighbors(n_neighbors=num_nearest_neighbors, metric=metric, algorithm="brute", n_jobs=-1) + nn.fit(syn_embedding) + distances, ids = nn.kneighbors(priv_embedding) + return distances, ids diff --git a/pe/histogram/nearest_neighbors.py b/pe/histogram/nearest_neighbors.py new file mode 100644 index 0000000..d8182c3 --- /dev/null +++ b/pe/histogram/nearest_neighbors.py @@ -0,0 +1,199 @@ +import numpy as np +import os +from collections import Counter +import copy + +from pe.histogram import Histogram +from pe.constant.data import CLEAN_HISTOGRAM_COLUMN_NAME +from pe.constant.data import LOOKAHEAD_EMBEDDING_COLUMN_NAME +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.constant.data import HISTOGRAM_NEAREST_NEIGHBORS_VOTING_IDS_COLUMN_NAME +from pe.logging import execution_logger + + +class NearestNeighbors(Histogram): + """Compute the nearest neighbors histogram. Each private sample will vote for their closest `num_nearest_neighbors` + synthetic samples to construct the histogram. The l2 norm of the votes from each private sample is normalized to 1. + """ + + def __init__( + self, + embedding, + mode, + lookahead_degree, + lookahead_log_folder=None, + voting_details_log_folder=None, + api=None, + num_nearest_neighbors=1, + backend="sklearn", + ): + """Constructor. + + :param embedding: The :py:class:`pe.embedding.embedding.Embedding` object to compute the embedding of samples + :type embedding: :py:class:`pe.embedding.embedding.Embedding` + :param mode: The distance metric to use for finding the nearest neighbors. It should be one of the following: + "l2" (l2 distance), "cos_sim" (cosine similarity), "ip" (inner product). Not all backends support all + modes + :type mode: str + :param lookahead_degree: The degree of lookahead to compute the embedding of synthetic samples. If it is 0, the + original embedding is used. If it is greater than 0, the embedding of the synthetic samples is computed by + averaging the embeddings of the synthetic samples generated by the variation API for `lookahead_degree` + times + :type lookahead_degree: int + :param lookahead_log_folder: The folder to save the logs of the lookahead. If it is None, the logs are not + saved. Defaults to None + :type lookahead_log_folder: str, optional + :param voting_details_log_folder: The folder to save the logs of the voting details. If it is None, the logs + are not saved. Defaults to None + :type voting_details_log_folder: str, optional + :param api: The API to generate synthetic samples. It should be provided when `lookahead_degree` is greater + than 0. Defaults to None + :type api: :py:class:`pe.api.api.API`, optional + :param num_nearest_neighbors: The number of nearest neighbors to consider for each private sample, defaults to + 1 + :type num_nearest_neighbors: int, optional + :param backend: The backend to use for finding the nearest neighbors. It should be one of the following: + "faiss" (FAISS), "sklearn" (scikit-learn). Defaults to "sklearn". FAISS supports GPU and is much faster + when the number of synthetic samples and/or private samples is large. It requires the installation of + `faiss-gpu` or `faiss-cpu` package. See https://faiss.ai/ + :type backend: str, optional + :raises ValueError: If the `api` is not provided when `lookahead_degree` is greater than 0 + :raises ValueError: If the `backend` is unknown + """ + super().__init__() + self._embedding = embedding + self._mode = mode + self._lookahead_degree = lookahead_degree + self._lookahead_log_folder = lookahead_log_folder + self._voting_details_log_folder = voting_details_log_folder + self._api = api + self._num_nearest_neighbors = num_nearest_neighbors + if self._lookahead_degree > 0 and self._api is None: + raise ValueError("API should be provided when lookahead_degree is greater than 0") + if backend.lower() == "faiss": + from pe.histogram.nearest_neighbor_backend.faiss import search + + self._search = search + elif backend.lower() == "sklearn": + from pe.histogram.nearest_neighbor_backend.sklearn import search + + self._search = search + else: + raise ValueError(f"Unknown backend: {backend}") + + def _log_lookahead(self, syn_data, lookahead_id): + """Log the lookahead data. + + :param syn_data: The lookahead data + :type syn_data: :py:class:`pe.data.data.Data` + :param lookahead_id: The ID of the lookahead + :type lookahead_id: int + """ + if self._lookahead_log_folder is None: + return + labels = set(list(syn_data.data_frame[LABEL_ID_COLUMN_NAME].values)) + assert len(labels) == 1 + label = list(labels)[0] + iteration = syn_data.metadata["iteration"] + log_folder = os.path.join( + self._lookahead_log_folder, f"{iteration}", f"label-id{label}_lookahead{lookahead_id}" + ) + syn_data.save_checkpoint(log_folder) + + def _log_voting_details(self, priv_data, syn_data, ids): + """Log the voting details. + + :param priv_data: The private data + :type priv_data: :py:class:`pe.data.data.Data` + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :param ids: The IDs of the nearest neighbors for each private sample + :type ids: np.ndarray + """ + if self._voting_details_log_folder is None: + return + labels = set(list(priv_data.data_frame[LABEL_ID_COLUMN_NAME].values)) + assert len(labels) == 1 + label = list(labels)[0] + iteration = syn_data.metadata["iteration"] + log_folder = os.path.join(self._voting_details_log_folder, f"{iteration}", f"label-id{label}") + priv_data = copy.deepcopy(priv_data) + priv_data.data_frame[HISTOGRAM_NEAREST_NEIGHBORS_VOTING_IDS_COLUMN_NAME] = list(ids) + priv_data.save_checkpoint(log_folder) + + def _compute_lookahead_embedding(self, syn_data): + """Compute the embedding of synthetic samples with lookahead. + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :return: The synthetic data with the computed embedding in the column + :py:const:`pe.constant.data.LOOKAHEAD_EMBEDDING_COLUMN_NAME` + :rtype: :py:class:`pe.data.data.Data` + """ + if self._lookahead_degree == 0: + syn_data = self._embedding.compute_embedding(syn_data) + syn_data.data_frame[LOOKAHEAD_EMBEDDING_COLUMN_NAME] = syn_data.data_frame[self._embedding.column_name] + else: + embedding_list = [] + for lookahead_id in range(self._lookahead_degree): + variation_data = self._api.variation_api(syn_data=syn_data) + variation_data = self._embedding.compute_embedding(variation_data) + self._log_lookahead(syn_data=variation_data, lookahead_id=lookahead_id) + embedding_list.append( + np.stack( + variation_data.data_frame[self._embedding.column_name].values, + axis=0, + ) + ) + embedding = np.mean(embedding_list, axis=0) + syn_data.data_frame[LOOKAHEAD_EMBEDDING_COLUMN_NAME] = list(embedding) + self._log_lookahead(syn_data=syn_data, lookahead_id=-1) + + return syn_data + + def compute_histogram(self, priv_data, syn_data): + """Compute the nearest neighbors histogram. + + :param priv_data: The private data + :type priv_data: :py:class:`pe.data.data.Data` + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :return: The private data, possibly with the additional embedding column, and the synthetic data, with the + computed histogram in the column :py:const:`pe.constant.data.CLEAN_HISTOGRAM_COLUMN_NAME` and possibly with + the additional embedding column + :rtype: tuple[:py:class:`pe.data.data.Data`, :py:class:`pe.data.data.Data`] + """ + execution_logger.info( + f"Histogram: computing nearest neighbors histogram for {len(priv_data.data_frame)} private " + f"samples and {len(syn_data.data_frame)} synthetic samples" + ) + + priv_data = self._embedding.compute_embedding(priv_data) + syn_data = self._compute_lookahead_embedding(syn_data) + + priv_embedding = np.stack(priv_data.data_frame[self._embedding.column_name].values, axis=0).astype(np.float32) + syn_embedding = np.stack(syn_data.data_frame[LOOKAHEAD_EMBEDDING_COLUMN_NAME].values, axis=0).astype( + np.float32 + ) + + _, ids = self._search( + syn_embedding=syn_embedding, + priv_embedding=priv_embedding, + num_nearest_neighbors=self._num_nearest_neighbors, + mode=self._mode, + ) + self._log_voting_details(priv_data=priv_data, syn_data=syn_data, ids=ids) + + counter = Counter(list(ids.flatten())) + count = np.zeros(shape=syn_embedding.shape[0], dtype=np.float32) + count[list(counter.keys())] = list(counter.values()) + count /= np.sqrt(self._num_nearest_neighbors) + + syn_data.data_frame[CLEAN_HISTOGRAM_COLUMN_NAME] = count + + execution_logger.info( + f"Histogram: finished computing nearest neighbors histogram for {len(priv_data.data_frame)} private " + f"samples and {len(syn_data.data_frame)} synthetic samples" + ) + + return priv_data, syn_data diff --git a/pe/logger/__init__.py b/pe/logger/__init__.py new file mode 100644 index 0000000..0447e54 --- /dev/null +++ b/pe/logger/__init__.py @@ -0,0 +1,4 @@ +from .csv_print import CSVPrint +from .image_file import ImageFile +from .log_print import LogPrint +from .matplotlib_pdf import MatplotlibPDF diff --git a/pe/logger/csv_print.py b/pe/logger/csv_print.py new file mode 100644 index 0000000..7fd3023 --- /dev/null +++ b/pe/logger/csv_print.py @@ -0,0 +1,109 @@ +import os +import csv +import torch +import numpy as np +from collections import defaultdict + +from .logger import Logger +from pe.metric_item import FloatMetricItem +from pe.metric_item import FloatListMetricItem + + +class CSVPrint(Logger): + """The logger that prints the metrics to CSV files.""" + + def __init__( + self, + output_folder, + path_separator="-", + float_format=".8f", + flush_iteration_freq=1, + ): + """Constructor. + + :param output_folder: The output folder that will be used to save the CSV files + :type output_folder: str + :param path_separator: The string that will be used to replace '\' and '/' in log names, defaults to "-" + :type path_separator: str, optional + :param float_format: The format of the floating point numbers, defaults to ".8f" + :type float_format: str, optional + :param flush_iteration_freq: The frequency to flush the logs, defaults to 1 + :type flush_iteration_freq: int, optional + """ + self._output_folder = output_folder + os.makedirs(self._output_folder, exist_ok=True) + self._path_separator = path_separator + self._float_format = float_format + self._flush_iteration_freq = flush_iteration_freq + self._clear_logs() + + def _clear_logs(self): + """Clear the logs.""" + self._logs = defaultdict(list) + + def _get_log_path(self, iteration, item): + """Get the log path. + + :param iteration: The PE iteration number + :type iteration: int + :param item: The metric item + :type item: :py:class:`pe.metric_item.MetricItem` + :return: The log path + :rtype: str + """ + log_path = item.name + log_path = log_path.replace("/", self._path_separator) + log_path = log_path.replace("\\", self._path_separator) + log_path = os.path.join(self._output_folder, log_path + ".csv") + return log_path + + def _flush(self): + """Flush the logs.""" + for path in self._logs: + with open(path, "a") as f: + writer = csv.writer(f) + writer.writerows(self._logs[path]) + + def _log_float(self, log_path, iteration, item): + """Log a float metric item. + + :param log_path: The path of the log file + :type log_path: str + :param iteration: The PE iteration number + :type iteration: int + :param item: The float metric item + :type item: :py:class:`pe.metric_item.FloatMetricItem` or :py:class:`pe.metric_item.FloatListMetricItem` + """ + str_iteration = str(iteration) + str_value = item.value + if isinstance(item.value, torch.Tensor): + str_value = item.value.cpu().detach().numpy() + if isinstance(str_value, np.ndarray): + str_value = str_value.tolist() + if isinstance(str_value, list): + str_value = ",".join([format(v, self._float_format) for v in str_value]) + else: + str_value = format(str_value, self._float_format) + self._logs[log_path].append([str_iteration, str_value]) + + def log(self, iteration, metric_items): + """Log the metrics. + + :param iteration: The PE iteration number + :type iteration: int + :param metric_items: The metrics to log + :type metric_items: list[:py:class:`pe.metric_item.MetricItem`] + """ + for item in metric_items: + if not isinstance(item, (FloatMetricItem, FloatListMetricItem)): + continue + log_path = self._get_log_path(iteration, item) + self._log_float(log_path, iteration, item) + if iteration % self._flush_iteration_freq == 0: + self._flush() + self._clear_logs() + + def clean_up(self): + """Clean up the logger.""" + self._flush() + self._clear_logs() diff --git a/pe/logger/image_file.py b/pe/logger/image_file.py new file mode 100644 index 0000000..7b7a8bc --- /dev/null +++ b/pe/logger/image_file.py @@ -0,0 +1,107 @@ +import os +import imageio +import math +import torch +import numpy as np +from torchvision.utils import make_grid + +from .logger import Logger +from pe.metric_item import ImageMetricItem, ImageListMetricItem + + +class ImageFile(Logger): + """The logger that saves images to files.""" + + def __init__( + self, + output_folder, + path_separator="-", + iteration_format="09d", + ): + """Constructor. + + :param output_folder: The output folder that will be used to save the images + :type output_folder: str + :param path_separator: The string that will be used to replace '\' and '/' in log names, defaults to "-" + :type path_separator: str, optional + :param iteration_format: The format of the iteration number, defaults to "09d" + :type iteration_format: str, optional + """ + self._output_folder = output_folder + self._path_separator = path_separator + self._iteration_format = iteration_format + + def log(self, iteration, metric_items): + """Log the images. + + :param iteration: The PE iteration number + :type iteration: int + :param metric_items: The images to log + :type metric_items: list[:py:class:`pe.metric_item.ImageMetricItem` or + :py:class:`pe.metric_item.ImageListMetricItem`] + """ + for item in metric_items: + if not isinstance(item, (ImageMetricItem, ImageListMetricItem)): + continue + image_path = self._get_image_path(iteration, item) + if isinstance(item, ImageMetricItem): + self._log_image(image_path, item) + elif isinstance(item, ImageListMetricItem): + self._log_image_list(image_path, item) + + def _get_image_path(self, iteration, item): + """Get the image save path. + + :param iteration: The PE iteration number + :type iteration: int + :param item: The image metric item + :type item: :py:class:`pe.metric_item.ImageMetricItem` or :py:class:`pe.metric_item.ImageListMetricItem` + :return: The image save path + :rtype: str + """ + os.makedirs(self._output_folder, exist_ok=True) + image_name = item.name + image_name = image_name.replace("/", self._path_separator) + image_name = image_name.replace("\\", self._path_separator) + image_folder = os.path.join(self._output_folder, image_name) + os.makedirs(image_folder, exist_ok=True) + iteration_string = format(iteration, self._iteration_format) + image_file_name = f"{iteration_string}.png" + image_path = os.path.join( + image_folder, + image_file_name, + ) + return image_path + + def _log_image(self, image_path, item): + """Log a single image. + + :param image_path: The path to save the image + :type image_path: str + :param item: The image metric item + :type item: :py:class:`pe.metric_item.ImageMetricItem` + """ + image = item.value + if isinstance(image, torch.Tensor): + image = image.cpu().detach().numpy() + imageio.imwrite(image_path, image) + + def _log_image_list(self, image_path, item): + """Log a list of images. + + :param image_path: The path to save the image + :type image_path: str + :param item: The image list metric item + :type item: :py:class:`pe.metric_item.ImageListMetricItem` + """ + images = item.value + num_images_per_row = item.num_images_per_row + if num_images_per_row is None: + num_images_per_row = int(math.sqrt(len(images))) + + if isinstance(images[0], np.ndarray): + images = [torch.from_numpy(image.transpose(2, 0, 1)) for image in images] + + image = make_grid(images, nrow=num_images_per_row).cpu().detach().numpy() + image = image.transpose((1, 2, 0)) + imageio.imwrite(image_path, image) diff --git a/pe/logger/log_print.py b/pe/logger/log_print.py new file mode 100644 index 0000000..00dc8d4 --- /dev/null +++ b/pe/logger/log_print.py @@ -0,0 +1,38 @@ +from .logger import Logger +from pe.metric_item import FloatMetricItem, FloatListMetricItem +from pe.logging import execution_logger + + +class LogPrint(Logger): + """The logger that prints the metrics to the console/file using :py:const:`pe.logging.execution_logger`.""" + + def __init__(self, log_iteration_freq=1): + """Constructor. + + :param log_iteration_freq: The frequency to log the metrics, defaults to 1 + :type log_iteration_freq: int, optional + """ + self._log_iteration_freq = log_iteration_freq + + def log(self, iteration, metric_items): + """Log the metrics to the console/file. + + :param iteration: The PE iteration number + :type iteration: int + :param metric_items: The metrics to log + :type metric_items: list[:py:class:`pe.metric_item.FloatMetricItem` or + :py:class:`pe.metric_item.FloatListMetricItem`] + """ + if iteration % self._log_iteration_freq != 0: + return + metric_items = [item for item in metric_items if isinstance(item, (FloatMetricItem, FloatListMetricItem))] + if len(metric_items) == 0: + return + execution_logger.info(f"Iteration: {iteration}") + for item in metric_items: + if isinstance(item, FloatMetricItem): + value = [item.value] + else: + value = item.value + value = ",".join([f"{v:.8f}" for v in value]) + execution_logger.info(f"\t{item.name}: {value}") diff --git a/pe/logger/logger.py b/pe/logger/logger.py new file mode 100644 index 0000000..1f1c646 --- /dev/null +++ b/pe/logger/logger.py @@ -0,0 +1,20 @@ +from abc import ABC, abstractmethod + + +class Logger(ABC): + """The abstract class for logging the metrics""" + + @abstractmethod + def log(self, iteration, metric_items): + """Log the metrics. + + :param iteration: The PE iteration number + :type iteration: int + :param metric_items: The metrics to log + :type metric_items: list[:py:class:`pe.metric_item.MetricItem`] + """ + ... + + def clean_up(self): + """Clean up the logger.""" + ... diff --git a/pe/logger/matplotlib_pdf.py b/pe/logger/matplotlib_pdf.py new file mode 100644 index 0000000..da61eec --- /dev/null +++ b/pe/logger/matplotlib_pdf.py @@ -0,0 +1,65 @@ +import os + +from .logger import Logger +from pe.metric_item import MatplotlibMetricItem + + +class MatplotlibPDF(Logger): + """The logger that saves Matplotlib figures to PDF files.""" + + def __init__( + self, + output_folder, + path_separator="-", + iteration_format="09d", + ): + """Constructor. + + :param output_folder: The output folder that will be used to save the PDF files + :type output_folder: str + :param path_separator: The string that will be used to replace '\' and '/' in log names, defaults to "-" + :type path_separator: str, optional + :param iteration_format: The format of the iteration number, defaults to "09d" + :type iteration_format: str, optional + """ + self._output_folder = output_folder + os.makedirs(self._output_folder, exist_ok=True) + self._path_separator = path_separator + self._iteration_format = iteration_format + + def log(self, iteration, metric_items): + """Log the Matplotlib figures to PDF files. + + :param iteration: The PE iteration number + :type iteration: int + :param metric_items: The Matplotlib figures to log + :type metric_items: list[:py:class:`pe.metric_item.MatplotlibMetricItem`] + """ + for item in metric_items: + if not isinstance(item, (MatplotlibMetricItem,)): + continue + pdf_path = self._get_pdf_path(iteration, item) + item.value.savefig(pdf_path) + + def _get_pdf_path(self, iteration, item): + """Get the PDF save path. + + :param iteration: The PE iteration number + :type iteration: int + :param item: The Matplotlib figure metric item + :type item: :py:class:`pe.metric_item.MatplotlibMetricItem` + :return: The PDF save path + :rtype: str + """ + image_name = item.name + image_name = image_name.replace("/", self._path_separator) + image_name = image_name.replace("\\", self._path_separator) + image_folder = os.path.join(self._output_folder, image_name) + os.makedirs(image_folder, exist_ok=True) + iteration_string = format(iteration, self._iteration_format) + image_file_name = f"{iteration_string}.pdf" + image_path = os.path.join( + image_folder, + image_file_name, + ) + return image_path diff --git a/pe/logging/__init__.py b/pe/logging/__init__.py new file mode 100644 index 0000000..2cd65c7 --- /dev/null +++ b/pe/logging/__init__.py @@ -0,0 +1,43 @@ +import logging +import os + +#: The logger that will be used to log the execution information +execution_logger = logging.getLogger() + + +def setup_logging( + log_file=None, + datefmt="%m/%d/%Y %H:%M:%S %p", + fmt="%(asctime)s [%(name)s] [%(levelname)-5.5s] %(message)s", + level=logging.INFO, + name="logger", +): + """Setup the logging configuration. + + :param log_file: The log file path, defaults to None + :type log_file: str, optional + :param datefmt: The date format, defaults to "%m/%d/%Y %H:%M:%S %p" + :type datefmt: str, optional + :param fmt: The log format, defaults to "%(asctime)s [%(name)s] [%(levelname)-5.5s] %(message)s" + :type fmt: str, optional + :param level: The log level, defaults to logging.INFO + :type level: int, optional + :param name: The logger name, defaults to "logger" + :type name: str, optional + """ + execution_logger.name = name + + execution_logger.handlers.clear() + execution_logger.setLevel(level) + + log_formatter = logging.Formatter(fmt=fmt, datefmt=datefmt) + + console_handler = logging.StreamHandler() + console_handler.setFormatter(log_formatter) + execution_logger.addHandler(console_handler) + + if log_file is not None: + os.makedirs(os.path.dirname(log_file), exist_ok=True) + file_handler = logging.FileHandler(log_file) + file_handler.setFormatter(log_formatter) + execution_logger.addHandler(file_handler) diff --git a/pe/metric_item/__init__.py b/pe/metric_item/__init__.py new file mode 100644 index 0000000..d5a9103 --- /dev/null +++ b/pe/metric_item/__init__.py @@ -0,0 +1,100 @@ +import matplotlib.pyplot as plt + +scopes = [] + + +class metric_scope(object): + """The context manager to manage the metric scope.""" + + def __init__(self, name): + self._name = name + + def __enter__(self): + scopes.append(self._name) + + def __exit__(self, type, value, traceback): + scopes.pop() + + +class MetricItem(object): + """The base class for the metric item.""" + + def __init__(self, name, value): + """Constructor. + + :param name: The name of the metric item + :type name: str + :param value: The value of the metric item + :type value: object + """ + self._name = "/".join(scopes + [name]) + self._value = value + + @property + def name(self): + """Get the name of the metric item. + + :return: The name of the metric item + :rtype: str + """ + return self._name + + @property + def value(self): + """Get the value of the metric item. + + :return: The value of the metric item + :rtype: object + """ + return self._value + + def clean_up(self): + """Clean up the metric item.""" + pass + + +class MatplotlibMetricItem(MetricItem): + """The metric item for Matplotlib figures.""" + + def clean_up(self): + plt.close(self._value) + + +class FloatMetricItem(MetricItem): + """The metric item for a single float value.""" + + pass + + +class FloatListMetricItem(MetricItem): + """The metric item for a list of float values.""" + + pass + + +class ImageMetricItem(MetricItem): + """The metric item for an image.""" + + pass + + +class ImageListMetricItem(MetricItem): + """The metric item for a list of images.""" + + def __init__(self, num_images_per_row=None, *args, **kwargs): + """Constructor. + + :param num_images_per_row: The number of images per row when saving to the file, defaults to None + :type num_images_per_row: int, optional + """ + super().__init__(*args, **kwargs) + self._num_images_per_row = num_images_per_row + + @property + def num_images_per_row(self): + """Get the number of images per row when saving to the file. + + :return: The number of images per row when saving to the file + :rtype: int or None + """ + return self._num_images_per_row diff --git a/pe/population/__init__.py b/pe/population/__init__.py new file mode 100644 index 0000000..1a5cb0c --- /dev/null +++ b/pe/population/__init__.py @@ -0,0 +1,2 @@ +from .population import Population +from .pe_population import PEPopulation diff --git a/pe/population/pe_population.py b/pe/population/pe_population.py new file mode 100644 index 0000000..24180b5 --- /dev/null +++ b/pe/population/pe_population.py @@ -0,0 +1,145 @@ +import numpy as np + +from .population import Population +from pe.data import Data +from pe.constant.data import DP_HISTOGRAM_COLUMN_NAME +from pe.constant.data import POST_PROCESSED_DP_HISTOGRAM_COLUMN_NAME +from pe.constant.data import PARENT_SYN_DATA_INDEX_COLUMN_NAME +from pe.constant.data import FROM_LAST_FLAG_COLUMN_NAME +from pe.logging import execution_logger + + +class PEPopulation(Population): + """The default population algorithm for Private Evolution.""" + + def __init__( + self, + api, + histogram_threshold, + initial_variation_api_fold=0, + next_variation_api_fold=1, + keep_selected=False, + selection_mode="sample", + ): + """Constructor. + + :param api: The API object that contains the random and variation APIs + :type api: :py:class:`pe.api.api.API` + :param histogram_threshold: The threshold for clipping the histogram + :type histogram_threshold: float + :param initial_variation_api_fold: The number of variations to apply to the initial synthetic data, defaults to + 0 + :type initial_variation_api_fold: int, optional + :param next_variation_api_fold: The number of variations to apply to the next synthetic data, defaults to 1 + :type next_variation_api_fold: int, optional + :param keep_selected: Whether to keep the selected data in the next synthetic data, defaults to False + :type keep_selected: bool, optional + :param selection_mode: The selection mode for selecting the data. It should be one of the following: "sample"( + random sampling proportional to the histogram). Defaults to "sample" + :type selection_mode: str, optional + :raises ValueError: If next_variation_api_fold is 0 and keep_selected is False + """ + super().__init__() + self._api = api + self._histogram_threshold = histogram_threshold + self._initial_variation_api_fold = initial_variation_api_fold + self._next_variation_api_fold = next_variation_api_fold + self._keep_selected = keep_selected + self._selection_mode = selection_mode + if self._next_variation_api_fold == 0 and not self._keep_selected: + raise ValueError( + "next_variation_api_fold should be greater than 0 or keep_selected should be True. Otherwise, next " + "synthetic data will be empty." + ) + + def initial(self, label_name, num_samples): + """Generate the initial synthetic data. + + :param label_name: The label name + :type label_name: str + :param num_samples: The number of samples to generate + :type num_samples: int + :return: The initial synthetic data + :rtype: :py:class:`pe.data.data.Data` + """ + execution_logger.info( + f"Population: generating {num_samples}*{self._initial_variation_api_fold + 1} initial " + f"synthetic samples for label {label_name}" + ) + random_data = self._api.random_api(label_name=label_name, num_samples=num_samples) + variation_data_list = [] + for _ in range(self._initial_variation_api_fold): + variation_data = self._api.variation_api(syn_data=random_data) + variation_data_list.append(variation_data) + data = Data.concat([random_data] + variation_data_list) + execution_logger.info( + f"Population: finished generating {num_samples}*{self._initial_variation_api_fold + 1} initial " + f"synthetic samples for label {label_name}" + ) + return data + + def _post_process_histogram(self, syn_data): + """Post process the histogram of synthetic data (e.g., clipping). + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :return: The synthetic data with post-processed histogram in the column + :py:const:`pe.constant.data.POST_PROCESSED_DP_HISTOGRAM_COLUMN_NAME` + :rtype: :py:class:`pe.data.data.Data` + """ + count = syn_data.data_frame[DP_HISTOGRAM_COLUMN_NAME].to_numpy() + clipped_count = np.clip(count, a_min=self._histogram_threshold, a_max=None) + clipped_count -= self._histogram_threshold + syn_data.data_frame[POST_PROCESSED_DP_HISTOGRAM_COLUMN_NAME] = clipped_count + return syn_data + + def _select_data(self, syn_data, num_samples): + """Select data from the synthetic data according to `selection_mode`. + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :param num_samples: The number of samples to select + :type num_samples: int + :raises ValueError: If the selection mode is not supported + :return: The selected data + :rtype: :py:class:`pe.data.data.Data` + """ + if self._selection_mode == "sample": + count = syn_data.data_frame[POST_PROCESSED_DP_HISTOGRAM_COLUMN_NAME].to_numpy() + prob = count / count.sum() + indices = np.random.choice(len(syn_data.data_frame), size=num_samples, p=prob) + new_data_frame = syn_data.data_frame.iloc[indices] + new_data_frame[PARENT_SYN_DATA_INDEX_COLUMN_NAME] = syn_data.data_frame.index[indices] + return Data(data_frame=new_data_frame, metadata=syn_data.metadata) + else: + raise ValueError(f"Selection mode {self._selection_mode} is not supported") + + def next(self, syn_data, num_samples): + """Generate the next synthetic data. + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :param num_samples: The number of samples to generate + :type num_samples: int + :return: The next synthetic data + :rtype: :py:class:`pe.data.data.Data` + """ + execution_logger.info( + f"Population: generating {num_samples}*{self._next_variation_api_fold} " "next synthetic samples" + ) + syn_data = self._post_process_histogram(syn_data) + selected_data = self._select_data(syn_data, num_samples) + selected_data.data_frame[FROM_LAST_FLAG_COLUMN_NAME] = 1 + variation_data_list = [] + for _ in range(self._next_variation_api_fold): + variation_data = self._api.variation_api(syn_data=selected_data) + variation_data.data_frame[PARENT_SYN_DATA_INDEX_COLUMN_NAME] = selected_data.data_frame[ + PARENT_SYN_DATA_INDEX_COLUMN_NAME + ].values + variation_data.data_frame[FROM_LAST_FLAG_COLUMN_NAME] = 0 + variation_data_list.append(variation_data) + new_syn_data = Data.concat(variation_data_list + ([selected_data] if self._keep_selected else [])) + execution_logger.info( + f"Population: finished generating {num_samples}*{self._next_variation_api_fold} " "next synthetic samples" + ) + return new_syn_data diff --git a/pe/population/population.py b/pe/population/population.py new file mode 100644 index 0000000..2676b91 --- /dev/null +++ b/pe/population/population.py @@ -0,0 +1,27 @@ +from abc import ABC, abstractmethod + + +class Population(ABC): + """The abstract class that generates synthetic data.""" + + @abstractmethod + def initial(self, label_name, num_samples): + """Generate the initial synthetic data. + + :param label_name: The label name + :type label_name: str + :param num_samples: The number of samples to generate + :type num_samples: int + """ + pass + + @abstractmethod + def next(self, syn_data, num_samples): + """Generate the next synthetic data. + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + :param num_samples: The number of samples to generate + :type num_samples: int + """ + pass diff --git a/pe/runner/__init__.py b/pe/runner/__init__.py new file mode 100644 index 0000000..5d3b4ba --- /dev/null +++ b/pe/runner/__init__.py @@ -0,0 +1 @@ +from .pe import PE diff --git a/pe/runner/pe.py b/pe/runner/pe.py new file mode 100644 index 0000000..f212a61 --- /dev/null +++ b/pe/runner/pe.py @@ -0,0 +1,222 @@ +import numpy as np + +from pe.dp import Gaussian +from pe.data import Data +from pe.constant.data import LABEL_ID_COLUMN_NAME +from pe.logging import execution_logger + + +class PE(object): + """The class that runs the PE algorithm.""" + + def __init__(self, priv_data, population, histogram, dp=None, loggers=[], callbacks=[]): + """Constructor. + + :param priv_data: The private data + :type priv_data: :py:class:`pe.data.data.Data` + :param population: The population algorithm + :type population: :py:class:`pe.population.population.Population` + :param histogram: The histogram algorithm + :type histogram: :py:class:`pe.histogram.histogram.Histogram` + :param dp: The DP algorithm, defaults to None, in which case the Gaussian mechanism + :py:class:`pe.dp.gaussian.Gaussian` is used + :type dp: :py:class:`pe.dp.dp.DP`, optional + :param loggers: The list of loggers, defaults to [] + :type loggers: list[:py:class:`pe.logger.logger.Logger`], optional + :param callbacks: The list of callbacks, defaults to [] + :type callbacks: list[Callable or :py:class:`pe.callback.callback.Callback`], optional + """ + super().__init__() + self._priv_data = priv_data + self._population = population + self._histogram = histogram + if dp is None: + dp = Gaussian() + self._dp = dp + self._loggers = loggers + self._callbacks = callbacks + + def load_checkpoint(self, checkpoint_path): + """Load a checkpoint. + + :param checkpoint_path: The path to the checkpoint + :type checkpoint_path: str + :return: The synthetic data + :rtype: :py:class:`pe.data.data.Data` or None + """ + syn_data = Data() + if not syn_data.load_checkpoint(checkpoint_path): + return None + return syn_data + + def _log_metrics(self, syn_data): + """Log metrics. + + :param syn_data: The synthetic data + :type syn_data: :py:class:`pe.data.data.Data` + """ + if not self._callbacks: + return + metric_items = [] + for callback in self._callbacks: + metric_items.extend(callback(syn_data) or []) + for logger in self._loggers: + logger.log(iteration=syn_data.metadata.iteration, metric_items=metric_items) + for metric_item in metric_items: + metric_item.clean_up() + + def _get_num_samples_per_label_id(self, num_samples, fraction_per_label_id): + """Get the number of samples per label id given the total number of samples + + :param num_samples: The total number of samples + :type num_samples: int + :param fraction_per_label_id: The fraction of samples for each label id. The fraction does not have to be + normalized. When it is None, the fraction is assumed to be the same as the fraction of label ids in the + private data. Defaults to None + :type fraction_per_label_id: list[float], optional + :raises ValueError: If the length of fraction_per_label_id is not the same as the number of labels + :raises ValueError: If the number of samples is so small that the number of samples for some label ids is zero + :return: The number of samples per label id + :rtype: np.ndarray + """ + if fraction_per_label_id is None: + execution_logger.warning( + "fraction_per_label_id is not provided. Assuming the fraction of label ids in private data is public " + "information." + ) + fraction_per_label_id = self._priv_data.data_frame[LABEL_ID_COLUMN_NAME].value_counts().to_dict() + fraction_per_label_id = [ + 0 if i not in fraction_per_label_id else fraction_per_label_id[i] + for i in range(len(self._priv_data.metadata.label_names)) + ] + if len(fraction_per_label_id) != len(self._priv_data.metadata.label_names): + raise ValueError("fraction_per_label_id should have the same length as the number of labels.") + fraction_per_label_id = np.array(fraction_per_label_id) + fraction_per_label_id = fraction_per_label_id / np.sum(fraction_per_label_id) + + target_num_samples_per_label_id = fraction_per_label_id * num_samples + num_samples_per_label_id = np.floor(target_num_samples_per_label_id).astype(int) + num_samples_left = num_samples - np.sum(num_samples_per_label_id) + ids = np.argsort(target_num_samples_per_label_id - num_samples_per_label_id)[::-1] + num_samples_per_label_id[ids[:num_samples_left]] += 1 + assert np.sum(num_samples_per_label_id) == num_samples + if np.any(num_samples_per_label_id == 0): + raise ValueError("num_samples is so small that the number of samples for some label ids is zero.") + return num_samples_per_label_id + + def _clean_up_loggers(self): + """Clean up loggers.""" + for logger in self._loggers: + logger.clean_up() + + def run( + self, + num_samples_schedule, + delta, + epsilon=None, + noise_multiplier=None, + checkpoint_path=None, + save_checkpoint=True, + fraction_per_label_id=None, + ): + """Run the PE algorithm. + + :param num_samples_schedule: The schedule of the number of samples for each PE iteration. The first element is + the number of samples for the initial data, and the rest are the number of samples for each PE iteration. + So the length of the list is the number of PE iterations plus one + :type num_samples_schedule: list[int] + :param delta: The delta value of DP + :type delta: float + :param epsilon: The epsilon value of DP, defaults to None + :type epsilon: float, optional + :param noise_multiplier: The noise multiplier of the DP mechanism, defaults to None + :type noise_multiplier: float, optional + :param checkpoint_path: The path to load and save the checkpoint, defaults to None + :type checkpoint_path: str, optional + :param save_checkpoint: Whether to save the checkpoint, defaults to True + :type save_checkpoint: bool, optional + :param fraction_per_label_id: The fraction of samples for each label id. The fraction does not have to be + normalized. When it is None, the fraction is assumed to be the same as the fraction of label ids in the + private data. Defaults to None + :type fraction_per_label_id: list[float], optional + :return: The synthetic data + :rtype: :py:class:`pe.data.data.Data` + """ + try: + # Set privacy budget. + self._dp.set_epsilon_and_delta( + num_iterations=len(num_samples_schedule) - 1, + epsilon=epsilon, + delta=delta, + noise_multiplier=noise_multiplier, + ) + + # Generate or load initial data. + if checkpoint_path is not None and (syn_data := self.load_checkpoint(checkpoint_path)): + execution_logger.info( + f"Loaded checkpoint from {checkpoint_path}, iteration={syn_data.metadata.iteration}" + ) + else: + num_samples_per_label_id = self._get_num_samples_per_label_id( + num_samples=num_samples_schedule[0], + fraction_per_label_id=fraction_per_label_id, + ) + syn_data_list = [] + for label_id, label_name in enumerate(self._priv_data.metadata.label_names): + syn_data = self._population.initial( + label_name=label_name, + num_samples=num_samples_per_label_id[label_id], + ) + syn_data.set_label_id(label_id) + syn_data_list.append(syn_data) + syn_data = Data.concat(syn_data_list, metadata=self._priv_data.metadata) + syn_data.data_frame.reset_index(drop=True, inplace=True) + syn_data.metadata.iteration = 0 + syn_data.metadata.label_names = self._priv_data.metadata.label_names + self._log_metrics(syn_data) + + # Run PE iterations. + for iteration in range(syn_data.metadata.iteration + 1, len(num_samples_schedule)): + execution_logger.info(f"PE iteration {iteration}") + num_samples_per_label_id = self._get_num_samples_per_label_id( + num_samples=num_samples_schedule[iteration], + fraction_per_label_id=fraction_per_label_id, + ) + syn_data_list = [] + priv_data_list = [] + + # Generate synthetic data for each label. + for label_id in range(len(self._priv_data.metadata.label_names)): + execution_logger.info(f"Label {label_id}") + sub_priv_data = self._priv_data.filter_label_id(label_id=label_id) + sub_syn_data = syn_data.filter_label_id(label_id=label_id) + + # DP NN histogram. + sub_priv_data, sub_syn_data = self._histogram.compute_histogram( + priv_data=sub_priv_data, syn_data=sub_syn_data + ) + priv_data_list.append(sub_priv_data) + sub_syn_data = self._dp.add_noise(syn_data=sub_syn_data) + + # Generate next population. + sub_syn_data = self._population.next( + syn_data=sub_syn_data, + num_samples=num_samples_per_label_id[label_id], + ) + sub_syn_data.set_label_id(label_id) + syn_data_list.append(sub_syn_data) + + syn_data = Data.concat(syn_data_list) + syn_data.data_frame.reset_index(drop=True, inplace=True) + syn_data.metadata.iteration = iteration + + new_priv_data = Data.concat(priv_data_list) + self._priv_data = self._priv_data.merge(new_priv_data) + + if save_checkpoint: + syn_data.save_checkpoint(checkpoint_path) + self._log_metrics(syn_data) + finally: + self._clean_up_loggers() + + return syn_data diff --git a/pe/util/__init__.py b/pe/util/__init__.py new file mode 100644 index 0000000..ae25d07 --- /dev/null +++ b/pe/util/__init__.py @@ -0,0 +1 @@ +from .download import download diff --git a/pe/util/download.py b/pe/util/download.py new file mode 100644 index 0000000..aa2ddc9 --- /dev/null +++ b/pe/util/download.py @@ -0,0 +1,21 @@ +import requests +from tqdm import tqdm + + +def download(url: str, fname: str, chunk_size=1024): + """ + From: + https://gist.github.com/yanqd0/c13ed29e29432e3cf3e7c38467f42f51 + """ + resp = requests.get(url, stream=True) + total = int(resp.headers.get("content-length", 0)) + with open(fname, "wb") as file, tqdm( + desc=fname, + total=total, + unit="iB", + unit_scale=True, + unit_divisor=1024, + ) as bar: + for data in resp.iter_content(chunk_size=chunk_size): + size = file.write(data) + bar.update(size) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..643a0ac --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,43 @@ +[build-system] +requires = ["setuptools>=61.0"] +build-backend = "setuptools.build_meta" + +[project] +name = "private-evolution" +version = "0.0.1" +maintainers = [{ name = "Zinan Lin", email = "zinanlin@microsoft.com" }] +requires-python = ">=3.8" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", +] +readme = "README.md" +description = "Differentially private synthetic data via foundation model inference APIs." +dependencies = [ + "matplotlib", + "clean-fid", + "omegaconf", + "pandas", + "scikit-learn", +] + +[project.optional-dependencies] +dev = ["pre-commit", "black", "sphinx", "sphinx-rtd-theme"] +image = [ + "blobfile", + "torch", + "imageio", + "clip@git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1", + "diffusers[pytorch]", + "improved-diffusion@git+https://github.com/fjxmlzn/improved-diffusion.git@8f6677c3c47d1c1ad2e22ad2603eaec4cc639805", + "wilds", +] + +[project.urls] +Homepage = "https://github.com/microsoft/DPSDA" +Documentation = "https://github.com/microsoft/DPSDA" +Repository = "https://github.com/microsoft/DPSDA" +"Bug Tracker" = "https://github.com/microsoft/DPSDA/issues" + +[tool.setuptools.packages.find] +exclude = ["doc", "data", "example", "docker*", "amlt", "dist*", "_*"]