releasing version 0.0.67, updating mha, pallas flash attention and …

…some other kernels, debuging lora
erfanzar · Jun 13, 2024 · 784e874 · 784e874
1 parent 84c91ee
commit 784e874
Show file tree

Hide file tree

Showing 90 changed files with 1,928 additions and 1,369 deletions.
diff --git a/.idea/FXUtils.iml b/.idea/FXUtils.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/README.md b/README.md
@@ -1,7 +1,3 @@
-<p align="center">
-  <img src="logo/light-logo.png" alt="Alt text"/>
-</p>
-
 # FJFormer
 
 Embark on a journey of paralleled/unparalleled computational prowess with FJFormer - an arsenal of custom Jax Flax
@@ -15,16 +11,15 @@ checkpoint savers, partitioning tools, and other helpful functions.
 The goal of FJFormer is to make your life easier when working with Flax and JAX. Whether you are training a new model,
 fine-tuning an existing one, or just exploring the capabilities of these powerful frameworks, FJFormer offers
 
-- FlashAttention on `TPU/GPU` 🧬
-- BITComputations for 8,6,4 BIT Flax Models 🤏
-- Smart Dataset Loading
+- Pallas Kernels for GPU,TPU
+- BITComputations for 8,6,4 BIT Flax Models
 - Built-in functions and Loss functions
-- GPU-Pallas triton like implementation of `Softmax`, `FlashAttention`, `RMSNorm`, `LayerNorm`
 - Distributed and sharding Model Loaders and Checkpoint Savers
 - Monitoring Utils for *TPU/GPU/CPU* memory `foot-print`
+- Optimizers
 - Special Optimizers with schedulers and Easy to Use
 - Partitioning Utils
-- LoRA with `XRapture` 🤠
+- LoRA
 
 and A lot of these features are fully documented so i gusse FJFormer has something
 to offer, and it's not just a Computation BackEnd for [EasyDel](https://github.com/erfanzar/EasyDel).

diff --git a/docs/generated-bit_quantization-calibration.md b/docs/generated-bit_quantization-calibration.md
@@ -0,0 +1,2 @@
+# bit_quantization.calibration
+::: src.fjformer.bit_quantization.calibration
diff --git a/docs/generated-bit_quantization-config.md b/docs/generated-bit_quantization-config.md
@@ -0,0 +1,2 @@
+# bit_quantization.config
+::: src.fjformer.bit_quantization.config
diff --git a/docs/generated-bit_quantization-int_numerics.md b/docs/generated-bit_quantization-int_numerics.md
@@ -0,0 +1,2 @@
+# bit_quantization.int_numerics
+::: src.fjformer.bit_quantization.int_numerics
diff --git a/docs/generated-bit_quantization-no_numerics.md b/docs/generated-bit_quantization-no_numerics.md
@@ -0,0 +1,2 @@
+# bit_quantization.no_numerics
+::: src.fjformer.bit_quantization.no_numerics
diff --git a/docs/generated-bit_quantization-numerics.md b/docs/generated-bit_quantization-numerics.md
@@ -0,0 +1,2 @@
+# bit_quantization.numerics
+::: src.fjformer.bit_quantization.numerics
diff --git a/docs/generated-bit_quantization-q_dot_general.md b/docs/generated-bit_quantization-q_dot_general.md
@@ -0,0 +1,2 @@
+# bit_quantization.q_dot_general
+::: src.fjformer.bit_quantization.q_dot_general
diff --git a/docs/generated-bit_quantization-q_flax.md b/docs/generated-bit_quantization-q_flax.md
@@ -0,0 +1,2 @@
+# bit_quantization.q_flax
+::: src.fjformer.bit_quantization.q_flax
diff --git a/docs/generated-bit_quantization-stochastic_rounding.md b/docs/generated-bit_quantization-stochastic_rounding.md
@@ -0,0 +1,2 @@
+# bit_quantization.stochastic_rounding
+::: src.fjformer.bit_quantization.stochastic_rounding
diff --git a/docs/generated-pallas_operations-gpu-flash_attention-mha.md b/docs/generated-pallas_operations-gpu-flash_attention-mha.md
@@ -0,0 +1,2 @@
+# pallas_operations.gpu.flash_attention.mha
+::: src.fjformer.pallas_operations.gpu.flash_attention.mha
diff --git a/docs/generated-pallas_operations-gpu-layer_norm-layer_norm.md b/docs/generated-pallas_operations-gpu-layer_norm-layer_norm.md
@@ -0,0 +1,2 @@
+# pallas_operations.gpu.layer_norm.layer_norm
+::: src.fjformer.pallas_operations.gpu.layer_norm.layer_norm
diff --git a/docs/generated-pallas_operations-gpu-rms_norm-rms_norm.md b/docs/generated-pallas_operations-gpu-rms_norm-rms_norm.md
@@ -0,0 +1,2 @@
+# pallas_operations.gpu.rms_norm.rms_norm
+::: src.fjformer.pallas_operations.gpu.rms_norm.rms_norm
diff --git a/docs/generated-pallas_operations-gpu-softmax-softmax.md b/docs/generated-pallas_operations-gpu-softmax-softmax.md
@@ -0,0 +1,2 @@
+# pallas_operations.gpu.softmax.softmax
+::: src.fjformer.pallas_operations.gpu.softmax.softmax
diff --git a/docs/generated-pallas_operations-pallas_attention-attention.md b/docs/generated-pallas_operations-pallas_attention-attention.md
@@ -0,0 +1,2 @@
+# pallas_operations.pallas_attention.attention
+::: src.fjformer.pallas_operations.pallas_attention.attention
diff --git a/docs/generated-pallas_operations-tpu-flash_attention-flash_attention.md b/docs/generated-pallas_operations-tpu-flash_attention-flash_attention.md
@@ -0,0 +1,2 @@
+# pallas_operations.tpu.flash_attention.flash_attention
+::: src.fjformer.pallas_operations.tpu.flash_attention.flash_attention
diff --git a/docs/generated-pallas_operations-tpu-paged_attention-paged_attention.md b/docs/generated-pallas_operations-tpu-paged_attention-paged_attention.md
@@ -0,0 +1,2 @@
+# pallas_operations.tpu.paged_attention.paged_attention
+::: src.fjformer.pallas_operations.tpu.paged_attention.paged_attention
diff --git a/docs/generated-pallas_operations-tpu-ring_attention-ring_attention.md b/docs/generated-pallas_operations-tpu-ring_attention-ring_attention.md
@@ -0,0 +1,2 @@
+# pallas_operations.tpu.ring_attention.ring_attention
+::: src.fjformer.pallas_operations.tpu.ring_attention.ring_attention
diff --git a/docs/generated-pallas_operations-tpu-splash_attention-splash_attention_kernel.md b/docs/generated-pallas_operations-tpu-splash_attention-splash_attention_kernel.md
@@ -0,0 +1,2 @@
+# pallas_operations.tpu.splash_attention.splash_attention_kernel
+::: src.fjformer.pallas_operations.tpu.splash_attention.splash_attention_kernel
diff --git a/docs/generated-pallas_operations-tpu-splash_attention-splash_attention_mask.md b/docs/generated-pallas_operations-tpu-splash_attention-splash_attention_mask.md
@@ -0,0 +1,2 @@
+# pallas_operations.tpu.splash_attention.splash_attention_mask
+::: src.fjformer.pallas_operations.tpu.splash_attention.splash_attention_mask
diff --git a/.../generated-pallas_operations-tpu-splash_attention-splash_attention_mask_info.md b/.../generated-pallas_operations-tpu-splash_attention-splash_attention_mask_info.md
@@ -0,0 +1,2 @@
+# pallas_operations.tpu.splash_attention.splash_attention_mask_info
+::: src.fjformer.pallas_operations.tpu.splash_attention.splash_attention_mask_info
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -1,15 +1,13 @@
 nav:
-  - Bits:
-    - Bits: generated-bits-bits.md
-    - Calibration: generated-bits-calibration.md
-    - Config: generated-bits-config.md
-    - Int Numerics: generated-bits-int_numerics.md
-    - No Numerics: generated-bits-no_numerics.md
-    - Numerics: generated-bits-numerics.md
-    - Q Dot General: generated-bits-q_dot_general.md
-    - Q Flax: generated-bits-q_flax.md
-    - Qk: generated-bits-qk.md
-    - Stochastic Rounding: generated-bits-stochastic_rounding.md
+  - Bit Quantization:
+    - Calibration: generated-bit_quantization-calibration.md
+    - Config: generated-bit_quantization-config.md
+    - Int Numerics: generated-bit_quantization-int_numerics.md
+    - No Numerics: generated-bit_quantization-no_numerics.md
+    - Numerics: generated-bit_quantization-numerics.md
+    - Q Dot General: generated-bit_quantization-q_dot_general.md
+    - Q Flax: generated-bit_quantization-q_flax.md
+    - Stochastic Rounding: generated-bit_quantization-stochastic_rounding.md
   - Checkpoint:
     - Load: generated-checkpoint-_load.md
     - Streamer: generated-checkpoint-streamer.md
@@ -30,29 +28,28 @@ nav:
   - Pallas Operations:
     - Efficient Attention:
       - Efficient Attention: generated-pallas_operations-efficient_attention-efficient_attention.md
-    - Layer Norm:
-      - Gpu:
-        - Layer Norm: generated-pallas_operations-layer_norm-gpu-layer_norm.md
-    - Pallas Flash Attention:
-      - Attention: generated-pallas_operations-pallas_flash_attention-attention.md
-    - Ring Attention:
-      - Ring Attention: generated-pallas_operations-ring_attention-ring_attention.md
-    - Rms Norm:
-      - Gpu:
-        - Rms Norm: generated-pallas_operations-rms_norm-gpu-rms_norm.md
-    - Softmax:
-      - Gpu:
-        - Softmax: generated-pallas_operations-softmax-gpu-softmax.md
-    - Splash Attention:
-      - Tpu:
-        - Splash Attention Kernel: generated-pallas_operations-splash_attention-tpu-splash_attention_kernel.md
-        - Splash Attention Mask: generated-pallas_operations-splash_attention-tpu-splash_attention_mask.md
-        - Splash Attention Mask Info: generated-pallas_operations-splash_attention-tpu-splash_attention_mask_info.md
-    - Tpu Flash Attention:
-      - Gpu:
-        - Jax Flash Attn Gpu: generated-pallas_operations-tpu_flash_attention-gpu-jax_flash_attn_gpu.md
-      - Tpu:
-        - Jax Flash Attn Tpu: generated-pallas_operations-tpu_flash_attention-tpu-jax_flash_attn_tpu.md
+    - Gpu:
+      - Flash Attention:
+        - Mha: generated-pallas_operations-gpu-flash_attention-mha.md
+      - Layer Norm:
+        - Layer Norm: generated-pallas_operations-gpu-layer_norm-layer_norm.md
+      - Rms Norm:
+        - Rms Norm: generated-pallas_operations-gpu-rms_norm-rms_norm.md
+      - Softmax:
+        - Softmax: generated-pallas_operations-gpu-softmax-softmax.md
+    - Pallas Attention:
+      - Attention: generated-pallas_operations-pallas_attention-attention.md
+    - Tpu:
+      - Flash Attention:
+        - Flash Attention: generated-pallas_operations-tpu-flash_attention-flash_attention.md
+      - Paged Attention:
+        - Paged Attention: generated-pallas_operations-tpu-paged_attention-paged_attention.md
+      - Ring Attention:
+        - Ring Attention: generated-pallas_operations-tpu-ring_attention-ring_attention.md
+      - Splash Attention:
+        - Splash Attention Kernel: generated-pallas_operations-tpu-splash_attention-splash_attention_kernel.md
+        - Splash Attention Mask: generated-pallas_operations-tpu-splash_attention-splash_attention_mask.md
+        - Splash Attention Mask Info: generated-pallas_operations-tpu-splash_attention-splash_attention_mask_info.md
   - Sharding:
     - Sharding: generated-sharding-sharding.md
     - T5x Partitioning: generated-sharding-t5x_partitioning.md

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,15 +1,13 @@
 [project]
-name = "FJFormer"
-authors = [
-    { name = "Erfan Zare Chavoshi", email = "Erfanzare810@gmail.com" }
-]
-requires-python = ">=3.8"
+name = "fjformer"
+authors = [{ name = "Erfan Zare Chavoshi", email = "Erfanzare810@gmail.com" }]
+requires-python = ">=3.9"
 readme = "README.md"
-version = "0.0.66"
+version = "0.0.67"
 
 dependencies = [
-    "jax>=0.4.23",
-    "jaxlib>=0.4.23",
+    "jax>=0.4.29",
+    "jaxlib>=0.4.29",
     "optax~=0.2.2",
     "msgpack~=1.0.7",
     "ipython~=8.17.2",
@@ -36,18 +34,15 @@ classifiers = [
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
 ]
 description = "Embark on a journey of paralleled/unparalleled computational prowess with FJFormer - an arsenal of custom Jax Flax Functions and Utils that elevate your AI endeavors to new heights!"
-
 license = { text = "Apache-2.0" }
-
-keywords = [
-    "JAX", "Torch", "Deep Learning", "Machine Learning", "Flax", "XLA"
-]
+keywords = ["JAX", "Deep Learning", "Machine Learning", "Flax", "XLA"]
 
 [build-system]
-requires = ["setuptools>=46.4.0", "wheel>=0.34.2"]
-build-backend = "setuptools.build_meta"
+requires = ["flit_core >=3.2,<4"]
+build-backend = "flit_core.buildapi"
 
 [tool.setuptools.packages]
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-jax>=0.4.23
-jaxlib>=0.4.23
+jax>=0.4.29
+jaxlib>=0.4.29
 optax~=0.2.2
 msgpack~=1.0.7
 ipython~=8.17.2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.calibration
		::: src.fjformer.bit_quantization.calibration
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.config
		::: src.fjformer.bit_quantization.config
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.int_numerics
		::: src.fjformer.bit_quantization.int_numerics
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.no_numerics
		::: src.fjformer.bit_quantization.no_numerics
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.numerics
		::: src.fjformer.bit_quantization.numerics
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.q_dot_general
		::: src.fjformer.bit_quantization.q_dot_general
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.q_flax
		::: src.fjformer.bit_quantization.q_flax
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# bit_quantization.stochastic_rounding
		::: src.fjformer.bit_quantization.stochastic_rounding
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# pallas_operations.gpu.flash_attention.mha
		::: src.fjformer.pallas_operations.gpu.flash_attention.mha
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# pallas_operations.gpu.layer_norm.layer_norm
		::: src.fjformer.pallas_operations.gpu.layer_norm.layer_norm