Add TOVA press (#12)

SimJeg · web-flow · commit 3ca0ce42d8e4 · 2024-11-26T13:55:58.000+01:00
* Add TOVA press

* update README

* update docstring

* Address PR comment
diff --git a/README.md b/README.md
@@ -55,10 +55,11 @@ All current presses are training free. We provide the following presses associat
 
 - `RandomPress`: random score
 - `KnormPress`: inverse norm of the key ([paper](https://arxiv.org/abs/2406.11430))
-- `ObservedAttentionPress`: average attention weight observed during in pre-filling phase (similar to [H2O](https://arxiv.org/abs/2306.14048) or [TOVA](https://arxiv.org/abs/2401.06104))
+- `ObservedAttentionPress`: average attention weight observed during in pre-filling phase (similar to [H2O](https://arxiv.org/abs/2306.14048))
 - `SnapKVPress`: average attention weight of the last 64 queries ([paper](https://arxiv.org/abs/2404.14469))
 - `ExpectedAttentionPress` (ours): expected attention weight during the generation phase  (see [this notebook](notebooks/expected_attention.ipynb))
 - `StreamingLLMPress`: keep only the first and last tokens ([paper](https://arxiv.org/abs/2309.17453))
+- `TOVAPress`: attention weight of the last query averaged across heads ([paper](https://arxiv.org/abs/2401.06104))
 
 For a detailed list of existing KV cache compression methods, check [Awesome-KV-Cache-Compression](https://github.com/October2001/Awesome-KV-Cache-Compression) or [Awesome-LLM-Compression](https://github.com/HuangOwen/Awesome-LLM-Compression?tab=readme-ov-file#kv-cache-compression)
 
@@ -186,5 +187,3 @@ press = apply_per_layer_compression(press, compression_ratios=[...])
 
 Check the [demo notebook](notebooks/per_layer_compression_demo.ipynb) for more details.
 </details>
-
-<details><summary> 
diff --git a/kvpress/__init__.py b/kvpress/__init__.py
@@ -11,6 +11,7 @@
 from kvpress.presses.random_press import RandomPress
 from kvpress.presses.snapkv_press import SnapKVPress
 from kvpress.presses.streaming_llm_press import StreamingLLMPress
+from kvpress.presses.tova_press import TOVAPress
 
 __all__ = [
     "BasePress",
@@ -20,6 +21,7 @@
     "RandomPress",
     "SnapKVPress",
     "StreamingLLMPress",
+    "TOVAPress",
     "KVPressTextGenerationPipeline",
     "apply_per_layer_compression",
 ]
diff --git a/kvpress/presses/observed_attention_press.py b/kvpress/presses/observed_attention_press.py
@@ -17,7 +17,7 @@
 class ObservedAttentionPress(BasePress):
     """The observed attention score is defined as the average attention weight over all prompt tokens
     Requires output_attentions=True and attn_implementation="eager" to have access to attentions
-    This approach is related to H2O (https://arxiv.org/abs/2306.14048) and TOVA (https://arxiv.org/abs/2401.06104)
+    This approach is related to H2O (https://arxiv.org/abs/2306.14048).
     """
 
     compression_ratio: float = 0.0
diff --git a/kvpress/presses/tova_press.py b/kvpress/presses/tova_press.py
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 1993-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from kvpress.presses.snapkv_press import SnapKVPress
+
+
+@dataclass
+class TOVAPress(SnapKVPress):
+    """
+    TOVA (https://arxiv.org/abs/2401.06104) use the attention of the last token averaged across heads
+    to estimate the importance of the previous KV pairs. This press was reviewed by Michael Hassid,
+    one of the authors of the TOVA paper.
+
+    Official implementation can be found here: https://github.com/schwartz-lab-NLP/TOVA/blob/main/src/tova_cache.py
+    """
+
+    compression_ratio: float = 0.0
+    window_size: int = 1  # re-use the attention weight computation from SnapKVPress for last token
+
+    def score(
+        self,
+        module: nn.Module,
+        hidden_states: torch.Tensor,
+        keys: torch.Tensor,
+        values: torch.Tensor,
+        attentions: torch.Tensor,
+        kwargs,
+    ) -> torch.Tensor:
+
+        if attentions is not None:
+            attn_weights = attentions[..., -1:, :-1]
+        else:
+            attn_weights = self.compute_window_attention(module, hidden_states, keys)
+
+        # Average across heads and repeat num_key_value_head times
+        scores = attn_weights.mean(1)
+        scores = scores.repeat(1, keys.shape[1], 1)
+
+        # Add back the last token. Use max score to make sure the window is not pruned.
+        # This is a very slight difference from TOVA that don't enforce it, but the
+        # last attention weight is usually very high so it should not change the results.
+        scores = F.pad(scores, (0, 1), value=scores.max().item())
+
+        return scores
diff --git a/notebooks/wikipedia_demo.ipynb b/notebooks/wikipedia_demo.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -26,6 +26,7 @@
     "    RandomPress,\n",
     "    SnapKVPress,\n",
     "    StreamingLLMPress,\n",
+    "    TOVAPress,\n",
     ")"
    ]
   },
diff --git a/tests/presses/test_presses.py b/tests/presses/test_presses.py
@@ -14,12 +14,13 @@
     RandomPress,
     SnapKVPress,
     StreamingLLMPress,
+    TOVAPress,
 )
 from tests.fixtures import unit_test_model, unit_test_model_output_attention  # noqa: F401
 
 
 def test_presses_run(unit_test_model):  # noqa: F811
-    for cls in [KnormPress, ExpectedAttentionPress, RandomPress, StreamingLLMPress, SnapKVPress]:
+    for cls in [KnormPress, ExpectedAttentionPress, RandomPress, StreamingLLMPress, SnapKVPress, TOVAPress]:
         for compression_ratio in [0.2, 0.4, 0.6, 0.8]:
             press = cls(compression_ratio=compression_ratio)
             if cls == SnapKVPress: