huggingface
diff --git a/‎.github/workflows/pr-style-bot.yml
+19 b/‎.github/workflows/pr-style-bot.yml
+19
diff --git a/‎.github/workflows/self-comment-ci.yml
+1-1 b/‎.github/workflows/self-comment-ci.yml
+1-1
diff --git a/‎README.md
+5 b/‎README.md
+5
diff --git a/‎docs/source/en/generation_strategies.md
+244-71 b/‎docs/source/en/generation_strategies.md
+244-71
diff --git a/‎docs/source/en/model_doc/albert.md
+1 b/‎docs/source/en/model_doc/albert.md
+1
diff --git a/‎docs/source/en/model_doc/bart.md
+1 b/‎docs/source/en/model_doc/bart.md
+1
diff --git a/‎docs/source/en/model_doc/biogpt.md
+1 b/‎docs/source/en/model_doc/biogpt.md
+1
diff --git a/‎docs/source/en/model_doc/data2vec.md
+1 b/‎docs/source/en/model_doc/data2vec.md
+1
diff --git a/‎docs/source/en/model_doc/gpt_bigcode.md
+4 b/‎docs/source/en/model_doc/gpt_bigcode.md
+4
diff --git a/‎docs/source/en/model_doc/hubert.md
+1-1 b/‎docs/source/en/model_doc/hubert.md
+1-1
diff --git a/‎docs/source/en/model_doc/m2m_100.md
+3 b/‎docs/source/en/model_doc/m2m_100.md
+3
diff --git a/‎docs/source/en/model_doc/mbart.md
+3 b/‎docs/source/en/model_doc/mbart.md
+3
diff --git a/‎docs/source/en/model_doc/musicgen.md
+3 b/‎docs/source/en/model_doc/musicgen.md
+3
diff --git a/‎docs/source/en/model_doc/musicgen_melody.md
+3 b/‎docs/source/en/model_doc/musicgen_melody.md
+3
diff --git a/‎docs/source/en/model_doc/opt.md
+3 b/‎docs/source/en/model_doc/opt.md
+3
diff --git a/‎docs/source/en/model_doc/qwen2_audio.md
+3 b/‎docs/source/en/model_doc/qwen2_audio.md
+3
diff --git a/‎docs/source/en/model_doc/sew.md
+3 b/‎docs/source/en/model_doc/sew.md
+3
diff --git a/‎docs/source/en/model_doc/unispeech-sat.md
+3 b/‎docs/source/en/model_doc/unispeech-sat.md
+3
diff --git a/‎docs/source/en/model_doc/unispeech.md
+3 b/‎docs/source/en/model_doc/unispeech.md
+3
diff --git a/‎docs/source/en/model_doc/vilt.md
+5 b/‎docs/source/en/model_doc/vilt.md
+5
diff --git a/‎docs/source/en/model_doc/wav2vec2.md
+3 b/‎docs/source/en/model_doc/wav2vec2.md
+3
diff --git a/‎docs/source/en/model_doc/whisper.md
+3 b/‎docs/source/en/model_doc/whisper.md
+3
diff --git a/‎docs/source/en/trainer.md
+3-3 b/‎docs/source/en/trainer.md
+3-3
diff --git a/‎examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+2 b/‎examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+2
diff --git a/‎src/transformers/cache_utils.py
+6-4 b/‎src/transformers/cache_utils.py
+6-4
@@ -0,0 +1,19 @@
+# To run this bot, comment "@bot /style" on a PR
+name: Style Bot
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  style:
+    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
+    with:
+      python_quality_dependencies: "[quality]"
+      style_command_type: "default"
+    secrets:
+      bot_token: ${{ secrets.GITHUB_TOKEN }}
@@ -29,7 +29,7 @@ jobs:
     runs-on: ubuntu-22.04
     name: Get PR number
     # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
     outputs:
       PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
     steps:
 
@@ -98,7 +98,12 @@ Install Transformers from source if you want the latest changes in the library o
 ```shell
 git clone https://github.com/huggingface/transformers.git
 cd transformers
+
+# pip
 pip install .[torch]
+
+# uv
+uv pip install .[torch]
 ```
 
 ## Quickstart
 
@@ -57,6 +57,7 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 - Embedding size E is different from hidden size H justified because the embeddings are context independent (one embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V being the vocab size). If E < H, it has less parameters.
 - Layers are split in groups that share parameters (to save memory).
 Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  
 
 ### Using Scaled Dot Product Attention (SDPA)
 
 
@@ -55,6 +55,7 @@ This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The
   * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
   * permute sentences
   * rotate the document to make it start at a specific token
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  
 
 ## Implementation Notes
 
 
@@ -36,6 +36,7 @@ This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The
 - BioGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left.
 - BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
 - The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  
 
 ### Using Scaled Dot Product Attention (SDPA)
 
 
@@ -53,6 +53,7 @@ The original code for vision can be found [here](https://github.com/facebookrese
 - For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
 - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
 - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  
 
 ### Using Scaled Dot Product Attention (SDPA)
 
 
@@ -46,8 +46,12 @@ The main differences compared to GPT2.
 - Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
 - Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model).
 
+
 You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Combining Starcoder and Flash Attention 2
 
 First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
 
@@ -50,7 +50,7 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 - Hubert is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - Hubert model was fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
   using [`Wav2Vec2CTCTokenizer`].
-
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  
 
 ## Using Flash Attention 2
 
 
@@ -51,6 +51,9 @@ multilingual it expects the sequences in a certain format: A special language id
 source and target text. The source text format is `[lang_code] X [eos]`, where `lang_code` is source language
 id for source text and target language id for target text, with `X` being the source or target text.
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 The [`M2M100Tokenizer`] depends on `sentencepiece` so be sure to install it before running the
 examples. To install `sentencepiece` run `pip install sentencepiece`.
 
 
@@ -35,6 +35,9 @@ You can find all the original mBART checkpoints under the [AI at Meta](https://h
 > [!TIP]
 > Click on the mBART models in the right sidebar for more examples of applying mBART to different language tasks.
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 The example below demonstrates how to translate text with [`Pipeline`] or the [`AutoModel`] class.
 
 <hfoptions id="usage">
 
@@ -62,6 +62,9 @@ python src/transformers/models/musicgen/convert_musicgen_transformers.py \
     --checkpoint small --pytorch_dump_folder /output/path --safe_serialization 
 ```
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Generation
 
 MusicGen is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly
 
@@ -44,6 +44,9 @@ There are two key differences with MusicGen:
 1. The audio prompt is used here as a conditional signal for the generated audio sample, whereas it's used for audio continuation in [MusicGen](https://huggingface.co/docs/transformers/main/en/model_doc/musicgen).
 2. Conditional text and audio signals are concatenated to the decoder's hidden states instead of being used as a cross-attention signal, as in MusicGen.
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Generation
 
 MusicGen Melody is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly better results than greedy, thus we encourage sampling mode to be used where possible. Sampling is enabled by default, and can be explicitly specified by setting `do_sample=True` in the call to [`MusicgenMelodyForConditionalGeneration.generate`], or by overriding the model's generation config (see below).
 
@@ -41,6 +41,9 @@ Tips:
 - OPT has the same architecture as [`BartDecoder`].
 - Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
 
@@ -40,6 +40,9 @@ The abstract from the paper is the following:
 
 `Qwen2-Audio-7B` and `Qwen2-Audio-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ### Inference
 
 ```python
 
@@ -46,6 +46,9 @@ This model was contributed by [anton-l](https://huggingface.co/anton-l).
 - SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
   [`Wav2Vec2CTCTokenizer`].
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 
@@ -54,6 +54,9 @@ found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
   decoded using [`Wav2Vec2CTCTokenizer`].
 - UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 
@@ -49,6 +49,9 @@ found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
 - UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
   decoded using [`Wav2Vec2CTCTokenizer`].
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources
 
 - [Audio classification task guide](../tasks/audio_classification)
 
@@ -72,6 +72,11 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
 [[autodoc]] ViltImageProcessor
     - preprocess
 
+## ViltImageProcessorFast
+
+[[autodoc]] ViltImageProcessorFast
+    - preprocess
+
 ## ViltProcessor
 
 [[autodoc]] ViltProcessor
 
@@ -50,6 +50,9 @@ Note: Meta (FAIR) released a new version of [Wav2Vec2-BERT 2.0](https://huggingf
 - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
   using [`Wav2Vec2CTCTokenizer`].
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Using Flash Attention 2
 
 Flash Attention 2 is an faster, optimized version of the model.
 
@@ -32,6 +32,9 @@ rendered properly in your Markdown viewer.
 
 You can find all the original Whisper checkpoints under the [Whisper](https://huggingface.co/collections/openai/whisper-release-6501bba2cf999715fd953013) collection.
 
+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 > [!TIP]
 > Click on the Whisper models in the right sidebar for more examples of how to apply Whisper to different audio tasks.
 
 
@@ -372,14 +372,14 @@ accelerate launch \
 
 ### torch.compile
 
-[torch.compile](./perf_torch_compile) can significantly speed up training and reduce computational overhead. Configure your torch.compile settings in [`TrainingArguments`]. Set `torch.compile` to `True`, and select a backend and compile mode.
+[torch.compile](./perf_torch_compile) can significantly speed up training and reduce computational overhead. Configure your torch.compile settings in [`TrainingArguments`]. Set `torch_compile` to `True`, and select a backend and compile mode.
 
 ```py
 from transformers import TrainingArguments
 
 training_args = TrainingArguments(
-    torch.compile=True,
-    torch.compile_backend="inductor",
+    torch_compile=True,
+    torch_compile_backend="inductor",
     torch_compile_mode="default",
     ...,
 )
 
@@ -21,6 +21,7 @@
 from pathlib import Path
 from typing import Optional, Union
 
+import aiohttp
 import datasets
 import torch
 from accelerate import Accelerator
@@ -454,6 +455,7 @@ def main():
             split=train_split_name,
             cache_dir=args.cache_dir,
             trust_remote_code=args.trust_remote_code,
+            storage_options={"client_kwargs": {"timeout": aiohttp.ClientTimeout(total=60 * 60)}},
         )
         datasets_splits.append(dataset_split)
 
 
@@ -464,7 +464,7 @@ def get_max_cache_shape(self) -> Optional[int]:
         """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length."""
         return None
 
-    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
         """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for
         backward compatibility."""
         legacy_cache = ()
@@ -473,7 +473,9 @@ def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
         return legacy_cache
 
     @classmethod
-    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+    def from_legacy_cache(
+        cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor, torch.FloatTensor]]] = None
+    ) -> "DynamicCache":
         """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for
         backward compatibility."""
         cache = cls()
@@ -1505,8 +1507,8 @@ def __len__(self):
         """
         return len(self.self_attention_cache)
 
-    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
-        """Converts the `EncoderDecoderCache` instance into  its equivalent in the legacy cache format."""
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor]]:
+        """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format."""
         legacy_cache = ()
         if len(self.cross_attention_cache) > 0:
             for self_attn, cross_attn in zip(