Fix that if use_past_kv_cache is set to True models from the Bloom family produce weird outputs. (#777)

degenfabian · bryce13950 · Fabian Degen · web-flow · commit d9792a9a2a07 · 2024-11-16T01:07:46.000+01:00
* Fix kv_cache leads to wrong output when used with bloom models

* add test for bloom models when use_past_kv_cache is set to true

* fix max_length for huggingface model in kv_cache test

* set max_length to 13 for huggingface model in kv_cache test

* use max_new_tokens for huggingface model instead of max_length in kv_cache test

* fix format

---------

Co-authored-by: Bryce Meyer &lt;bryce13950@gmail.com&gt;
Co-authored-by: Fabian Degen &lt;fabian.degen@mytum.de&gt;
diff --git a/tests/acceptance/test_hooked_transformer.py b/tests/acceptance/test_hooked_transformer.py
@@ -175,6 +175,26 @@ def test_from_pretrained_revision():
         raise AssertionError("Should have raised an error")
 
 
+def test_bloom_similarity_with_hf_model_with_kv_cache_activated():
+    tf_model = HookedTransformer.from_pretrained(
+        "bigscience/bloom-560m", default_prepend_bos=False, device="cpu"
+    )
+    hf_model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")
+    hf_tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+
+    output_tf = tf_model.generate(
+        text, do_sample=False, use_past_kv_cache=True, verbose=False, max_new_tokens=10
+    )
+    output_hf_tokens = hf_model.generate(
+        hf_tokenizer(text, return_tensors="pt").input_ids,
+        do_sample=False,
+        max_new_tokens=10,
+    )
+    output_hf_str = hf_tokenizer.decode(output_hf_tokens[0], skip_special_tokens=True)
+
+    assert output_tf == output_hf_str
+
+
 def check_norm_folding(
     model_name,
     hf_model=None,
diff --git a/transformer_lens/components/abstract_attention.py b/transformer_lens/components/abstract_attention.py
@@ -229,8 +229,9 @@ def forward(
                     self.cfg.n_heads, key_ctx, self.cfg.device
                 )
 
+            # Take the last query_ctx positions so it also works with past_kv_cache
             attn_scores += self.alibi[
-                :, :query_ctx, :key_ctx
+                :, -query_ctx:, :key_ctx
             ]  # [batch, head_index, query_pos, key_pos]
         elif self.cfg.positional_embedding_type == "relative_positional_bias":
             if position_bias is None: