Model llama 3.2 (#734)

bryce13950 · web-flow · commit 4172ce820daa · 2024-09-27T01:50:44.000+02:00
* fixed typo

* added llama 3.2-1b

* configured 3b

* configured instruct models
diff --git a/transformer_lens/HookedEncoder.py b/transformer_lens/HookedEncoder.py
@@ -255,7 +255,7 @@ def from_pretrained(
         if move_to_device:
             model.to(cfg.device)
 
-        print(f"Loaded pretrained model {model_name} into HookedTransformer")
+        print(f"Loaded pretrained model {model_name} into HookedEncoder")
 
         return model
 
diff --git a/transformer_lens/loading_from_pretrained.py b/transformer_lens/loading_from_pretrained.py
@@ -151,6 +151,10 @@
     "meta-llama/Meta-Llama-3-8B-Instruct",
     "meta-llama/Meta-Llama-3-70B",
     "meta-llama/Meta-Llama-3-70B-Instruct",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-3B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "meta-llama/Llama-3.2-3B-Instruct",
     "Baidicoot/Othello-GPT-Transformer-Lens",
     "bert-base-cased",
     "roneneldan/TinyStories-1M",
@@ -885,6 +889,82 @@ def convert_hf_model_config(model_name: str, **kwargs):
             "final_rms": True,
             "gated_mlp": True,
         }
+    elif "Llama-3.2-1B" in official_model_name:
+        cfg_dict = {
+            "d_model": 2048,
+            "d_head": 64,
+            "n_heads": 32,
+            "d_mlp": 8192,
+            "n_layers": 16,
+            "n_ctx": 2048,  # capped due to memory issues
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 64,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
+    elif "Llama-3.2-3B" in official_model_name:
+        cfg_dict = {
+            "d_model": 3072,
+            "d_head": 128,
+            "n_heads": 24,
+            "d_mlp": 8192,
+            "n_layers": 28,
+            "n_ctx": 2048,  # capped due to memory issues
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 128,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
+    elif "Llama-3.2-1B-Instruct" in official_model_name:
+        cfg_dict = {
+            "d_model": 2048,
+            "d_head": 64,
+            "n_heads": 32,
+            "d_mlp": 8192,
+            "n_layers": 16,
+            "n_ctx": 2048,  # capped due to memory issues
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 64,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
+    elif "Llama-3.2-3B-Instruct" in official_model_name:
+        cfg_dict = {
+            "d_model": 3072,
+            "d_head": 128,
+            "n_heads": 24,
+            "d_mlp": 8192,
+            "n_layers": 28,
+            "n_ctx": 2048,  # capped due to memory issues
+            "eps": 1e-5,
+            "d_vocab": 128256,
+            "act_fn": "silu",
+            "n_key_value_heads": 8,
+            "normalization_type": "RMS",
+            "positional_embedding_type": "rotary",
+            "rotary_adjacent_pairs": False,
+            "rotary_dim": 128,
+            "final_rms": True,
+            "gated_mlp": True,
+        }
     elif architecture == "GPTNeoForCausalLM":
         cfg_dict = {
             "d_model": hf_config.hidden_size,