|
151 | 151 | "meta-llama/Meta-Llama-3-8B-Instruct",
|
152 | 152 | "meta-llama/Meta-Llama-3-70B",
|
153 | 153 | "meta-llama/Meta-Llama-3-70B-Instruct",
|
154 |
| - "meta-llama/Llama-3.2-1B", |
155 |
| - "meta-llama/Llama-3.2-3B", |
156 |
| - "meta-llama/Llama-3.2-1B-Instruct", |
157 |
| - "meta-llama/Llama-3.2-3B-Instruct", |
158 | 154 | "meta-llama/Llama-3.1-70B",
|
159 | 155 | "meta-llama/Llama-3.1-8B",
|
160 | 156 | "meta-llama/Llama-3.1-8B-Instruct",
|
161 | 157 | "meta-llama/Llama-3.1-70B-Instruct",
|
| 158 | + "meta-llama/Llama-3.2-1B", |
| 159 | + "meta-llama/Llama-3.2-3B", |
| 160 | + "meta-llama/Llama-3.2-1B-Instruct", |
| 161 | + "meta-llama/Llama-3.2-3B-Instruct", |
| 162 | + "meta-llama/Llama-3.3-70B-Instruct", |
162 | 163 | "Baidicoot/Othello-GPT-Transformer-Lens",
|
163 | 164 | "bert-base-cased",
|
164 | 165 | "roneneldan/TinyStories-1M",
|
@@ -960,6 +961,30 @@ def convert_hf_model_config(model_name: str, **kwargs):
|
960 | 961 | "NTK_by_parts_high_freq_factor": 4.0,
|
961 | 962 | "NTK_by_parts_factor": 32.0,
|
962 | 963 | }
|
| 964 | + elif "Llama-3.3-70B" in official_model_name: |
| 965 | + cfg_dict = { |
| 966 | + "d_model": 8192, |
| 967 | + "d_head": 128, |
| 968 | + "n_heads": 64, |
| 969 | + "d_mlp": 28672, |
| 970 | + "n_layers": 80, |
| 971 | + "n_ctx": 2048, # capped due to memory issues |
| 972 | + "eps": 1e-5, |
| 973 | + "d_vocab": 128256, |
| 974 | + "act_fn": "silu", |
| 975 | + "n_key_value_heads": 8, |
| 976 | + "normalization_type": "RMS", |
| 977 | + "positional_embedding_type": "rotary", |
| 978 | + "rotary_adjacent_pairs": False, |
| 979 | + "rotary_dim": 32, |
| 980 | + "final_rms": True, |
| 981 | + "gated_mlp": True, |
| 982 | + "rotary_base": 500000.0, |
| 983 | + "use_NTK_by_parts_rope": True, |
| 984 | + "NTK_by_parts_low_freq_factor": 1.0, |
| 985 | + "NTK_by_parts_high_freq_factor": 4.0, |
| 986 | + "NTK_by_parts_factor": 8.0, |
| 987 | + } |
963 | 988 | elif "Llama-3.1-8B" in official_model_name:
|
964 | 989 | cfg_dict = {
|
965 | 990 | "d_model": 4096,
|
|
0 commit comments