Skip to content

Commit cebd471

Browse files
Merge pull request #106 from menloresearch/update-dev-from-master-2025-05-27-14-47
Sync master with upstream release b5509
2 parents 20607fd + 05f6ac6 commit cebd471

26 files changed

+1640
-729
lines changed

convert_hf_to_gguf.py

Lines changed: 140 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,9 @@ def load_hparams(dir_model: Path):
432432
if "llm_config" in config:
433433
# rename for InternVL
434434
config["text_config"] = config["llm_config"]
435+
if "thinker_config" in config:
436+
# rename for Qwen2.5-Omni
437+
config["text_config"] = config["thinker_config"]["text_config"]
435438
return config
436439

437440
@classmethod
@@ -1121,18 +1124,21 @@ class MmprojModel(ModelBase):
11211124
preprocessor_config: dict[str, Any]
11221125
global_config: dict[str, Any]
11231126

1127+
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"]
1128+
11241129
has_vision_encoder: bool = True # by default
11251130
has_audio_encoder: bool = False
11261131

1132+
# for models having multiple encoders, we need to separate their hparams
1133+
hparams_vision: dict[str, Any] | None = None
1134+
hparams_audio: dict[str, Any] | None = None
1135+
11271136
def __init__(self, *args, **kwargs):
11281137
super().__init__(*args, **kwargs)
11291138

11301139
if self.model_arch != gguf.MODEL_ARCH.MMPROJ:
11311140
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
11321141

1133-
if self.has_vision_encoder and self.has_audio_encoder:
1134-
raise NotImplementedError("both vision + audio not supported yet")
1135-
11361142
# get n_embd of the text model
11371143
if "text_config" not in self.hparams:
11381144
self.hparams["text_config"] = {}
@@ -1143,22 +1149,32 @@ def __init__(self, *args, **kwargs):
11431149
assert self.n_embd_text > 0, "n_embd not found in hparams"
11441150

11451151
# move vision config to the top level, while preserving the original hparams in global_config
1146-
self.global_config = self.hparams
1152+
import copy
1153+
self.global_config = copy.deepcopy(self.hparams)
1154+
self.hparams_vision = self.get_vision_config()
1155+
self.hparams_audio = self.get_audio_config()
11471156

1148-
if "vision_config" in self.hparams:
1149-
self.hparams = self.hparams["vision_config"]
1150-
elif "audio_config" in self.hparams:
1151-
self.hparams = self.hparams["audio_config"]
1152-
else:
1157+
if self.hparams_vision is None and self.hparams_audio is None:
11531158
raise ValueError("vision_config / audio_config not found in hparams")
11541159

1155-
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth"])
1160+
# for compat with vision-only models
1161+
self.hparams = self.hparams_vision or self.hparams_audio or self.hparams
1162+
1163+
# TODO @ngxson : this is a hack to support both vision and audio encoders
1164+
have_multiple_encoders = self.has_audio_encoder and self.has_vision_encoder
1165+
self.block_count = 128 if have_multiple_encoders else self.find_hparam(self.n_block_keys, True)
11561166
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
11571167

11581168
# load preprocessor config
11591169
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
11601170
self.preprocessor_config = json.load(f)
11611171

1172+
def get_vision_config(self) -> dict[str, Any] | None:
1173+
return self.global_config.get("vision_config")
1174+
1175+
def get_audio_config(self) -> dict[str, Any] | None:
1176+
return self.global_config.get("audio_config")
1177+
11621178
def set_type(self):
11631179
self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)
11641180

@@ -1170,33 +1186,49 @@ def set_gguf_parameters(self):
11701186
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
11711187

11721188
# vision config
1173-
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
1174-
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
1175-
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
1176-
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
1177-
self.gguf_writer.add_vision_block_count(self.block_count)
1178-
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
1189+
self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
1190+
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
1191+
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
1192+
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
1193+
self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
1194+
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
11791195

11801196
# preprocessor config
11811197
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
11821198
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
11831199

1184-
elif self.has_audio_encoder:
1200+
if self.has_audio_encoder:
11851201
self.gguf_writer.add_clip_has_audio_encoder(True)
11861202
self.gguf_writer.add_audio_projection_dim(self.n_embd_text)
11871203

11881204
# audio config
1189-
self.gguf_writer.add_audio_embedding_length(self.find_hparam(["hidden_size"]))
1190-
self.gguf_writer.add_audio_feed_forward_length(self.find_hparam(["intermediate_size"]))
1191-
self.gguf_writer.add_audio_block_count(self.block_count)
1192-
self.gguf_writer.add_audio_head_count(self.find_hparam(["num_attention_heads"]))
1205+
self.gguf_writer.add_audio_embedding_length(self.find_aparam(["hidden_size"]))
1206+
self.gguf_writer.add_audio_feed_forward_length(self.find_aparam(["intermediate_size"]))
1207+
self.gguf_writer.add_audio_block_count(self.find_aparam(self.n_block_keys))
1208+
self.gguf_writer.add_audio_head_count(self.find_aparam(["num_attention_heads"]))
11931209

11941210
else:
11951211
raise ValueError("MmprojModel must have either vision or audio encoder")
11961212

11971213
def write_vocab(self):
11981214
raise ValueError("MmprojModel does not support vocab writing")
11991215

1216+
def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1217+
assert self.hparams_vision is not None
1218+
return self._find_param(self.hparams_vision, keys, optional)
1219+
1220+
def find_aparam(self, keys: Iterable[str], optional: bool = False) -> Any:
1221+
assert self.hparams_audio is not None
1222+
return self._find_param(self.hparams_audio, keys, optional)
1223+
1224+
def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False) -> Any:
1225+
key = next((k for k in keys if k in obj), None)
1226+
if key is not None:
1227+
return obj[key]
1228+
if optional:
1229+
return None
1230+
raise KeyError(f"could not find any of: {keys}")
1231+
12001232

12011233
@ModelBase.register("GPTNeoXForCausalLM")
12021234
class GPTNeoXModel(TextModel):
@@ -2674,7 +2706,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26742706
yield from super().modify_tensors(data_torch, name, bid)
26752707

26762708

2677-
@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
2709+
@ModelBase.register(
2710+
"Qwen2VLModel",
2711+
"Qwen2VLForConditionalGeneration",
2712+
"Qwen2_5_VLForConditionalGeneration",
2713+
"Qwen2_5OmniModel",
2714+
)
26782715
class Qwen2VLModel(TextModel):
26792716
model_arch = gguf.MODEL_ARCH.QWEN2VL
26802717

@@ -2692,8 +2729,11 @@ def set_vocab(self):
26922729

26932730
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
26942731
del bid # unused
2695-
if name.startswith("visual."):
2696-
# skip visual tensors
2732+
if name.startswith("thinker."):
2733+
name = name.replace("thinker.", "")
2734+
if name.startswith("visual") or name.startswith("audio") or \
2735+
name.startswith("talker") or name.startswith("token2wav"):
2736+
# skip multimodal tensors
26972737
return []
26982738
return [(self.map_tensor_name(name), data_torch)]
26992739

@@ -2702,21 +2742,27 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27022742
class Qwen2VLVisionModel(MmprojModel):
27032743
def __init__(self, *args, **kwargs):
27042744
super().__init__(*args, **kwargs)
2705-
self.hparams["image_size"] = self.hparams.get("image_size", 560)
2745+
assert self.hparams_vision is not None
2746+
self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
27062747
# rename config.json values
2707-
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
2708-
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
2709-
if "embed_dim" in self.hparams: # qwen2vl
2710-
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
2711-
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
2748+
self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
2749+
self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
2750+
if "embed_dim" in self.hparams_vision: # qwen2vl
2751+
self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
2752+
self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
27122753

27132754
def set_gguf_parameters(self):
27142755
super().set_gguf_parameters()
2715-
hparams = self.hparams
2716-
if self.global_config['model_type'] == 'qwen2_vl':
2756+
assert self.hparams_vision is not None
2757+
hparams = self.hparams_vision
2758+
model_type = self.global_config['model_type']
2759+
if model_type == 'qwen2_vl':
27172760
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN2VL)
2718-
elif self.global_config['model_type'] == 'qwen2_5_vl':
2719-
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
2761+
elif model_type == 'qwen2_5_vl' or model_type == 'qwen2_5_omni':
2762+
if model_type == 'qwen2_5_omni':
2763+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25O)
2764+
else:
2765+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
27202766
self.gguf_writer.add_vision_use_silu(True)
27212767
# find n_wa_pattern (window attention pattern)
27222768
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
@@ -2774,6 +2820,66 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27742820
return [] # skip other tensors
27752821

27762822

2823+
@ModelBase.register("Qwen2_5OmniModel")
2824+
class Qwen25OmniModel(Qwen2VLVisionModel):
2825+
has_vision_encoder = True
2826+
has_audio_encoder = True
2827+
2828+
def __init__(self, *args, **kwargs):
2829+
super().__init__(*args, **kwargs)
2830+
assert self.hparams_audio is not None
2831+
self.hparams_audio["hidden_size"] = self.hparams_audio["d_model"]
2832+
self.hparams_audio["intermediate_size"] = self.hparams_audio["encoder_ffn_dim"]
2833+
self.hparams_audio["num_attention_heads"] = self.hparams_audio["encoder_attention_heads"]
2834+
2835+
def set_gguf_parameters(self):
2836+
super().set_gguf_parameters()
2837+
assert self.hparams_audio is not None
2838+
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["num_mel_bins"])
2839+
self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams_audio.get("layer_norm_eps", 1e-5))
2840+
2841+
def get_vision_config(self) -> dict[str, Any] | None:
2842+
return self.global_config["thinker_config"].get("vision_config")
2843+
2844+
def get_audio_config(self) -> dict[str, Any] | None:
2845+
return self.global_config["thinker_config"].get("audio_config")
2846+
2847+
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
2848+
# SinusoidsPositionEmbedding
2849+
assert self.hparams_audio is not None
2850+
max_timescale = 10000
2851+
length = 1500
2852+
channels = self.hparams_audio["hidden_size"]
2853+
log_timescale_increment = np.log(max_timescale) / (channels // 2 - 1)
2854+
inv_timescales = torch.exp(-log_timescale_increment * torch.arange(channels // 2).float())
2855+
scaled_time = torch.arange(length)[:, np.newaxis] * inv_timescales[np.newaxis, :]
2856+
pos_embd = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=1).to(dtype=torch.float32)
2857+
yield ("audio_tower.embed_positions.weight", pos_embd)
2858+
2859+
def tensor_force_quant(self, name, new_name, bid, n_dims):
2860+
del bid, new_name, n_dims # unused
2861+
if ".conv" in name and ".weight" in name:
2862+
return gguf.GGMLQuantizationType.F16
2863+
return False
2864+
2865+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2866+
if name.startswith("thinker."):
2867+
name = name.replace("thinker.", "")
2868+
2869+
if name.startswith("audio_tower"):
2870+
# process audio tensors
2871+
if "conv1.bias" in name or "conv2.bias" in name:
2872+
# transpose conv1 and conv2 bias
2873+
data_torch = data_torch.unsqueeze(-1)
2874+
if "audio_bos_eos_token" in name:
2875+
# this tensor is left unused in transformers code
2876+
# https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809
2877+
return []
2878+
return [(self.map_tensor_name(name), data_torch)]
2879+
2880+
return super().modify_tensors(data_torch, name, bid)
2881+
2882+
27772883
@ModelBase.register("InternVisionModel")
27782884
class InternVisionModel(MmprojModel):
27792885
def set_gguf_parameters(self):

docs/function-calling.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
[chat.h](../common/chat.h) (https://github.com/ggml-org/llama.cpp/pull/9639) adds support for [OpenAI-style function calling](https://platform.openai.com/docs/guides/function-calling) and is used in:
44
- `llama-server` when started w/ `--jinja` flag
5-
- `llama-cli` (WIP: https://github.com/ggml-org/llama.cpp/pull/11556)
65

76
## Universal support w/ Native & Generic handlers
87

docs/multimodal.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,3 +98,12 @@ NOTE: some models may require large context window, for example: `-c 8192`
9898
# note: no pre-quantized GGUF this model, as they have very poor result
9999
# ref: https://github.com/ggml-org/llama.cpp/pull/13760
100100
```
101+
102+
**Mixed modalities**:
103+
104+
```sh
105+
# Qwen2.5 Omni
106+
# Capabilities: audio input, vision input
107+
(tool_name) -hf ggml-org/Qwen2.5-Omni-3B-GGUF
108+
(tool_name) -hf ggml-org/Qwen2.5-Omni-7B-GGUF
109+
```

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
129129
option(GGML_LSX "ggml: enable lsx" ON)
130130
option(GGML_RVV "ggml: enable rvv" ON)
131131
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
132+
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
132133
option(GGML_VXE "ggml: enable vxe" ON)
133134

134135
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)

ggml/src/ggml-backend.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1598,6 +1598,9 @@ void ggml_backend_sched_synchronize(ggml_backend_sched_t sched) {
15981598
for (int i = 0; i < sched->n_backends; i++) {
15991599
ggml_backend_synchronize(sched->backends[i]);
16001600
}
1601+
// reset the current copy to 0 so that the graphs will be similar during generation
1602+
// necessary for CUDA graphs
1603+
sched->cur_copy = 0;
16011604
}
16021605

16031606
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,25 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
299299
endif()
300300
endif()
301301
endif()
302+
303+
if (GGML_BACKEND_DL)
304+
if (GGML_NATIVE)
305+
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
306+
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
307+
endif()
308+
309+
# The feature detection code is compiled as a separate target so that
310+
# it can be built without the architecture flags
311+
# Since multiple variants of the CPU backend may be included in the same
312+
# build, using set_source_files_properties() to set the arch flags is not possible
313+
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
314+
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
315+
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
316+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
317+
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
318+
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
319+
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
320+
endif()
302321
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
303322
message(STATUS "PowerPC detected")
304323
if (GGML_NATIVE)
@@ -338,8 +357,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
338357
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
339358
message(STATUS "RISC-V detected")
340359
if (GGML_RVV)
341-
if (GGML_RV_ZFH)
342-
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
360+
if (GGML_XTHEADVECTOR)
361+
list(APPEND ARCH_FLAGS -march=rv64gc_xtheadvector -mabi=lp64d)
362+
elseif (GGML_RV_ZFH)
363+
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
343364
else()
344365
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
345366
endif()
@@ -477,25 +498,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
477498
target_compile_options(${GGML_CPU_NAME} PRIVATE ${ARCH_FLAGS})
478499
target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS})
479500

480-
if (GGML_BACKEND_DL)
481-
if (GGML_NATIVE)
482-
# the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE
483-
message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS")
484-
endif()
485-
486-
# The feature detection code is compiled as a separate target so that
487-
# it can be built without the architecture flags
488-
# Since multiple variants of the CPU backend may be included in the same
489-
# build, using set_source_files_properties() to set the arch flags is not possible
490-
set(GGML_CPU_FEATS_NAME ${GGML_CPU_NAME}-feats)
491-
add_library(${GGML_CPU_FEATS_NAME} OBJECT ggml-cpu/cpu-feats-x86.cpp)
492-
target_include_directories(${GGML_CPU_FEATS_NAME} PRIVATE . .. ../include)
493-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE ${ARCH_DEFINITIONS})
494-
target_compile_definitions(${GGML_CPU_FEATS_NAME} PRIVATE GGML_BACKEND_DL GGML_BACKEND_BUILD GGML_BACKEND_SHARED)
495-
set_target_properties(${GGML_CPU_FEATS_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON)
496-
target_link_libraries(${GGML_CPU_NAME} PRIVATE ${GGML_CPU_FEATS_NAME})
497-
endif()
498-
499501
if (EMSCRIPTEN)
500502
set_target_properties(${GGML_CPU_NAME} PROPERTIES COMPILE_FLAGS "-msimd128")
501503
endif()

ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,7 +1191,7 @@ static void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
11911191
}
11921192
}
11931193
return;
1194-
#elif defined(__riscv_v_intrinsic)
1194+
#elif defined __riscv_v
11951195
if (__riscv_vlenb() >= QK4_0) {
11961196
const size_t vl = QK4_0;
11971197

@@ -3783,7 +3783,7 @@ static void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c
37833783
}
37843784
return;
37853785
}
3786-
#elif defined(__riscv_v_intrinsic)
3786+
#elif defined __riscv_v
37873787
if (__riscv_vlenb() >= QK4_0) {
37883788
const size_t vl = QK4_0;
37893789

0 commit comments

Comments
 (0)