Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
2496f9c
mtmd : support MiniCPM-V 4.6 (#22529)
tc-mb May 6, 2026
3980e04
llama : add missing call to ggml_backend_load_all() (#22752)
angt May 7, 2026
cfff1fc
sycl : fix test script (#22737)
dogunbound May 7, 2026
e358d75
webui: fix flicker issue on dismiss animation on overlay primitives (…
vignesh191 May 7, 2026
97f06e9
codeowners : add ZenDNN backend codeowner (#22772)
z-vishal May 7, 2026
f4b5a2e
webui: fix ?model= URL param race in router mode (#22771)
ServeurpersoCom May 7, 2026
8e52631
model: Add Mimo v2.5 model support (#22493)
AesSedai May 7, 2026
cc97e45
mtmd: fix whisper audio tail truncation by exposing padded buffer to …
ServeurpersoCom May 7, 2026
68380ae
ggml-cpu: Optimized risc-v cpu q1_0 dot
pl752 May 7, 2026
803627f
llama : remove unnecessary seq_id check during state restore (#22797)
ggerganov May 7, 2026
b9afc19
Write a readme on Multi-GPU usage in llama.cpp (#22729)
gaugarg-nv May 7, 2026
ad09224
sycl: add FILL, CUMSUM, DIAG, SOLVE_TRI, SSM_SCAN, GATED_DELTA_NET (#…
aicss-genai May 7, 2026
deab41e
tests: add long-sequence cases and fix inputs for gated_delta_net (#2…
Neroued May 7, 2026
093be62
common/chat : preserve media markers for typed-content templates (#22…
aldehir May 7, 2026
ceb7e14
opencl: add opfilter regex for debugging (#22782)
shaofeiqi May 7, 2026
e43431b
llama : fix device state save/load (#22805)
ggerganov May 7, 2026
aaf4a4d
webui: add option for LLM title generation (#22265)
smugman-dot May 7, 2026
05ff59c
CUDA: batch out_prod inner loop with cublasSgemmStridedBatched (#22651)
leonardHONG May 7, 2026
44dbe8c
model: Support sarashina2.2-vision-3b model (#22103)
samuraieng May 7, 2026
6a2a251
fix script error (#22795sycl : )
arthw May 8, 2026
1d72d87
convert : fix RuntimeError when stripping FP8 KV-cache scales (#22818)
pich May 8, 2026
f3e8d14
opencl: add q4_0 MoE GEMM for Adreno (#22731)
shawngu-quic May 8, 2026
3e941b8
ggml: update SCHED_DEBUG output to use ggml_op_desc() (#22825)
max-krasnyansky May 8, 2026
6d57a49
vulkan: fix spv shadowing (#22760)
miyanyan May 8, 2026
a8fd165
CUDA: lower-case PCI bus id, standardize for ggml (#22820)
JohannesGaessler May 8, 2026
9b2925e
webui: Add Import/Export of Settings configuration + improve architec…
allozaur May 8, 2026
58e68df
cuda: fuse snake activation (mul, sin, sqr, mul, add) (#22667)
ServeurpersoCom May 8, 2026
9dcf835
server: (router) expose child model info from router's /v1/models (#2…
ngxson May 8, 2026
29debb3
server: support Vertex AI compatible API (#22545)
ngxson May 8, 2026
5d6f18a
webui: fix LLM title generation for agentic conversations (#22840)
smugman-dot May 8, 2026
f9cd456
common : revert reasoning budget +inf logit bias (#22740)
aldehir May 8, 2026
9f5f0e6
model : support Gemma4_26B_A4B_NVFP4 (#22804)
ynankani May 8, 2026
4995604
common : do not wrap raw strings in schema parser for tagged parsers …
aldehir May 8, 2026
b46812d
Feature hexagon l2 norm (#22816)
pdhinaka May 8, 2026
c5703e0
sycl: support non-contiguous input in PAD op (#22148)
aicss-genai May 9, 2026
6600172
hexagon: add HTP kernel for GGML_OP_GATED_DELTA_NET (#22837)
wyanzhao May 9, 2026
046e284
Add flash attention MMA / Tiles to support MiMo-V2.5 (#22812)
AesSedai May 9, 2026
4a4f819
sycl: Battlemage AOT build via spir64_gen + MMQ subgroup annotations …
aicss-genai May 9, 2026
6048993
sycl: Q5_K reorder MMVQ/dequant + Q8_0 reorder MMVQ path (#22152)
aicss-genai May 9, 2026
fd89556
[SYCL] Add BF16 support to GET_ROWS operation (#21391)
devedse May 9, 2026
e20b839
SYCL: reduce allocation overhead during flash attention (#22732)
sanmai May 9, 2026
5757c4d
cmake : update BoringSSL to 0.20260508.0 (#22839)
cabelo May 9, 2026
00d56b1
docker : upgraded the default intel compute-runtime version (#22567)
WizardlyBump17 May 9, 2026
65d7a8b
devops : updated Nix systems (#22869)
yuannan May 9, 2026
1e5ad35
model : add sarvam_moe architecture support (#20275)
sumitchatterjee13 May 9, 2026
5755a10
model : fix model type check for granite/llama3 and deepseek2/glm4.7 …
CISC May 10, 2026
f3c3e0e
internal AllReduce kernel for CUDA provider (#22299)
scutler-nv May 10, 2026
efbada9
ggml : bump version to 0.11.1 (ggml/1484)
ggerganov May 10, 2026
0b04728
sync : ggml
ggerganov May 10, 2026
2b2babd
ggml-virtgpu : include missing mutex header (#22810)
olliewalsh May 10, 2026
5d5d2e1
vendor : update cpp-httplib to 0.43.4 (#22888)
cabelo May 10, 2026
2e97c5f
backend sampling: support returning post-sampling probs (#22622)
TimNN May 10, 2026
389ff61
server : print warning when HTTP timeout exceeded (#22907)
ggerganov May 10, 2026
7d442ab
[SYCL] Add OP im2col_3d (#22903)
arthw May 11, 2026
8383743
vendor : update cpp-httplib to 0.44.0 (#22919)
cabelo May 11, 2026
f5636f8
convert : add image break token fallback (#22914)
danbev May 11, 2026
8cef820
CUDA: directly include cuda/iterator (#22936)
ORippler May 11, 2026
dd9280a
vulkan: Support asymmetric FA in scalar/mmq/coopmat1 paths (#22589)
jeffbolznv May 11, 2026
7dbb0e9
examples : update args speculative-simple README.md [no ci] (#22938)
danbev May 11, 2026
928b486
ggml-virtgpu: Add a GHA build check (#22943)
kpouget May 11, 2026
68e7ea3
spec : parallel drafting support (#22838)
ggerganov May 11, 2026
ef22b3e
docs: fix metrics endpoint description in server README (#22879)
willjoha May 11, 2026
e936660
Ggml/cuda snake fusion hardening (#22912)
ServeurpersoCom May 11, 2026
8e1f9d0
CUDA: handle OW > 65535 in im2col (2D and 3D) (#22944)
CrispStrobe May 11, 2026
1ec7ba0
opencl: add q4_1 MoE for Adreno (#22856)
shawngu-quic May 11, 2026
da44953
metal : promote mul_mv/mul_mm batch divisors to function constants (#…
guyfischman May 12, 2026
78fbbc2
convert : add split() to LoraTorchTensor in LoRA converter (#22832)
jesus-talavera-ibm May 12, 2026
4178259
mtmd: add MiMo v2.5 vision (#22883)
AesSedai May 12, 2026
fa62042
ci : bump ty to 0.0.35 (#22961)
CISC May 12, 2026
706fbd8
vulkan: Check shared memory size for mmq shaders (#22693)
jeffbolznv May 12, 2026
ef93e98
vulkan: Fix Windows performance regression on Intel GPU BF16 workload…
rillomas May 12, 2026
fde69a3
examples : add llama-eval (#21152)
ggerganov May 12, 2026
89730c8
model-conversion : add causal-convert-mmproj target [no ci] (#22969)
danbev May 12, 2026
438f7e3
ggml-cpu: add rvv 512b,1024b impls for iq4_xs
taimur-10x Feb 13, 2026
dc05e35
ggml-cpu: refactor; add rvv 512b, 1024b impls for q6_K, i-quants
taimur-10x Feb 14, 2026
1e611d6
ggml-cpu: refactor; add 512 and 1024 implementations of tq3_s, iq3_xx…
RehanQasim-dev Feb 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .devops/intel.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ RUN mkdir -p /app/full \

FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base

ARG IGC_VERSION=v2.30.1
ARG IGC_VERSION_FULL=2_2.30.1+20950
ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1
ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0
ARG IGC_VERSION=v2.32.7
ARG IGC_VERSION_FULL=2_2.32.7+21184
ARG COMPUTE_RUNTIME_VERSION=26.14.37833.4
ARG COMPUTE_RUNTIME_VERSION_FULL=26.14.37833.4-0
ARG IGDGMM_VERSION=22.9.0
RUN mkdir /tmp/neo/ && cd /tmp/neo/ \
&& wget https://github.com/intel/intel-graphics-compiler/releases/download/$IGC_VERSION/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \
Expand Down
2 changes: 1 addition & 1 deletion .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ let
vulkan-headers
vulkan-loader
shaderc
spirv-headers
];
in

Expand Down Expand Up @@ -146,7 +147,6 @@ effectiveStdenv.mkDerivation (finalAttrs: {
ninja
pkg-config
git
spirv-headers
]
++ optionals useCuda [
cudaPackages.cuda_nvcc
Expand Down
50 changes: 50 additions & 0 deletions .github/workflows/build-virtgpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: CI (virtgpu)

on:
workflow_dispatch: # allows manual triggering
push:
branches:
- master
paths: [
'.github/workflows/build-virtgpu.yml',
'**/CMakeLists.txt',
'**/.cmake',
'**/*.h',
'**/*.hpp',
'**/*.c',
'**/*.cpp'
]

pull_request:
types: [opened, synchronize, reopened]
paths: [
'.github/workflows/build-virtgpu.yml',
'ggml/src/ggml-virtgpu/**'
]

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true

jobs:
ubuntu-24-virtgpu:
runs-on: ${{ 'ubuntu-24.04-arm' || 'ubuntu-24.04' }}

steps:
- name: Clone
id: checkout
uses: actions/checkout@v6

- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential libdrm-dev pkg-config libssl-dev

- name: Build
id: cmake_build
run: |
cmake -B build \
-DGGML_VIRTGPU=ON \
-DGGML_VIRTGPU_BACKEND=ON
cmake --build build --config Release -j $(nproc)
2 changes: 1 addition & 1 deletion .github/workflows/python-type-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt ty==0.0.33
pip-install: -r requirements/requirements-all.txt ty==0.0.35
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ uv.lock

# Nix

flake.lock
/result

# Test binaries
Expand Down
1 change: 1 addition & 0 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
/ggml/src/ggml-zendnn/ @avinashcpandey @Jiten1parmar @z-vishal
/ggml/src/ggml.c @ggerganov
/ggml/src/ggml.cpp @ggerganov
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,7 @@ To learn more about model quantization, [read this documentation](tools/quantize
- [How to build](docs/build.md)
- [Running on Docker](docs/docker.md)
- [Build on Android](docs/android.md)
- [Multi-GPU usage](docs/multi-gpu.md)
- [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)

Expand Down
41 changes: 5 additions & 36 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
string_process_escapes(seq_breaker);
}
for (auto & pair : params.speculative.draft.replacements) {
string_process_escapes(pair.first);
string_process_escapes(pair.second);
}
}

if (!params.kv_overrides.empty()) {
Expand Down Expand Up @@ -3518,13 +3514,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.draft.p_min = std::stof(value);
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_P_MIN"));
add_opt(common_arg(
{"--spec-draft-ctx-size", "-cd", "--ctx-size-draft"}, "N",
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.draft.n_ctx),
[](common_params & params, int value) {
params.speculative.draft.n_ctx = value;
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_CTX_SIZE"));
add_opt(common_arg(
{"--spec-draft-device", "-devd", "--device-draft"}, "<dev1,dev2,..>",
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
Expand Down Expand Up @@ -3561,32 +3550,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL"));
add_opt(common_arg(
{"--spec-draft-replace", "--spec-replace"}, "TARGET", "DRAFT",
"translate the string in TARGET into DRAFT if the draft model and main model are not compatible",
[](common_params & params, const std::string & tgt, const std::string & dft) {
params.speculative.draft.replacements.push_back({ tgt, dft });
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
{"--spec-type"}, "[none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]",
{"--spec-type"}, common_speculative_all_types_str(),
string_format("type of speculative decoding to use when no draft model is provided (default: %s)\n",
common_speculative_type_to_str(params.speculative.type).c_str()),
common_speculative_type_name_str(params.speculative.types).c_str()),
[](common_params & params, const std::string & value) {
if (value == "none") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NONE;
} else if (value == "ngram-cache") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_CACHE;
} else if (value == "ngram-simple") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE;
} else if (value == "ngram-map-k") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
} else if (value == "ngram-map-k4v") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V;
} else if (value == "ngram-mod") {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
} else {
throw std::invalid_argument("unknown speculative decoding type without draft model");
}
const auto enabled_types = string_split<std::string>(value, ',');
params.speculative.types = common_speculative_types_from_names(enabled_types);
}
).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_TYPE"));
add_opt(common_arg(
Expand Down Expand Up @@ -4075,7 +4044,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--spec-default"},
string_format("enable default speculative decoding config"),
[](common_params & params) {
params.speculative.type = COMMON_SPECULATIVE_TYPE_NGRAM_MOD;
params.speculative.types = { COMMON_SPECULATIVE_TYPE_NGRAM_MOD };
params.speculative.ngram_mod.n_match = 24;
params.speculative.ngram_mod.n_min = 48;
params.speculative.ngram_mod.n_max = 64;
Expand Down
4 changes: 1 addition & 3 deletions common/chat-auto-parser-generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -369,9 +369,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
arguments.name_suffix) +
arguments.value_prefix +
(schema_info.resolves_to_string(param_schema) ?
p.tool_arg_string_value(p.schema(until_suffix,
"tool-" + name + "-arg-" + param_name + "-schema",
param_schema, true)) :
p.tool_arg_string_value(until_suffix) :
p.tool_arg_json_value(p.schema(
p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) +
p.space()) +
Expand Down
2 changes: 1 addition & 1 deletion common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ json common_chat_msg::to_json_oaicompat(bool concat_typed_text) const {
if (!content.empty()) {
jmsg["content"] = content;
} else if (!content_parts.empty()) {
if (concat_typed_text) {
if (concat_typed_text || contains_media()) {
std::string text;
bool last_was_media_marker = false;
// join parts with newline, do not add newline before or after media markers
Expand Down
9 changes: 9 additions & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,15 @@ struct common_chat_msg {
tool_name.empty() && tool_call_id.empty();
}

bool contains_media() const {
for (const auto & part : content_parts) {
if (part.type == "media_marker") {
return true;
}
}
return false;
}

void set_tool_call_ids(std::vector<std::string> & ids_cache,
const std::function<std::string()> & gen_tool_call_id) {
for (auto i = 0u; i < tool_calls.size(); i++) {
Expand Down
101 changes: 100 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1422,7 +1422,7 @@ common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx) {

// try to remove the last tokens
if (!llama_memory_seq_rm(mem, 0, 1, -1)) {
LOG_WRN("%s: the target context does not support partial sequence removal\n", __func__);
LOG_WRN("%s: the context does not support partial sequence removal\n", __func__);
res = COMMON_CONTEXT_SEQ_RM_TYPE_FULL;
goto done;
}
Expand Down Expand Up @@ -1960,3 +1960,102 @@ bool common_prompt_batch_decode(

return true;
}

size_t common_prompt_checkpoint::size() const {
return data_tgt.size() + data_dft.size();
}

bool common_prompt_checkpoint::empty() const {
return data_tgt.empty();
}

void common_prompt_checkpoint::clear() {
n_tokens = 0;

pos_min = 0;
pos_max = 0;

data_tgt.clear();
data_dft.clear();
}

void common_prompt_checkpoint::update_pos(
int64_t n_tokens,
llama_pos pos_min,
llama_pos pos_max) {
this->n_tokens = n_tokens;
this->pos_min = pos_min;
this->pos_max = pos_max;
}

void common_prompt_checkpoint::update_tgt(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) {
if (ctx == nullptr) {
return;
}

const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);

data_tgt.resize(ckpt_size);

const size_t n = llama_state_seq_get_data_ext(ctx, data_tgt.data(), ckpt_size, seq_id, flags);
if (n != ckpt_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
}
}

void common_prompt_checkpoint::update_dft(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) {
if (ctx == nullptr) {
return;
}

const size_t ckpt_size = llama_state_seq_get_size_ext(ctx, seq_id, flags);

data_dft.resize(ckpt_size);

const size_t n = llama_state_seq_get_data_ext(ctx, data_dft.data(), ckpt_size, seq_id, flags);
if (n != ckpt_size) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", ckpt_size, n);
}
}

void common_prompt_checkpoint::load_tgt(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) const {
if (ctx == nullptr) {
return;
}

if (data_tgt.empty()) {
return;
}

const size_t n = llama_state_seq_set_data_ext(ctx, data_tgt.data(), data_tgt.size(), seq_id, flags);
if (n != data_tgt.size()) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_tgt.size(), n);
}
}

void common_prompt_checkpoint::load_dft(
llama_context * ctx,
llama_seq_id seq_id,
llama_state_seq_flags flags) const {
if (ctx == nullptr) {
return;
}

if (data_dft.empty()) {
return;
}

const size_t n = llama_state_seq_set_data_ext(ctx, data_dft.data(), data_dft.size(), seq_id, flags);
if (n != data_dft.size()) {
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", data_dft.size(), n);
}
}
Loading
Loading