Skip to content

Commit 5a4cdc1

Browse files
Merge branch 'main' into kvaishnavi/guidance-schema
2 parents b421a39 + 90cb011 commit 5a4cdc1

35 files changed

+1342
-266
lines changed

.pipelines/stages/jobs/steps/capi-win-step.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ steps:
117117
workingDirectory: '$(Build.Repository.LocalPath)'
118118

119119
- powershell: |
120-
cmake --build --preset windows_$(arch)_$(ep)_$(build_config) --parallel --target ${{ parameters.target }}
120+
cmake --build --preset windows_$(arch)_$(ep)_$(build_config) --parallel
121121
displayName: 'Build C API'
122122
workingDirectory: '$(Build.Repository.LocalPath)'
123123

@@ -143,7 +143,8 @@ steps:
143143
displayName: 'Install wheel'
144144
145145
- powershell: |
146-
cmake --build --preset windows_$(arch)_$(ep)_$(build_config) --target package
146+
cd $(Build.Repository.LocalPath)\build\$(ep)\$(os)-$(arch)
147+
cpack -C $(build_config)
147148
displayName: 'Package C/C++ API'
148149
workingDirectory: '$(Build.Repository.LocalPath)'
149150

CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,11 @@ if(ANDROID)
269269
# strip the binary if it's not a build with debug info
270270
set_target_properties(onnxruntime-genai PROPERTIES LINK_FLAGS_RELEASE -s)
271271
set_target_properties(onnxruntime-genai PROPERTIES LINK_FLAGS_MINSIZEREL -s)
272+
273+
# Build shared libraries with support for 16 KB page size on Android
274+
# https://source.android.com/docs/core/architecture/16kb-page-size/16kb#build-lib-16kb-alignment
275+
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,max-page-size=16384")
276+
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-z,max-page-size=16384")
272277
endif()
273278

274279
if(ENABLE_TESTS)
@@ -294,4 +299,4 @@ endif()
294299
# Have visual studio put all files into one single folder vs the default split of header files into a separate folder
295300
source_group(TREE ${GENERATORS_ROOT} FILES ${generator_srcs})
296301

297-
include(cmake/package.cmake)
302+
include(cmake/package.cmake)

README.md

Lines changed: 72 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,5 @@
11
# ONNX Runtime GenAI
22

3-
Note: between `v0.11.0` and `v0.10.1`, there is a breaking API usage change to improve model quality during multi-turn conversations.
4-
5-
Previously, the decoding loop could be written as follows.
6-
7-
```
8-
while not IsDone():
9-
GenerateToken()
10-
GetLastToken()
11-
PrintLastToken()
12-
```
13-
14-
In 0.11.0, the decoding loop should now be written as follows.
15-
16-
```
17-
while True:
18-
GenerateToken()
19-
if IsDone():
20-
break
21-
GetLastToken()
22-
PrintLastToken()
23-
```
24-
25-
Please read [this PR's description](https://github.com/microsoft/onnxruntime-genai/pull/1849) for more information.
26-
273
## Status
284

295
[![Latest version](https://img.shields.io/nuget/vpre/Microsoft.ML.OnnxRuntimeGenAI.Managed?label=latest)](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntimeGenAI.Managed/absoluteLatest)
@@ -32,20 +8,22 @@ Please read [this PR's description](https://github.com/microsoft/onnxruntime-gen
328

339
## Description
3410

35-
Run generative AI models with ONNX Runtime. This API gives you an easy, flexible and performant way of running LLMs on device. It implements the generative AI loop for ONNX models, including pre and post processing, inference with ONNX Runtime, logits processing, search and sampling, and KV cache management.
11+
Run generative AI models with ONNX Runtime. This API gives you an easy, flexible and performant way of running LLMs on device. It implements the generative AI loop for ONNX models, including pre and post processing, inference with ONNX Runtime, logits processing, search and sampling, KV cache management, and grammar specification for tool calling.
12+
13+
ONNX Runtime GenAI powers Foundry Local, Windows ML, and the Visual Studio Code AI Toolkit.
3614

3715
See documentation at the [ONNX Runtime website](https://onnxruntime.ai/docs/genai) for more details.
3816

39-
|Support matrix|Supported now|Under development|On the roadmap|
17+
| Support matrix | Supported now | Under development | On the roadmap|
4018
| -------------- | ------------- | ----------------- | -------------- |
41-
| Model architectures | AMD OLMo <br/> ChatGLM <br/> DeepSeek <br/> ERNIE 4.5 <br/> Gemma <br/> gpt-oss <br/> Granite <br/> Llama <br/> Mistral <br/> Nemotron <br/> Phi (language + vision) <br/> Qwen <br/> SmolLM3 <br/> Whisper | Stable diffusion | Multi-modal models |
19+
| Model architectures | ChatGLM</br>DeepSeek</br>Ernie</br>Fara</br>Gemma</br>GPTOSS</br>Granite</br>Llama</br>Mistral</br>Nemotron</br>OLMo</br>Phi</br>Phi3V</br>Phi4MM</br>Qwen</br>Qwen-2.5VL</br>SmolLM3</br>Whisper</br>| Stable diffusion ||
4220
| API| Python <br/>C# <br/>C/C++ <br/> Java ^ | Objective-C ||
43-
| Platform | Linux <br/> Windows <br/>Mac ^ <br/>Android ^ || iOS |||
44-
| Architecture | x86 <br/> x64 <br/> Arm64 ~ ||||
21+
| O/S | Linux <br/> Windows <br/>Mac <br/>Android || iOS |||
22+
| Architecture | x86 <br/> x64 <br/> arm64 ||||
4523
| Hardware Acceleration | CPU <br/> CUDA <br/> DirectML <br/> NvTensorRtRtx (TRT-RTX) <br/> OpenVINO <br/> QNN <br/> WebGPU | | AMD GPU |
4624
| Features | Multi-LoRA <br/> Continuous decoding <br/> Constrained decoding | | Speculative decoding |
4725

48-
\~ Windows builds available, requires build from source for other platforms
26+
^ Requires build from source
4927

5028
## Installation
5129

@@ -60,7 +38,7 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install)
6038
```
6139

6240
2. Install the API
63-
41+
6442
```shell
6543
pip install numpy
6644
pip install --pre onnxruntime-genai
@@ -113,30 +91,86 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install)
11391
del generator
11492
```
11593

116-
### Choosing the Right Examples: Release vs. Main Branch
94+
### Choose the correct version of the examples
11795

118-
Due to the evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version. The majority of the steps would remain same. Just the package installation and the model example file would change.
96+
Due to the evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version.
11997

12098
### Stable version
121-
Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Let's say you installed the 0.10.1 version of ONNX Runtime GenAI, so the instructions would look like this:
99+
100+
Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). For example, install the Python package.
101+
102+
```bash
103+
pip install onnxruntime-genai
104+
```
105+
106+
Get the version of the package
107+
108+
Linux/Mac:
109+
```bash
110+
pip list | grep onnxruntime-genai
111+
```
112+
113+
Windows:
114+
```bash
115+
pip list | findstr "onnxruntime-genai"
116+
```
117+
118+
Checkout the version of the examples that correspond to that release.
122119

123120
```bash
124121
# Clone the repo
125122
git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
126123
# Checkout the branch for the version you are using
127-
git checkout v0.10.1
124+
git checkout v0.11.4
128125
cd examples
129126
```
130127

131-
### Nightly version (Main Branch)
132-
Build the package from source using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). Now just go to the folder location where all the examples are present.
128+
### Nightly version (main branch)
129+
130+
Checkout the main branch of the repo
133131

134132
```bash
135-
# Clone the repo
136133
git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
134+
```
135+
136+
Build from source, using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). For example, to build the Python wheel:
137+
138+
```bash
139+
python build.py
140+
```
141+
142+
Navigate to the examples folder in the main branch.
143+
144+
```bash
137145
cd examples
138146
```
139147

148+
## Breaking API changes
149+
150+
### v0.11.0
151+
152+
Between `v0.11.0` and `v0.10.1`, there is a breaking API usage change to improve model quality during multi-turn conversations.
153+
154+
Previously, the decoding loop could be written as follows.
155+
156+
```
157+
while not IsDone():
158+
GenerateToken()
159+
GetLastToken()
160+
PrintLastToken()
161+
```
162+
163+
In 0.11.0, the decoding loop should now be written as follows.
164+
165+
```
166+
while True:
167+
GenerateToken()
168+
if IsDone():
169+
break
170+
GetLastToken()
171+
PrintLastToken()
172+
```
173+
140174
## Roadmap
141175

142176
See the [Discussions](https://github.com/microsoft/onnxruntime-genai/discussions) to request new features and up-vote existing requests.

benchmark/c/main.cpp

Lines changed: 21 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -112,20 +112,29 @@ void WriteE2EStats(std::string_view label,
112112
<< "\n";
113113
}
114114

115-
std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer, size_t batch_size) {
115+
static std::unique_ptr<OgaGeneratorParams> MakeGeneratorParams(const benchmark::Options& opts, const OgaModel& model, size_t num_tokens) {
116+
auto params = OgaGeneratorParams::Create(model);
117+
if (opts.max_length != -1) {
118+
auto max_length = num_tokens;
119+
if (opts.max_length > 0)
120+
max_length = static_cast<size_t>(opts.max_length);
121+
params->SetSearchOption("max_length", static_cast<double>(max_length));
122+
}
123+
params->SetSearchOption("min_length", static_cast<double>(num_tokens));
124+
return params;
125+
}
126+
127+
std::string GeneratePrompt(const benchmark::Options& opts, size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer, size_t batch_size) {
116128
const char* const base_prompt = "A";
117129
auto base_prompt_sequences = OgaSequences::Create();
118130
for (size_t i = 0; i < batch_size; ++i) {
119131
tokenizer.Encode(base_prompt, *base_prompt_sequences);
120132
}
121133

122-
auto params = OgaGeneratorParams::Create(model);
123-
params->SetSearchOption("max_length", static_cast<double>(num_prompt_tokens));
124-
params->SetSearchOption("min_length", static_cast<double>(num_prompt_tokens));
125-
134+
auto params = MakeGeneratorParams(opts, model, num_prompt_tokens);
126135
auto generator = OgaGenerator::Create(model, *params);
127136
generator->AppendTokenSequences(*base_prompt_sequences);
128-
while (!generator->IsDone()) {
137+
while (!generator->IsDone() && num_prompt_tokens-- > 0) {
129138
generator->GenerateNextToken();
130139
}
131140

@@ -159,7 +168,7 @@ void RunBenchmark(const benchmark::Options& opts) {
159168

160169
const auto prompt = [&]() -> std::string {
161170
if (const size_t* num_prompt_tokens = std::get_if<size_t>(&opts.prompt_num_tokens_or_content)) {
162-
return GeneratePrompt(*num_prompt_tokens, *model, *tokenizer, opts.batch_size);
171+
return GeneratePrompt(opts, *num_prompt_tokens, *model, *tokenizer, opts.batch_size);
163172
}
164173
return std::get<std::string>(opts.prompt_num_tokens_or_content);
165174
}();
@@ -176,22 +185,15 @@ void RunBenchmark(const benchmark::Options& opts) {
176185

177186
const size_t num_prompt_tokens = prompt_sequences->SequenceCount(0);
178187
const size_t num_tokens = num_prompt_tokens + opts.num_tokens_to_generate;
179-
180-
auto make_generator_params = [&] {
181-
auto params = OgaGeneratorParams::Create(*model);
182-
params->SetSearchOption("max_length", static_cast<double>(num_tokens));
183-
params->SetSearchOption("min_length", static_cast<double>(num_tokens));
184-
return params;
185-
};
186-
187-
const auto generator_params = make_generator_params();
188+
const auto generator_params = MakeGeneratorParams(opts, *model, num_tokens);
188189

189190
// warmup
190191
if (opts.verbose) std::cout << "Running warmup iterations (" << opts.num_warmup_iterations << ")...\n";
191192
for (size_t i = 0; i < opts.num_warmup_iterations; ++i) {
192193
auto generator = OgaGenerator::Create(*model, *generator_params);
194+
auto num_tokens_to_generate = opts.num_tokens_to_generate;
193195
generator->AppendTokenSequences(*prompt_sequences);
194-
while (!generator->IsDone()) {
196+
while (!generator->IsDone() && num_tokens_to_generate-- > 0) {
195197
generator->GenerateNextToken();
196198
}
197199

@@ -215,6 +217,7 @@ void RunBenchmark(const benchmark::Options& opts) {
215217
if (opts.verbose) std::cout << "Running iterations (" << opts.num_iterations << ")...\n";
216218
for (size_t i = 0; i < opts.num_iterations; ++i) {
217219
auto generator = OgaGenerator::Create(*model, *generator_params);
220+
auto num_tokens_to_generate = opts.num_tokens_to_generate;
218221

219222
{
220223
Timing e2e_gen_timing{e2e_gen_times};
@@ -232,7 +235,7 @@ void RunBenchmark(const benchmark::Options& opts) {
232235
generator_done = generator->IsDone();
233236
}
234237

235-
while (!generator_done) {
238+
while (!generator_done && num_tokens_to_generate-- > 0) {
236239
{
237240
Timing token_gen_timing{token_gen_times};
238241
generator->GenerateNextToken();

benchmark/c/options.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,9 @@ namespace {
4747
<< " Number of times to repeat the benchmark. Default: " << defaults.num_iterations << "\n"
4848
<< " -w,--warmup <number>\n"
4949
<< " Number of warmup runs before benchmarking. Default: " << defaults.num_warmup_iterations << "\n"
50+
<< " -ml,--max_length <number>\n"
51+
<< " Max sequence length (prompt + output). Overrides genai_config.json.\n"
52+
<< " Default: prompt_length + generation_length. Pass -1 to use config file value.\n"
5053
<< " -v,--verbose\n"
5154
<< " Show more informational output.\n"
5255
<< " -h,--help\n"
@@ -130,6 +133,8 @@ Options ParseOptionsFromCommandLine(int argc, const char* const* argv) {
130133
opts.num_iterations = ParseNumber<size_t>(next_arg(i));
131134
} else if (arg == "-w" || arg == "--warmup") {
132135
opts.num_warmup_iterations = ParseNumber<size_t>(next_arg(i));
136+
} else if (arg == "-ml" || arg == "--max_length") {
137+
opts.max_length = ParseNumber<int64_t>(next_arg(i));
133138
} else if (arg == "-v" || arg == "--verbose") {
134139
opts.verbose = true;
135140
} else if (arg == "-h" || arg == "--help") {

benchmark/c/options.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ struct Options {
1919
size_t batch_size{1};
2020
size_t num_iterations{5};
2121
size_t num_warmup_iterations{1};
22+
int64_t max_length{0};
2223
bool verbose{};
2324
};
2425

benchmark/python/benchmark_e2e.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,23 @@ def monitor_cpu_memory():
7474

7575

7676
# Use input model to generate prompt
77-
def generate_prompt(model, tokenizer, prompt_length) -> str:
77+
def generate_prompt(model, tokenizer, prompt_length, override_max_length) -> str:
7878
text = "a"
7979
prompt = f"{args.chat_template.format(input=text)}"
8080
tokens = tokenizer.encode(prompt)
8181
params = og.GeneratorParams(model)
8282
max_length_to_use = prompt_length + len(tokens)
83-
params.set_search_options(max_length=max_length_to_use, min_length=prompt_length)
83+
params.set_search_options(
84+
min_length=prompt_length,
85+
**({ "max_length": max_length_to_use } if override_max_length else {})
86+
)
8487

8588
generator = og.Generator(model, params)
8689
generator.append_tokens(tokens)
87-
while not generator.is_done():
90+
i = 0
91+
while not generator.is_done() and i < prompt_length:
8892
generator.generate_next_token()
93+
i += 1
8994
return tokenizer.decode(generator.get_sequence(0))
9095

9196

@@ -280,6 +285,9 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
280285
raise ValueError(
281286
f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template"
282287
)
288+
289+
# When -1 is passed as max_length we should not override that search option
290+
override_max_length = max_length != -1
283291

284292
# Generate prompt
285293
if args.use_random_tokens:
@@ -294,7 +302,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
294302
prompt = f"{args.chat_template.format(input=text)}"
295303
tokens = tokenizer.encode(prompt)
296304
else:
297-
text = [generate_prompt(model, tokenizer, prompt_length)] * batch_size
305+
text = [generate_prompt(model, tokenizer, prompt_length, override_max_length)] * batch_size
298306
prompt = f"{args.chat_template.format(input=text)}"
299307
tokens = tokenizer.encode(prompt)
300308
prompt_length = len(tokens)
@@ -307,7 +315,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
307315
top_k=args.top_k,
308316
top_p=args.top_p,
309317
temperature=temperature,
310-
max_length=max_length,
318+
**({ "max_length": max_length } if override_max_length else {}),
311319
min_length=max_length,
312320
batch_size=batch_size,
313321
)
@@ -317,8 +325,10 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
317325
for _ in tqdm(range(args.warmup)):
318326
generator = og.Generator(model, params)
319327
generator.append_tokens(tokens)
320-
while not generator.is_done():
328+
i = 0
329+
while not generator.is_done() and i < generation_length:
321330
generator.generate_next_token()
331+
i += 1
322332
if args.print_model_output:
323333
print(tokenizer.decode(generator.get_sequence(0)))
324334
# Delete the generator to free the captured graph for the next generator, if graph capture is enabled
@@ -350,7 +360,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
350360
top_k=args.top_k,
351361
top_p=args.top_p,
352362
temperature=temperature,
353-
max_length=max_length,
363+
**({ "max_length": max_length } if override_max_length else {}),
354364
min_length=max_length,
355365
batch_size=batch_size,
356366
)
@@ -508,7 +518,7 @@ def str2strlist(value):
508518
"--max_lengths",
509519
type=str2intlist,
510520
default=[],
511-
help="Max length is either a combination of prompt and generation length or one value broadcasting for all.",
521+
help="Max length is either a combination of prompt and generation length or one value broadcasting for all. Pass -1 to disable override.",
512522
)
513523
parser.add_argument("-r", "--repetitions", type=int, default=10, help="Number of times to repeat the benchmark")
514524
parser.add_argument("-w", "--warmup", type=int, default=5, help="Number of warmup runs before benchmarking")

0 commit comments

Comments
 (0)