microsoft
diff --git a/‎.pipelines/stages/jobs/steps/capi-win-step.yml‎
Lines changed: 3 additions & 2 deletions b/‎.pipelines/stages/jobs/steps/capi-win-step.yml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 72 additions & 38 deletions b/‎README.md‎
Lines changed: 72 additions & 38 deletions
diff --git a/‎benchmark/c/main.cpp‎
Lines changed: 21 additions & 18 deletions b/‎benchmark/c/main.cpp‎
Lines changed: 21 additions & 18 deletions
diff --git a/‎benchmark/c/options.cpp‎
Lines changed: 5 additions & 0 deletions b/‎benchmark/c/options.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmark/c/options.h‎
Lines changed: 1 addition & 0 deletions b/‎benchmark/c/options.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmark/python/benchmark_e2e.py‎
Lines changed: 18 additions & 8 deletions b/‎benchmark/python/benchmark_e2e.py‎
Lines changed: 18 additions & 8 deletions
@@ -117,7 +117,7 @@ steps:
   workingDirectory: '$(Build.Repository.LocalPath)'
 
 - powershell: |
-    cmake --build --preset windows_$(arch)_$(ep)_$(build_config) --parallel --target ${{ parameters.target }}
+    cmake --build --preset windows_$(arch)_$(ep)_$(build_config) --parallel
   displayName: 'Build C API'
   workingDirectory: '$(Build.Repository.LocalPath)'
 
@@ -143,7 +143,8 @@ steps:
     displayName: 'Install wheel'
 
   - powershell: |
-      cmake --build --preset windows_$(arch)_$(ep)_$(build_config) --target package
+      cd $(Build.Repository.LocalPath)\build\$(ep)\$(os)-$(arch)
+      cpack -C $(build_config)
     displayName: 'Package C/C++ API'
     workingDirectory: '$(Build.Repository.LocalPath)'
 
 
@@ -269,6 +269,11 @@ if(ANDROID)
   # strip the binary if it's not a build with debug info
   set_target_properties(onnxruntime-genai PROPERTIES LINK_FLAGS_RELEASE -s)
   set_target_properties(onnxruntime-genai PROPERTIES LINK_FLAGS_MINSIZEREL -s)
+
+  # Build shared libraries with support for 16 KB page size on Android
+  # https://source.android.com/docs/core/architecture/16kb-page-size/16kb#build-lib-16kb-alignment
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,max-page-size=16384")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-z,max-page-size=16384")
 endif()
 
 if(ENABLE_TESTS)
@@ -294,4 +299,4 @@ endif()
 # Have visual studio put all files into one single folder vs the default split of header files into a separate folder
 source_group(TREE ${GENERATORS_ROOT} FILES ${generator_srcs})
 
-include(cmake/package.cmake)
+include(cmake/package.cmake)
@@ -1,29 +1,5 @@
 # ONNX Runtime GenAI
 
-Note: between `v0.11.0` and `v0.10.1`, there is a breaking API usage change to improve model quality during multi-turn conversations.
-
-Previously, the decoding loop could be written as follows.
-
-```
-while not IsDone():
-    GenerateToken()
-    GetLastToken()
-    PrintLastToken()
-```
-
-In 0.11.0, the decoding loop should now be written as follows.
-
-```
-while True:
-    GenerateToken()
-    if IsDone():
-        break
-    GetLastToken()
-    PrintLastToken()
-```
-
-Please read [this PR's description](https://github.com/microsoft/onnxruntime-genai/pull/1849) for more information.
-
 ## Status
 
 [![Latest version](https://img.shields.io/nuget/vpre/Microsoft.ML.OnnxRuntimeGenAI.Managed?label=latest)](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntimeGenAI.Managed/absoluteLatest)
@@ -32,20 +8,22 @@ Please read [this PR's description](https://github.com/microsoft/onnxruntime-gen
 
 ## Description
 
-Run generative AI models with ONNX Runtime. This API gives you an easy, flexible and performant way of running LLMs on device. It implements the generative AI loop for ONNX models, including pre and post processing, inference with ONNX Runtime, logits processing, search and sampling, and KV cache management.
+Run generative AI models with ONNX Runtime. This API gives you an easy, flexible and performant way of running LLMs on device. It implements the generative AI loop for ONNX models, including pre and post processing, inference with ONNX Runtime, logits processing, search and sampling, KV cache management, and grammar specification for tool calling.
+
+ONNX Runtime GenAI powers Foundry Local, Windows ML, and the Visual Studio Code AI Toolkit.
 
 See documentation at the [ONNX Runtime website](https://onnxruntime.ai/docs/genai) for more details.
 
-|Support matrix|Supported now|Under development|On the roadmap|
+| Support matrix | Supported now | Under development | On the roadmap|
 | -------------- | ------------- | ----------------- | -------------- |
-| Model architectures | AMD OLMo <br/> ChatGLM <br/> DeepSeek <br/> ERNIE 4.5 <br/> Gemma <br/> gpt-oss <br/> Granite <br/> Llama <br/> Mistral <br/> Nemotron <br/> Phi (language + vision) <br/> Qwen <br/> SmolLM3 <br/> Whisper | Stable diffusion | Multi-modal models |
+| Model architectures | ChatGLM</br>DeepSeek</br>Ernie</br>Fara</br>Gemma</br>GPTOSS</br>Granite</br>Llama</br>Mistral</br>Nemotron</br>OLMo</br>Phi</br>Phi3V</br>Phi4MM</br>Qwen</br>Qwen-2.5VL</br>SmolLM3</br>Whisper</br>| Stable diffusion ||
 | API| Python <br/>C# <br/>C/C++ <br/> Java ^ | Objective-C ||
-| Platform | Linux <br/> Windows <br/>Mac ^ <br/>Android ^  || iOS |||
-| Architecture | x86 <br/> x64 <br/> Arm64 ~ ||||
+| O/S | Linux <br/> Windows <br/>Mac  <br/>Android   || iOS |||
+| Architecture | x86 <br/> x64 <br/> arm64 ||||
 | Hardware Acceleration | CPU <br/> CUDA <br/> DirectML <br/> NvTensorRtRtx (TRT-RTX) <br/> OpenVINO <br/> QNN <br/> WebGPU | | AMD GPU |
 | Features | Multi-LoRA <br/> Continuous decoding <br/> Constrained decoding | | Speculative decoding |
 
-\~ Windows builds available, requires build from source for other platforms
+^ Requires build from source
 
 ## Installation
 
@@ -60,7 +38,7 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install)
    ```
 
 2. Install the API
-   
+
    ```shell
    pip install numpy
    pip install --pre onnxruntime-genai
@@ -113,30 +91,86 @@ See [installation instructions](https://onnxruntime.ai/docs/genai/howto/install)
    del generator
    ```
 
-### Choosing the Right Examples: Release vs. Main Branch
+### Choose the correct version of the examples
 
-Due to the evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version. The majority of the steps would remain same. Just the package installation and the model example file would change.
+Due to the evolving nature of this project and ongoing feature additions, examples in the `main` branch may not always align with the latest stable release. This section outlines how to ensure compatibility between the examples and the corresponding version.
 
 ### Stable version
-Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). Let's say you installed the 0.10.1 version of ONNX Runtime GenAI, so the instructions would look like this:
+
+Install the package according to the [installation instructions](https://onnxruntime.ai/docs/genai/howto/install). For example, install the Python package.
+
+```bash
+pip install onnxruntime-genai
+```
+
+Get the version of the package
+
+Linux/Mac:
+```bash
+pip list | grep onnxruntime-genai
+```
+
+Windows:
+```bash
+pip list | findstr "onnxruntime-genai"
+```
+
+Checkout the version of the examples that correspond to that release.
 
 ```bash
 # Clone the repo
 git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
 # Checkout the branch for the version you are using
-git checkout v0.10.1
+git checkout v0.11.4
 cd examples
 ```
 
-### Nightly version (Main Branch)
-Build the package from source using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). Now just go to the folder location where all the examples are present.
+### Nightly version (main branch)
+
+Checkout the main branch of the repo
 
 ```bash
-# Clone the repo
 git clone https://github.com/microsoft/onnxruntime-genai.git && cd onnxruntime-genai
+```
+
+Build from source, using these [instructions](https://onnxruntime.ai/docs/genai/howto/build-from-source.html). For example, to build the Python wheel:
+
+```bash
+python build.py
+```
+
+Navigate to the examples folder in the main branch.
+
+```bash
 cd examples
 ```
 
+## Breaking API changes
+
+### v0.11.0
+
+Between `v0.11.0` and `v0.10.1`, there is a breaking API usage change to improve model quality during multi-turn conversations.
+
+Previously, the decoding loop could be written as follows.
+
+```
+while not IsDone():
+    GenerateToken()
+    GetLastToken()
+    PrintLastToken()
+```
+
+In 0.11.0, the decoding loop should now be written as follows.
+
+```
+while True:
+    GenerateToken()
+    if IsDone():
+        break
+    GetLastToken()
+    PrintLastToken()
+```
+
 ## Roadmap
 
 See the [Discussions](https://github.com/microsoft/onnxruntime-genai/discussions) to request new features and up-vote existing requests.
 
@@ -112,20 +112,29 @@ void WriteE2EStats(std::string_view label,
             << "\n";
 }
 
-std::string GeneratePrompt(size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer, size_t batch_size) {
+static std::unique_ptr<OgaGeneratorParams> MakeGeneratorParams(const benchmark::Options& opts, const OgaModel& model, size_t num_tokens) {
+  auto params = OgaGeneratorParams::Create(model);
+  if (opts.max_length != -1) {
+    auto max_length = num_tokens;
+    if (opts.max_length > 0)
+      max_length = static_cast<size_t>(opts.max_length);
+    params->SetSearchOption("max_length", static_cast<double>(max_length));
+  }
+  params->SetSearchOption("min_length", static_cast<double>(num_tokens));
+  return params;
+}
+
+std::string GeneratePrompt(const benchmark::Options& opts, size_t num_prompt_tokens, const OgaModel& model, const OgaTokenizer& tokenizer, size_t batch_size) {
   const char* const base_prompt = "A";
   auto base_prompt_sequences = OgaSequences::Create();
   for (size_t i = 0; i < batch_size; ++i) {
     tokenizer.Encode(base_prompt, *base_prompt_sequences);
   }
 
-  auto params = OgaGeneratorParams::Create(model);
-  params->SetSearchOption("max_length", static_cast<double>(num_prompt_tokens));
-  params->SetSearchOption("min_length", static_cast<double>(num_prompt_tokens));
-
+  auto params = MakeGeneratorParams(opts, model, num_prompt_tokens);
   auto generator = OgaGenerator::Create(model, *params);
   generator->AppendTokenSequences(*base_prompt_sequences);
-  while (!generator->IsDone()) {
+  while (!generator->IsDone() && num_prompt_tokens-- > 0) {
     generator->GenerateNextToken();
   }
 
@@ -159,7 +168,7 @@ void RunBenchmark(const benchmark::Options& opts) {
 
   const auto prompt = [&]() -> std::string {
     if (const size_t* num_prompt_tokens = std::get_if<size_t>(&opts.prompt_num_tokens_or_content)) {
-      return GeneratePrompt(*num_prompt_tokens, *model, *tokenizer, opts.batch_size);
+      return GeneratePrompt(opts, *num_prompt_tokens, *model, *tokenizer, opts.batch_size);
     }
     return std::get<std::string>(opts.prompt_num_tokens_or_content);
   }();
@@ -176,22 +185,15 @@ void RunBenchmark(const benchmark::Options& opts) {
 
   const size_t num_prompt_tokens = prompt_sequences->SequenceCount(0);
   const size_t num_tokens = num_prompt_tokens + opts.num_tokens_to_generate;
-
-  auto make_generator_params = [&] {
-    auto params = OgaGeneratorParams::Create(*model);
-    params->SetSearchOption("max_length", static_cast<double>(num_tokens));
-    params->SetSearchOption("min_length", static_cast<double>(num_tokens));
-    return params;
-  };
-
-  const auto generator_params = make_generator_params();
+  const auto generator_params = MakeGeneratorParams(opts, *model, num_tokens);
 
   // warmup
   if (opts.verbose) std::cout << "Running warmup iterations (" << opts.num_warmup_iterations << ")...\n";
   for (size_t i = 0; i < opts.num_warmup_iterations; ++i) {
     auto generator = OgaGenerator::Create(*model, *generator_params);
+    auto num_tokens_to_generate = opts.num_tokens_to_generate;
     generator->AppendTokenSequences(*prompt_sequences);
-    while (!generator->IsDone()) {
+    while (!generator->IsDone() && num_tokens_to_generate-- > 0) {
       generator->GenerateNextToken();
     }
 
@@ -215,6 +217,7 @@ void RunBenchmark(const benchmark::Options& opts) {
   if (opts.verbose) std::cout << "Running iterations (" << opts.num_iterations << ")...\n";
   for (size_t i = 0; i < opts.num_iterations; ++i) {
     auto generator = OgaGenerator::Create(*model, *generator_params);
+    auto num_tokens_to_generate = opts.num_tokens_to_generate;
 
     {
       Timing e2e_gen_timing{e2e_gen_times};
@@ -232,7 +235,7 @@ void RunBenchmark(const benchmark::Options& opts) {
         generator_done = generator->IsDone();
       }
 
-      while (!generator_done) {
+      while (!generator_done && num_tokens_to_generate-- > 0) {
         {
           Timing token_gen_timing{token_gen_times};
           generator->GenerateNextToken();
 
@@ -47,6 +47,9 @@ namespace {
     << "      Number of times to repeat the benchmark. Default: " << defaults.num_iterations << "\n"
     << "    -w,--warmup <number>\n"
     << "      Number of warmup runs before benchmarking. Default: " << defaults.num_warmup_iterations << "\n"
+    << "    -ml,--max_length <number>\n"
+    << "      Max sequence length (prompt + output). Overrides genai_config.json.\n"
+    << "      Default: prompt_length + generation_length. Pass -1 to use config file value.\n"
     << "    -v,--verbose\n"
     << "      Show more informational output.\n"
     << "    -h,--help\n"
@@ -130,6 +133,8 @@ Options ParseOptionsFromCommandLine(int argc, const char* const* argv) {
         opts.num_iterations = ParseNumber<size_t>(next_arg(i));
       } else if (arg == "-w" || arg == "--warmup") {
         opts.num_warmup_iterations = ParseNumber<size_t>(next_arg(i));
+      } else if (arg == "-ml" || arg == "--max_length") {
+        opts.max_length = ParseNumber<int64_t>(next_arg(i));
       } else if (arg == "-v" || arg == "--verbose") {
         opts.verbose = true;
       } else if (arg == "-h" || arg == "--help") {
 
@@ -19,6 +19,7 @@ struct Options {
   size_t batch_size{1};
   size_t num_iterations{5};
   size_t num_warmup_iterations{1};
+  int64_t max_length{0};
   bool verbose{};
 };
 
 
@@ -74,18 +74,23 @@ def monitor_cpu_memory():
 
 
 # Use input model to generate prompt
-def generate_prompt(model, tokenizer, prompt_length) -> str:
+def generate_prompt(model, tokenizer, prompt_length, override_max_length) -> str:
     text = "a"
     prompt = f"{args.chat_template.format(input=text)}"
     tokens = tokenizer.encode(prompt)
     params = og.GeneratorParams(model)
     max_length_to_use = prompt_length + len(tokens)
-    params.set_search_options(max_length=max_length_to_use, min_length=prompt_length)
+    params.set_search_options(
+        min_length=prompt_length,
+        **({ "max_length": max_length_to_use } if override_max_length else {})
+    )
 
     generator = og.Generator(model, params)
     generator.append_tokens(tokens)
-    while not generator.is_done():
+    i = 0
+    while not generator.is_done() and i < prompt_length:
         generator.generate_next_token()
+        i += 1
     return tokenizer.decode(generator.get_sequence(0))
 
 
@@ -280,6 +285,9 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
             raise ValueError(
                 f"Chat Template for model type {model_type} is not known. Please provide chat template using --chat_template"
             )
+    
+    # When -1 is passed as max_length we should not override that search option
+    override_max_length = max_length != -1
 
     # Generate prompt
     if args.use_random_tokens:
@@ -294,7 +302,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
         prompt = f"{args.chat_template.format(input=text)}"
         tokens = tokenizer.encode(prompt)
     else:
-        text = [generate_prompt(model, tokenizer, prompt_length)] * batch_size
+        text = [generate_prompt(model, tokenizer, prompt_length, override_max_length)] * batch_size
         prompt = f"{args.chat_template.format(input=text)}"
         tokens = tokenizer.encode(prompt)
         prompt_length = len(tokens)
@@ -307,7 +315,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
         top_k=args.top_k,
         top_p=args.top_p,
         temperature=temperature,
-        max_length=max_length,
+        **({ "max_length": max_length } if override_max_length else {}),
         min_length=max_length,
         batch_size=batch_size,
     )
@@ -317,8 +325,10 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
     for _ in tqdm(range(args.warmup)):
         generator = og.Generator(model, params)
         generator.append_tokens(tokens)
-        while not generator.is_done():
+        i = 0
+        while not generator.is_done() and i < generation_length:
             generator.generate_next_token()
+            i += 1
         if args.print_model_output:
             print(tokenizer.decode(generator.get_sequence(0)))
         # Delete the generator to free the captured graph for the next generator, if graph capture is enabled
@@ -350,7 +360,7 @@ def run_benchmark(args, batch_size, prompt_length, generation_length, max_length
             top_k=args.top_k,
             top_p=args.top_p,
             temperature=temperature,
-            max_length=max_length,
+            **({ "max_length": max_length } if override_max_length else {}),
             min_length=max_length,
             batch_size=batch_size,
         )
@@ -508,7 +518,7 @@ def str2strlist(value):
         "--max_lengths",
         type=str2intlist,
         default=[],
-        help="Max length is either a combination of prompt and generation length or one value broadcasting for all.",
+        help="Max length is either a combination of prompt and generation length or one value broadcasting for all. Pass -1 to disable override.",
     )
     parser.add_argument("-r", "--repetitions", type=int, default=10, help="Number of times to repeat the benchmark")
     parser.add_argument("-w", "--warmup", type=int, default=5, help="Number of warmup runs before benchmarking")