microsoft · Kevin-Taha · Apr 21, 2026 · Apr 21, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -172,6 +172,16 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses
 static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
     "session.use_ort_model_bytes_for_initializers";
 
+// <summary>
+// Key for using memory-mapped I/O to load ORT format model files.
+// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O
+// to load the .ort model file instead of reading it into a heap-allocated buffer.
+// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes,
+// although the mapping must remain valid and model weights will be immutable.
+// The model load will fail if the mapping fails; fallbacks should be caller-handled.
+// </summary>
+static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model";
+
 // This should only be specified when exporting an ORT format model for use on a different platform.
 // If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
 // Available since version 1.11.

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -1677,10 +1677,35 @@ static Status LoadOrtModelBytes(const PathString& model_uri,
   return Status::OK();
 }
 
+static Status LoadOrtModelBytesMapped(const PathString& model_uri,
+                                      gsl::span<const uint8_t>& bytes,
+                                      Env::MappedMemoryPtr& mapped_memory) {
+  size_t num_bytes = 0;
+  ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes));
+
+  ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory));
+
+  bytes = gsl::span<const uint8_t>(reinterpret_cast<const uint8_t*>(mapped_memory.get()), num_bytes);
+
+  return Status::OK();
+}
+
 Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
   return LoadOrtModelWithLoader(
       [&]() {
         model_location_ = model_uri;
+
+        const auto& config_options = GetSessionOptions().config_options;
+        const bool use_mmap =
+            config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1";
+
+        if (use_mmap) {
+          ORT_RETURN_IF_ERROR(
+              LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_));
+          LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O.";
+          return Status::OK();
+        }
+
         ORT_RETURN_IF_ERROR(
             LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_));
         return Status::OK();
@@ -1788,8 +1813,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
   ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model.");
 
   // if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user
-  // provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_
-  // will be empty.
+  // provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file,
+  // ort_format_model_bytes_data_holder_ will be empty.
   // if that is the case we also allow creating initializers that directly use those bytes.
   const auto& config_options = session_options_.config_options;
   using_ort_model_bytes_for_initializers_ =
@@ -2611,6 +2636,7 @@ common::Status InferenceSession::Initialize() {
     if (!using_ort_model_bytes_for_initializers_) {
       ort_format_model_bytes_ = gsl::span<const uint8_t>();
       std::vector<uint8_t>().swap(ort_format_model_bytes_data_holder_);
+      ort_format_model_mapped_memory_.reset();
     }
 
     // once the model is saved, we may remove unnecessary attributes for inference

diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
@@ -15,6 +15,7 @@
 #include "core/common/path_string.h"
 #include "core/common/profiler.h"
 #include "core/common/status.h"
+#include "core/platform/env.h"
 #include "core/framework/execution_providers.h"
 #include "core/framework/framework_common.h"
 #include "core/framework/iexecutor.h"
@@ -1025,6 +1026,8 @@ class InferenceSession {
   //   We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize
   //   behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model)
   //   and some for the Initialize (create SessionState).
+  //   If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the
+  //   mapping in ort_format_model_mapped_memory_.
   // Short term we free them after Initialize.
   // Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy
   // those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away.
@@ -1033,9 +1036,13 @@ class InferenceSession {
   // This holds the actual model data
   // In case if the session is started with an input byte array contains model data, and the caller
   // specifies that ORT should use the model bytes directly by setting the session config option
-  // "session.use_ort_model_bytes_directly" to "1", this will be empty
+  // "session.use_ort_model_bytes_directly" to "1", this will be empty.
+  // Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_.
   std::vector<uint8_t> ort_format_model_bytes_data_holder_;
 
+  // Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set.
+  Env::MappedMemoryPtr ort_format_model_mapped_memory_;
+
   bool using_ort_model_bytes_for_initializers_{false};
 
   // Container to store pre-packed weights to share between sessions.

diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -37,6 +37,7 @@ struct OrtModelTestInfo {
   bool run_use_buffer{false};
   bool disable_copy_ort_buffer{false};
   bool use_buffer_for_initializers{false};
+  bool use_memory_mapped_load{false};
   TransformerLevel optimization_level = TransformerLevel::Level3;
 };
 
@@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
 
   if (test_info.disable_copy_ort_buffer) {
     ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1"));
+  }
 
-    if (test_info.use_buffer_for_initializers) {
-      ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
-    }
+  if (test_info.use_memory_mapped_load) {
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
+  }
+
+  if (test_info.use_buffer_for_initializers &&
+      (test_info.disable_copy_ort_buffer || test_info.use_memory_mapped_load)) {
+    ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
   }
 
   so.graph_optimization_level = test_info.optimization_level;
@@ -557,6 +563,21 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer)
   RunOrtModel(test_info);
 }
 
+// Load the model from a file using memory-mapped I/O
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) {
+  OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+  test_info.use_memory_mapped_load = true;
+  RunOrtModel(test_info);
+}
+
+// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes
+TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) {
+  OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
+  test_info.use_memory_mapped_load = true;
+  test_info.use_buffer_for_initializers = true;
+  RunOrtModel(test_info);
+}
+
 // regression test for 2 issues covered by PR #17000 (internally reported issue).
 // 1) allocation planner broke in minimal build when subgraph had no nodes.
 // 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing