Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,16 @@ static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "ses
static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
"session.use_ort_model_bytes_for_initializers";

// <summary>
// Key for using memory-mapped I/O to load ORT format model files.
// When set to "1" and the session is created from a file path, ORT will use memory-mapped I/O
// to load the .ort model file instead of reading it into a heap-allocated buffer.
// Usage with session.use_ort_model_bytes_for_initializers will ensure Tensors point directly to the mapped bytes,
// although the mapping must remain valid and model weights will be immutable.
// The model load will fail if the mapping fails; fallbacks should be caller-handled.
// </summary>
static const char* const kOrtSessionOptionsConfigUseMemoryMappedOrtModel = "session.use_memory_mapped_ort_model";

// This should only be specified when exporting an ORT format model for use on a different platform.
// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
// Available since version 1.11.
Expand Down
30 changes: 28 additions & 2 deletions onnxruntime/core/session/inference_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1677,10 +1677,35 @@ static Status LoadOrtModelBytes(const PathString& model_uri,
return Status::OK();
}

static Status LoadOrtModelBytesMapped(const PathString& model_uri,
gsl::span<const uint8_t>& bytes,
Env::MappedMemoryPtr& mapped_memory) {
size_t num_bytes = 0;
ORT_RETURN_IF_ERROR(Env::Default().GetFileLength(model_uri.c_str(), num_bytes));

ORT_RETURN_IF_ERROR(Env::Default().MapFileIntoMemory(model_uri.c_str(), 0, num_bytes, mapped_memory));

bytes = gsl::span<const uint8_t>(reinterpret_cast<const uint8_t*>(mapped_memory.get()), num_bytes);

return Status::OK();
}

Status InferenceSession::LoadOrtModel(const PathString& model_uri) {
return LoadOrtModelWithLoader(
[&]() {
model_location_ = model_uri;

const auto& config_options = GetSessionOptions().config_options;
const bool use_mmap =
config_options.GetConfigOrDefault(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "0") == "1";

if (use_mmap) {
ORT_RETURN_IF_ERROR(
LoadOrtModelBytesMapped(model_location_, ort_format_model_bytes_, ort_format_model_mapped_memory_));
LOGS(*session_logger_, INFO) << "ORT model loaded via memory-mapped I/O.";
return Status::OK();
}

ORT_RETURN_IF_ERROR(
LoadOrtModelBytes(model_location_, ort_format_model_bytes_, ort_format_model_bytes_data_holder_));
return Status::OK();
Expand Down Expand Up @@ -1788,8 +1813,8 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function<Status()> load_ort
ORT_RETURN_IF(nullptr == fbs_model, "Missing Model. Invalid ORT format model.");

// if we're using the bytes directly because kOrtSessionOptionsConfigUseORTModelBytesDirectly was set and the user
// provided an existing buffer of bytes when creating the InferenceSession, ort_format_model_bytes_data_holder_
// will be empty.
// provided an existing buffer of bytes when creating the InferenceSession, or because we memory-mapped the file,
// ort_format_model_bytes_data_holder_ will be empty.
// if that is the case we also allow creating initializers that directly use those bytes.
const auto& config_options = session_options_.config_options;
using_ort_model_bytes_for_initializers_ =
Expand Down Expand Up @@ -2611,6 +2636,7 @@ common::Status InferenceSession::Initialize() {
if (!using_ort_model_bytes_for_initializers_) {
ort_format_model_bytes_ = gsl::span<const uint8_t>();
std::vector<uint8_t>().swap(ort_format_model_bytes_data_holder_);
ort_format_model_mapped_memory_.reset();
}

// once the model is saved, we may remove unnecessary attributes for inference
Expand Down
9 changes: 8 additions & 1 deletion onnxruntime/core/session/inference_session.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "core/common/path_string.h"
#include "core/common/profiler.h"
#include "core/common/status.h"
#include "core/platform/env.h"
#include "core/framework/execution_providers.h"
#include "core/framework/framework_common.h"
#include "core/framework/iexecutor.h"
Expand Down Expand Up @@ -1025,6 +1026,8 @@ class InferenceSession {
// We store them currently in the ort_format_model_bytes_data_holder_ to make the Load + Initialize
// behave the same way as for an ONNX model, as we need some of the bytes for the Load (create the Model)
// and some for the Initialize (create SessionState).
// If "session.use_memory_mapped_ort_model" is set, we memory-map the file instead and store the
// mapping in ort_format_model_mapped_memory_.
// Short term we free them after Initialize.
// Longer term we may want to directly refer to offsets in this buffer for initializers so we don't need to copy
// those into new OrtValue instances, at which point we won't free them until the InferenceSession goes away.
Expand All @@ -1033,9 +1036,13 @@ class InferenceSession {
// This holds the actual model data
// In case if the session is started with an input byte array contains model data, and the caller
// specifies that ORT should use the model bytes directly by setting the session config option
// "session.use_ort_model_bytes_directly" to "1", this will be empty
// "session.use_ort_model_bytes_directly" to "1", this will be empty.
// Also empty when using memory-mapped loading, as the data is held by ort_format_model_mapped_memory_.
std::vector<uint8_t> ort_format_model_bytes_data_holder_;

// Holds the memory-mapped file data when session.use_memory_mapped_ort_model is set.
Env::MappedMemoryPtr ort_format_model_mapped_memory_;

bool using_ort_model_bytes_for_initializers_{false};

// Container to store pre-packed weights to share between sessions.
Expand Down
27 changes: 24 additions & 3 deletions onnxruntime/test/framework/ort_model_only_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ struct OrtModelTestInfo {
bool run_use_buffer{false};
bool disable_copy_ort_buffer{false};
bool use_buffer_for_initializers{false};
bool use_memory_mapped_load{false};
TransformerLevel optimization_level = TransformerLevel::Level3;
};

Expand All @@ -49,10 +50,15 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {

if (test_info.disable_copy_ort_buffer) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesDirectly, "1"));
}

if (test_info.use_buffer_for_initializers) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
}
if (test_info.use_memory_mapped_load) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseMemoryMappedOrtModel, "1"));
}

if (test_info.use_buffer_for_initializers &&
(test_info.disable_copy_ort_buffer || test_info.use_memory_mapped_load)) {
ASSERT_STATUS_OK(so.config_options.AddConfigEntry(kOrtSessionOptionsConfigUseORTModelBytesForInitializers, "1"));
}

so.graph_optimization_level = test_info.optimization_level;
Expand Down Expand Up @@ -557,6 +563,21 @@ TEST(OrtModelOnlyTests, LoadOrtFormatModelFromBufferNoCopyInitializersUseBuffer)
RunOrtModel(test_info);
}

// Load the model from a file using memory-mapped I/O
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMapped) {
OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
test_info.use_memory_mapped_load = true;
RunOrtModel(test_info);
}

// Load the model from a file using memory-mapped I/O, with initializers referencing the mapped bytes
TEST(OrtModelOnlyTests, LoadOrtFormatModelMemoryMappedWithInitializersFromMap) {
OrtModelTestInfo test_info = GetTestInfoForLoadOrtFormatModel();
test_info.use_memory_mapped_load = true;
test_info.use_buffer_for_initializers = true;
RunOrtModel(test_info);
}

// regression test for 2 issues covered by PR #17000 (internally reported issue).
// 1) allocation planner broke in minimal build when subgraph had no nodes.
// 2) usage of a sequence data type caused an exception due to IsSparseTensor() throwing
Expand Down
Loading
Loading