From d0aee204af2a34bdf58cc6ca7dee984aa914c818 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Mon, 24 Jun 2024 10:02:38 -0700 Subject: [PATCH] [ORT 1.18.1 Release] Cherry pick 3rd round (#21129) ### Description Adding critical TensorRT EP support ### Motivation and Context --------- Co-authored-by: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Co-authored-by: Ye Wang <52801275+wangyems@users.noreply.github.com> Co-authored-by: Michal Guzek Co-authored-by: pengwa Co-authored-by: wejoncy Co-authored-by: Yi Zhang Co-authored-by: Yi Zhang Co-authored-by: Pranav Sharma Co-authored-by: Adam Pocock Co-authored-by: cao lei Co-authored-by: Adrian Lizarraga Co-authored-by: inisis <46103969+inisis@users.noreply.github.com> Co-authored-by: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com> Co-authored-by: mo-ja <60505697+mo-ja@users.noreply.github.com> Co-authored-by: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Co-authored-by: Sumit Agarwal Co-authored-by: Atanas Dimitrov <70822030+neNasko1@users.noreply.github.com> Co-authored-by: Justin Chu Co-authored-by: Yufeng Li Co-authored-by: Dhruv Matani Co-authored-by: Dhruv Matani Co-authored-by: wangshuai09 <391746016@qq.com> Co-authored-by: Xiaoyu <85524621+xiaoyu-work@users.noreply.github.com> Co-authored-by: Xu Xing Co-authored-by: Dmitri Smirnov Co-authored-by: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com> Co-authored-by: Sai Kishan Pampana Co-authored-by: rachguo Co-authored-by: Jian Chen Co-authored-by: Shubham Bhokare <32080845+shubhambhokare1@users.noreply.github.com> Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Co-authored-by: Andrew Fantino <15876180+afantino951@users.noreply.github.com> Co-authored-by: Thomas Boby Co-authored-by: Tianlei Wu Co-authored-by: Scott McKay Co-authored-by: Michal Guzek Co-authored-by: George Wu Co-authored-by: Baiju Meswani --- cgmanifests/generated/cgmanifest.json | 2 +- cmake/deps.txt | 2 +- cmake/deps_update_and_upload.py | 4 +- docs/ContribOperators.md | 2 + .../tensorrt/tensorrt_provider_options.h | 17 +- .../contrib_ops/cuda/moe/ft_moe/moe_kernel.cu | 1 + .../core/graph/contrib_ops/contrib_defs.cc | 5 + .../tensorrt/onnx_ctx_model_helper.cc | 77 +++++- .../tensorrt/onnx_ctx_model_helper.h | 24 +- .../tensorrt/tensorrt_execution_provider.cc | 258 ++++++++++++++++-- .../tensorrt/tensorrt_execution_provider.h | 21 +- .../tensorrt_execution_provider_info.cc | 17 ++ .../tensorrt_execution_provider_info.h | 3 + .../tensorrt/tensorrt_provider_factory.cc | 3 + .../core/session/provider_bridge_ort.cc | 2 + .../python/onnxruntime_pybind_state.cc | 27 +- .../perf/parse_mem_concurrency_test.py | 132 +++++++++ .../python/tools/tensorrt/perf/post.py | 61 +---- .../test/perftest/command_args_parser.cc | 3 + tools/ci_build/build.py | 6 +- ...linux-gpu-tensorrt-daily-perf-pipeline.yml | 39 ++- .../templates/download-deps.yml | 4 +- 22 files changed, 613 insertions(+), 97 deletions(-) create mode 100644 onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 56abe33989d7c..eb74178b3e032 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -216,7 +216,7 @@ "component": { "type": "git", "git": { - "commitHash": "bacfaaa951653cd4e72efe727a543567cb38f7de", + "commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d", "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git" }, "comments": "onnx_tensorrt" diff --git a/cmake/deps.txt b/cmake/deps.txt index b9511af4664ce..d213b09034f02 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -38,7 +38,7 @@ mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d #use the latest commit of 10.0-GA -onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/eb43908b02a296ea0594432f06e9d3fac288d672.zip;94d07871810a36a5bc70a1def5c50504101c9bd1 +onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874 diff --git a/cmake/deps_update_and_upload.py b/cmake/deps_update_and_upload.py index 63df3f6f03869..c11ed95ca3122 100644 --- a/cmake/deps_update_and_upload.py +++ b/cmake/deps_update_and_upload.py @@ -6,9 +6,9 @@ # # Run without --do-upload once to verify downloading. Use --do-upload when you are ready to publish. # E.g.: -# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 +# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.164 # # check contents of C:/temp/onnxruntime_deps -# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.82 --no-download --do-upload +# python cmake/deps_update_and_upload.py --root-path C:/temp/onnxruntime_deps --version 1.0.164 --no-download --do-upload # # Next, update the version number in tools/ci_build/github/azure-pipelines/templates/download-deps.yml. diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index fc559411df190..da959740a4e23 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -1597,6 +1597,8 @@ This version of the operator has been available since version 1 of the 'com.micr
Usually each single EPContext associate with a graph partition.But for some case like QNN, it has single EPContext contains all partitions.In that case, the node with ep_cache_context should set main_context=1. Other nodes set main_context=0 and skip ep_cache_context.The path is relative to this Onnx file. Default is 1.
notes : string
(Optional) Some notes for the model
+
onnx_model_filename : string
+
(Optional) Filename of the original ONNX model.
partition_name : string
(Optional) partitioned graph name.
source : string
diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h index 32a9f06464ace..d008058821be3 100644 --- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h @@ -64,10 +64,21 @@ struct OrtTensorRTProviderOptionsV2 { * - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir" * - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir" * + * 3. In the case of building weight-stripped engines, the same security reasons as listed in 1) apply to the + * "onnx_model_filename" node attribute of EP context node, which contains a filename of the ONNX model with the + * weights needed for the refit process. User can specify a folder path relative to the current working + * directory by means of the "trt_onnx_model_folder_path" option. + * */ - int trt_dump_ep_context_model{0}; // Dump EP context node model - const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path. - int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data + int trt_dump_ep_context_model{0}; // Dump EP context node model + const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path. + int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data + int trt_weight_stripped_engine_enable{0}; // Enable weight-stripped engine build. Default 0 = false, + // nonzero = true + const char* trt_onnx_model_folder_path{nullptr}; // Folder path relative to the current working directory for + // the ONNX model containing the weights (applicable only when + // the "trt_weight_stripped_engine_enable" option is enabled) const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix + int trt_engine_hw_compatible{0}; // Enable hardware compatibility. Default 0 = false, nonzero = true }; diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu index 360c0aacd9c7a..39ce6aec90e1a 100644 --- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu +++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu @@ -17,6 +17,7 @@ // Licensed under the MIT License. #include +#include #include #include #include diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 5cf1818bbf9e8..7604e5ab2fe15 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -3299,6 +3299,11 @@ void RegisterContribSchemas() { "(Optional) SDK version used to convert the model.", AttributeProto::STRING, OPTIONAL_VALUE) + .Attr( + "onnx_model_filename", + "(Optional) Filename of the original ONNX model.", + AttributeProto::STRING, + OPTIONAL_VALUE) .Attr( "hardware_architecture", "(Optional) Hardware architecture.", diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc index 1994d1f5ab0b8..2171ce056e029 100644 --- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.cc @@ -8,8 +8,10 @@ #include "onnx_ctx_model_helper.h" #include "core/providers/cuda/shared_inc/cuda_call.h" #include "core/framework/execution_provider.h" +#include "tensorrt_execution_provider.h" namespace onnxruntime { +extern TensorrtLogger& GetTensorrtLogger(bool verbose_log); /* * Check whether the graph has the EP context contrib op. @@ -67,7 +69,8 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer, char* engine_data, size_t size, const int64_t embed_mode, - std::string compute_capability, + const std::string compute_capability, + const std::string onnx_model_path, const logging::Logger* logger) { auto model_build = graph_viewer.CreateModel(*logger); auto& graph_build = model_build->MainGraph(); @@ -88,6 +91,7 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer, auto attr_0 = ONNX_NAMESPACE::AttributeProto::Create(); // embed_mode auto attr_1 = ONNX_NAMESPACE::AttributeProto::Create(); // ep_cache_context auto attr_2 = ONNX_NAMESPACE::AttributeProto::Create(); // hardware_architecture + auto attr_3 = ONNX_NAMESPACE::AttributeProto::Create(); // onnx_model_filename std::string engine_data_str = ""; attr_0->set_name(EMBED_MODE); attr_0->set_type(onnx::AttributeProto_AttributeType_INT); @@ -106,13 +110,17 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer, attr_2->set_name(COMPUTE_CAPABILITY); attr_2->set_type(onnx::AttributeProto_AttributeType_STRING); attr_2->set_s(compute_capability); + attr_3->set_name(ONNX_MODEL_FILENAME); + attr_3->set_type(onnx::AttributeProto_AttributeType_STRING); + attr_3->set_s(std::filesystem::path(onnx_model_path).filename().string()); auto node_attributes = ONNX_NAMESPACE::NodeAttributes::Create(); - int num_attributes = 3; + constexpr int num_attributes = 4; node_attributes->reserve(num_attributes); node_attributes->emplace(EMBED_MODE, *attr_0); node_attributes->emplace(EP_CACHE_CONTEXT, *attr_1); node_attributes->emplace(COMPUTE_CAPABILITY, *attr_2); + node_attributes->emplace(ONNX_MODEL_FILENAME, *attr_3); // Create EP context node graph_build.AddNode(EPCONTEXT_OP, EPCONTEXT_OP, "", inputs, outputs, node_attributes.get(), EPCONTEXT_OP_DOMAIN); @@ -205,7 +213,7 @@ void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto, LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Dumped " + ctx_model_path; } -bool IsAbsolutePath(std::string& path_string) { +bool IsAbsolutePath(const std::string& path_string) { #ifdef _WIN32 onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string); auto path = std::filesystem::path(ort_path_string.c_str()); @@ -219,7 +227,7 @@ bool IsAbsolutePath(std::string& path_string) { } // Like "../file_path" -bool IsRelativePathToParentPath(std::string& path_string) { +bool IsRelativePathToParentPath(const std::string& path_string) { #ifdef _WIN32 onnxruntime::PathString ort_path_string = onnxruntime::ToPathString(path_string); auto path = std::filesystem::path(ort_path_string.c_str()); @@ -236,6 +244,28 @@ bool IsRelativePathToParentPath(std::string& path_string) { #endif } +/* + * Get the weight-refitted engine cache path from a weight-stripped engine cache path + * + * Weight-stipped engine: + * An engine with weights stripped and its size is smaller than a regualr engine. + * The cache name of weight-stripped engine is TensorrtExecutionProvider_TRTKernel_XXXXX.stripped.engine + * + * Weight-refitted engine: + * An engine that its weights have been refitted and it's simply a regular engine. + * The cache name of weight-refitted engine is TensorrtExecutionProvider_TRTKernel_XXXXX.engine + */ +std::string GetWeightRefittedEnginePath(std::string stripped_engine_cache) { + std::filesystem::path stripped_engine_cache_path(stripped_engine_cache); + std::string refitted_engine_cache_path = stripped_engine_cache_path.stem().stem().string() + ".engine"; + return refitted_engine_cache_path; +} + +bool IsWeightStrippedEngineCache(std::filesystem::path& engine_cache_path) { + // The weight-stripped engine cache has the naming of xxx.stripped.engine + return engine_cache_path.stem().extension().string() == ".stripped"; +} + Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph_viewer) { if (!ValidateEPCtxNode(graph_viewer)) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "It's not a valid EP Context node"); @@ -271,6 +301,22 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph // The engine cache and context model (current model) should be in the same directory std::filesystem::path ctx_model_dir(GetPathOrParentPathOfCtxModel(ep_context_model_path_)); auto engine_cache_path = ctx_model_dir.append(cache_path); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] GetEpContextFromGraph engine_cache_path: " + engine_cache_path.string(); + + // If it's a weight-stripped engine cache, it needs to be refitted even though the refit flag is not enabled + if (!weight_stripped_engine_refit_) { + weight_stripped_engine_refit_ = IsWeightStrippedEngineCache(engine_cache_path); + } + + // If the serialized refitted engine is present, use it directly without refitting the engine again + if (weight_stripped_engine_refit_) { + const std::filesystem::path refitted_engine_cache_path = GetWeightRefittedEnginePath(engine_cache_path.string()); + if (std::filesystem::exists(refitted_engine_cache_path)) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] " + refitted_engine_cache_path.string() + " exists."; + engine_cache_path = refitted_engine_cache_path.string(); + weight_stripped_engine_refit_ = false; + } + } if (!std::filesystem::exists(engine_cache_path)) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, @@ -290,6 +336,21 @@ Status TensorRTCacheModelHandler::GetEpContextFromGraph(const GraphViewer& graph "TensorRT EP could not deserialize engine from cache: " + engine_cache_path.string()); } LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path.string(); + + if (weight_stripped_engine_refit_) { + const std::string onnx_model_filename = attrs.at(ONNX_MODEL_FILENAME).s(); + std::string weight_stripped_engine_cache = engine_cache_path.string(); + auto status = TensorrtExecutionProvider::RefitEngine(onnx_model_filename, + onnx_model_folder_path_, + weight_stripped_engine_cache, + true /* path check for security */, + (*trt_engine_).get(), + true /* serialize refitted engine to disk */, + detailed_build_log_); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + } } return Status::OK(); } @@ -306,7 +367,13 @@ bool TensorRTCacheModelHandler::ValidateEPCtxNode(const GraphViewer& graph_viewe // Show the warning if compute capability is not matched if (attrs.count(COMPUTE_CAPABILITY) > 0) { std::string model_compute_capability = attrs.at(COMPUTE_CAPABILITY).s(); - if (model_compute_capability != compute_capability_) { + // Verify if engine was compiled with ampere+ hardware compatibility enabled + if (model_compute_capability == "80+") { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine is compatible to all Ampere+ GPU (except Jetson)"; + if (std::stoi(compute_capability_) < 80) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] However, this GPU doesn't match. The compute capability of the GPU: " << compute_capability_; + } + } else if (model_compute_capability != compute_capability_) { LOGS_DEFAULT(WARNING) << "[TensorRT EP] Engine was compiled for a different compatibility level and might not work or perform suboptimal"; LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the engine: " << model_compute_capability; LOGS_DEFAULT(WARNING) << "[TensorRT EP] The compute capability of the GPU: " << compute_capability_; diff --git a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h index 9f1e5178428e7..f8fefc12c3453 100644 --- a/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/tensorrt/onnx_ctx_model_helper.h @@ -5,6 +5,7 @@ #include #include +#include #include "core/providers/tensorrt/nv_includes.h" #include "core/providers/shared_library/provider_api.h" @@ -15,6 +16,7 @@ static const std::string EPCONTEXT_OP = "EPContext"; static const std::string EMBED_MODE = "embed_mode"; static const std::string EP_CACHE_CONTEXT = "ep_cache_context"; static const std::string COMPUTE_CAPABILITY = "hardware_architecture"; +static const std::string ONNX_MODEL_FILENAME = "onnx_model_filename"; static const std::string EPCONTEXT_OP_DOMAIN = "com.microsoft"; static const std::string EPCONTEXT_WARNING = "It's suggested to set the ORT graph optimization level to 0 and \ @@ -29,12 +31,13 @@ ONNX_NAMESPACE::ModelProto* CreateCtxModel(const GraphViewer& graph_viewer, char* engine_data, size_t size, const int64_t embed_mode, - std::string compute_capability, + const std::string compute_capability, + const std::string onnx_model_path, const logging::Logger* logger); std::string GetCtxModelPath(const std::string& ep_context_file_path, const std::string& original_model_path); -bool IsAbsolutePath(std::string& path_string); -bool IsRelativePathToParentPath(std::string& path_string); +bool IsAbsolutePath(const std::string& path_string); +bool IsRelativePathToParentPath(const std::string& path_string); void DumpCtxModel(ONNX_NAMESPACE::ModelProto* model_proto, const std::string& ctx_model_path); void UpdateCtxNodeModelEngineContext(ONNX_NAMESPACE::ModelProto* model_proto, @@ -46,7 +49,17 @@ class TensorRTCacheModelHandler { TensorRTCacheModelHandler(std::unique_ptr* trt_engine, nvinfer1::IRuntime* trt_runtime, std::string ep_context_model_path, - std::string compute_capability) : trt_engine_(trt_engine), trt_runtime_(trt_runtime), ep_context_model_path_(ep_context_model_path), compute_capability_(compute_capability) { + std::string compute_capability, + bool weight_stripped_engine_refit, + std::string onnx_model_folder_path, + bool detailed_build_log) + : trt_engine_(trt_engine), + trt_runtime_(trt_runtime), + ep_context_model_path_(ep_context_model_path), + compute_capability_(compute_capability), + weight_stripped_engine_refit_(weight_stripped_engine_refit), + onnx_model_folder_path_(onnx_model_folder_path), + detailed_build_log_(detailed_build_log) { } ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorRTCacheModelHandler); @@ -59,5 +72,8 @@ class TensorRTCacheModelHandler { nvinfer1::IRuntime* trt_runtime_; std::string ep_context_model_path_; // If using context model, it implies context model and engine cache is in the same directory std::string compute_capability_; + bool weight_stripped_engine_refit_; + std::string onnx_model_folder_path_; + bool detailed_build_log_; }; // TRTCacheModelHandler } // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index bfd464c7007ac..45b5ee65b3164 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -71,6 +71,7 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_mapsetDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) { + LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for network input " << tensor_name; return false; } } @@ -84,10 +85,12 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_mapgetOutput(j)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) { + LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for tensor " << tensor_name; return false; } } else if (trt_layer->getType() == nvinfer1::LayerType::kCONSTANT) { nvinfer1::IConstantLayer* const_layer = static_cast(trt_layer); + const std::string const_layer_name = const_layer->getName(); auto trt_weights = const_layer->getWeights(); double max_weight = std::numeric_limits::min(); for (int64_t k = 0, end = trt_weights.count; k < end; ++k) { @@ -108,13 +111,19 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map(trt_weights.values)[k]; break; +#if NV_TENSORRT_MAJOR >= 10 + case nvinfer1::DataType::kINT64: + weight = static_cast(static_cast(trt_weights.values)[k]); + break; +#endif // NV_TENSORRT_MAJOR >= 10 default: - LOGS_DEFAULT(ERROR) << "Found unsupported datatype!"; + LOGS_DEFAULT(ERROR) << "Found unsupported datatype for layer " << const_layer_name; return false; } max_weight = std::max(max_weight, std::abs(weight)); } if (!trt_layer->getOutput(j)->setDynamicRange(static_cast(-max_weight), static_cast(max_weight))) { + LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for layer " << const_layer_name; return false; } } @@ -1062,8 +1071,12 @@ Status BindKernelOutput(Ort::KernelContext& ctx, CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8, int8_t) CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t) CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t) +#if NV_TENSORRT_MAJOR >= 10 + CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t) +#else // The allocation buffer holds the int32 output data since TRT doesn't support int64. So, we need to cast the data (int32 -> int64) for ORT kernel output. CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int32_t, int64_t) +#endif // The allocation buffer holds the float output data since TRT doesn't support double. So, we need to cast the data (float -> double) for ORT kernel output. CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double) default: { @@ -1234,6 +1247,13 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv std::string profile_min_shapes, profile_max_shapes, profile_opt_shapes; + // incase the EP context is dumped the engine cache has to be enabled + auto enable_engine_cache_for_ep_context_model = [this]() { + if (dump_ep_context_model_ && ep_context_embed_mode_ == 0) { + engine_cache_enable_ = true; + } + }; + // Get environment variables if (info.has_trt_options) { max_partition_iterations_ = info.max_partition_iterations; @@ -1251,12 +1271,15 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv } dump_subgraphs_ = info.dump_subgraphs; engine_cache_enable_ = info.engine_cache_enable; + weight_stripped_engine_enable_ = info.weight_stripped_engine_enable; + onnx_model_folder_path_ = info.onnx_model_folder_path; timing_cache_enable_ = info.timing_cache_enable; force_timing_cache_match_ = info.force_timing_cache; detailed_build_log_ = info.detailed_build_log; dump_ep_context_model_ = info.dump_ep_context_model; ep_context_file_path_ = info.ep_context_file_path; ep_context_embed_mode_ = info.ep_context_embed_mode; + enable_engine_cache_for_ep_context_model(); if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) { cache_path_ = info.engine_cache_path; cache_prefix_ = info.engine_cache_prefix; @@ -1287,6 +1310,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv profile_max_shapes = info.profile_max_shapes; profile_opt_shapes = info.profile_opt_shapes; cuda_graph_enable_ = info.cuda_graph_enable; + engine_hw_compatible_ = info.engine_hw_compatible; } else { try { const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations); @@ -1350,6 +1374,16 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true); } + const std::string weight_stripped_engine_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kWeightStrippedEngineEnable); + if (!weight_stripped_engine_enable_env.empty()) { + weight_stripped_engine_enable_ = std::stoi(weight_stripped_engine_enable_env) != 0; + } + + const std::string onnx_model_folder_path_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kOnnxModelFolderPath); + if (!onnx_model_folder_path_env.empty()) { + onnx_model_folder_path_ = onnx_model_folder_path_env; + } + const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable); if (!timing_cache_enable_env.empty()) { timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true); @@ -1380,6 +1414,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv ep_context_embed_mode_ = std::stoi(ep_context_embed_mode_env); } + enable_engine_cache_for_ep_context_model(); + if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) { const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath); cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath); @@ -1513,6 +1549,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string(); } + // Hardware compatibility: pre-check on environment + if (engine_cache_enable_ && engine_hw_compatible_) { +#if NV_TENSORRT_MAJOR == 8 && NV_TENSORRT_MINOR > 5 || NV_TENSORRT_MAJOR > 8 + if (std::stoi(compute_capability_) < 80) { + LOGS_DEFAULT(WARNING) << "Engine hardware compatibility cannot be enabled as GPU arch < 80. "; + engine_hw_compatible_ = false; + } else if (std::stoi(compute_capability_) == 87) { + LOGS_DEFAULT(WARNING) << "Engine hardware compatibility cannot be enabled on Jetson Orin. "; + engine_hw_compatible_ = false; + } +#else + LOGS_DEFAULT(WARNING) << "Engine hardware compatibility cannot be enabled as TRT < 8.6. "; + engine_hw_compatible_ = false; +#endif + } + if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) { if (!cache_path_.empty() && !fs::is_directory(cache_path_)) { if (!fs::create_directory(cache_path_)) { @@ -1619,6 +1671,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv << ", trt_dla_core: " << dla_core_ << ", trt_dump_subgraphs: " << dump_subgraphs_ << ", trt_engine_cache_enable: " << engine_cache_enable_ + << ", trt_weight_stripped_engine_enable: " << weight_stripped_engine_enable_ + << ", trt_onnx_model_folder_path: " << onnx_model_folder_path_ << ", trt_cache_path: " << cache_path_ << ", trt_global_cache_path: " << global_cache_path_ << ", trt_engine_decryption_enable: " << engine_decryption_enable_ @@ -1638,7 +1692,8 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv << ", trt_dump_ep_context_model: " << dump_ep_context_model_ << ", trt_ep_context_file_path: " << ep_context_file_path_ << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_ - << ", trt_cache_prefix: " << cache_prefix_; + << ", trt_cache_prefix: " << cache_prefix_ + << ", trt_engine_hw_compatible: " << engine_hw_compatible_; } TensorrtExecutionProvider::~TensorrtExecutionProvider() { @@ -2271,7 +2326,6 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, const IKernelLookup& /*kernel_lookup*/) const { // Construct subgraph capability from node list std::vector> result; - // Get ModelPath const auto& path_string = graph.ModelPath().ToPathString(); #ifdef _WIN32 @@ -2462,6 +2516,67 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, return result; } +/** + * Refit the weight-stripped engine + */ +common::Status TensorrtExecutionProvider::RefitEngine(std::string onnx_model_filename, + std::string& onnx_model_folder_path, + std::string& weight_stripped_engine_cath_path, + bool path_check, + nvinfer1::ICudaEngine* trt_engine, + bool serialize_refitted_engine, + bool detailed_build_log) { +#if NV_TENSORRT_MAJOR >= 10 + std::filesystem::path onnx_model_path{onnx_model_folder_path}; + onnx_model_path.append(onnx_model_filename); + if (path_check && IsAbsolutePath(onnx_model_path.string())) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "For security purpose, the ONNX model path should be set with " + "a relative path, but it is an absolute path: " + + onnx_model_path.string()); + } + if (path_check && IsRelativePathToParentPath(onnx_model_path.string())) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "The ONNX model path has '..'. For security purpose, it's not " + "allowed to point outside the directory."); + } + + if (!std::filesystem::exists(onnx_model_path)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "The ONNX model " + onnx_model_path.string() + + " does not exist."); + } + + // weight-stripped engine refit logic + TensorrtLogger& trt_logger = GetTensorrtLogger(detailed_build_log); + auto refitter = std::unique_ptr(nvinfer1::createInferRefitter(*trt_engine, trt_logger)); + auto parser_refitter = std::unique_ptr( + nvonnxparser::createParserRefitter(*refitter, trt_logger)); + if (!parser_refitter->refitFromFile(onnx_model_path.string().c_str())) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP's IParserRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string()); + } + if (refitter->refitCudaEngine()) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Successfully refitted the weight-stripped engine."; + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP's IRefitter could not refit deserialized weight-stripped engine with weights contained in: " + onnx_model_path.string()); + } + + // serialize the refitted engine to disk + if (serialize_refitted_engine) { + std::string refitted_engine_cache = GetWeightRefittedEnginePath(weight_stripped_engine_cath_path); + nvinfer1::IHostMemory* serialized_engine = trt_engine->serialize(); + std::ofstream engine_file(refitted_engine_cache, std::ios::binary | std::ios::out); + engine_file.write(reinterpret_cast(serialized_engine->data()), serialized_engine->size()); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialize the refitted engine to " << refitted_engine_cache; + } + return Status::OK(); +#else + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP's IParserRefitter can only be used on TRT 10.0 onwards."); +#endif +} + common::Status TensorrtExecutionProvider::Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) { for (auto& fused_node_graph : fused_nodes_and_graphs) { @@ -2485,7 +2600,11 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector= 10 + trt_config->setFlag(nvinfer1::BuilderFlag::kSTRIP_PLAN); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] STRIP_PLAN is enabled"; + trt_config->setFlag(nvinfer1::BuilderFlag::kREFIT_IDENTICAL); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] REFIT_IDENTICAL is enabled"; +#else + LOGS_DEFAULT(WARNING) << "[TensorRT EP] weight-stripped engines can only be used on TRT 10.0 onwards!"; +#endif + } + // limit used tactic sources if (!tactic_sources_.empty()) { nvinfer1::TacticSources tactics = trt_config->getTacticSources(); @@ -2813,13 +2943,29 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView cache_path = GetCachePath(cache_path_, trt_node_name_with_precision); } + std::string cache_hw_compat = "_sm" + compute_capability_; + // Enable hardware compatility mode if assigned + if (engine_cache_enable_ && engine_hw_compatible_) { + trt_config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kAMPERE_PLUS); + cache_hw_compat = "_sm80+"; + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Hardware compatibility is enabled when loading and capturing engine cache."; + } + // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity - const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_; - const std::string engine_cache_path = cache_path_prefix + ".engine"; + const std::string cache_path_prefix = cache_path + cache_hw_compat; + std::string engine_cache_path = cache_path_prefix + ".engine"; const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted"; const std::string profile_cache_path = cache_path_prefix + ".profile"; + // If weight-stripped engine is enabled and refitted engine cache is not present, + // TRT EP will use the engine cache with ".stripped.engine" appended to the end. + const std::filesystem::path engine_cache_fs_path = engine_cache_path; + if (weight_stripped_engine_enable_ && !std::filesystem::exists(engine_cache_fs_path)) { + engine_cache_path = cache_path_prefix + ".stripped.engine"; + weight_stripped_engine_refit_ = true; + } + // Generate file name for dumping ep context model if (dump_ep_context_model_ && ctx_model_path_.empty()) { ctx_model_path_ = GetCtxModelPath(ep_context_file_path_, model_path_); @@ -2859,6 +3005,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP could not deserialize engine from cache: " + engine_cache_path); } + } else if (engine_decryption_enable_ && engine_cache_enable_ && std::filesystem::exists(encrypted_engine_cache_path) && !engine_update) { // Decrypt engine size_t engine_size = 0; @@ -2966,19 +3113,36 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView auto cache_file_name = std::filesystem::path(engine_cache_path).filename(); ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string(); } - + std::string compute_capability_hw_compat = compute_capability_; + if (engine_cache_enable_ && engine_hw_compatible_) { + compute_capability_hw_compat = "80+"; + } std::unique_ptr model_proto{CreateCtxModel(graph_body_viewer, ep_cache_context_attr_, reinterpret_cast(serialized_engine->data()), serialized_engine->size(), ep_context_embed_mode_, - compute_capability_, + compute_capability_hw_compat, + model_path_, GetLogger())}; DumpCtxModel(model_proto.get(), ctx_model_path_); } } } + if (weight_stripped_engine_refit_) { + auto status = RefitEngine(model_path_, + onnx_model_folder_path_, + engine_cache_path, + false /* path check for security */, + trt_engine.get(), + true /* serialize refitted engine to disk */, + detailed_build_log_); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + } + // Build context // Note: Creating an execution context from an engine is thread safe per TRT doc // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading @@ -3039,12 +3203,17 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView auto cache_file_name = std::filesystem::path(engine_cache_path).filename(); ep_cache_context_attr_ = std::filesystem::path(engine_cache_relative_path_to_context_model_dir).append(cache_file_name.string()).string(); } + std::string compute_capability_hw_compat = compute_capability_; + if (engine_cache_enable_ && engine_hw_compatible_) { + compute_capability_hw_compat = "80+"; + } model_proto_.reset(CreateCtxModel(graph_body_viewer, ep_cache_context_attr_, nullptr, 0, ep_context_embed_mode_, - compute_capability_, + compute_capability_hw_compat, + model_path_, GetLogger())); if (ep_context_embed_mode_ == 0) { DumpCtxModel(model_proto_.get(), ctx_model_path_); @@ -3065,11 +3234,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, - dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, - runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_, - dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_, - global_cache_path_, force_timing_cache_match_, detailed_build_log_, build_heuristics_enable_, sparsity_enable_, - builder_optimization_level_, auxiliary_streams_, !tactic_sources_.empty(), tactics, cuda_graph_enable_, cache_prefix_, cache_suffix}; + dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, + engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name], + context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_, + engine_decryption_, engine_encryption_, timing_cache_enable_, global_cache_path_, force_timing_cache_match_, + detailed_build_log_, build_heuristics_enable_, sparsity_enable_, builder_optimization_level_, + auxiliary_streams_, !tactic_sources_.empty(), tactics, cuda_graph_enable_, cache_prefix_, cache_suffix, engine_hw_compatible_}; *state = p.release(); return 0; }; @@ -3132,8 +3302,18 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView } else { cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision); } - const std::string cache_path_prefix = cache_path + "_sm" + compute_capability_; - const std::string engine_cache_path = cache_path_prefix + ".engine"; + + // Enable hardware compatility mode if assigned + std::string cache_hw_compat = "_sm" + compute_capability_; + if (engine_cache_enable_ && engine_hw_compatible_) { + cache_hw_compat = "_sm80+"; + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Hardware compatibility is enabled when loading and capturing engine cache."; + } + + // Name the engine cache based on GPU compute capacity and reduce the chance of loading an incompatible cache + // Note: Engine cache generated on a GPU with large memory might not be loadable on a GPU with smaller memory, even if they share the same compute capacity + const std::string cache_path_prefix = cache_path + cache_hw_compat; + std::string engine_cache_path = cache_path_prefix + ".engine"; const std::string encrypted_engine_cache_path = engine_cache_path + ".encrypted"; const std::string profile_cache_path = cache_path_prefix + ".profile"; std::string timing_cache_path = ""; @@ -3141,6 +3321,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView timing_cache_path = GetTimingCachePath(global_cache_path_, compute_capability_); } + // If weight-stripped engine is enabled and refitted engine cache is not present, + // TRT EP will use the engine cache with ".stripped.engine" appended to the end. + const std::filesystem::path engine_cache_fs_path = engine_cache_path; + if (weight_stripped_engine_enable_ && !std::filesystem::exists(engine_cache_fs_path)) { + engine_cache_path = cache_path_prefix + ".stripped.engine"; + weight_stripped_engine_refit_ = true; + } + // Load serialized engine if (trt_state->engine_cache_enable && trt_engine == nullptr) { std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); @@ -3169,6 +3357,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + engine_cache_path; trt_engine = trt_state->engine->get(); context_update = true; + } else if (trt_state->engine_decryption_enable && std::filesystem::exists(encrypted_engine_cache_path) && profile_file) { shape_ranges = DeserializeProfileV2(profile_file); LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] DeSerialized " + profile_cache_path; @@ -3281,6 +3470,16 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView LOGS_DEFAULT(WARNING) << "[TensorRT EP] Auxiliary streams can only be set on TRT 8.6 onwards!"; } #endif + if (weight_stripped_engine_enable_) { +#if NV_TENSORRT_MAJOR >= 10 + trt_config->setFlag(nvinfer1::BuilderFlag::kSTRIP_PLAN); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] STRIP_PLAN is enabled"; + trt_config->setFlag(nvinfer1::BuilderFlag::kREFIT_IDENTICAL); + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] REFIT_IDENTICAL is enabled"; +#else + LOGS_DEFAULT(WARNING) << "[TensorRT EP] weight-stripped engines can only be used on TRT 10.0 onwards!"; +#endif + } // limit used tactic sources if (trt_state->filter_tactic_sources) { nvinfer1::TacticSources tactics = trt_config->getTacticSources(); @@ -3304,6 +3503,12 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView } } + // Enable hardware compatility mode if assigned + if (trt_state->engine_hw_compatible) { + trt_config->setHardwareCompatibilityLevel(nvinfer1::HardwareCompatibilityLevel::kAMPERE_PLUS); + LOGS_DEFAULT(INFO) << "[TensorRT EP] Re-generate engine with hardware compatibility enabled."; + } + // Build engine std::unique_ptr serialized_engine; { @@ -3375,6 +3580,19 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView DumpCtxModel(model_proto_.get(), ctx_model_path_); } context_update = true; + + if (weight_stripped_engine_refit_) { + auto status = RefitEngine(model_path_, + onnx_model_folder_path_, + engine_cache_path, + false /* path check for security */, + trt_engine, + true /* serialize refitted engine to disk */, + detailed_build_log_); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + } } if (context_update) { @@ -3575,7 +3793,13 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con std::unordered_map output_types; // TRT engine output name -> ORT output tensor type // Get engine binary data and deserialize it - auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, runtime_.get(), model_path_, compute_capability_); + auto trt_cache_model_handler = TensorRTCacheModelHandler(&trt_engine, + runtime_.get(), + model_path_, + compute_capability_, + weight_stripped_engine_enable_, + onnx_model_folder_path_, + detailed_build_log_); auto status = trt_cache_model_handler.GetEpContextFromGraph(graph_body_viewer); if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index eabbbdea1c4ac..f4dae57487f51 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -27,7 +27,9 @@ static const std::string kDLACore = "ORT_TENSORRT_DLA_CORE"; static const std::string kDumpSubgraphs = "ORT_TENSORRT_DUMP_SUBGRAPHS"; static const std::string kEngineCacheEnable = "ORT_TENSORRT_ENGINE_CACHE_ENABLE"; static const std::string kCachePath = "ORT_TENSORRT_CACHE_PATH"; -// As a timing cache can be used across multiple ONNX files it makes sense to have a seperate cache path +static const std::string kWeightStrippedEngineEnable = "ORT_TENSORRT_WEIGHT_STRIPPED_ENGINE_ENABLE"; +static const std::string kOnnxModelFolderPath = "ORT_TENSORRT_ONNX_MODEL_FOLDER_PATH"; +// As a timing cache can be used across multiple ONNX files it makes sense to have a separate cache path static const std::string kTimingCachePath = "ORT_TENSORRT_GLOBAL_CACHE_PATH"; static const std::string kDecryptionEnable = "ORT_TENSORRT_ENGINE_DECRYPTION_ENABLE"; static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LIB_PATH"; @@ -191,6 +193,7 @@ struct TensorrtFuncState { bool cuda_graph_enable = 0; std::string cache_prefix; std::string cache_suffix; + bool engine_hw_compatible = false; }; // Minimum information to construct kernel function state for direct engine load code path @@ -217,6 +220,7 @@ struct SubGraphContext { using SubGraphContextMap = std::unordered_map>; using DDSOutputAllocatorMap = std::unordered_map>; +std::string GetWeightRefittedEnginePath(std::string engine_cache_path); // Logical device representation. class TensorrtExecutionProvider : public IExecutionProvider { @@ -263,6 +267,17 @@ class TensorrtExecutionProvider : public IExecutionProvider { bool IsGraphCaptured(int graph_annotation_id) const override; Status ReplayGraph(int graph_annotation_id) override; + /** + * Refit the weight-stripped engine + */ + static common::Status RefitEngine(std::string onnx_model_filename, + std::string& onnx_model_folder_path, + std::string& weight_stripped_engine_cath_path, + bool path_check, + nvinfer1::ICudaEngine* trt_engine, + bool serialize_refitted_engine, + bool detailed_build_log); + private: mutable TensorrtExecutionProviderInfo info_; bool external_stream_ = false; @@ -280,6 +295,9 @@ class TensorrtExecutionProvider : public IExecutionProvider { bool int8_use_native_tensorrt_calibration_table_ = false; bool dump_subgraphs_ = false; bool engine_cache_enable_ = false; + bool weight_stripped_engine_enable_ = false; + bool weight_stripped_engine_refit_ = false; + std::string onnx_model_folder_path_; bool build_heuristics_enable_ = false; bool sparsity_enable_ = false; int builder_optimization_level_ = 3; @@ -303,6 +321,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { bool detailed_build_log_ = false; bool cuda_graph_enable_ = false; std::string cache_prefix_; + bool engine_hw_compatible_ = false; // The OrtAllocator object will be get during ep compute time // and should be kept for the lifetime of TRT EP object. diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc index cd2087c9d7472..9fe39f5921e1c 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc @@ -27,6 +27,8 @@ constexpr const char* kDLACore = "trt_dla_core"; constexpr const char* kDumpSubgraphs = "trt_dump_subgraphs"; constexpr const char* kEngineCacheEnable = "trt_engine_cache_enable"; constexpr const char* kEngineCachePath = "trt_engine_cache_path"; +constexpr const char* kWeightStrippedEngineEnable = "trt_weight_stripped_engine_enable"; +constexpr const char* kOnnxModelFolderPath = "trt_onnx_model_folder_path"; constexpr const char* kEngineCachePrefix = "trt_engine_cache_prefix"; constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable"; constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path"; @@ -51,6 +53,8 @@ constexpr const char* kCudaGraphEnable = "trt_cuda_graph_enable"; constexpr const char* kEpContextEmbedMode = "trt_ep_context_embed_mode"; constexpr const char* kEpContextFilePath = "trt_ep_context_file_path"; constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model"; +constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible"; + } // namespace provider_option_names } // namespace tensorrt @@ -92,6 +96,8 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions .AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs) .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable) .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePath, info.engine_cache_path) + .AddAssignmentToReference(tensorrt::provider_option_names::kWeightStrippedEngineEnable, info.weight_stripped_engine_enable) + .AddAssignmentToReference(tensorrt::provider_option_names::kOnnxModelFolderPath, info.onnx_model_folder_path) .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCachePrefix, info.engine_cache_prefix) .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable) .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path) @@ -115,6 +121,7 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions .AddAssignmentToReference(tensorrt::provider_option_names::kDumpEpContextModel, info.dump_ep_context_model) .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextFilePath, info.ep_context_file_path) .AddAssignmentToReference(tensorrt::provider_option_names::kEpContextEmbedMode, info.ep_context_embed_mode) + .AddAssignmentToReference(tensorrt::provider_option_names::kEngineHwCompatible, info.engine_hw_compatible) .Parse(options)); // add new provider option here. info.user_compute_stream = user_compute_stream; @@ -139,6 +146,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE {tensorrt::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)}, {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.engine_cache_enable)}, {tensorrt::provider_option_names::kEngineCachePath, MakeStringWithClassicLocale(info.engine_cache_path)}, + {tensorrt::provider_option_names::kWeightStrippedEngineEnable, MakeStringWithClassicLocale(info.weight_stripped_engine_enable)}, + {tensorrt::provider_option_names::kOnnxModelFolderPath, MakeStringWithClassicLocale(info.onnx_model_folder_path)}, {tensorrt::provider_option_names::kEngineCachePrefix, MakeStringWithClassicLocale(info.engine_cache_prefix)}, {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.engine_decryption_enable)}, {tensorrt::provider_option_names::kDecryptionLibPath, MakeStringWithClassicLocale(info.engine_decryption_lib_path)}, @@ -163,6 +172,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.dump_ep_context_model)}, {tensorrt::provider_option_names::kEpContextFilePath, MakeStringWithClassicLocale(info.ep_context_file_path)}, {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.ep_context_embed_mode)}, + {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.engine_hw_compatible)}, }; return options; } @@ -180,6 +190,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor const std::string kProfilesMaxShapes_ = empty_if_null(info.trt_profile_max_shapes); const std::string kProfilesOptShapes_ = empty_if_null(info.trt_profile_opt_shapes); const std::string kEpContextFilePath_ = empty_if_null(info.trt_ep_context_file_path); + const std::string kOnnxModelFolderPath_ = empty_if_null(info.trt_onnx_model_folder_path); const ProviderOptions options{ {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)}, @@ -198,6 +209,8 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor {tensorrt::provider_option_names::kEngineCacheEnable, MakeStringWithClassicLocale(info.trt_engine_cache_enable)}, {tensorrt::provider_option_names::kEngineCachePath, kEngineCachePath_}, {tensorrt::provider_option_names::kEngineCachePrefix, kEngineCachePrefix_}, + {tensorrt::provider_option_names::kWeightStrippedEngineEnable, MakeStringWithClassicLocale(info.trt_weight_stripped_engine_enable)}, + {tensorrt::provider_option_names::kOnnxModelFolderPath, kOnnxModelFolderPath_}, {tensorrt::provider_option_names::kDecryptionEnable, MakeStringWithClassicLocale(info.trt_engine_decryption_enable)}, {tensorrt::provider_option_names::kDecryptionLibPath, kDecryptionLibPath_}, {tensorrt::provider_option_names::kForceSequentialEngineBuild, MakeStringWithClassicLocale(info.trt_force_sequential_engine_build)}, @@ -220,6 +233,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor {tensorrt::provider_option_names::kEpContextFilePath, kEpContextFilePath_}, {tensorrt::provider_option_names::kDumpEpContextModel, MakeStringWithClassicLocale(info.trt_dump_ep_context_model)}, {tensorrt::provider_option_names::kEpContextEmbedMode, MakeStringWithClassicLocale(info.trt_ep_context_embed_mode)}, + {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.trt_engine_hw_compatible)}, }; return options; } @@ -289,6 +303,8 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options trt_provider_options_v2.trt_dla_core = internal_options.dla_core; trt_provider_options_v2.trt_dump_subgraphs = internal_options.dump_subgraphs; trt_provider_options_v2.trt_engine_cache_enable = internal_options.engine_cache_enable; + trt_provider_options_v2.trt_weight_stripped_engine_enable = internal_options.weight_stripped_engine_enable; + trt_provider_options_v2.trt_onnx_model_folder_path = copy_string_if_needed(internal_options.onnx_model_folder_path); trt_provider_options_v2.trt_engine_cache_path = copy_string_if_needed(internal_options.engine_cache_path); trt_provider_options_v2.trt_engine_cache_prefix = copy_string_if_needed(internal_options.engine_cache_prefix); @@ -319,5 +335,6 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options trt_provider_options_v2.trt_dump_ep_context_model = internal_options.dump_ep_context_model; trt_provider_options_v2.trt_ep_context_embed_mode = internal_options.ep_context_embed_mode; trt_provider_options_v2.trt_ep_context_file_path = copy_string_if_needed(internal_options.ep_context_file_path); + trt_provider_options_v2.trt_engine_hw_compatible = internal_options.engine_hw_compatible; } } // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h index 80424b8d6d196..3b859ea2da466 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h @@ -32,6 +32,8 @@ struct TensorrtExecutionProviderInfo { bool dump_subgraphs{false}; bool engine_cache_enable{false}; std::string engine_cache_path{""}; + bool weight_stripped_engine_enable{false}; + std::string onnx_model_folder_path{""}; bool engine_decryption_enable{false}; std::string engine_decryption_lib_path{""}; bool force_sequential_engine_build{false}; @@ -55,6 +57,7 @@ struct TensorrtExecutionProviderInfo { std::string ep_context_file_path{""}; int ep_context_embed_mode{0}; std::string engine_cache_prefix{""}; + bool engine_hw_compatible{false}; static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc index 568da57a50956..6430ffab09976 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc @@ -90,6 +90,8 @@ struct Tensorrt_Provider : Provider { info.dump_subgraphs = options.trt_dump_subgraphs != 0; info.engine_cache_enable = options.trt_engine_cache_enable != 0; info.engine_cache_path = options.trt_engine_cache_path == nullptr ? "" : options.trt_engine_cache_path; + info.weight_stripped_engine_enable = options.trt_weight_stripped_engine_enable != 0; + info.onnx_model_folder_path = options.trt_onnx_model_folder_path == nullptr ? "" : options.trt_onnx_model_folder_path; info.engine_decryption_enable = options.trt_engine_decryption_enable != 0; info.engine_decryption_lib_path = options.trt_engine_decryption_lib_path == nullptr ? "" : options.trt_engine_decryption_lib_path; info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0; @@ -113,6 +115,7 @@ struct Tensorrt_Provider : Provider { info.ep_context_file_path = options.trt_ep_context_file_path == nullptr ? "" : options.trt_ep_context_file_path; info.ep_context_embed_mode = options.trt_ep_context_embed_mode; info.engine_cache_prefix = options.trt_engine_cache_prefix == nullptr ? "" : options.trt_engine_cache_prefix; + info.engine_hw_compatible = options.trt_engine_hw_compatible != 0; return std::make_shared(info); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 80bd301c201b3..b16da60f7c7c1 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1643,6 +1643,7 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti trt_options_converted.trt_ep_context_file_path = ""; trt_options_converted.trt_ep_context_embed_mode = 0; trt_options_converted.trt_engine_cache_prefix = ""; + trt_options_converted.trt_engine_hw_compatible = 0; return trt_options_converted; } @@ -2256,6 +2257,7 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor delete[] ptr->trt_profile_max_shapes; delete[] ptr->trt_profile_opt_shapes; delete[] ptr->trt_ep_context_file_path; + delete[] ptr->trt_onnx_model_folder_path; } std::unique_ptr p(ptr); diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 7fc6515d3d50a..051e870256012 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -475,7 +475,9 @@ std::unique_ptr CreateExecutionProviderInstance( // So we need these std::string variables defined here as they will be kept alive for the lifetime of TRT EP and we can still access them from OrtTensorRTProviderOptionsV2 instance. // (The reason is string copy is involved, for example params.trt_engine_cache_path = cache_path.c_str() and those std::string variable is referenced by OrtTensorRTProviderOptionsV2 instance // and TRT EP instance, so it won't be released.) - std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path; + std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, + trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path, + onnx_model_folder_path; auto it = provider_options_map.find(type); if (it != provider_options_map.end()) { OrtTensorRTProviderOptionsV2 params; @@ -588,6 +590,21 @@ std::unique_ptr CreateExecutionProviderInstance( } else { ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_cache_prefix' should be a string to customize engine cache prefix i.e. 'FRCNN' or 'yolov4'.\n"); } + } else if (option.first == "trt_weight_stripped_engine_enable") { + if (option.second == "True" || option.second == "true") { + params.trt_weight_stripped_engine_enable = true; + } else if (option.second == "False" || option.second == "false") { + params.trt_weight_stripped_engine_enable = false; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_weight_stripped_engine_enable' should be 'True' or 'False'. Default value is 'False'.\n"); + } + } else if (option.first == "trt_onnx_model_folder_path") { + if (!option.second.empty()) { + onnx_model_folder_path = option.second; + params.trt_onnx_model_folder_path = onnx_model_folder_path.c_str(); + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_onnx_model_folder_path' should be a path string i.e. 'engine_cache'.\n"); + } } else if (option.first == "trt_engine_decryption_enable") { if (option.second == "True" || option.second == "true") { params.trt_engine_decryption_enable = true; @@ -750,6 +767,14 @@ std::unique_ptr CreateExecutionProviderInstance( } else { ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_ep_context_embed_mode' should be a positive integer number i.e. '1'.\n"); } + } else if (option.first == "trt_engine_hw_compatible") { + if (option.second == "True" || option.second == "true") { + params.trt_engine_hw_compatible = true; + } else if (option.second == "False" || option.second == "false") { + params.trt_engine_hw_compatible = false; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_hw_compatible' should be 'True' or 'False'. Default value is 'False'.\n"); + } } else { ORT_THROW("Invalid TensorRT EP option: ", option.first); } diff --git a/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py new file mode 100644 index 0000000000000..492de13fb42b5 --- /dev/null +++ b/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py @@ -0,0 +1,132 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +import csv +import datetime +import os +import re + +import pandas as pd +from azure.kusto.data import KustoConnectionStringBuilder +from azure.kusto.ingest import QueuedIngestClient +from post import get_identifier, parse_arguments, write_table + + +def parse_valgrind_log(input_path, output_path, keywords): + is_definitely_lost = False + is_ort_trt_related = False + buffer = [] + leak_block = None + leak_bytes = None + keyword = None + results = [] + + with open(input_path) as file: + lines = file.readlines() + + for line in lines: + line = line.strip() # noqa: PLW2901 + # Remove "==xxxxx==" pattern from the line + line = line.split("==")[-1].strip() # noqa: PLW2901 + + if "blocks are definitely lost in loss" in line: + is_definitely_lost = True + # Extract LeakBlock and LeakBytes + match = re.search(r"([\d,]+) byte[s]? in ([\d,]+) block[s]?", line) + if match: + leak_bytes = match.group(1).replace(",", "") + leak_block = match.group(2).replace(",", "") + continue + + if is_definitely_lost: + if line: + buffer.append(line) + for word in keywords: + if word in line: + is_ort_trt_related = True + keyword = word + break + + # End of section + if is_definitely_lost and not line: + if is_ort_trt_related: + results.append((keyword, leak_block, leak_bytes, "\n".join(buffer))) + # Reset var + is_definitely_lost = False + is_ort_trt_related = False + buffer = [] + leak_block = None + leak_bytes = None + keyword = None + + # Writing results to CSV + with open(output_path, "w", newline="") as csvfile: + csvwriter = csv.writer(csvfile) + csvwriter.writerow(["Keyword", "LeakBlock", "LeakBytes", "ValgrindMessage"]) + for entry in results: + csvwriter.writerow([entry[0], entry[1], entry[2], entry[3]]) + + +def parse_concurrency_test_log(input_path, output_path): + with open(input_path) as log_file: + log_content = log_file.read() + + failed_cases_section = log_content.split("Failed Test Cases:")[1] + + # passed = 1 if no failed test cases + if failed_cases_section.strip() == "": + passed = 1 + else: + passed = 0 + + with open(output_path, "w", newline="") as csv_file: + csv_writer = csv.writer(csv_file) + csv_writer.writerow(["Passed", "Log"]) + csv_writer.writerow([passed, log_content]) + + +if __name__ == "__main__": + args = parse_arguments() + + # connect to database + kcsb_ingest = KustoConnectionStringBuilder.with_az_cli_authentication(args.kusto_conn) + ingest_client = QueuedIngestClient(kcsb_ingest) + identifier = get_identifier( + args.commit_datetime, args.commit_hash, args.trt_version, args.branch, args.use_tensorrt_oss_parser + ) + upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0) + + try: + result_mem_test_path = args.report_folder + os.chdir(result_mem_test_path) + # Parse mem_test log + logs = ["valgrind.log", "concurrency_test.log"] + csv_paths = ["mem_test.csv", "concurrency_test.csv"] + for log, csv_path in zip(logs, csv_paths): + if os.path.exists(log): + print(f"{identifier}: Parsing {log}") + if log == logs[0]: + parse_valgrind_log(log, csv_path, ["TensorrtExecutionProvider", "TensorRT"]) + else: + parse_concurrency_test_log(log, csv_path) + + # Upload to db + for csv_path, db_table_name in zip(csv_paths, ["ep_valgrind_record", "ep_concurrencytest_record"]): + if os.path.exists(csv_path): + table = pd.read_csv(csv_path) + write_table( + ingest_client, + args.database, + table, + db_table_name, + upload_time, + identifier, + args.branch, + args.commit_hash, + args.commit_datetime, + ) + print(f"{identifier}: {csv_path} is synced to db") + + except Exception as e: + print(str(e)) diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py index fe941096e2fe8..9b78743d7e751 100644 --- a/onnxruntime/python/tools/tensorrt/perf/post.py +++ b/onnxruntime/python/tools/tensorrt/perf/post.py @@ -3,7 +3,6 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import argparse -import csv import datetime import os import sys @@ -421,11 +420,10 @@ def main(): upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0) try: - # Load EP Perf test results from /result result_file = args.report_folder - result_perf_test_path = os.path.join(result_file, "result") - folders = os.listdir(result_perf_test_path) - os.chdir(result_perf_test_path) + + folders = os.listdir(result_file) + os.chdir(result_file) tables = [ fail_name, @@ -448,13 +446,13 @@ def main(): for model_group in folders: os.chdir(model_group) csv_filenames = os.listdir() - for csv_file in csv_filenames: - table = pd.read_csv(csv_file) - if session_name in csv_file: + for csv in csv_filenames: + table = pd.read_csv(csv) + if session_name in csv: table_results[session_name] = pd.concat( [table_results[session_name], get_session(table, model_group)], ignore_index=True ) - elif specs_name in csv_file: + elif specs_name in csv: table_results[specs_name] = pd.concat( [ table_results[specs_name], @@ -462,12 +460,12 @@ def main(): ], ignore_index=True, ) - elif fail_name in csv_file: + elif fail_name in csv: table_results[fail_name] = pd.concat( [table_results[fail_name], get_failures(table, model_group)], ignore_index=True, ) - elif latency_name in csv_file: + elif latency_name in csv: table_results[memory_name] = pd.concat( [table_results[memory_name], get_memory(table, model_group)], ignore_index=True, @@ -477,11 +475,11 @@ def main(): [table_results[latency_name], get_latency(table, model_group)], ignore_index=True, ) - elif status_name in csv_file: + elif status_name in csv: table_results[status_name] = pd.concat( [table_results[status_name], get_status(table, model_group)], ignore_index=True ) - elif op_metrics_name in csv_file: + elif op_metrics_name in csv: table = table.assign(Group=model_group) table_results[op_metrics_name] = pd.concat( [table_results[op_metrics_name], table], ignore_index=True @@ -515,43 +513,6 @@ def main(): args.commit_datetime, ) - # Load concurrency test results - result_mem_test_path = os.path.join(result_file, "result_mem_test") - os.chdir(result_mem_test_path) - log_path = "concurrency_test.log" - if os.path.exists(log_path): - print("Generating concurrency test report") - with open(log_path) as log_file: - log_content = log_file.read() - - failed_cases_section = log_content.split("Failed Test Cases:")[1] - - # passed = 1 if no failed test cases - if failed_cases_section.strip() == "": - passed = 1 - else: - passed = 0 - - csv_path = "concurrency_test.csv" - with open(csv_path, "w", newline="") as csv_file: - csv_writer = csv.writer(csv_file) - csv_writer.writerow(["Passed", "Log"]) - csv_writer.writerow([passed, log_content]) - - db_table_name = "ep_concurrencytest_record" - table = pd.read_csv(csv_path) - write_table( - ingest_client, - args.database, - table, - db_table_name, - upload_time, - identifier, - args.branch, - args.commit_hash, - args.commit_datetime, - ) - except BaseException as e: print(str(e)) sys.exit(1) diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index 62291762f61b8..175079d8197bf 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -112,6 +112,9 @@ namespace perftest { "\t [TensorRT only] [trt_engine_cache_enable]: Enable engine caching.\n" "\t [TensorRT only] [trt_engine_cache_path]: Specify engine cache path.\n" "\t [TensorRT only] [trt_engine_cache_prefix]: Customize engine cache prefix when trt_engine_cache_enable is true.\n" + "\t [TensorRT only] [trt_engine_hw_compatible]: Enable hardware compatibility. Engines ending with '_sm80+' can be re-used across all Ampere+ GPU (a hardware-compatible engine may have lower throughput and/or higher latency than its non-hardware-compatible counterpart).\n" + "\t [TensorRT only] [trt_weight_stripped_engine_enable]: Enable weight-stripped engine build.\n" + "\t [TensorRT only] [trt_onnx_model_folder_path]: Folder path for the ONNX model with weights.\n" "\t [TensorRT only] [trt_force_sequential_engine_build]: Force TensorRT engines to be built sequentially.\n" "\t [TensorRT only] [trt_context_memory_sharing_enable]: Enable TensorRT context memory sharing between subgraphs.\n" "\t [TensorRT only] [trt_layer_norm_fp32_fallback]: Force Pow + Reduce ops in layer norm to run in FP32 to avoid overflow.\n" diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 19a95730c565d..0d2ad51482078 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1548,7 +1548,11 @@ def generate_build_tree( and not args.build_wasm ): if is_windows(): - cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS"] + # DLL initialization errors due to old conda msvcp140.dll dll are a result of the new MSVC compiler + # See https://developercommunity.visualstudio.com/t/Access-violation-with-std::mutex::lock-a/10664660#T-N10668856 + # Remove this definition (_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR) + # once the conda msvcp140.dll dll is updated. + cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS", "/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR"] if not args.use_gdk: # Target Windows 10 cflags += [ diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index a16647f17280d..7cfff805c3b3c 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -1,7 +1,7 @@ parameters: - name: PostToDashboard - displayName: Post to Dashboard + displayName: Post EP Perf results to Dashboard type: boolean default: true @@ -30,7 +30,7 @@ parameters: - "partner-models" - name: MemTest - displayName: Run Memory Test and Concurrency Test + displayName: Run Memory and Concurrency Test type: boolean default: true @@ -147,11 +147,27 @@ jobs: workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/' condition: always() - - task: PublishBuildArtifacts@1 - inputs: - pathtoPublish: '$(Build.SourcesDirectory)/Artifact' - artifactName: 'result-$(Build.BuildNumber)' - + - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs' + displayName: 'Install dashboard dependencies' + + - script: | + az --version || { + echo "Azure CLI not found, installing..." + curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash + } + displayName: 'Check and Install Azure CLI' + + - task: AzureCLI@2 + displayName: 'Parse Memory & Concurrency Test Records and Sync' + inputs: + azureSubscription: AIInfraBuildOnnxRuntimeOSS + scriptLocation: inlineScript + scriptType: bash + inlineScript: | + short_hash=$(git rev-parse --short HEAD) && + commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) && + python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/parse_mem_concurrency_test.py -r $(Build.SourcesDirectory)/Artifact/result_mem_test -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + - ${{ if eq(parameters.PostToDashboard, true) }}: - script: 'python3 -m pip install pandas azure-kusto-data[pandas] azure-kusto-ingest[pandas] coloredlogs' @@ -165,7 +181,7 @@ jobs: displayName: 'Check and Install Azure CLI' - task: AzureCLI@2 - displayName: 'Post EP Perf Results to Dashboard' + displayName: 'Azure CLI Post to Dashboard' inputs: azureSubscription: AIInfraBuildOnnxRuntimeOSS scriptLocation: inlineScript @@ -173,7 +189,12 @@ jobs: inlineScript: | short_hash=$(git rev-parse --short HEAD) && commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) && - python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + + - task: PublishBuildArtifacts@1 + inputs: + pathtoPublish: '$(Build.SourcesDirectory)/Artifact' + artifactName: 'result-$(Build.BuildNumber)' - template: templates/component-governance-component-detection-steps.yml parameters : diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index e00425739b711..85722c1cb8d2a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.150 + version: 1.0.164 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.150 + version: 1.0.164 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here.