From 74492a8b2d2838b3ce06d1bdb60e117941503f9e Mon Sep 17 00:00:00 2001 From: Benjamin Braun Date: Mon, 9 Dec 2024 18:01:14 +0000 Subject: [PATCH 1/2] Add helper functions to pull metrics in HTTPAPIServer to pull metrics for use in HandleGenerate to add kv_utilization and max_token_capacity to the inference request response header. --- src/http_server.cc | 150 +++++++++++++++++++++++++++++++++++++++++++++ src/http_server.h | 20 +++++- 2 files changed, 169 insertions(+), 1 deletion(-) diff --git a/src/http_server.cc b/src/http_server.cc index 99aed411b5..462880d1cf 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -3225,6 +3225,42 @@ HTTPAPIServer::HandleGenerate( req, RestrictedCategory::INFERENCE, restricted_apis_); AddContentTypeHeader(req, "application/json"); +#ifdef TRITON_ENABLE_METRICS + // logic to add kv_cache metrics to response header + // Get the metrics in Prometheus format + if (std::getenv("ORCA_HEADER_METRIC_TYPE") != nullptr) { + const std::string orca_type = std::getenv("ORCA_HEADER_METRIC_TYPE"); + TRITONSERVER_Metrics* metrics = nullptr; + TRITONSERVER_Error* err = + TRITONSERVER_ServerMetrics(server_.get(), &metrics); + if (err == nullptr) { + const char* base; + size_t byte_size; + err = TRITONSERVER_MetricsFormatted( + metrics, TRITONSERVER_METRIC_PROMETHEUS, &base, &byte_size); + if (err == nullptr) { + std::string kv_utilization(base, byte_size); + // Extract the KV utilization metrics from the Prometheus formatted + // string. + std::string extracted_kv_metrics = + ExtractKVMetrics(kv_utilization, orca_type); + if (!extracted_kv_metrics.empty()) { + evhtp_headers_add_header( + req->headers_out, + evhtp_header_new( + "endpoint-load-metrics", extracted_kv_metrics.c_str(), 1, 1)); + } + } + } + TRITONSERVER_MetricsDelete(metrics); + // Handle potential errors + if (err != nullptr) { + LOG_ERROR << "Failed to get KV metrics: " + << TRITONSERVER_ErrorMessage(err); + TRITONSERVER_ErrorDelete(err); + } + } +#endif // TRITON_ENABLE_METRICS if (req->method != htp_method_POST) { RETURN_AND_RESPOND_WITH_ERR( req, EVHTP_RES_METHNALLOWED, "Method Not Allowed"); @@ -3381,6 +3417,120 @@ HTTPAPIServer::HandleGenerate( request_release_payload.release(); } +#ifdef TRITON_ENABLE_METRICS +std::vector +HTTPAPIServer::MetricFamilyExtractor( + const std::string& input, const std::string& metricFamily) +{ + std::vector metrics; + // Construct the regex pattern using the provided metricFamily + std::string patternStr = metricFamily + R"((?:{(.*?)})?\s+(\d+(?:\.\d+)?))"; + re2::RE2 pattern(patternStr); + re2::StringPiece inputPiece(input); + + std::string labelString; + std::string metric_value; + + while (re2::RE2::FindAndConsume( + &inputPiece, pattern, &labelString, &metric_value)) { + PromMetric metric; + + // Extract labels if they exist + if (!labelString.empty()) { + re2::RE2 labelPattern(R"((\w+)=\"([^\"]+)\")"); + re2::StringPiece labelPiece(labelString); + std::string key, value; + while ( + re2::RE2::FindAndConsume(&labelPiece, labelPattern, &key, &value)) { + metric.labels[key] = value; + } + } + + // Assign the value + metric.value = stod(metric_value); + metrics.push_back(metric); + } + + return metrics; +} + +std::string +HTTPAPIServer::ExtractKVMetrics( + const std::string& prometheus_metrics, const std::string& orca_type) +{ + std::string metric_family = "nv_trt_llm_kv_cache_block_metrics"; + std::vector kv_cache_metrics = + MetricFamilyExtractor(prometheus_metrics, metric_family); + + double tokens_per_block = -1; + double used_blocks = -1; + double max_blocks = -1; + + for (const auto& metric : kv_cache_metrics) { + if (metric.labels.count("kv_cache_block_type") > 0) { + std::string type = metric.labels.at("kv_cache_block_type"); + if (type == "tokens_per") { + tokens_per_block = metric.value; + } else if (type == "used") { + used_blocks = metric.value; + } else if (type == "max") { + max_blocks = metric.value; + } + } + } + + // One or more of the kv metrics was not found or invalid. + if (tokens_per_block < 0 || used_blocks < 0 || max_blocks < 0) { + return ""; + } + + // Calculate derived metrics + double kv_cache_utilization = 0; + if (max_blocks > 0) { + kv_cache_utilization = used_blocks / max_blocks; + } + uint64_t max_token_capacity = + static_cast(max_blocks * tokens_per_block); + + // Logic to construct and format response header + std::string header_contents = ""; + const std::string named_metrics_key = "named_metrics"; + const std::string kv_util_key = "kv_cache_utilization"; + const std::string max_token_key = "max_token_capacity"; + + if (orca_type == "json") { + // Format the metrics according to the ORCA protocol as JSON. + triton::common::TritonJson::Value orca_metrics( + triton::common::TritonJson::ValueType::OBJECT); + triton::common::TritonJson::Value named_metrics( + orca_metrics, triton::common::TritonJson::ValueType::OBJECT); + + named_metrics.AddDouble(kv_util_key.c_str(), kv_cache_utilization); + named_metrics.AddUInt(max_token_key.c_str(), max_token_capacity); + orca_metrics.Add(named_metrics_key.c_str(), std::move(named_metrics)); + + triton::common::TritonJson::WriteBuffer buffer; + orca_metrics.Write(&buffer); + header_contents = std::string("JSON ") + buffer.Contents(); + + } else if (orca_type == "http") { + // Format the metrics according to the ORCA protocol as Native HTTP + // (comma separated list). + const std::string prefix = named_metrics_key + "."; + + header_contents = "TEXT "; + header_contents += prefix + kv_util_key + "=" + + std::to_string(kv_cache_utilization) + ", "; + header_contents += + prefix + max_token_key + "=" + std::to_string(max_token_capacity); + } else { + LOG_ERROR << "orca_type is set to an invalid type: " << orca_type; + } + + return header_contents; +} +#endif // TRITON_ENABLE_METRICS + TRITONSERVER_Error* HTTPAPIServer::ModelInputMetadata( const std::string& model_name, const int64_t model_version, diff --git a/src/http_server.h b/src/http_server.h index 3949f97e27..3e865bba74 100644 --- a/src/http_server.h +++ b/src/http_server.h @@ -455,6 +455,14 @@ class HTTPAPIServer : public HTTPServer { evbuffer* buffer_ = nullptr; }; +#ifdef TRITON_ENABLE_METRICS + private: + struct PromMetric { + std::unordered_map labels; + double value; + }; +#endif // TRITON_ENABLE_METRICS + protected: explicit HTTPAPIServer( const std::shared_ptr& server, @@ -558,7 +566,17 @@ class HTTPAPIServer : public HTTPServer { void HandleGenerate( evhtp_request_t* req, const std::string& model_name, const std::string& model_version_str, bool streaming); - +#ifdef TRITON_ENABLE_METRICS + // Helper function to set get the KV-cache utilization metrics for the + // infer response header + std::string ExtractKVMetrics( + const std::string& prometheus_metrics, const std::string& orca_type); + + // Generates a metric struct for a given family with a map of labels and a + // value + std::vector MetricFamilyExtractor( + const std::string& input, const std::string& metricFamily); +#endif // TRITON_ENABLE_METRICS // 'meta_data_root' is the root JSON document for 'input_metadata'. // In TritonJson, the Value objects are references to the root document. // Therefore the document must stay valid. From d5760e0def2e3a1727186fa3531bbfbb2bfa1fc5 Mon Sep 17 00:00:00 2001 From: Benjamin Braun Date: Fri, 13 Dec 2024 02:04:12 +0000 Subject: [PATCH 2/2] Add logging, examples and more detailed comments, and move feature functionality to HTTPAPIServer::GenerateRequestClass::StartResponse() to extract metrics after inference request is processed for up-to-date metrics. --- src/http_server.cc | 372 +++++++++++++++++++++++++++------------------ src/http_server.h | 43 +++--- 2 files changed, 248 insertions(+), 167 deletions(-) diff --git a/src/http_server.cc b/src/http_server.cc index 462880d1cf..e7bc0b9e12 100644 --- a/src/http_server.cc +++ b/src/http_server.cc @@ -3225,42 +3225,6 @@ HTTPAPIServer::HandleGenerate( req, RestrictedCategory::INFERENCE, restricted_apis_); AddContentTypeHeader(req, "application/json"); -#ifdef TRITON_ENABLE_METRICS - // logic to add kv_cache metrics to response header - // Get the metrics in Prometheus format - if (std::getenv("ORCA_HEADER_METRIC_TYPE") != nullptr) { - const std::string orca_type = std::getenv("ORCA_HEADER_METRIC_TYPE"); - TRITONSERVER_Metrics* metrics = nullptr; - TRITONSERVER_Error* err = - TRITONSERVER_ServerMetrics(server_.get(), &metrics); - if (err == nullptr) { - const char* base; - size_t byte_size; - err = TRITONSERVER_MetricsFormatted( - metrics, TRITONSERVER_METRIC_PROMETHEUS, &base, &byte_size); - if (err == nullptr) { - std::string kv_utilization(base, byte_size); - // Extract the KV utilization metrics from the Prometheus formatted - // string. - std::string extracted_kv_metrics = - ExtractKVMetrics(kv_utilization, orca_type); - if (!extracted_kv_metrics.empty()) { - evhtp_headers_add_header( - req->headers_out, - evhtp_header_new( - "endpoint-load-metrics", extracted_kv_metrics.c_str(), 1, 1)); - } - } - } - TRITONSERVER_MetricsDelete(metrics); - // Handle potential errors - if (err != nullptr) { - LOG_ERROR << "Failed to get KV metrics: " - << TRITONSERVER_ErrorMessage(err); - TRITONSERVER_ErrorDelete(err); - } - } -#endif // TRITON_ENABLE_METRICS if (req->method != htp_method_POST) { RETURN_AND_RESPOND_WITH_ERR( req, EVHTP_RES_METHNALLOWED, "Method Not Allowed"); @@ -3417,119 +3381,6 @@ HTTPAPIServer::HandleGenerate( request_release_payload.release(); } -#ifdef TRITON_ENABLE_METRICS -std::vector -HTTPAPIServer::MetricFamilyExtractor( - const std::string& input, const std::string& metricFamily) -{ - std::vector metrics; - // Construct the regex pattern using the provided metricFamily - std::string patternStr = metricFamily + R"((?:{(.*?)})?\s+(\d+(?:\.\d+)?))"; - re2::RE2 pattern(patternStr); - re2::StringPiece inputPiece(input); - - std::string labelString; - std::string metric_value; - - while (re2::RE2::FindAndConsume( - &inputPiece, pattern, &labelString, &metric_value)) { - PromMetric metric; - - // Extract labels if they exist - if (!labelString.empty()) { - re2::RE2 labelPattern(R"((\w+)=\"([^\"]+)\")"); - re2::StringPiece labelPiece(labelString); - std::string key, value; - while ( - re2::RE2::FindAndConsume(&labelPiece, labelPattern, &key, &value)) { - metric.labels[key] = value; - } - } - - // Assign the value - metric.value = stod(metric_value); - metrics.push_back(metric); - } - - return metrics; -} - -std::string -HTTPAPIServer::ExtractKVMetrics( - const std::string& prometheus_metrics, const std::string& orca_type) -{ - std::string metric_family = "nv_trt_llm_kv_cache_block_metrics"; - std::vector kv_cache_metrics = - MetricFamilyExtractor(prometheus_metrics, metric_family); - - double tokens_per_block = -1; - double used_blocks = -1; - double max_blocks = -1; - - for (const auto& metric : kv_cache_metrics) { - if (metric.labels.count("kv_cache_block_type") > 0) { - std::string type = metric.labels.at("kv_cache_block_type"); - if (type == "tokens_per") { - tokens_per_block = metric.value; - } else if (type == "used") { - used_blocks = metric.value; - } else if (type == "max") { - max_blocks = metric.value; - } - } - } - - // One or more of the kv metrics was not found or invalid. - if (tokens_per_block < 0 || used_blocks < 0 || max_blocks < 0) { - return ""; - } - - // Calculate derived metrics - double kv_cache_utilization = 0; - if (max_blocks > 0) { - kv_cache_utilization = used_blocks / max_blocks; - } - uint64_t max_token_capacity = - static_cast(max_blocks * tokens_per_block); - - // Logic to construct and format response header - std::string header_contents = ""; - const std::string named_metrics_key = "named_metrics"; - const std::string kv_util_key = "kv_cache_utilization"; - const std::string max_token_key = "max_token_capacity"; - - if (orca_type == "json") { - // Format the metrics according to the ORCA protocol as JSON. - triton::common::TritonJson::Value orca_metrics( - triton::common::TritonJson::ValueType::OBJECT); - triton::common::TritonJson::Value named_metrics( - orca_metrics, triton::common::TritonJson::ValueType::OBJECT); - - named_metrics.AddDouble(kv_util_key.c_str(), kv_cache_utilization); - named_metrics.AddUInt(max_token_key.c_str(), max_token_capacity); - orca_metrics.Add(named_metrics_key.c_str(), std::move(named_metrics)); - - triton::common::TritonJson::WriteBuffer buffer; - orca_metrics.Write(&buffer); - header_contents = std::string("JSON ") + buffer.Contents(); - - } else if (orca_type == "http") { - // Format the metrics according to the ORCA protocol as Native HTTP - // (comma separated list). - const std::string prefix = named_metrics_key + "."; - - header_contents = "TEXT "; - header_contents += prefix + kv_util_key + "=" + - std::to_string(kv_cache_utilization) + ", "; - header_contents += - prefix + max_token_key + "=" + std::to_string(max_token_capacity); - } else { - LOG_ERROR << "orca_type is set to an invalid type: " << orca_type; - } - - return header_contents; -} -#endif // TRITON_ENABLE_METRICS TRITONSERVER_Error* HTTPAPIServer::ModelInputMetadata( @@ -4376,6 +4227,62 @@ HTTPAPIServer::GenerateRequestClass::StartResponse( return; } + +#ifdef TRITON_ENABLE_METRICS + // logic to add kv_cache metrics to response header + // Get the metrics in Prometheus format + + // "ORCA_METRIC_FORMAT" is an environment variable that specifies which load + // report format `endpoint-load-metrics` will be in. If left unset the header + // will not be written and the feature is disabled. + // + // When set, the valid values for "ORCA_METRIC_FORMAT" are: + // + // "http" + // "json" + // + // Any other value will have behavior equivalent to being unset while also + // logging and error. + // + // For specifics on the different formats for the load reporting formats, see: + // https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0#heading=h.do9yfa1wlpk8 + auto server = infer_request->EvHtpServer(); + if (std::getenv("ORCA_METRIC_FORMAT") != nullptr && server != nullptr) { + const std::string orca_type = std::getenv("ORCA_METRIC_FORMAT"); + TRITONSERVER_Metrics* metrics = nullptr; + TRITONSERVER_Error* err = TRITONSERVER_ServerMetrics(server, &metrics); + if (err == nullptr) { + const char* base; + size_t byte_size; + err = TRITONSERVER_MetricsFormatted( + metrics, TRITONSERVER_METRIC_PROMETHEUS, &base, &byte_size); + if (err == nullptr) { + std::string formatted_metrics(base, byte_size); + // Extract the KV utilization metrics from the Prometheus formatted + // string. + std::string extracted_kv_metrics = + ExtractKVMetrics(formatted_metrics, orca_type); + if (!extracted_kv_metrics.empty()) { + evhtp_headers_add_header( + req->headers_out, + evhtp_header_new( + "endpoint-load-metrics", extracted_kv_metrics.c_str(), 1, 1)); + } else { + LOG_ERROR << "ORCA_METRIC_FORMAT is set but extracted_kv_metrics is " + "empty, no header written. orca_type=" + << orca_type; + } + } + } else { + // Handle potential errors + LOG_ERROR << "Failed to get KV metrics: " + << TRITONSERVER_ErrorMessage(err); + TRITONSERVER_ErrorDelete(err); + } + TRITONSERVER_MetricsDelete(metrics); + } +#endif // TRITON_ENABLE_METRICS + if (infer_request->streaming_) { AddContentTypeHeader(req, "text/event-stream; charset=utf-8"); } else { @@ -4385,6 +4292,173 @@ HTTPAPIServer::GenerateRequestClass::StartResponse( evhtp_request_resume(req); } + +#ifdef TRITON_ENABLE_METRICS +std::vector +HTTPAPIServer::GenerateRequestClass::MetricFamilyExtractor( + const std::string& input, const std::string& metricFamily) +{ + std::vector metrics; + // Construct the regex pattern using the provided metricFamily. + + // `labelGroup` is a capturing group that captures all characters within curly + // braces, excluding line breaks. + std::string labelGroup = "(?:{(.*?)})"; + + // `valueGroup` is a capturing group that captures a number with its + // decimals if any. + std::string valueGroup = R"((\d+(?:\.\d+)?))"; + + // `patternStr` matches on lines starting with `metricFamily` then captures + // its labels if any, then (optionally) matches any whitespace, then captures + // its numeric double value. + // + // For example, `patternStr` would match on input: + // `nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="used",model="tensorrt_llm",version="1"} + // 3` + // + // with 2 capturing groups: + // 1. `kv_cache_block_type="used",model="tensorrt_llm",version="1"` + // 2. `3` + std::string patternStr = metricFamily + labelGroup + R"(?\s*)" + valueGroup; + re2::RE2 pattern(patternStr); + re2::StringPiece inputPiece(input); + + std::string labelString; + std::string metric_value; + + while (re2::RE2::FindAndConsume( + &inputPiece, pattern, &labelString, &metric_value)) { + PromMetric metric; + + // Extract labels if they exist + if (!labelString.empty()) { + // `labelPattern` captures any alphanumeric sequence that precedes an '=' + // character, then captures the following quoted character sequence. These + // groups are exahstive given the prometheus data model: + // https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels + // + // For example, calling FindAndConsume() with `labelPattern` on input: + // `kv_cache_block_type="used",model="tensorrt_llm",version="1"` + // + // matches 3 times with 2 capturing groups each: + // + // Match #1 + // 1. `kv_cache_block_type` + // 2. `used` + // + // Match #2 + // 1. `model` + // 2. `tensorrt_llm` + // + // Match #3 + // 1. `version` + // 2. `1` + re2::RE2 labelPattern(R"((\w+)=\"([^\"]*)\")"); + re2::StringPiece labelPiece(labelString); + std::string key, value; + while ( + re2::RE2::FindAndConsume(&labelPiece, labelPattern, &key, &value)) { + // Populate the metric's labels map + metric.labels[key] = value; + } + } + + // Assign the metric its value and add it to the family list + metric.value = stod(metric_value); + metrics.push_back(metric); + } + + return metrics; +} + +std::string +HTTPAPIServer::GenerateRequestClass::ExtractKVMetrics( + const std::string& prometheus_metrics, const std::string& orca_type) +{ + std::string metric_family = "nv_trt_llm_kv_cache_block_metrics"; + std::vector kv_cache_metrics = + MetricFamilyExtractor(prometheus_metrics, metric_family); + + double tokens_per_block = -1; + double used_blocks = -1; + double max_blocks = -1; + + for (const auto& metric : kv_cache_metrics) { + if (metric.labels.count("kv_cache_block_type") > 0) { + std::string type = metric.labels.at("kv_cache_block_type"); + if (type == "tokens_per") { + tokens_per_block = metric.value; + } else if (type == "used") { + used_blocks = metric.value; + } else if (type == "max") { + max_blocks = metric.value; + } + } + } + + // Return early if not all kv metrics are found and set. + if (tokens_per_block < 0 || used_blocks < 0 || max_blocks < 0) { + LOG_ERROR << "One or more of the kv metrics was not found or invalid."; + return ""; + } + + // Calculate derived metrics + double kv_cache_utilization = 0; + if (max_blocks > 0) { + kv_cache_utilization = used_blocks / max_blocks; + } + uint64_t max_token_capacity = + static_cast(max_blocks * tokens_per_block); + + return OrcaKVMetricHeader( + orca_type, kv_cache_utilization, max_token_capacity); +} + +std::string +HTTPAPIServer::GenerateRequestClass::OrcaKVMetricHeader( + const std::string& orca_type, const double kv_cache_utilization, + const uint64_t max_token_capacity) +{ + // Logic to construct and format response header + std::string header_contents = ""; + const std::string named_metrics_key = "named_metrics"; + const std::string kv_util_key = "kv_cache_utilization"; + const std::string max_token_key = "max_token_capacity"; + + if (orca_type == "json") { + // Format the metrics according to the ORCA protocol as JSON. + triton::common::TritonJson::Value orca_metrics( + triton::common::TritonJson::ValueType::OBJECT); + triton::common::TritonJson::Value named_metrics( + orca_metrics, triton::common::TritonJson::ValueType::OBJECT); + + named_metrics.AddDouble(kv_util_key.c_str(), kv_cache_utilization); + named_metrics.AddUInt(max_token_key.c_str(), max_token_capacity); + orca_metrics.Add(named_metrics_key.c_str(), std::move(named_metrics)); + + triton::common::TritonJson::WriteBuffer buffer; + orca_metrics.Write(&buffer); + header_contents = std::string("JSON ") + buffer.Contents(); + + } else if (orca_type == "http") { + // Format the metrics according to the ORCA protocol as Native HTTP + // (comma separated list). + const std::string prefix = named_metrics_key + "."; + + header_contents = "TEXT "; + header_contents += prefix + kv_util_key + "=" + + std::to_string(kv_cache_utilization) + ", "; + header_contents += + prefix + max_token_key + "=" + std::to_string(max_token_capacity); + } else { + LOG_ERROR << "orca_type is set to an invalid type: " << orca_type; + } + + return header_contents; +} +#endif // TRITON_ENABLE_METRICS + void HTTPAPIServer::GenerateRequestClass::ChunkResponseCallback( evthr_t* thr, void* arg, void* shared) diff --git a/src/http_server.h b/src/http_server.h index 3e865bba74..c557311bc4 100644 --- a/src/http_server.h +++ b/src/http_server.h @@ -364,6 +364,8 @@ class HTTPAPIServer : public HTTPServer { } virtual ~GenerateRequestClass(); + TRITONSERVER_Server* EvHtpServer() const { return server_; } + // [FIXME] Specialize response complete function for now, should have // been a dispatcher and call into object specific response function. static void InferResponseComplete( @@ -393,6 +395,12 @@ class HTTPAPIServer : public HTTPServer { const MappingSchema* ResponseSchema() { return response_schema_; } private: +#ifdef TRITON_ENABLE_METRICS + struct PromMetric { + std::unordered_map labels; + double value; + }; +#endif // TRITON_ENABLE_METRICS struct TritonOutput { enum class Type { RESERVED, TENSOR, PARAMETER }; TritonOutput(Type t, const std::string& val) : type(t), value(val) {} @@ -403,6 +411,23 @@ class HTTPAPIServer : public HTTPServer { // TENSOR, PARAMETER type uint32_t index; }; + +#ifdef TRITON_ENABLE_METRICS + // Helper function to get the KV-cache utilization metrics for the + // inference response header + static std::string ExtractKVMetrics( + const std::string& prometheus_metrics, const std::string& orca_type); + // Generates a metric struct for a given family with a map of labels and a + // value + static std::vector MetricFamilyExtractor( + const std::string& input, const std::string& metricFamily); + // Creates a header string in the the proper reporting format for provided + // KV-cache metrics. + static std::string OrcaKVMetricHeader( + const std::string& reporting_format, const double kv_cache_utilization, + const uint64_t max_token_capacity); +#endif // TRITON_ENABLE_METRICS + TRITONSERVER_Error* ExactMappingInput( const std::string& name, triton::common::TritonJson::Value& value, std::map& @@ -455,13 +480,6 @@ class HTTPAPIServer : public HTTPServer { evbuffer* buffer_ = nullptr; }; -#ifdef TRITON_ENABLE_METRICS - private: - struct PromMetric { - std::unordered_map labels; - double value; - }; -#endif // TRITON_ENABLE_METRICS protected: explicit HTTPAPIServer( @@ -566,17 +584,6 @@ class HTTPAPIServer : public HTTPServer { void HandleGenerate( evhtp_request_t* req, const std::string& model_name, const std::string& model_version_str, bool streaming); -#ifdef TRITON_ENABLE_METRICS - // Helper function to set get the KV-cache utilization metrics for the - // infer response header - std::string ExtractKVMetrics( - const std::string& prometheus_metrics, const std::string& orca_type); - - // Generates a metric struct for a given family with a map of labels and a - // value - std::vector MetricFamilyExtractor( - const std::string& input, const std::string& metricFamily); -#endif // TRITON_ENABLE_METRICS // 'meta_data_root' is the root JSON document for 'input_metadata'. // In TritonJson, the Value objects are references to the root document. // Therefore the document must stay valid.