From 74492a8b2d2838b3ce06d1bdb60e117941503f9e Mon Sep 17 00:00:00 2001
From: Benjamin Braun <benjaminbraun@google.com>
Date: Mon, 9 Dec 2024 18:01:14 +0000
Subject: [PATCH 1/2] Add helper functions to pull metrics in HTTPAPIServer to
 pull metrics for use in HandleGenerate to add kv_utilization and
 max_token_capacity to the inference request response header.

---
 src/http_server.cc | 150 +++++++++++++++++++++++++++++++++++++++++++++
 src/http_server.h  |  20 +++++-
 2 files changed, 169 insertions(+), 1 deletion(-)
diff --git a/src/http_server.cc b/src/http_server.cc
index 99aed411b5..462880d1cf 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -3225,6 +3225,42 @@ HTTPAPIServer::HandleGenerate(
       req, RestrictedCategory::INFERENCE, restricted_apis_);
 
   AddContentTypeHeader(req, "application/json");
+#ifdef TRITON_ENABLE_METRICS
+  // logic to add kv_cache metrics to response header
+  // Get the metrics in Prometheus format
+  if (std::getenv("ORCA_HEADER_METRIC_TYPE") != nullptr) {
+    const std::string orca_type = std::getenv("ORCA_HEADER_METRIC_TYPE");
+    TRITONSERVER_Metrics* metrics = nullptr;
+    TRITONSERVER_Error* err =
+        TRITONSERVER_ServerMetrics(server_.get(), &metrics);
+    if (err == nullptr) {
+      const char* base;
+      size_t byte_size;
+      err = TRITONSERVER_MetricsFormatted(
+          metrics, TRITONSERVER_METRIC_PROMETHEUS, &base, &byte_size);
+      if (err == nullptr) {
+        std::string kv_utilization(base, byte_size);
+        // Extract the KV utilization metrics from the Prometheus formatted
+        // string.
+        std::string extracted_kv_metrics =
+            ExtractKVMetrics(kv_utilization, orca_type);
+        if (!extracted_kv_metrics.empty()) {
+          evhtp_headers_add_header(
+              req->headers_out,
+              evhtp_header_new(
+                  "endpoint-load-metrics", extracted_kv_metrics.c_str(), 1, 1));
+        }
+      }
+    }
+    TRITONSERVER_MetricsDelete(metrics);
+    // Handle potential errors
+    if (err != nullptr) {
+      LOG_ERROR << "Failed to get KV metrics: "
+                << TRITONSERVER_ErrorMessage(err);
+      TRITONSERVER_ErrorDelete(err);
+    }
+  }
+#endif  // TRITON_ENABLE_METRICS
   if (req->method != htp_method_POST) {
     RETURN_AND_RESPOND_WITH_ERR(
         req, EVHTP_RES_METHNALLOWED, "Method Not Allowed");
@@ -3381,6 +3417,120 @@ HTTPAPIServer::HandleGenerate(
   request_release_payload.release();
 }
 
+#ifdef TRITON_ENABLE_METRICS
+std::vector<HTTPAPIServer::PromMetric>
+HTTPAPIServer::MetricFamilyExtractor(
+    const std::string& input, const std::string& metricFamily)
+{
+  std::vector<PromMetric> metrics;
+  // Construct the regex pattern using the provided metricFamily
+  std::string patternStr = metricFamily + R"((?:{(.*?)})?\s+(\d+(?:\.\d+)?))";
+  re2::RE2 pattern(patternStr);
+  re2::StringPiece inputPiece(input);
+
+  std::string labelString;
+  std::string metric_value;
+
+  while (re2::RE2::FindAndConsume(
+      &inputPiece, pattern, &labelString, &metric_value)) {
+    PromMetric metric;
+
+    // Extract labels if they exist
+    if (!labelString.empty()) {
+      re2::RE2 labelPattern(R"((\w+)=\"([^\"]+)\")");
+      re2::StringPiece labelPiece(labelString);
+      std::string key, value;
+      while (
+          re2::RE2::FindAndConsume(&labelPiece, labelPattern, &key, &value)) {
+        metric.labels[key] = value;
+      }
+    }
+
+    // Assign the value
+    metric.value = stod(metric_value);
+    metrics.push_back(metric);
+  }
+
+  return metrics;
+}
+
+std::string
+HTTPAPIServer::ExtractKVMetrics(
+    const std::string& prometheus_metrics, const std::string& orca_type)
+{
+  std::string metric_family = "nv_trt_llm_kv_cache_block_metrics";
+  std::vector<PromMetric> kv_cache_metrics =
+      MetricFamilyExtractor(prometheus_metrics, metric_family);
+
+  double tokens_per_block = -1;
+  double used_blocks = -1;
+  double max_blocks = -1;
+
+  for (const auto& metric : kv_cache_metrics) {
+    if (metric.labels.count("kv_cache_block_type") > 0) {
+      std::string type = metric.labels.at("kv_cache_block_type");
+      if (type == "tokens_per") {
+        tokens_per_block = metric.value;
+      } else if (type == "used") {
+        used_blocks = metric.value;
+      } else if (type == "max") {
+        max_blocks = metric.value;
+      }
+    }
+  }
+
+  // One or more of the kv metrics was not found or invalid.
+  if (tokens_per_block < 0 || used_blocks < 0 || max_blocks < 0) {
+    return "";
+  }
+
+  // Calculate derived metrics
+  double kv_cache_utilization = 0;
+  if (max_blocks > 0) {
+    kv_cache_utilization = used_blocks / max_blocks;
+  }
+  uint64_t max_token_capacity =
+      static_cast<uint64_t>(max_blocks * tokens_per_block);
+
+  // Logic to construct and format response header
+  std::string header_contents = "";
+  const std::string named_metrics_key = "named_metrics";
+  const std::string kv_util_key = "kv_cache_utilization";
+  const std::string max_token_key = "max_token_capacity";
+
+  if (orca_type == "json") {
+    // Format the metrics according to the ORCA protocol as JSON.
+    triton::common::TritonJson::Value orca_metrics(
+        triton::common::TritonJson::ValueType::OBJECT);
+    triton::common::TritonJson::Value named_metrics(
+        orca_metrics, triton::common::TritonJson::ValueType::OBJECT);
+
+    named_metrics.AddDouble(kv_util_key.c_str(), kv_cache_utilization);
+    named_metrics.AddUInt(max_token_key.c_str(), max_token_capacity);
+    orca_metrics.Add(named_metrics_key.c_str(), std::move(named_metrics));
+
+    triton::common::TritonJson::WriteBuffer buffer;
+    orca_metrics.Write(&buffer);
+    header_contents = std::string("JSON ") + buffer.Contents();
+
+  } else if (orca_type == "http") {
+    // Format the metrics according to the ORCA protocol as Native HTTP
+    // (comma separated list).
+    const std::string prefix = named_metrics_key + ".";
+
+    header_contents = "TEXT ";
+    header_contents += prefix + kv_util_key + "=" +
+                       std::to_string(kv_cache_utilization) + ", ";
+    header_contents +=
+        prefix + max_token_key + "=" + std::to_string(max_token_capacity);
+  } else {
+    LOG_ERROR << "orca_type is set to an invalid type: " << orca_type;
+  }
+
+  return header_contents;
+}
+#endif  // TRITON_ENABLE_METRICS
+
 TRITONSERVER_Error*
 HTTPAPIServer::ModelInputMetadata(
     const std::string& model_name, const int64_t model_version,
diff --git a/src/http_server.h b/src/http_server.h
index 3949f97e27..3e865bba74 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -455,6 +455,14 @@ class HTTPAPIServer : public HTTPServer {
     evbuffer* buffer_ = nullptr;
   };
 
+#ifdef TRITON_ENABLE_METRICS
+ private:
+  struct PromMetric {
+    std::unordered_map<std::string, std::string> labels;
+    double value;
+  };
+#endif  // TRITON_ENABLE_METRICS
+
  protected:
   explicit HTTPAPIServer(
       const std::shared_ptr<TRITONSERVER_Server>& server,
@@ -558,7 +566,17 @@ class HTTPAPIServer : public HTTPServer {
   void HandleGenerate(
       evhtp_request_t* req, const std::string& model_name,
       const std::string& model_version_str, bool streaming);
-
+#ifdef TRITON_ENABLE_METRICS
+  // Helper function to set get the KV-cache utilization metrics for the
+  // infer response header
+  std::string ExtractKVMetrics(
+      const std::string& prometheus_metrics, const std::string& orca_type);
+
+  // Generates a metric struct for a given family with a map of labels and a
+  // value
+  std::vector<PromMetric> MetricFamilyExtractor(
+      const std::string& input, const std::string& metricFamily);
+#endif  // TRITON_ENABLE_METRICS
   // 'meta_data_root' is the root JSON document for 'input_metadata'.
   // In TritonJson, the Value objects are references to the root document.
   // Therefore the document must stay valid.

From d5760e0def2e3a1727186fa3531bbfbb2bfa1fc5 Mon Sep 17 00:00:00 2001
From: Benjamin Braun <benjaminbraun@google.com>
Date: Fri, 13 Dec 2024 02:04:12 +0000
Subject: [PATCH 2/2] Add logging, examples and more detailed comments, and
 move feature functionality to
 HTTPAPIServer::GenerateRequestClass::StartResponse() to extract metrics after
 inference request is processed for up-to-date metrics.

---
 src/http_server.cc | 372 +++++++++++++++++++++++++++------------------
 src/http_server.h  |  43 +++---
 2 files changed, 248 insertions(+), 167 deletions(-)

diff --git a/src/http_server.cc b/src/http_server.cc
index 462880d1cf..e7bc0b9e12 100644
--- a/src/http_server.cc
+++ b/src/http_server.cc
@@ -3225,42 +3225,6 @@ HTTPAPIServer::HandleGenerate(
       req, RestrictedCategory::INFERENCE, restricted_apis_);
 
   AddContentTypeHeader(req, "application/json");
-#ifdef TRITON_ENABLE_METRICS
-  // logic to add kv_cache metrics to response header
-  // Get the metrics in Prometheus format
-  if (std::getenv("ORCA_HEADER_METRIC_TYPE") != nullptr) {
-    const std::string orca_type = std::getenv("ORCA_HEADER_METRIC_TYPE");
-    TRITONSERVER_Metrics* metrics = nullptr;
-    TRITONSERVER_Error* err =
-        TRITONSERVER_ServerMetrics(server_.get(), &metrics);
-    if (err == nullptr) {
-      const char* base;
-      size_t byte_size;
-      err = TRITONSERVER_MetricsFormatted(
-          metrics, TRITONSERVER_METRIC_PROMETHEUS, &base, &byte_size);
-      if (err == nullptr) {
-        std::string kv_utilization(base, byte_size);
-        // Extract the KV utilization metrics from the Prometheus formatted
-        // string.
-        std::string extracted_kv_metrics =
-            ExtractKVMetrics(kv_utilization, orca_type);
-        if (!extracted_kv_metrics.empty()) {
-          evhtp_headers_add_header(
-              req->headers_out,
-              evhtp_header_new(
-                  "endpoint-load-metrics", extracted_kv_metrics.c_str(), 1, 1));
-        }
-      }
-    }
-    TRITONSERVER_MetricsDelete(metrics);
-    // Handle potential errors
-    if (err != nullptr) {
-      LOG_ERROR << "Failed to get KV metrics: "
-                << TRITONSERVER_ErrorMessage(err);
-      TRITONSERVER_ErrorDelete(err);
-    }
-  }
-#endif  // TRITON_ENABLE_METRICS
   if (req->method != htp_method_POST) {
     RETURN_AND_RESPOND_WITH_ERR(
         req, EVHTP_RES_METHNALLOWED, "Method Not Allowed");
@@ -3417,119 +3381,6 @@ HTTPAPIServer::HandleGenerate(
   request_release_payload.release();
 }
 
-#ifdef TRITON_ENABLE_METRICS
-std::vector<HTTPAPIServer::PromMetric>
-HTTPAPIServer::MetricFamilyExtractor(
-    const std::string& input, const std::string& metricFamily)
-{
-  std::vector<PromMetric> metrics;
-  // Construct the regex pattern using the provided metricFamily
-  std::string patternStr = metricFamily + R"((?:{(.*?)})?\s+(\d+(?:\.\d+)?))";
-  re2::RE2 pattern(patternStr);
-  re2::StringPiece inputPiece(input);
-
-  std::string labelString;
-  std::string metric_value;
-
-  while (re2::RE2::FindAndConsume(
-      &inputPiece, pattern, &labelString, &metric_value)) {
-    PromMetric metric;
-
-    // Extract labels if they exist
-    if (!labelString.empty()) {
-      re2::RE2 labelPattern(R"((\w+)=\"([^\"]+)\")");
-      re2::StringPiece labelPiece(labelString);
-      std::string key, value;
-      while (
-          re2::RE2::FindAndConsume(&labelPiece, labelPattern, &key, &value)) {
-        metric.labels[key] = value;
-      }
-    }
-
-    // Assign the value
-    metric.value = stod(metric_value);
-    metrics.push_back(metric);
-  }
-
-  return metrics;
-}
-
-std::string
-HTTPAPIServer::ExtractKVMetrics(
-    const std::string& prometheus_metrics, const std::string& orca_type)
-{
-  std::string metric_family = "nv_trt_llm_kv_cache_block_metrics";
-  std::vector<PromMetric> kv_cache_metrics =
-      MetricFamilyExtractor(prometheus_metrics, metric_family);
-
-  double tokens_per_block = -1;
-  double used_blocks = -1;
-  double max_blocks = -1;
-
-  for (const auto& metric : kv_cache_metrics) {
-    if (metric.labels.count("kv_cache_block_type") > 0) {
-      std::string type = metric.labels.at("kv_cache_block_type");
-      if (type == "tokens_per") {
-        tokens_per_block = metric.value;
-      } else if (type == "used") {
-        used_blocks = metric.value;
-      } else if (type == "max") {
-        max_blocks = metric.value;
-      }
-    }
-  }
-
-  // One or more of the kv metrics was not found or invalid.
-  if (tokens_per_block < 0 || used_blocks < 0 || max_blocks < 0) {
-    return "";
-  }
-
-  // Calculate derived metrics
-  double kv_cache_utilization = 0;
-  if (max_blocks > 0) {
-    kv_cache_utilization = used_blocks / max_blocks;
-  }
-  uint64_t max_token_capacity =
-      static_cast<uint64_t>(max_blocks * tokens_per_block);
-
-  // Logic to construct and format response header
-  std::string header_contents = "";
-  const std::string named_metrics_key = "named_metrics";
-  const std::string kv_util_key = "kv_cache_utilization";
-  const std::string max_token_key = "max_token_capacity";
-
-  if (orca_type == "json") {
-    // Format the metrics according to the ORCA protocol as JSON.
-    triton::common::TritonJson::Value orca_metrics(
-        triton::common::TritonJson::ValueType::OBJECT);
-    triton::common::TritonJson::Value named_metrics(
-        orca_metrics, triton::common::TritonJson::ValueType::OBJECT);
-
-    named_metrics.AddDouble(kv_util_key.c_str(), kv_cache_utilization);
-    named_metrics.AddUInt(max_token_key.c_str(), max_token_capacity);
-    orca_metrics.Add(named_metrics_key.c_str(), std::move(named_metrics));
-
-    triton::common::TritonJson::WriteBuffer buffer;
-    orca_metrics.Write(&buffer);
-    header_contents = std::string("JSON ") + buffer.Contents();
-
-  } else if (orca_type == "http") {
-    // Format the metrics according to the ORCA protocol as Native HTTP
-    // (comma separated list).
-    const std::string prefix = named_metrics_key + ".";
-
-    header_contents = "TEXT ";
-    header_contents += prefix + kv_util_key + "=" +
-                       std::to_string(kv_cache_utilization) + ", ";
-    header_contents +=
-        prefix + max_token_key + "=" + std::to_string(max_token_capacity);
-  } else {
-    LOG_ERROR << "orca_type is set to an invalid type: " << orca_type;
-  }
-
-  return header_contents;
-}
-#endif  // TRITON_ENABLE_METRICS
 
 TRITONSERVER_Error*
 HTTPAPIServer::ModelInputMetadata(
@@ -4376,6 +4227,62 @@ HTTPAPIServer::GenerateRequestClass::StartResponse(
     return;
   }
 
+
+#ifdef TRITON_ENABLE_METRICS
+  // logic to add kv_cache metrics to response header
+  // Get the metrics in Prometheus format
+
+  // "ORCA_METRIC_FORMAT" is an environment variable that specifies which load
+  // report format `endpoint-load-metrics` will be in. If left unset the header
+  // will not be written and the feature is disabled.
+  //
+  // When set, the valid values for "ORCA_METRIC_FORMAT" are:
+  //
+  // "http"
+  // "json"
+  //
+  // Any other value will have behavior equivalent to being unset while also
+  // logging and error.
+  //
+  // For specifics on the different formats for the load reporting formats, see:
+  // https://docs.google.com/document/d/1C1ybMmDKJIVlrbOLbywhu9iRYo4rilR-cT50OTtOFTs/edit?tab=t.0#heading=h.do9yfa1wlpk8
+  auto server = infer_request->EvHtpServer();
+  if (std::getenv("ORCA_METRIC_FORMAT") != nullptr && server != nullptr) {
+    const std::string orca_type = std::getenv("ORCA_METRIC_FORMAT");
+    TRITONSERVER_Metrics* metrics = nullptr;
+    TRITONSERVER_Error* err = TRITONSERVER_ServerMetrics(server, &metrics);
+    if (err == nullptr) {
+      const char* base;
+      size_t byte_size;
+      err = TRITONSERVER_MetricsFormatted(
+          metrics, TRITONSERVER_METRIC_PROMETHEUS, &base, &byte_size);
+      if (err == nullptr) {
+        std::string formatted_metrics(base, byte_size);
+        // Extract the KV utilization metrics from the Prometheus formatted
+        // string.
+        std::string extracted_kv_metrics =
+            ExtractKVMetrics(formatted_metrics, orca_type);
+        if (!extracted_kv_metrics.empty()) {
+          evhtp_headers_add_header(
+              req->headers_out,
+              evhtp_header_new(
+                  "endpoint-load-metrics", extracted_kv_metrics.c_str(), 1, 1));
+        } else {
+          LOG_ERROR << "ORCA_METRIC_FORMAT is set but extracted_kv_metrics is "
+                       "empty, no header written. orca_type="
+                    << orca_type;
+        }
+      }
+    } else {
+      // Handle potential errors
+      LOG_ERROR << "Failed to get KV metrics: "
+                << TRITONSERVER_ErrorMessage(err);
+      TRITONSERVER_ErrorDelete(err);
+    }
+    TRITONSERVER_MetricsDelete(metrics);
+  }
+#endif  // TRITON_ENABLE_METRICS
+
   if (infer_request->streaming_) {
     AddContentTypeHeader(req, "text/event-stream; charset=utf-8");
   } else {
@@ -4385,6 +4292,173 @@ HTTPAPIServer::GenerateRequestClass::StartResponse(
   evhtp_request_resume(req);
 }
 
+
+#ifdef TRITON_ENABLE_METRICS
+std::vector<HTTPAPIServer::GenerateRequestClass::PromMetric>
+HTTPAPIServer::GenerateRequestClass::MetricFamilyExtractor(
+    const std::string& input, const std::string& metricFamily)
+{
+  std::vector<PromMetric> metrics;
+  // Construct the regex pattern using the provided metricFamily.
+
+  // `labelGroup` is a capturing group that captures all characters within curly
+  // braces, excluding line breaks.
+  std::string labelGroup = "(?:{(.*?)})";
+
+  // `valueGroup` is a capturing group that captures a number with its
+  // decimals if any.
+  std::string valueGroup = R"((\d+(?:\.\d+)?))";
+
+  // `patternStr` matches on lines starting with `metricFamily` then captures
+  // its labels if any, then (optionally) matches any whitespace, then captures
+  // its numeric double value.
+  //
+  // For example, `patternStr` would match on input:
+  // `nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type="used",model="tensorrt_llm",version="1"}
+  // 3`
+  //
+  // with 2 capturing groups:
+  // 1. `kv_cache_block_type="used",model="tensorrt_llm",version="1"`
+  // 2. `3`
+  std::string patternStr = metricFamily + labelGroup + R"(?\s*)" + valueGroup;
+  re2::RE2 pattern(patternStr);
+  re2::StringPiece inputPiece(input);
+
+  std::string labelString;
+  std::string metric_value;
+
+  while (re2::RE2::FindAndConsume(
+      &inputPiece, pattern, &labelString, &metric_value)) {
+    PromMetric metric;
+
+    // Extract labels if they exist
+    if (!labelString.empty()) {
+      // `labelPattern` captures any alphanumeric sequence that precedes an '='
+      // character, then captures the following quoted character sequence. These
+      // groups are exahstive given the prometheus data model:
+      // https://prometheus.io/docs/concepts/data_model/#metric-names-and-labels
+      //
+      // For example, calling FindAndConsume() with `labelPattern` on input:
+      // `kv_cache_block_type="used",model="tensorrt_llm",version="1"`
+      //
+      // matches 3 times with 2 capturing groups each:
+      //
+      // Match #1
+      // 1. `kv_cache_block_type`
+      // 2. `used`
+      //
+      // Match #2
+      // 1. `model`
+      // 2. `tensorrt_llm`
+      //
+      // Match #3
+      // 1. `version`
+      // 2. `1`
+      re2::RE2 labelPattern(R"((\w+)=\"([^\"]*)\")");
+      re2::StringPiece labelPiece(labelString);
+      std::string key, value;
+      while (
+          re2::RE2::FindAndConsume(&labelPiece, labelPattern, &key, &value)) {
+        // Populate the metric's labels map
+        metric.labels[key] = value;
+      }
+    }
+
+    // Assign the metric its value and add it to the family list
+    metric.value = stod(metric_value);
+    metrics.push_back(metric);
+  }
+
+  return metrics;
+}
+
+std::string
+HTTPAPIServer::GenerateRequestClass::ExtractKVMetrics(
+    const std::string& prometheus_metrics, const std::string& orca_type)
+{
+  std::string metric_family = "nv_trt_llm_kv_cache_block_metrics";
+  std::vector<PromMetric> kv_cache_metrics =
+      MetricFamilyExtractor(prometheus_metrics, metric_family);
+
+  double tokens_per_block = -1;
+  double used_blocks = -1;
+  double max_blocks = -1;
+
+  for (const auto& metric : kv_cache_metrics) {
+    if (metric.labels.count("kv_cache_block_type") > 0) {
+      std::string type = metric.labels.at("kv_cache_block_type");
+      if (type == "tokens_per") {
+        tokens_per_block = metric.value;
+      } else if (type == "used") {
+        used_blocks = metric.value;
+      } else if (type == "max") {
+        max_blocks = metric.value;
+      }
+    }
+  }
+
+  // Return early if not all kv metrics are found and set.
+  if (tokens_per_block < 0 || used_blocks < 0 || max_blocks < 0) {
+    LOG_ERROR << "One or more of the kv metrics was not found or invalid.";
+    return "";
+  }
+
+  // Calculate derived metrics
+  double kv_cache_utilization = 0;
+  if (max_blocks > 0) {
+    kv_cache_utilization = used_blocks / max_blocks;
+  }
+  uint64_t max_token_capacity =
+      static_cast<uint64_t>(max_blocks * tokens_per_block);
+
+  return OrcaKVMetricHeader(
+      orca_type, kv_cache_utilization, max_token_capacity);
+}
+
+std::string
+HTTPAPIServer::GenerateRequestClass::OrcaKVMetricHeader(
+    const std::string& orca_type, const double kv_cache_utilization,
+    const uint64_t max_token_capacity)
+{
+  // Logic to construct and format response header
+  std::string header_contents = "";
+  const std::string named_metrics_key = "named_metrics";
+  const std::string kv_util_key = "kv_cache_utilization";
+  const std::string max_token_key = "max_token_capacity";
+
+  if (orca_type == "json") {
+    // Format the metrics according to the ORCA protocol as JSON.
+    triton::common::TritonJson::Value orca_metrics(
+        triton::common::TritonJson::ValueType::OBJECT);
+    triton::common::TritonJson::Value named_metrics(
+        orca_metrics, triton::common::TritonJson::ValueType::OBJECT);
+
+    named_metrics.AddDouble(kv_util_key.c_str(), kv_cache_utilization);
+    named_metrics.AddUInt(max_token_key.c_str(), max_token_capacity);
+    orca_metrics.Add(named_metrics_key.c_str(), std::move(named_metrics));
+
+    triton::common::TritonJson::WriteBuffer buffer;
+    orca_metrics.Write(&buffer);
+    header_contents = std::string("JSON ") + buffer.Contents();
+
+  } else if (orca_type == "http") {
+    // Format the metrics according to the ORCA protocol as Native HTTP
+    // (comma separated list).
+    const std::string prefix = named_metrics_key + ".";
+
+    header_contents = "TEXT ";
+    header_contents += prefix + kv_util_key + "=" +
+                       std::to_string(kv_cache_utilization) + ", ";
+    header_contents +=
+        prefix + max_token_key + "=" + std::to_string(max_token_capacity);
+  } else {
+    LOG_ERROR << "orca_type is set to an invalid type: " << orca_type;
+  }
+
+  return header_contents;
+}
+#endif  // TRITON_ENABLE_METRICS
+
 void
 HTTPAPIServer::GenerateRequestClass::ChunkResponseCallback(
     evthr_t* thr, void* arg, void* shared)
diff --git a/src/http_server.h b/src/http_server.h
index 3e865bba74..c557311bc4 100644
--- a/src/http_server.h
+++ b/src/http_server.h
@@ -364,6 +364,8 @@ class HTTPAPIServer : public HTTPServer {
     }
     virtual ~GenerateRequestClass();
 
+    TRITONSERVER_Server* EvHtpServer() const { return server_; }
+
     // [FIXME] Specialize response complete function for now, should have
     // been a dispatcher and call into object specific response function.
     static void InferResponseComplete(
@@ -393,6 +395,12 @@ class HTTPAPIServer : public HTTPServer {
     const MappingSchema* ResponseSchema() { return response_schema_; }
 
    private:
+#ifdef TRITON_ENABLE_METRICS
+    struct PromMetric {
+      std::unordered_map<std::string, std::string> labels;
+      double value;
+    };
+#endif  // TRITON_ENABLE_METRICS
     struct TritonOutput {
       enum class Type { RESERVED, TENSOR, PARAMETER };
       TritonOutput(Type t, const std::string& val) : type(t), value(val) {}
@@ -403,6 +411,23 @@ class HTTPAPIServer : public HTTPServer {
       // TENSOR, PARAMETER type
       uint32_t index;
     };
+
+#ifdef TRITON_ENABLE_METRICS
+    // Helper function to get the KV-cache utilization metrics for the
+    // inference response header
+    static std::string ExtractKVMetrics(
+        const std::string& prometheus_metrics, const std::string& orca_type);
+    // Generates a metric struct for a given family with a map of labels and a
+    // value
+    static std::vector<PromMetric> MetricFamilyExtractor(
+        const std::string& input, const std::string& metricFamily);
+    // Creates a header string in the the proper reporting format for provided
+    // KV-cache metrics.
+    static std::string OrcaKVMetricHeader(
+        const std::string& reporting_format, const double kv_cache_utilization,
+        const uint64_t max_token_capacity);
+#endif  // TRITON_ENABLE_METRICS
+
     TRITONSERVER_Error* ExactMappingInput(
         const std::string& name, triton::common::TritonJson::Value& value,
         std::map<std::string, triton::common::TritonJson::Value>&
@@ -455,13 +480,6 @@ class HTTPAPIServer : public HTTPServer {
     evbuffer* buffer_ = nullptr;
   };
 
-#ifdef TRITON_ENABLE_METRICS
- private:
-  struct PromMetric {
-    std::unordered_map<std::string, std::string> labels;
-    double value;
-  };
-#endif  // TRITON_ENABLE_METRICS
 
  protected:
   explicit HTTPAPIServer(
@@ -566,17 +584,6 @@ class HTTPAPIServer : public HTTPServer {
   void HandleGenerate(
       evhtp_request_t* req, const std::string& model_name,
       const std::string& model_version_str, bool streaming);
-#ifdef TRITON_ENABLE_METRICS
-  // Helper function to set get the KV-cache utilization metrics for the
-  // infer response header
-  std::string ExtractKVMetrics(
-      const std::string& prometheus_metrics, const std::string& orca_type);
-
-  // Generates a metric struct for a given family with a map of labels and a
-  // value
-  std::vector<PromMetric> MetricFamilyExtractor(
-      const std::string& input, const std::string& metricFamily);
-#endif  // TRITON_ENABLE_METRICS
   // 'meta_data_root' is the root JSON document for 'input_metadata'.
   // In TritonJson, the Value objects are references to the root document.
   // Therefore the document must stay valid.