From 8dc7fbaed3766fcc0d00504fc68d0252dfd67a87 Mon Sep 17 00:00:00 2001
From: "Chan, Roy" <Roy.Chan@amd.com>
Date: Sun, 22 Sep 2024 21:21:22 -0400
Subject: [PATCH] amd/vpelib: Revise the config sharing handling

[WHY]
- was hardcoded to store 16 configs only
- as the config descriptor usage grows, more is needed
- in bypass case, we also generate a new config which is a waste

[HOW]
- change to use vector to store configs
- don't force new config desc if in bypass
- revise the vector API, reduce the parameter passing

[TESTING]
- Tested with corresponding test cases

Reviewed-by: Brendan Leder <breleder@amd.com>
Acked-by: Chih-Wei Chien <Chih-Wei.Chien@amd.com>
Signed-off-by: Roy Chan <roy.chan@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31693>
---
 src/amd/vpelib/inc/vpe_types.h                |  10 +-
 .../vpelib/src/chip/vpe10/vpe10_cmd_builder.c |  42 ++--
 .../vpelib/src/chip/vpe10/vpe10_resource.c    |   6 +-
 src/amd/vpelib/src/core/background.c          |   2 +-
 src/amd/vpelib/src/core/color.c               |  16 +-
 src/amd/vpelib/src/core/inc/config_cache.h    |  27 ++-
 src/amd/vpelib/src/core/inc/resource.h        |   6 +-
 src/amd/vpelib/src/core/inc/vpe_priv.h        |  16 +-
 src/amd/vpelib/src/core/resource.c            | 187 +++++++++++-------
 src/amd/vpelib/src/core/vpelib.c              |  77 +++++++-
 src/amd/vpelib/src/utils/inc/vector.h         |  10 +-
 src/amd/vpelib/src/utils/vector.c             |  15 +-
 12 files changed, 270 insertions(+), 144 deletions(-)

diff --git a/src/amd/vpelib/inc/vpe_types.h b/src/amd/vpelib/inc/vpe_types.h
index d40ebad829f..a9b0acf93be 100644
--- a/src/amd/vpelib/inc/vpe_types.h
+++ b/src/amd/vpelib/inc/vpe_types.h
@@ -211,12 +211,12 @@ struct vpe_caps {
     uint32_t is_apu                 : 1;
     uint32_t bg_color_check_support : 1;
     struct {
-        int num_dpp;
-        int num_opp;
-        int num_mpc_3dlut;
-        int num_cdc_be;
+        uint32_t num_dpp;
+        uint32_t num_opp;
+        uint32_t num_mpc_3dlut;
+        uint32_t num_cdc_be;
 
-        int num_queue; /**< num of hw queue */
+        uint32_t num_queue; /**< num of hw queue */
     } resource_caps;
 
     struct vpe_color_caps color_caps;
diff --git a/src/amd/vpelib/src/chip/vpe10/vpe10_cmd_builder.c b/src/amd/vpelib/src/chip/vpe10/vpe10_cmd_builder.c
index e0ded5df0f3..9b9d6fd81b9 100644
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_cmd_builder.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_cmd_builder.c
@@ -63,9 +63,11 @@ enum vpe_status vpe10_build_vpe_cmd(
     struct cmd_builder     *builder         = &vpe_priv->resource.cmd_builder;
     struct vpe_desc_writer *vpe_desc_writer = &vpe_priv->vpe_desc_writer;
     struct vpe_buf         *emb_buf         = &cur_bufs->emb_buf;
-    struct output_ctx   *output_ctx;
-    struct pipe_ctx     *pipe_ctx = NULL;
+    struct output_ctx      *output_ctx;
+    struct pipe_ctx        *pipe_ctx = NULL;
     uint32_t                pipe_idx, config_idx;
+    struct vpe_vector      *config_vector;
+    struct config_record   *config;
     struct vpe_cmd_info    *cmd_info = vpe_vector_get(vpe_priv->vpe_cmd_vector, cmd_idx);
     VPE_ASSERT(cmd_info);
 
@@ -118,19 +120,22 @@ enum vpe_status vpe10_build_vpe_cmd(
 
             // follow the same order of config generation in "non-reuse" case
             // stream sharing
-            VPE_ASSERT(stream_ctx->num_configs[pipe_idx]);
-            for (config_idx = 0; config_idx < stream_ctx->num_configs[pipe_idx]; config_idx++) {
-                vpe_desc_writer->add_config_desc(vpe_desc_writer,
-                    stream_ctx->configs[pipe_idx][config_idx].config_base_addr, reuse,
-                    (uint8_t)emb_buf->tmz);
+            config_vector = stream_ctx->configs[pipe_idx];
+            VPE_ASSERT(config_vector->num_elements);
+            for (config_idx = 0; config_idx < config_vector->num_elements; config_idx++) {
+                config = (struct config_record *)vpe_vector_get(config_vector, config_idx);
+
+                vpe_desc_writer->add_config_desc(
+                    vpe_desc_writer, config->config_base_addr, reuse, (uint8_t)emb_buf->tmz);
             }
 
             // stream-op sharing
-            for (config_idx = 0; config_idx < stream_ctx->num_stream_op_configs[pipe_idx][cmd_type];
-                 config_idx++) {
-                vpe_desc_writer->add_config_desc(vpe_desc_writer,
-                    stream_ctx->stream_op_configs[pipe_idx][cmd_type][config_idx].config_base_addr,
-                    reuse, (uint8_t)emb_buf->tmz);
+            config_vector = stream_ctx->stream_op_configs[pipe_idx][cmd_type];
+            for (config_idx = 0; config_idx < config_vector->num_elements; config_idx++) {
+                config = (struct config_record *)vpe_vector_get(config_vector, config_idx);
+
+                vpe_desc_writer->add_config_desc(
+                    vpe_desc_writer, config->config_base_addr, reuse, (uint8_t)emb_buf->tmz);
             }
 
             // command specific
@@ -148,14 +153,19 @@ enum vpe_status vpe10_build_vpe_cmd(
 
     // backend programming
     output_ctx = &vpe_priv->output_ctx;
-    if (!output_ctx->num_configs[0]) {
+
+    config_vector = output_ctx->configs[0];
+    if (!config_vector->num_elements) {
         vpe_priv->resource.program_backend(vpe_priv, pipe_ctx->pipe_idx, cmd_idx, false);
     } else {
         bool reuse = !vpe_priv->init.debug.disable_reuse_bit;
+
         // re-use output register configs
-        for (config_idx = 0; config_idx < output_ctx->num_configs[0]; config_idx++) {
-            vpe_desc_writer->add_config_desc(vpe_desc_writer,
-                output_ctx->configs[0][config_idx].config_base_addr, reuse, (uint8_t)emb_buf->tmz);
+        for (config_idx = 0; config_idx < config_vector->num_elements; config_idx++) {
+            config = (struct config_record *)vpe_vector_get(config_vector, config_idx);
+
+            vpe_desc_writer->add_config_desc(
+                vpe_desc_writer, config->config_base_addr, reuse, (uint8_t)emb_buf->tmz);
         }
 
         vpe_priv->resource.program_backend(vpe_priv, pipe_ctx->pipe_idx, cmd_idx, true);
diff --git a/src/amd/vpelib/src/chip/vpe10/vpe10_resource.c b/src/amd/vpelib/src/chip/vpe10/vpe10_resource.c
index a47e54c7ed7..6de708dadcf 100644
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_resource.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_resource.c
@@ -951,7 +951,7 @@ enum vpe_status vpe10_populate_cmd_info(struct vpe_priv *vpe_priv)
             cmd_info.tm_enabled         = tm_enabled;
             cmd_info.insert_start_csync = false;
             cmd_info.insert_end_csync   = false;
-            vpe_vector_push(vpe_priv, vpe_priv->vpe_cmd_vector, &cmd_info);
+            vpe_vector_push(vpe_priv->vpe_cmd_vector, &cmd_info);
 
             // The following codes are only valid if blending is supported
             /*
@@ -979,6 +979,7 @@ void vpe10_create_stream_ops_config(struct vpe_priv *vpe_priv, uint32_t pipe_idx
     struct dpp          *dpp      = vpe_priv->resource.dpp[pipe_idx];
     struct mpc          *mpc      = vpe_priv->resource.mpc[pipe_idx];
     enum vpe_cmd_type    cmd_type = VPE_CMD_TYPE_COUNT;
+    struct vpe_vector   *config_vector;
 
     vpe_priv->fe_cb_ctx.stream_op_sharing = true;
     vpe_priv->fe_cb_ctx.stream_sharing    = false;
@@ -995,7 +996,8 @@ void vpe10_create_stream_ops_config(struct vpe_priv *vpe_priv, uint32_t pipe_idx
         return;
 
     // return if already generated
-    if (stream_ctx->num_stream_op_configs[pipe_idx][cmd_type])
+    config_vector = stream_ctx->stream_op_configs[pipe_idx][cmd_type];
+    if (config_vector->num_elements)
         return;
 
     vpe_priv->fe_cb_ctx.cmd_type = cmd_type;
diff --git a/src/amd/vpelib/src/core/background.c b/src/amd/vpelib/src/core/background.c
index fed2eba9095..bc72f869fc7 100644
--- a/src/amd/vpelib/src/core/background.c
+++ b/src/amd/vpelib/src/core/background.c
@@ -118,7 +118,7 @@ void vpe_create_bg_segments(
         cmd_info.ops        = ops;
         cmd_info.cd         = (uint8_t)(gaps_cnt - gap_index - 1);
         cmd_info.tm_enabled = false; // currently only support frontend tm
-        vpe_vector_push(vpe_priv, vpe_priv->vpe_cmd_vector, &cmd_info);
+        vpe_vector_push(vpe_priv->vpe_cmd_vector, &cmd_info);
     }
 }
 
diff --git a/src/amd/vpelib/src/core/color.c b/src/amd/vpelib/src/core/color.c
index b0e12bafc2a..b2d64fb996c 100644
--- a/src/amd/vpelib/src/core/color.c
+++ b/src/amd/vpelib/src/core/color.c
@@ -181,7 +181,7 @@ static bool color_update_regamma_tf(struct vpe_priv *vpe_priv,
         break;
     }
 
-    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+    for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
         if (vpe_priv->init.debug.disable_lut_caching ||
             (output_tf->cache_info[i].cm_gamma_type != output_tf->cm_gamma_type) ||
             (output_tf->cache_info[i].tf != output_tf->tf) ||
@@ -198,7 +198,7 @@ static bool color_update_regamma_tf(struct vpe_priv *vpe_priv,
         ret = vpe_color_calculate_regamma_params(
             vpe_priv, x_scale, y_scale, &vpe_priv->cal_buffer, output_tf);
         if (ret) {
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
                 // reset the cache status and mark as dirty to let hw layer to re-cache
                 output_tf->dirty[i]                    = true;
                 output_tf->config_cache[i].cached      = false;
@@ -244,7 +244,7 @@ static bool color_update_degamma_tf(struct vpe_priv *vpe_priv,
         break;
     }
 
-    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+    for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
         if (vpe_priv->init.debug.disable_lut_caching ||
             (input_tf->cache_info[i].cm_gamma_type != input_tf->cm_gamma_type) ||
             (input_tf->cache_info[i].tf != input_tf->tf) ||
@@ -260,7 +260,7 @@ static bool color_update_degamma_tf(struct vpe_priv *vpe_priv,
     if (update) {
         ret = vpe_color_calculate_degamma_params(vpe_priv, x_scale, y_scale, input_tf);
         if (ret) {
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
                 // reset the cache status and mark as dirty to let hw layer to re-cache
                 input_tf->dirty[i]                    = true;
                 input_tf->config_cache[i].cached      = false;
@@ -683,7 +683,7 @@ enum vpe_status vpe_color_update_3dlut(
     } else {
         bool update = false;
 
-        for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++)
+        for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++)
             if (vpe_priv->init.debug.disable_lut_caching ||
                 (stream_ctx->lut3d_func->cache_info[i].uid_3dlut !=
                     stream_ctx->stream.tm_params.UID))
@@ -693,7 +693,7 @@ enum vpe_status vpe_color_update_3dlut(
             vpe_convert_to_tetrahedral(
                 vpe_priv, stream_ctx->stream.tm_params.lut_data,
                 stream_ctx->stream.tm_params.lut_dim, stream_ctx->lut3d_func);
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
                 stream_ctx->lut3d_func->dirty[i]                = true;
                 stream_ctx->lut3d_func->config_cache[i].cached  = false;
                 stream_ctx->lut3d_func->cache_info[i].uid_3dlut = stream_ctx->stream.tm_params.UID;
@@ -830,7 +830,7 @@ enum vpe_status vpe_color_update_shaper(const struct vpe_priv *vpe_priv, uint16_
     }
 
     // right now shaper is always programmed with linear, once cached, it is always reused.
-    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+    for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
         if (vpe_priv->init.debug.disable_lut_caching ||
             (shaper_func && (shaper_func->cache_info[i].tf != tf))) {
             // if the caching has the required data cached, skip the update
@@ -849,7 +849,7 @@ enum vpe_status vpe_color_update_shaper(const struct vpe_priv *vpe_priv, uint16_
 
         ret = vpe_build_shaper(&shaper_in, &shaper_func->pwl);
         if (ret == VPE_STATUS_OK) {
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
                 shaper_func->dirty[i]               = true;
                 shaper_func->config_cache[i].cached = false;
                 shaper_func->cache_info[i].tf       = tf;
diff --git a/src/amd/vpelib/src/core/inc/config_cache.h b/src/amd/vpelib/src/core/inc/config_cache.h
index 479ac7151c9..698d69886ef 100644
--- a/src/amd/vpelib/src/core/inc/config_cache.h
+++ b/src/amd/vpelib/src/core/inc/config_cache.h
@@ -42,17 +42,17 @@
  * The upper layer has to indicate this object is dirty or not for the hw programming layer to
  * determine i.  re-use the config cache? ii. cache the new settings?
  *
- * Before using the CONFIG_CACHE(), make sure the function has these local variables visiable in the
+ * Before using the CONFIG_CACHE(), make sure the function has these local variables visible in the
  * same code block:
  * 1. struct config_writer *config_writer
  *    - usually been declared with PROGRAM_ENTRY()
  * 2. a debug option that want to disable caching or not
  * 3. an input object that has the config_cache member
  * 4. the hw programming function that would generate command buffer content
- * 5. the object that has num_configs which stores the generated configs
+ * 5. the input/output context that has configs vector which stores the generated configs
  *
  * Inside this CONFIG_CACHE macro it will clear the dirty bit after consuming the settings
- * 
+ *
  * Make sure to free up this cache object when the parent object is destroyed using
  * CONFIG_CACHE_FREE()
  *
@@ -63,6 +63,7 @@ extern "C" {
 #endif
 
 struct vpe_priv;
+struct vpe_vector;
 
 /* a common config cache structure to be included in the object that is for program hardware API
  * layer
@@ -77,21 +78,21 @@ struct config_cache {
  * as bypass mode is not heavy lifting programming.
  *
  * /param   obj_cache           an object that has the config cache member
- * /param   obj_cfg_array       an object that contains the configs and num_configs member
+ * /param   ctx                 an input/output context that contains the configs vector
  * /param   disable_cache       a flag that controls a caching is needed
  * /param   is_bypass           if it is in bypass, it doesn't cache the bypass config
  * /param   program_func_call   the program call that generate config packet content
  * /param   inst                index to address the config_cache array
  */
-#define CONFIG_CACHE(obj_cache, obj_cfg_array, disable_cache, is_bypass, program_func_call, inst)  \
+#define CONFIG_CACHE(obj_cache, ctx, disable_cache, is_bypass, program_func_call, inst)            \
     {                                                                                              \
         bool use_cache = false;                                                                    \
                                                                                                    \
-        /* make sure it opens a new config packet */                                               \
-        config_writer_force_new_with_type(config_writer, CONFIG_TYPE_DIRECT);                      \
-                                                                                                   \
         if ((obj_cache) && !disable_cache && (obj_cache)->config_cache[inst].p_buffer &&           \
             (obj_cache)->config_cache[inst].cached && !((obj_cache)->dirty[inst]) && !is_bypass) { \
+            /* make sure it opens a new config packet */                                           \
+            config_writer_force_new_with_type(config_writer, CONFIG_TYPE_DIRECT);                  \
+                                                                                                   \
             /* reuse the cache */                                                                  \
             if (config_writer->buf->size >= (obj_cache)->config_cache[inst].size) {                \
                 memcpy((void *)(uintptr_t)config_writer->base_cpu_va,                              \
@@ -109,7 +110,13 @@ struct config_cache {
                                                                                                    \
         if (!use_cache) {                                                                          \
             uint64_t start, end;                                                                   \
-            uint16_t config_num = (uint16_t)(obj_cfg_array)->num_configs[inst];                    \
+            uint16_t num_config = (uint16_t)(ctx)->configs[inst]->num_elements;                    \
+                                                                                                   \
+            if (!is_bypass) {                                                                      \
+                /* make sure it opens a new config packet so we can cache a complete new config */ \
+                /* for bypass we don't do caching, so no need to open a new desc */                \
+                config_writer_force_new_with_type(config_writer, CONFIG_TYPE_DIRECT);              \
+            }                                                                                      \
                                                                                                    \
             start = config_writer->base_cpu_va;                                                    \
             program_func_call;                                                                     \
@@ -117,7 +124,7 @@ struct config_cache {
                                                                                                    \
             if (!disable_cache && !is_bypass) {                                                    \
                 /* only cache when it is not crossing config packets */                            \
-                if (config_num == (obj_cfg_array)->num_configs[inst]) {                            \
+                if (num_config == (ctx)->configs[inst]->num_elements) {                            \
                     if ((obj_cache)->dirty[inst]) {                                                \
                         uint64_t size = end - start;                                               \
                                                                                                    \
diff --git a/src/amd/vpelib/src/core/inc/resource.h b/src/amd/vpelib/src/core/inc/resource.h
index 0b6a32d347f..491a2f3aea5 100644
--- a/src/amd/vpelib/src/core/inc/resource.h
+++ b/src/amd/vpelib/src/core/inc/resource.h
@@ -42,7 +42,8 @@ struct vpe_priv;
 struct vpe_cmd_info;
 struct segment_ctx;
 
-#define MIN_VPE_CMD     1024
+#define MIN_VPE_CMD    (1024)
+#define MIN_NUM_CONFIG (16)
 
 enum vpe_cmd_ops;
 
@@ -132,9 +133,6 @@ struct stream_ctx *vpe_alloc_stream_ctx(struct vpe_priv *vpe_priv, uint32_t num_
 
 void vpe_free_stream_ctx(struct vpe_priv *vpe_priv);
 
-/** output ctx */
-void vpe_free_output_ctx(struct vpe_priv *vpe_priv);
-
 /** pipe resource management */
 void vpe_pipe_reset(struct vpe_priv *vpe_priv);
 
diff --git a/src/amd/vpelib/src/core/inc/vpe_priv.h b/src/amd/vpelib/src/core/inc/vpe_priv.h
index 1174fdaa1eb..7694b264e12 100644
--- a/src/amd/vpelib/src/core/inc/vpe_priv.h
+++ b/src/amd/vpelib/src/core/inc/vpe_priv.h
@@ -56,8 +56,6 @@ extern "C" {
 #define MAX_LINE_SIZE 1024 // without 16 pixels for the seams
 #define MAX_LINE_CNT  4
 
-#define MAX_NUM_SAVED_CONFIG 16
-
 enum vpe_cmd_ops {
     VPE_CMD_OPS_BLENDING,
     VPE_CMD_OPS_BG,
@@ -130,13 +128,9 @@ struct stream_ctx {
     uint16_t            num_segments;
     struct segment_ctx *segment_ctx;
 
-    uint16_t num_configs[MAX_INPUT_PIPE]; // shared among same stream
-    uint16_t num_stream_op_configs[MAX_INPUT_PIPE][VPE_CMD_TYPE_COUNT];
-    // shared among same cmd type, within the same stream
-
-    struct config_record configs[MAX_INPUT_PIPE][MAX_NUM_SAVED_CONFIG];
-    struct config_record stream_op_configs[MAX_INPUT_PIPE][VPE_CMD_TYPE_COUNT]
-                                          [MAX_NUM_SAVED_CONFIG];
+    // share configs that can be re-used once generated
+    struct vpe_vector *configs[MAX_INPUT_PIPE];
+    struct vpe_vector *stream_op_configs[MAX_INPUT_PIPE][VPE_CMD_TYPE_COUNT];
 
     // cached color properties
     bool                     per_pixel_alpha;
@@ -182,8 +176,8 @@ struct output_ctx {
     enum color_transfer_func tf;
     enum color_space         cs;
 
-    uint32_t             num_configs[MAX_OUTPUT_PIPE];
-    struct config_record configs[MAX_OUTPUT_PIPE][MAX_NUM_SAVED_CONFIG];
+    // store generated per-pipe configs that can be reused
+    struct vpe_vector *configs[MAX_OUTPUT_PIPE];
 
     union {
         struct {
diff --git a/src/amd/vpelib/src/core/resource.c b/src/amd/vpelib/src/core/resource.c
index ba9a6db6d0f..0415a021c95 100644
--- a/src/amd/vpelib/src/core/resource.c
+++ b/src/amd/vpelib/src/core/resource.c
@@ -164,42 +164,75 @@ struct segment_ctx *vpe_alloc_segment_ctx(struct vpe_priv *vpe_priv, uint16_t nu
     return segment_ctx_base;
 }
 
-struct stream_ctx *vpe_alloc_stream_ctx(struct vpe_priv *vpe_priv, uint32_t num_streams)
+static enum vpe_status create_input_config_vector(struct stream_ctx *stream_ctx)
 {
-    struct stream_ctx *ctx_base, *ctx;
-    uint32_t           i;
+    enum vpe_status  res = VPE_STATUS_OK;
+    uint32_t         pipe_idx, type_idx;
+    struct vpe_priv *vpe_priv;
 
-    ctx_base = (struct stream_ctx *)vpe_zalloc(sizeof(struct stream_ctx) * num_streams);
-    if (!ctx_base)
-        return NULL;
+    vpe_priv = stream_ctx->vpe_priv;
 
-    for (i = 0; i < num_streams; i++) {
-        ctx           = &ctx_base[i];
-        ctx->cs       = COLOR_SPACE_UNKNOWN;
-        ctx->tf       = TRANSFER_FUNC_UNKNOWN;
-        ctx->vpe_priv = vpe_priv;
-        vpe_color_set_adjustments_to_default(&ctx->color_adjustments);
-        ctx->tf_scaling_factor = vpe_fixpt_one;
-        ctx->stream.flags.geometric_scaling = 0;
-        ctx->stream.tm_params.UID = 0;
-        ctx->uid_3dlut = 0;
+    for (pipe_idx = 0; pipe_idx < vpe_priv->pub.caps->resource_caps.num_dpp; pipe_idx++) {
+        stream_ctx->configs[pipe_idx] =
+            vpe_vector_create(vpe_priv, sizeof(struct config_record), MIN_NUM_CONFIG);
+        if (!stream_ctx->configs[pipe_idx]) {
+            res = VPE_STATUS_NO_MEMORY;
+            break;
+        }
+
+        for (type_idx = 0; type_idx < VPE_CMD_TYPE_COUNT; type_idx++) {
+            stream_ctx->stream_op_configs[pipe_idx][type_idx] =
+                vpe_vector_create(vpe_priv, sizeof(struct config_record), MIN_NUM_CONFIG);
+            if (!stream_ctx->stream_op_configs[pipe_idx][type_idx]) {
+                res = VPE_STATUS_NO_MEMORY;
+                break;
+            }
+        }
+
+        if (res != VPE_STATUS_OK)
+            break;
     }
 
-    return ctx_base;
+    return res;
 }
 
-void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
+static void destroy_input_config_vector(struct stream_ctx *stream_ctx)
 {
-    uint16_t           i;
-    struct stream_ctx *ctx;
+    uint32_t         pipe_idx, type_idx;
+    struct vpe_priv *vpe_priv;
 
-    if (!vpe_priv->stream_ctx || !vpe_priv->num_streams)
+    vpe_priv = stream_ctx->vpe_priv;
+
+    for (pipe_idx = 0; pipe_idx < vpe_priv->pub.caps->resource_caps.num_dpp; pipe_idx++) {
+        if (stream_ctx->configs[pipe_idx]) {
+            vpe_vector_free(stream_ctx->configs[pipe_idx]);
+            stream_ctx->configs[pipe_idx] = NULL;
+        }
+
+        for (type_idx = 0; type_idx < VPE_CMD_TYPE_COUNT; type_idx++) {
+            if (stream_ctx->stream_op_configs[pipe_idx][type_idx]) {
+                vpe_vector_free(stream_ctx->stream_op_configs[pipe_idx][type_idx]);
+                stream_ctx->stream_op_configs[pipe_idx][type_idx] = NULL;
+            }
+        }
+    }
+}
+
+static void free_stream_ctx(uint32_t num_streams, struct stream_ctx *stream_ctx)
+{
+    struct vpe_priv *vpe_priv;
+    uint32_t         stream_idx;
+
+    if (!stream_ctx || !num_streams)
         return;
 
-    for (i = 0; i < vpe_priv->num_streams; i++) {
-        ctx = &vpe_priv->stream_ctx[i];
+    vpe_priv = stream_ctx[0].vpe_priv;
+
+    for (stream_idx = 0; stream_idx < num_streams; stream_idx++) {
+        struct stream_ctx *ctx = &stream_ctx[stream_idx];
+
         if (ctx->input_tf) {
-            for (int j = 0; j < MAX_INPUT_PIPE; j++)
+            for (uint32_t j = 0; j < MAX_INPUT_PIPE; j++)
                 CONFIG_CACHE_FREE(ctx->input_tf->config_cache[j]);
             vpe_free(ctx->input_tf);
             ctx->input_tf = NULL;
@@ -221,21 +254,21 @@ void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
         }
 
         if (ctx->in_shaper_func) {
-            for (int j = 0; j < MAX_INPUT_PIPE; j++)
+            for (uint32_t j = 0; j < MAX_INPUT_PIPE; j++)
                 CONFIG_CACHE_FREE(ctx->in_shaper_func->config_cache[j]);
             vpe_free(ctx->in_shaper_func);
             ctx->in_shaper_func = NULL;
         }
 
         if (ctx->blend_tf) {
-            for (int j = 0; j < MAX_INPUT_PIPE; j++)
+            for (uint32_t j = 0; j < MAX_INPUT_PIPE; j++)
                 CONFIG_CACHE_FREE(ctx->blend_tf->config_cache[j]);
             vpe_free(ctx->blend_tf);
             ctx->blend_tf = NULL;
         }
 
         if (ctx->lut3d_func) {
-            for (int j = 0; j < MAX_3DLUT; j++)
+            for (uint32_t j = 0; j < MAX_3DLUT; j++)
                 CONFIG_CACHE_FREE(ctx->lut3d_func->config_cache[j]);
             vpe_free(ctx->lut3d_func);
             ctx->lut3d_func = NULL;
@@ -245,20 +278,53 @@ void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
             vpe_free(ctx->segment_ctx);
             ctx->segment_ctx = NULL;
         }
+
+        destroy_input_config_vector(ctx);
     }
-    vpe_free(vpe_priv->stream_ctx);
-    vpe_priv->stream_ctx  = NULL;
-    vpe_priv->num_streams = 0;
-    vpe_priv->num_virtual_streams = 0;
 }
 
-void vpe_free_output_ctx(struct vpe_priv *vpe_priv)
+struct stream_ctx *vpe_alloc_stream_ctx(struct vpe_priv *vpe_priv, uint32_t num_streams)
 {
-    if (vpe_priv->output_ctx.gamut_remap)
-        vpe_free(vpe_priv->output_ctx.gamut_remap);
+    struct stream_ctx *ctx_base, *ctx;
+    uint32_t           stream_idx;
+    enum vpe_status    res = VPE_STATUS_OK;
 
-    if (vpe_priv->output_ctx.output_tf)
-        vpe_free(vpe_priv->output_ctx.output_tf);
+    ctx_base = (struct stream_ctx *)vpe_zalloc(sizeof(struct stream_ctx) * num_streams);
+    if (!ctx_base)
+        return NULL;
+
+    for (stream_idx = 0; stream_idx < num_streams; stream_idx++) {
+        ctx           = &ctx_base[stream_idx];
+        ctx->cs       = COLOR_SPACE_UNKNOWN;
+        ctx->tf       = TRANSFER_FUNC_UNKNOWN;
+        ctx->vpe_priv = vpe_priv;
+        vpe_color_set_adjustments_to_default(&ctx->color_adjustments);
+        ctx->tf_scaling_factor              = vpe_fixpt_one;
+        ctx->stream.flags.geometric_scaling = 0;
+        ctx->stream.tm_params.UID           = 0;
+        ctx->uid_3dlut                      = 0;
+
+        if ((res = create_input_config_vector(ctx)) != VPE_STATUS_OK)
+            break;
+    }
+
+    if (res != VPE_STATUS_OK) {
+        free_stream_ctx(num_streams, ctx_base);
+        ctx_base = NULL;
+    }
+    return ctx_base;
+}
+
+void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
+{
+    if (vpe_priv->num_streams && vpe_priv->stream_ctx) {
+        free_stream_ctx(vpe_priv->num_streams, vpe_priv->stream_ctx);
+        vpe_free(vpe_priv->stream_ctx);
+    }
+
+    vpe_priv->stream_ctx          = NULL;
+    vpe_priv->num_streams         = 0;
+    vpe_priv->num_virtual_streams = 0;
 }
 
 void vpe_pipe_reset(struct vpe_priv *vpe_priv)
@@ -697,35 +763,24 @@ void vpe_resource_build_bit_depth_reduction_params(
 void vpe_frontend_config_callback(
     void *ctx, uint64_t cfg_base_gpu, uint64_t cfg_base_cpu, uint64_t size, uint32_t pipe_idx)
 {
-    struct config_frontend_cb_ctx *cb_ctx = (struct config_frontend_cb_ctx*)ctx;
-    struct vpe_priv *vpe_priv             = cb_ctx->vpe_priv;
-    struct stream_ctx *stream_ctx         = &vpe_priv->stream_ctx[cb_ctx->stream_idx];
-    enum vpe_cmd_type  cmd_type;
+    struct config_frontend_cb_ctx *cb_ctx     = (struct config_frontend_cb_ctx *)ctx;
+    struct vpe_priv               *vpe_priv   = cb_ctx->vpe_priv;
+    struct stream_ctx             *stream_ctx = &vpe_priv->stream_ctx[cb_ctx->stream_idx];
+    enum vpe_cmd_type              cmd_type;
+    struct config_record           record;
 
     if (cb_ctx->stream_sharing) {
-        VPE_ASSERT(stream_ctx->num_configs[pipe_idx] <
-                   (int)(sizeof(stream_ctx->configs[pipe_idx]) / sizeof(struct config_record)));
+        record.config_base_addr = cfg_base_gpu;
+        record.config_size      = size;
 
-        stream_ctx->configs[pipe_idx][stream_ctx->num_configs[pipe_idx]].config_base_addr =
-            cfg_base_gpu;
-        stream_ctx->configs[pipe_idx][stream_ctx->num_configs[pipe_idx]].config_size = size;
-        stream_ctx->num_configs[pipe_idx]++;
+        vpe_vector_push(stream_ctx->configs[pipe_idx], &record);
     } else if (cb_ctx->stream_op_sharing) {
         cmd_type = cb_ctx->cmd_type;
 
-        VPE_ASSERT(stream_ctx->num_stream_op_configs[pipe_idx][cmd_type] <
-                   (int)(sizeof(stream_ctx->stream_op_configs[pipe_idx][cmd_type]) /
-                         sizeof(struct config_record)));
+        record.config_base_addr = cfg_base_gpu;
+        record.config_size      = size;
 
-        stream_ctx
-            ->stream_op_configs[pipe_idx][cmd_type]
-                               [stream_ctx->num_stream_op_configs[pipe_idx][cmd_type]]
-            .config_base_addr = cfg_base_gpu;
-        stream_ctx
-            ->stream_op_configs[pipe_idx][cmd_type]
-                               [stream_ctx->num_stream_op_configs[pipe_idx][cmd_type]]
-            .config_size = size;
-        stream_ctx->num_stream_op_configs[pipe_idx][cmd_type]++;
+        vpe_vector_push(stream_ctx->stream_op_configs[pipe_idx][cmd_type], &record);
     }
 
     vpe_priv->vpe_desc_writer.add_config_desc(
@@ -735,18 +790,16 @@ void vpe_frontend_config_callback(
 void vpe_backend_config_callback(
     void *ctx, uint64_t cfg_base_gpu, uint64_t cfg_base_cpu, uint64_t size, uint32_t pipe_idx)
 {
-    struct config_backend_cb_ctx *cb_ctx = (struct config_backend_cb_ctx*)ctx;
-    struct vpe_priv *vpe_priv            = cb_ctx->vpe_priv;
-    struct output_ctx *output_ctx        = &vpe_priv->output_ctx;
+    struct config_backend_cb_ctx *cb_ctx     = (struct config_backend_cb_ctx *)ctx;
+    struct vpe_priv              *vpe_priv   = cb_ctx->vpe_priv;
+    struct output_ctx            *output_ctx = &vpe_priv->output_ctx;
+    struct config_record          record;
 
     if (cb_ctx->share) {
-        VPE_ASSERT(output_ctx->num_configs[pipe_idx] <
-                   (sizeof(output_ctx->configs[pipe_idx]) / sizeof(struct config_record)));
+        record.config_base_addr = cfg_base_gpu;
+        record.config_size      = size;
 
-        output_ctx->configs[pipe_idx][output_ctx->num_configs[pipe_idx]].config_base_addr =
-            cfg_base_gpu;
-        output_ctx->configs[pipe_idx][output_ctx->num_configs[pipe_idx]].config_size = size;
-        output_ctx->num_configs[pipe_idx]++;
+        vpe_vector_push(output_ctx->configs[pipe_idx], &record);
     }
 
     vpe_priv->vpe_desc_writer.add_config_desc(
diff --git a/src/amd/vpelib/src/core/vpelib.c b/src/amd/vpelib/src/core/vpelib.c
index 419a668968f..594f1598a99 100644
--- a/src/amd/vpelib/src/core/vpelib.c
+++ b/src/amd/vpelib/src/core/vpelib.c
@@ -128,7 +128,6 @@ static void override_debug_option(
         debug->disable_lut_caching = user_debug->disable_lut_caching;
 }
 
-#ifdef VPE_BUILD_1_1
 static void verify_collaboration_mode(struct vpe_priv *vpe_priv)
 {
     if (vpe_priv->pub.level == VPE_IP_LEVEL_1_1) {
@@ -142,7 +141,44 @@ static void verify_collaboration_mode(struct vpe_priv *vpe_priv)
         vpe_priv->collaboration_mode = false;
     }
 }
-#endif
+
+static enum vpe_status create_output_config_vector(struct vpe_priv *vpe_priv)
+{
+    uint32_t i;
+
+    // output config vector stores all share-able configs that can be re-used later
+    for (i = 0; i < vpe_priv->pub.caps->resource_caps.num_cdc_be; i++) {
+        vpe_priv->output_ctx.configs[i] =
+            vpe_vector_create(vpe_priv, sizeof(struct config_record), MIN_NUM_CONFIG);
+        if (!vpe_priv->output_ctx.configs[i]) {
+            return VPE_STATUS_NO_MEMORY;
+        }
+    }
+    return VPE_STATUS_OK;
+}
+
+static void destroy_output_config_vector(struct vpe_priv *vpe_priv)
+{
+    uint32_t i;
+
+    for (i = 0; i < vpe_priv->pub.caps->resource_caps.num_cdc_be; i++) {
+        if (vpe_priv->output_ctx.configs[i]) {
+            vpe_vector_free(vpe_priv->output_ctx.configs[i]);
+            vpe_priv->output_ctx.configs[i] = NULL;
+        }
+    }
+}
+
+static void free_output_ctx(struct vpe_priv *vpe_priv)
+{
+    if (vpe_priv->output_ctx.gamut_remap)
+        vpe_free(vpe_priv->output_ctx.gamut_remap);
+
+    if (vpe_priv->output_ctx.output_tf)
+        vpe_free(vpe_priv->output_ctx.output_tf);
+
+    destroy_output_config_vector(vpe_priv);
+}
 
 struct vpe *vpe_create(const struct vpe_init_data *params)
 {
@@ -178,6 +214,14 @@ struct vpe *vpe_create(const struct vpe_init_data *params)
         vpe_free(vpe_priv);
         return NULL;
     }
+
+    status = create_output_config_vector(vpe_priv);
+    if (status != VPE_STATUS_OK) {
+        destroy_output_config_vector(vpe_priv);
+        vpe_free(vpe_priv);
+        return NULL;
+    }
+
     override_debug_option(&vpe_priv->init.debug, &params->debug);
 
     vpe_color_setup_x_points_distribution();
@@ -204,12 +248,12 @@ void vpe_destroy(struct vpe **vpe)
 
     vpe_destroy_resource(vpe_priv, &vpe_priv->resource);
 
-    vpe_free_output_ctx(vpe_priv);
+    free_output_ctx(vpe_priv);
 
     vpe_free_stream_ctx(vpe_priv);
 
     if (vpe_priv->vpe_cmd_vector)
-        vpe_vector_free(vpe_priv, vpe_priv->vpe_cmd_vector);
+        vpe_vector_free(vpe_priv->vpe_cmd_vector);
 
     if (vpe_priv->dummy_input_param)
         vpe_free(vpe_priv->dummy_input_param);
@@ -629,13 +673,15 @@ enum vpe_status vpe_build_commands(
     struct vpe_priv      *vpe_priv;
     struct cmd_builder   *builder;
     enum vpe_status       status = VPE_STATUS_OK;
-    uint32_t              cmd_idx, i, pipe_idx, stream_idx, cmd_type_idx;
+    uint32_t              cmd_idx, pipe_idx, stream_idx, cmd_type_idx;
     struct vpe_build_bufs curr_bufs;
     int64_t               cmd_buf_size;
     int64_t               emb_buf_size;
     uint64_t              cmd_buf_gpu_a, cmd_buf_cpu_a;
     uint64_t              emb_buf_gpu_a, emb_buf_cpu_a;
+    struct vpe_vector    *config_vector;
     struct vpe_cmd_info  *cmd_info;
+
     if (!vpe || !param || !bufs)
         return VPE_STATUS_ERROR;
 
@@ -686,15 +732,26 @@ enum vpe_status vpe_build_commands(
 
     // copy the param, reset saved configs
     for (stream_idx = 0; stream_idx < vpe_priv->num_streams; stream_idx++) {
+        struct stream_ctx *stream_ctx = &vpe_priv->stream_ctx[stream_idx];
+
         for (pipe_idx = 0; pipe_idx < MAX_INPUT_PIPE; pipe_idx++) {
-            vpe_priv->stream_ctx[stream_idx].num_configs[pipe_idx] = 0;
-            for (cmd_type_idx = 0; cmd_type_idx < VPE_CMD_TYPE_COUNT; cmd_type_idx++)
-                vpe_priv->stream_ctx[stream_idx].num_stream_op_configs[pipe_idx][cmd_type_idx] = 0;
+            config_vector = stream_ctx->configs[pipe_idx];
+            if (config_vector)
+                vpe_vector_clear(config_vector);
+
+            for (cmd_type_idx = 0; cmd_type_idx < VPE_CMD_TYPE_COUNT; cmd_type_idx++) {
+                config_vector = stream_ctx->stream_op_configs[pipe_idx][cmd_type_idx];
+                if (config_vector)
+                    vpe_vector_clear(config_vector);
+            }
         }
     }
 
-    for (i = 0; i < MAX_OUTPUT_PIPE; i++)
-        vpe_priv->output_ctx.num_configs[i] = 0;
+    for (pipe_idx = 0; pipe_idx < vpe_priv->pub.caps->resource_caps.num_cdc_be; pipe_idx++) {
+        config_vector = vpe_priv->output_ctx.configs[pipe_idx];
+        if (config_vector)
+            vpe_vector_clear(config_vector);
+    }
 
     // Reset pipes
     vpe_pipe_reset(vpe_priv);
diff --git a/src/amd/vpelib/src/utils/inc/vector.h b/src/amd/vpelib/src/utils/inc/vector.h
index 95006d1f97b..070b73af302 100644
--- a/src/amd/vpelib/src/utils/inc/vector.h
+++ b/src/amd/vpelib/src/utils/inc/vector.h
@@ -31,7 +31,11 @@
 extern "C" {
 #endif
 
+struct vpe_priv;
+
 struct vpe_vector {
+    struct vpe_priv *vpe_priv; /*< store the vpe_priv for alloc/free memory */
+
     void  *element;      /*< the internal vector memory storage */
     size_t num_elements; /*< number of stored elements */
     size_t capacity;
@@ -57,11 +61,10 @@ void *vpe_vector_get(struct vpe_vector *vector, size_t idx);
 
 /**
  * Push the element to end of the vector.
- * @param[in]  vpe_priv  vpe instance created by vpe_create()
  * @param[in]  vector    vector that we want to push to the end.
  * @param[in]  p_element pointer of the element
  */
-void vpe_vector_push(struct vpe_priv *vpe_priv, struct vpe_vector *vector, void *p_element);
+void vpe_vector_push(struct vpe_vector *vector, void *p_element);
 
 /**
  * Clear the vector.
@@ -71,10 +74,9 @@ void vpe_vector_clear(struct vpe_vector *vector);
 
 /**
  * Free the vector.
- * @param[in]  vpe_priv vpe instance created by vpe_create()
  * @param[in]  vector   vector that we want to free.
  */
-void vpe_vector_free(struct vpe_priv *vpe_priv, struct vpe_vector *vpe_vector);
+void vpe_vector_free(struct vpe_vector *vpe_vector);
 
 #ifdef __cplusplus
 }
diff --git a/src/amd/vpelib/src/utils/vector.c b/src/amd/vpelib/src/utils/vector.c
index d9eca9f6a48..02bd3cd58fb 100644
--- a/src/amd/vpelib/src/utils/vector.c
+++ b/src/amd/vpelib/src/utils/vector.c
@@ -39,6 +39,7 @@ struct vpe_vector *vpe_vector_create(
         return NULL;
     }
 
+    vector->vpe_priv     = vpe_priv;
     vector->num_elements = 0;
     vector->capacity     = initial_capacity;
     vector->element_size = element_size;
@@ -46,9 +47,10 @@ struct vpe_vector *vpe_vector_create(
     return vector;
 }
 
-static struct vpe_vector *vector_realloc(
-    struct vpe_priv *vpe_priv, struct vpe_vector *vector, size_t new_size)
+static struct vpe_vector *vector_realloc(struct vpe_vector *vector, size_t new_size)
 {
+    struct vpe_priv *vpe_priv = vector->vpe_priv;
+
     void *new_element = vpe_zalloc(new_size);
     if (!new_element)
         return NULL;
@@ -70,14 +72,14 @@ void *vpe_vector_get(struct vpe_vector *vector, size_t idx)
     return (void *)((char *)(vector->element) + (idx * vector->element_size));
 }
 
-void vpe_vector_push(struct vpe_priv *vpe_priv, struct vpe_vector *vector, void *p_element)
+void vpe_vector_push(struct vpe_vector *vector, void *p_element)
 {
     if (!p_element || !vector)
         return;
 
     if (vector->num_elements >= vector->capacity) {
         vector->capacity *= 2;
-        vector = vector_realloc(vpe_priv, vector, vector->capacity * vector->element_size);
+        vector = vector_realloc(vector, vector->capacity * vector->element_size);
     }
 
     if (!vector)
@@ -97,10 +99,11 @@ void vpe_vector_clear(struct vpe_vector *vector)
     memset(vector->element, 0, vector->capacity * vector->element_size);
 }
 
-void vpe_vector_free(struct vpe_priv *vpe_priv, struct vpe_vector *vector)
+void vpe_vector_free(struct vpe_vector *vector)
 {
+    struct vpe_priv *vpe_priv = vector->vpe_priv;
+
     vpe_free(vector->element);
     vector->element = NULL;
     vpe_free(vector);
-    vector = NULL;
 }