amd/vpelib: Revise the config sharing handling

[WHY] - was hardcoded to store 16 configs only - as the config descriptor usage grows, more is needed - in bypass case, we also generate a new config which is a waste [HOW] - change to use vector to store configs - don't force new config desc if in bypass - revise the vector API, reduce the parameter passing [TESTING] - Tested with corresponding test cases Reviewed-by: Brendan Leder <breleder@amd.com> Acked-by: Chih-Wei Chien <Chih-Wei.Chien@amd.com> Signed-off-by: Roy Chan <roy.chan@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31693>
2024-09-22 21:21:22 -04:00
parent 9751b52a10
commit 8dc7fbaed3
12 changed files with 270 additions and 144 deletions
--- a/src/amd/vpelib/inc/vpe_types.h
+++ b/src/amd/vpelib/inc/vpe_types.h
@@ -211,12 +211,12 @@ struct vpe_caps {
    uint32_t is_apu                 : 1;
    uint32_t bg_color_check_support : 1;
    struct {
-        int num_dpp;
-        int num_opp;
-        int num_mpc_3dlut;
-        int num_cdc_be;
+        uint32_t num_dpp;
+        uint32_t num_opp;
+        uint32_t num_mpc_3dlut;
+        uint32_t num_cdc_be;

-        int num_queue; /**< num of hw queue */
+        uint32_t num_queue; /**< num of hw queue */
    } resource_caps;

    struct vpe_color_caps color_caps;
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_cmd_builder.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_cmd_builder.c
@@ -63,9 +63,11 @@ enum vpe_status vpe10_build_vpe_cmd(
    struct cmd_builder     *builder         = &vpe_priv->resource.cmd_builder;
    struct vpe_desc_writer *vpe_desc_writer = &vpe_priv->vpe_desc_writer;
    struct vpe_buf         *emb_buf         = &cur_bufs->emb_buf;
-    struct output_ctx   *output_ctx;
-    struct pipe_ctx     *pipe_ctx = NULL;
+    struct output_ctx      *output_ctx;
+    struct pipe_ctx        *pipe_ctx = NULL;
    uint32_t                pipe_idx, config_idx;
+    struct vpe_vector      *config_vector;
+    struct config_record   *config;
    struct vpe_cmd_info    *cmd_info = vpe_vector_get(vpe_priv->vpe_cmd_vector, cmd_idx);
    VPE_ASSERT(cmd_info);

@@ -118,19 +120,22 @@ enum vpe_status vpe10_build_vpe_cmd(

            // follow the same order of config generation in "non-reuse" case
            // stream sharing
-            VPE_ASSERT(stream_ctx->num_configs[pipe_idx]);
-            for (config_idx = 0; config_idx < stream_ctx->num_configs[pipe_idx]; config_idx++) {
-                vpe_desc_writer->add_config_desc(vpe_desc_writer,
-                    stream_ctx->configs[pipe_idx][config_idx].config_base_addr, reuse,
-                    (uint8_t)emb_buf->tmz);
+            config_vector = stream_ctx->configs[pipe_idx];
+            VPE_ASSERT(config_vector->num_elements);
+            for (config_idx = 0; config_idx < config_vector->num_elements; config_idx++) {
+                config = (struct config_record *)vpe_vector_get(config_vector, config_idx);
+
+                vpe_desc_writer->add_config_desc(
+                    vpe_desc_writer, config->config_base_addr, reuse, (uint8_t)emb_buf->tmz);
            }

            // stream-op sharing
-            for (config_idx = 0; config_idx < stream_ctx->num_stream_op_configs[pipe_idx][cmd_type];
-                 config_idx++) {
-                vpe_desc_writer->add_config_desc(vpe_desc_writer,
-                    stream_ctx->stream_op_configs[pipe_idx][cmd_type][config_idx].config_base_addr,
-                    reuse, (uint8_t)emb_buf->tmz);
+            config_vector = stream_ctx->stream_op_configs[pipe_idx][cmd_type];
+            for (config_idx = 0; config_idx < config_vector->num_elements; config_idx++) {
+                config = (struct config_record *)vpe_vector_get(config_vector, config_idx);
+
+                vpe_desc_writer->add_config_desc(
+                    vpe_desc_writer, config->config_base_addr, reuse, (uint8_t)emb_buf->tmz);
            }

            // command specific
@@ -148,14 +153,19 @@ enum vpe_status vpe10_build_vpe_cmd(

    // backend programming
    output_ctx = &vpe_priv->output_ctx;
-    if (!output_ctx->num_configs[0]) {
+
+    config_vector = output_ctx->configs[0];
+    if (!config_vector->num_elements) {
        vpe_priv->resource.program_backend(vpe_priv, pipe_ctx->pipe_idx, cmd_idx, false);
    } else {
        bool reuse = !vpe_priv->init.debug.disable_reuse_bit;
+
        // re-use output register configs
-        for (config_idx = 0; config_idx < output_ctx->num_configs[0]; config_idx++) {
-            vpe_desc_writer->add_config_desc(vpe_desc_writer,
-                output_ctx->configs[0][config_idx].config_base_addr, reuse, (uint8_t)emb_buf->tmz);
+        for (config_idx = 0; config_idx < config_vector->num_elements; config_idx++) {
+            config = (struct config_record *)vpe_vector_get(config_vector, config_idx);
+
+            vpe_desc_writer->add_config_desc(
+                vpe_desc_writer, config->config_base_addr, reuse, (uint8_t)emb_buf->tmz);
        }

        vpe_priv->resource.program_backend(vpe_priv, pipe_ctx->pipe_idx, cmd_idx, true);
--- a/src/amd/vpelib/src/chip/vpe10/vpe10_resource.c
+++ b/src/amd/vpelib/src/chip/vpe10/vpe10_resource.c
@@ -951,7 +951,7 @@ enum vpe_status vpe10_populate_cmd_info(struct vpe_priv *vpe_priv)
            cmd_info.tm_enabled         = tm_enabled;
            cmd_info.insert_start_csync = false;
            cmd_info.insert_end_csync   = false;
-            vpe_vector_push(vpe_priv, vpe_priv->vpe_cmd_vector, &cmd_info);
+            vpe_vector_push(vpe_priv->vpe_cmd_vector, &cmd_info);

            // The following codes are only valid if blending is supported
            /*
@@ -979,6 +979,7 @@ void vpe10_create_stream_ops_config(struct vpe_priv *vpe_priv, uint32_t pipe_idx
    struct dpp          *dpp      = vpe_priv->resource.dpp[pipe_idx];
    struct mpc          *mpc      = vpe_priv->resource.mpc[pipe_idx];
    enum vpe_cmd_type    cmd_type = VPE_CMD_TYPE_COUNT;
+    struct vpe_vector   *config_vector;

    vpe_priv->fe_cb_ctx.stream_op_sharing = true;
    vpe_priv->fe_cb_ctx.stream_sharing    = false;
@@ -995,7 +996,8 @@ void vpe10_create_stream_ops_config(struct vpe_priv *vpe_priv, uint32_t pipe_idx
        return;

    // return if already generated
-    if (stream_ctx->num_stream_op_configs[pipe_idx][cmd_type])
+    config_vector = stream_ctx->stream_op_configs[pipe_idx][cmd_type];
+    if (config_vector->num_elements)
        return;

    vpe_priv->fe_cb_ctx.cmd_type = cmd_type;
--- a/src/amd/vpelib/src/core/background.c
+++ b/src/amd/vpelib/src/core/background.c
@@ -118,7 +118,7 @@ void vpe_create_bg_segments(
        cmd_info.ops        = ops;
        cmd_info.cd         = (uint8_t)(gaps_cnt - gap_index - 1);
        cmd_info.tm_enabled = false; // currently only support frontend tm
-        vpe_vector_push(vpe_priv, vpe_priv->vpe_cmd_vector, &cmd_info);
+        vpe_vector_push(vpe_priv->vpe_cmd_vector, &cmd_info);
    }
 }

--- a/src/amd/vpelib/src/core/color.c
+++ b/src/amd/vpelib/src/core/color.c
@@ -181,7 +181,7 @@ static bool color_update_regamma_tf(struct vpe_priv *vpe_priv,
        break;
    }

-    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+    for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
        if (vpe_priv->init.debug.disable_lut_caching ||
            (output_tf->cache_info[i].cm_gamma_type != output_tf->cm_gamma_type) ||
            (output_tf->cache_info[i].tf != output_tf->tf) ||
@@ -198,7 +198,7 @@ static bool color_update_regamma_tf(struct vpe_priv *vpe_priv,
        ret = vpe_color_calculate_regamma_params(
            vpe_priv, x_scale, y_scale, &vpe_priv->cal_buffer, output_tf);
        if (ret) {
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
                // reset the cache status and mark as dirty to let hw layer to re-cache
                output_tf->dirty[i]                    = true;
                output_tf->config_cache[i].cached      = false;
@@ -244,7 +244,7 @@ static bool color_update_degamma_tf(struct vpe_priv *vpe_priv,
        break;
    }

-    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+    for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
        if (vpe_priv->init.debug.disable_lut_caching ||
            (input_tf->cache_info[i].cm_gamma_type != input_tf->cm_gamma_type) ||
            (input_tf->cache_info[i].tf != input_tf->tf) ||
@@ -260,7 +260,7 @@ static bool color_update_degamma_tf(struct vpe_priv *vpe_priv,
    if (update) {
        ret = vpe_color_calculate_degamma_params(vpe_priv, x_scale, y_scale, input_tf);
        if (ret) {
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_dpp; i++) {
                // reset the cache status and mark as dirty to let hw layer to re-cache
                input_tf->dirty[i]                    = true;
                input_tf->config_cache[i].cached      = false;
@@ -683,7 +683,7 @@ enum vpe_status vpe_color_update_3dlut(
    } else {
        bool update = false;

-        for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++)
+        for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++)
            if (vpe_priv->init.debug.disable_lut_caching ||
                (stream_ctx->lut3d_func->cache_info[i].uid_3dlut !=
                    stream_ctx->stream.tm_params.UID))
@@ -693,7 +693,7 @@ enum vpe_status vpe_color_update_3dlut(
            vpe_convert_to_tetrahedral(
                vpe_priv, stream_ctx->stream.tm_params.lut_data,
                stream_ctx->stream.tm_params.lut_dim, stream_ctx->lut3d_func);
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
                stream_ctx->lut3d_func->dirty[i]                = true;
                stream_ctx->lut3d_func->config_cache[i].cached  = false;
                stream_ctx->lut3d_func->cache_info[i].uid_3dlut = stream_ctx->stream.tm_params.UID;
@@ -830,7 +830,7 @@ enum vpe_status vpe_color_update_shaper(const struct vpe_priv *vpe_priv, uint16_
    }

    // right now shaper is always programmed with linear, once cached, it is always reused.
-    for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+    for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
        if (vpe_priv->init.debug.disable_lut_caching ||
            (shaper_func && (shaper_func->cache_info[i].tf != tf))) {
            // if the caching has the required data cached, skip the update
@@ -849,7 +849,7 @@ enum vpe_status vpe_color_update_shaper(const struct vpe_priv *vpe_priv, uint16_

        ret = vpe_build_shaper(&shaper_in, &shaper_func->pwl);
        if (ret == VPE_STATUS_OK) {
-            for (int i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
+            for (uint32_t i = 0; i < vpe_priv->pub.caps->resource_caps.num_mpc_3dlut; i++) {
                shaper_func->dirty[i]               = true;
                shaper_func->config_cache[i].cached = false;
                shaper_func->cache_info[i].tf       = tf;
--- a/src/amd/vpelib/src/core/inc/config_cache.h
+++ b/src/amd/vpelib/src/core/inc/config_cache.h
@@ -42,17 +42,17 @@
 * The upper layer has to indicate this object is dirty or not for the hw programming layer to
 * determine i.  re-use the config cache? ii. cache the new settings?
 *
- * Before using the CONFIG_CACHE(), make sure the function has these local variables visiable in the
+ * Before using the CONFIG_CACHE(), make sure the function has these local variables visible in the
 * same code block:
 * 1. struct config_writer *config_writer
 *    - usually been declared with PROGRAM_ENTRY()
 * 2. a debug option that want to disable caching or not
 * 3. an input object that has the config_cache member
 * 4. the hw programming function that would generate command buffer content
- * 5. the object that has num_configs which stores the generated configs
+ * 5. the input/output context that has configs vector which stores the generated configs
 *
 * Inside this CONFIG_CACHE macro it will clear the dirty bit after consuming the settings
- * 
+ *
 * Make sure to free up this cache object when the parent object is destroyed using
 * CONFIG_CACHE_FREE()
 *
@@ -63,6 +63,7 @@ extern "C" {
 #endif

 struct vpe_priv;
+struct vpe_vector;

 /* a common config cache structure to be included in the object that is for program hardware API
 * layer
@@ -77,21 +78,21 @@ struct config_cache {
 * as bypass mode is not heavy lifting programming.
 *
 * /param   obj_cache           an object that has the config cache member
- * /param   obj_cfg_array       an object that contains the configs and num_configs member
+ * /param   ctx                 an input/output context that contains the configs vector
 * /param   disable_cache       a flag that controls a caching is needed
 * /param   is_bypass           if it is in bypass, it doesn't cache the bypass config
 * /param   program_func_call   the program call that generate config packet content
 * /param   inst                index to address the config_cache array
 */
-#define CONFIG_CACHE(obj_cache, obj_cfg_array, disable_cache, is_bypass, program_func_call, inst)  \
+#define CONFIG_CACHE(obj_cache, ctx, disable_cache, is_bypass, program_func_call, inst)            \
    {                                                                                              \
        bool use_cache = false;                                                                    \
                                                                                                   \
-        /* make sure it opens a new config packet */                                               \
-        config_writer_force_new_with_type(config_writer, CONFIG_TYPE_DIRECT);                      \
-                                                                                                   \
        if ((obj_cache) && !disable_cache && (obj_cache)->config_cache[inst].p_buffer &&           \
            (obj_cache)->config_cache[inst].cached && !((obj_cache)->dirty[inst]) && !is_bypass) { \
+            /* make sure it opens a new config packet */                                           \
+            config_writer_force_new_with_type(config_writer, CONFIG_TYPE_DIRECT);                  \
+                                                                                                   \
            /* reuse the cache */                                                                  \
            if (config_writer->buf->size >= (obj_cache)->config_cache[inst].size) {                \
                memcpy((void *)(uintptr_t)config_writer->base_cpu_va,                              \
@@ -109,7 +110,13 @@ struct config_cache {
                                                                                                   \
        if (!use_cache) {                                                                          \
            uint64_t start, end;                                                                   \
-            uint16_t config_num = (uint16_t)(obj_cfg_array)->num_configs[inst];                    \
+            uint16_t num_config = (uint16_t)(ctx)->configs[inst]->num_elements;                    \
+                                                                                                   \
+            if (!is_bypass) {                                                                      \
+                /* make sure it opens a new config packet so we can cache a complete new config */ \
+                /* for bypass we don't do caching, so no need to open a new desc */                \
+                config_writer_force_new_with_type(config_writer, CONFIG_TYPE_DIRECT);              \
+            }                                                                                      \
                                                                                                   \
            start = config_writer->base_cpu_va;                                                    \
            program_func_call;                                                                     \
@@ -117,7 +124,7 @@ struct config_cache {
                                                                                                   \
            if (!disable_cache && !is_bypass) {                                                    \
                /* only cache when it is not crossing config packets */                            \
-                if (config_num == (obj_cfg_array)->num_configs[inst]) {                            \
+                if (num_config == (ctx)->configs[inst]->num_elements) {                            \
                    if ((obj_cache)->dirty[inst]) {                                                \
                        uint64_t size = end - start;                                               \
                                                                                                   \
--- a/src/amd/vpelib/src/core/inc/resource.h
+++ b/src/amd/vpelib/src/core/inc/resource.h
@@ -42,7 +42,8 @@ struct vpe_priv;
 struct vpe_cmd_info;
 struct segment_ctx;

-#define MIN_VPE_CMD     1024
+#define MIN_VPE_CMD    (1024)
+#define MIN_NUM_CONFIG (16)

 enum vpe_cmd_ops;

@@ -132,9 +133,6 @@ struct stream_ctx *vpe_alloc_stream_ctx(struct vpe_priv *vpe_priv, uint32_t num_

 void vpe_free_stream_ctx(struct vpe_priv *vpe_priv);

-/** output ctx */
-void vpe_free_output_ctx(struct vpe_priv *vpe_priv);
-
 /** pipe resource management */
 void vpe_pipe_reset(struct vpe_priv *vpe_priv);

--- a/src/amd/vpelib/src/core/inc/vpe_priv.h
+++ b/src/amd/vpelib/src/core/inc/vpe_priv.h
@@ -56,8 +56,6 @@ extern "C" {
 #define MAX_LINE_SIZE 1024 // without 16 pixels for the seams
 #define MAX_LINE_CNT  4

-#define MAX_NUM_SAVED_CONFIG 16
-
 enum vpe_cmd_ops {
    VPE_CMD_OPS_BLENDING,
    VPE_CMD_OPS_BG,
@@ -130,13 +128,9 @@ struct stream_ctx {
    uint16_t            num_segments;
    struct segment_ctx *segment_ctx;

-    uint16_t num_configs[MAX_INPUT_PIPE]; // shared among same stream
-    uint16_t num_stream_op_configs[MAX_INPUT_PIPE][VPE_CMD_TYPE_COUNT];
-    // shared among same cmd type, within the same stream
-
-    struct config_record configs[MAX_INPUT_PIPE][MAX_NUM_SAVED_CONFIG];
-    struct config_record stream_op_configs[MAX_INPUT_PIPE][VPE_CMD_TYPE_COUNT]
-                                          [MAX_NUM_SAVED_CONFIG];
+    // share configs that can be re-used once generated
+    struct vpe_vector *configs[MAX_INPUT_PIPE];
+    struct vpe_vector *stream_op_configs[MAX_INPUT_PIPE][VPE_CMD_TYPE_COUNT];

    // cached color properties
    bool                     per_pixel_alpha;
@@ -182,8 +176,8 @@ struct output_ctx {
    enum color_transfer_func tf;
    enum color_space         cs;

-    uint32_t             num_configs[MAX_OUTPUT_PIPE];
-    struct config_record configs[MAX_OUTPUT_PIPE][MAX_NUM_SAVED_CONFIG];
+    // store generated per-pipe configs that can be reused
+    struct vpe_vector *configs[MAX_OUTPUT_PIPE];

    union {
        struct {
--- a/src/amd/vpelib/src/core/resource.c
+++ b/src/amd/vpelib/src/core/resource.c
@@ -164,42 +164,75 @@ struct segment_ctx *vpe_alloc_segment_ctx(struct vpe_priv *vpe_priv, uint16_t nu
    return segment_ctx_base;
 }

-struct stream_ctx *vpe_alloc_stream_ctx(struct vpe_priv *vpe_priv, uint32_t num_streams)
+static enum vpe_status create_input_config_vector(struct stream_ctx *stream_ctx)
 {
-    struct stream_ctx *ctx_base, *ctx;
-    uint32_t           i;
+    enum vpe_status  res = VPE_STATUS_OK;
+    uint32_t         pipe_idx, type_idx;
+    struct vpe_priv *vpe_priv;

-    ctx_base = (struct stream_ctx *)vpe_zalloc(sizeof(struct stream_ctx) * num_streams);
-    if (!ctx_base)
-        return NULL;
+    vpe_priv = stream_ctx->vpe_priv;

-    for (i = 0; i < num_streams; i++) {
-        ctx           = &ctx_base[i];
-        ctx->cs       = COLOR_SPACE_UNKNOWN;
-        ctx->tf       = TRANSFER_FUNC_UNKNOWN;
-        ctx->vpe_priv = vpe_priv;
-        vpe_color_set_adjustments_to_default(&ctx->color_adjustments);
-        ctx->tf_scaling_factor = vpe_fixpt_one;
-        ctx->stream.flags.geometric_scaling = 0;
-        ctx->stream.tm_params.UID = 0;
-        ctx->uid_3dlut = 0;
+    for (pipe_idx = 0; pipe_idx < vpe_priv->pub.caps->resource_caps.num_dpp; pipe_idx++) {
+        stream_ctx->configs[pipe_idx] =
+            vpe_vector_create(vpe_priv, sizeof(struct config_record), MIN_NUM_CONFIG);
+        if (!stream_ctx->configs[pipe_idx]) {
+            res = VPE_STATUS_NO_MEMORY;
+            break;
+        }
+
+        for (type_idx = 0; type_idx < VPE_CMD_TYPE_COUNT; type_idx++) {
+            stream_ctx->stream_op_configs[pipe_idx][type_idx] =
+                vpe_vector_create(vpe_priv, sizeof(struct config_record), MIN_NUM_CONFIG);
+            if (!stream_ctx->stream_op_configs[pipe_idx][type_idx]) {
+                res = VPE_STATUS_NO_MEMORY;
+                break;
+            }
+        }
+
+        if (res != VPE_STATUS_OK)
+            break;
    }

-    return ctx_base;
+    return res;
 }

-void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
+static void destroy_input_config_vector(struct stream_ctx *stream_ctx)
 {
-    uint16_t           i;
-    struct stream_ctx *ctx;
+    uint32_t         pipe_idx, type_idx;
+    struct vpe_priv *vpe_priv;

-    if (!vpe_priv->stream_ctx || !vpe_priv->num_streams)
+    vpe_priv = stream_ctx->vpe_priv;
+
+    for (pipe_idx = 0; pipe_idx < vpe_priv->pub.caps->resource_caps.num_dpp; pipe_idx++) {
+        if (stream_ctx->configs[pipe_idx]) {
+            vpe_vector_free(stream_ctx->configs[pipe_idx]);
+            stream_ctx->configs[pipe_idx] = NULL;
+        }
+
+        for (type_idx = 0; type_idx < VPE_CMD_TYPE_COUNT; type_idx++) {
+            if (stream_ctx->stream_op_configs[pipe_idx][type_idx]) {
+                vpe_vector_free(stream_ctx->stream_op_configs[pipe_idx][type_idx]);
+                stream_ctx->stream_op_configs[pipe_idx][type_idx] = NULL;
+            }
+        }
+    }
+}
+
+static void free_stream_ctx(uint32_t num_streams, struct stream_ctx *stream_ctx)
+{
+    struct vpe_priv *vpe_priv;
+    uint32_t         stream_idx;
+
+    if (!stream_ctx || !num_streams)
        return;

-    for (i = 0; i < vpe_priv->num_streams; i++) {
-        ctx = &vpe_priv->stream_ctx[i];
+    vpe_priv = stream_ctx[0].vpe_priv;
+
+    for (stream_idx = 0; stream_idx < num_streams; stream_idx++) {
+        struct stream_ctx *ctx = &stream_ctx[stream_idx];
+
        if (ctx->input_tf) {
-            for (int j = 0; j < MAX_INPUT_PIPE; j++)
+            for (uint32_t j = 0; j < MAX_INPUT_PIPE; j++)
                CONFIG_CACHE_FREE(ctx->input_tf->config_cache[j]);
            vpe_free(ctx->input_tf);
            ctx->input_tf = NULL;
@@ -221,21 +254,21 @@ void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
        }

        if (ctx->in_shaper_func) {
-            for (int j = 0; j < MAX_INPUT_PIPE; j++)
+            for (uint32_t j = 0; j < MAX_INPUT_PIPE; j++)
                CONFIG_CACHE_FREE(ctx->in_shaper_func->config_cache[j]);
            vpe_free(ctx->in_shaper_func);
            ctx->in_shaper_func = NULL;
        }

        if (ctx->blend_tf) {
-            for (int j = 0; j < MAX_INPUT_PIPE; j++)
+            for (uint32_t j = 0; j < MAX_INPUT_PIPE; j++)
                CONFIG_CACHE_FREE(ctx->blend_tf->config_cache[j]);
            vpe_free(ctx->blend_tf);
            ctx->blend_tf = NULL;
        }

        if (ctx->lut3d_func) {
-            for (int j = 0; j < MAX_3DLUT; j++)
+            for (uint32_t j = 0; j < MAX_3DLUT; j++)
                CONFIG_CACHE_FREE(ctx->lut3d_func->config_cache[j]);
            vpe_free(ctx->lut3d_func);
            ctx->lut3d_func = NULL;
@@ -245,20 +278,53 @@ void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
            vpe_free(ctx->segment_ctx);
            ctx->segment_ctx = NULL;
        }
+
+        destroy_input_config_vector(ctx);
    }
-    vpe_free(vpe_priv->stream_ctx);
-    vpe_priv->stream_ctx  = NULL;
-    vpe_priv->num_streams = 0;
-    vpe_priv->num_virtual_streams = 0;
 }

-void vpe_free_output_ctx(struct vpe_priv *vpe_priv)
+struct stream_ctx *vpe_alloc_stream_ctx(struct vpe_priv *vpe_priv, uint32_t num_streams)
 {
-    if (vpe_priv->output_ctx.gamut_remap)
-        vpe_free(vpe_priv->output_ctx.gamut_remap);
+    struct stream_ctx *ctx_base, *ctx;
+    uint32_t           stream_idx;
+    enum vpe_status    res = VPE_STATUS_OK;

-    if (vpe_priv->output_ctx.output_tf)
-        vpe_free(vpe_priv->output_ctx.output_tf);
+    ctx_base = (struct stream_ctx *)vpe_zalloc(sizeof(struct stream_ctx) * num_streams);
+    if (!ctx_base)
+        return NULL;
+
+    for (stream_idx = 0; stream_idx < num_streams; stream_idx++) {
+        ctx           = &ctx_base[stream_idx];
+        ctx->cs       = COLOR_SPACE_UNKNOWN;
+        ctx->tf       = TRANSFER_FUNC_UNKNOWN;
+        ctx->vpe_priv = vpe_priv;
+        vpe_color_set_adjustments_to_default(&ctx->color_adjustments);
+        ctx->tf_scaling_factor              = vpe_fixpt_one;
+        ctx->stream.flags.geometric_scaling = 0;
+        ctx->stream.tm_params.UID           = 0;
+        ctx->uid_3dlut                      = 0;
+
+        if ((res = create_input_config_vector(ctx)) != VPE_STATUS_OK)
+            break;
+    }
+
+    if (res != VPE_STATUS_OK) {
+        free_stream_ctx(num_streams, ctx_base);
+        ctx_base = NULL;
+    }
+    return ctx_base;
+}
+
+void vpe_free_stream_ctx(struct vpe_priv *vpe_priv)
+{
+    if (vpe_priv->num_streams && vpe_priv->stream_ctx) {
+        free_stream_ctx(vpe_priv->num_streams, vpe_priv->stream_ctx);
+        vpe_free(vpe_priv->stream_ctx);
+    }
+
+    vpe_priv->stream_ctx          = NULL;
+    vpe_priv->num_streams         = 0;
+    vpe_priv->num_virtual_streams = 0;
 }

 void vpe_pipe_reset(struct vpe_priv *vpe_priv)
@@ -697,35 +763,24 @@ void vpe_resource_build_bit_depth_reduction_params(
 void vpe_frontend_config_callback(
    void *ctx, uint64_t cfg_base_gpu, uint64_t cfg_base_cpu, uint64_t size, uint32_t pipe_idx)
 {
-    struct config_frontend_cb_ctx *cb_ctx = (struct config_frontend_cb_ctx*)ctx;
-    struct vpe_priv *vpe_priv             = cb_ctx->vpe_priv;
-    struct stream_ctx *stream_ctx         = &vpe_priv->stream_ctx[cb_ctx->stream_idx];
-    enum vpe_cmd_type  cmd_type;
+    struct config_frontend_cb_ctx *cb_ctx     = (struct config_frontend_cb_ctx *)ctx;
+    struct vpe_priv               *vpe_priv   = cb_ctx->vpe_priv;
+    struct stream_ctx             *stream_ctx = &vpe_priv->stream_ctx[cb_ctx->stream_idx];
+    enum vpe_cmd_type              cmd_type;
+    struct config_record           record;

    if (cb_ctx->stream_sharing) {
-        VPE_ASSERT(stream_ctx->num_configs[pipe_idx] <
-                   (int)(sizeof(stream_ctx->configs[pipe_idx]) / sizeof(struct config_record)));
+        record.config_base_addr = cfg_base_gpu;
+        record.config_size      = size;

-        stream_ctx->configs[pipe_idx][stream_ctx->num_configs[pipe_idx]].config_base_addr =
-            cfg_base_gpu;
-        stream_ctx->configs[pipe_idx][stream_ctx->num_configs[pipe_idx]].config_size = size;
-        stream_ctx->num_configs[pipe_idx]++;
+        vpe_vector_push(stream_ctx->configs[pipe_idx], &record);
    } else if (cb_ctx->stream_op_sharing) {
        cmd_type = cb_ctx->cmd_type;

-        VPE_ASSERT(stream_ctx->num_stream_op_configs[pipe_idx][cmd_type] <
-                   (int)(sizeof(stream_ctx->stream_op_configs[pipe_idx][cmd_type]) /
-                         sizeof(struct config_record)));
+        record.config_base_addr = cfg_base_gpu;
+        record.config_size      = size;

-        stream_ctx
-            ->stream_op_configs[pipe_idx][cmd_type]
-                               [stream_ctx->num_stream_op_configs[pipe_idx][cmd_type]]
-            .config_base_addr = cfg_base_gpu;
-        stream_ctx
-            ->stream_op_configs[pipe_idx][cmd_type]
-                               [stream_ctx->num_stream_op_configs[pipe_idx][cmd_type]]
-            .config_size = size;
-        stream_ctx->num_stream_op_configs[pipe_idx][cmd_type]++;
+        vpe_vector_push(stream_ctx->stream_op_configs[pipe_idx][cmd_type], &record);
    }

    vpe_priv->vpe_desc_writer.add_config_desc(
@@ -735,18 +790,16 @@ void vpe_frontend_config_callback(
 void vpe_backend_config_callback(
    void *ctx, uint64_t cfg_base_gpu, uint64_t cfg_base_cpu, uint64_t size, uint32_t pipe_idx)
 {
-    struct config_backend_cb_ctx *cb_ctx = (struct config_backend_cb_ctx*)ctx;
-    struct vpe_priv *vpe_priv            = cb_ctx->vpe_priv;
-    struct output_ctx *output_ctx        = &vpe_priv->output_ctx;
+    struct config_backend_cb_ctx *cb_ctx     = (struct config_backend_cb_ctx *)ctx;
+    struct vpe_priv              *vpe_priv   = cb_ctx->vpe_priv;
+    struct output_ctx            *output_ctx = &vpe_priv->output_ctx;
+    struct config_record          record;

    if (cb_ctx->share) {
-        VPE_ASSERT(output_ctx->num_configs[pipe_idx] <
-                   (sizeof(output_ctx->configs[pipe_idx]) / sizeof(struct config_record)));
+        record.config_base_addr = cfg_base_gpu;
+        record.config_size      = size;

-        output_ctx->configs[pipe_idx][output_ctx->num_configs[pipe_idx]].config_base_addr =
-            cfg_base_gpu;
-        output_ctx->configs[pipe_idx][output_ctx->num_configs[pipe_idx]].config_size = size;
-        output_ctx->num_configs[pipe_idx]++;
+        vpe_vector_push(output_ctx->configs[pipe_idx], &record);
    }

    vpe_priv->vpe_desc_writer.add_config_desc(
--- a/src/amd/vpelib/src/core/vpelib.c
+++ b/src/amd/vpelib/src/core/vpelib.c
@@ -128,7 +128,6 @@ static void override_debug_option(
        debug->disable_lut_caching = user_debug->disable_lut_caching;
 }

-#ifdef VPE_BUILD_1_1
 static void verify_collaboration_mode(struct vpe_priv *vpe_priv)
 {
    if (vpe_priv->pub.level == VPE_IP_LEVEL_1_1) {
@@ -142,7 +141,44 @@ static void verify_collaboration_mode(struct vpe_priv *vpe_priv)
        vpe_priv->collaboration_mode = false;
    }
 }
-#endif
+
+static enum vpe_status create_output_config_vector(struct vpe_priv *vpe_priv)
+{
+    uint32_t i;
+
+    // output config vector stores all share-able configs that can be re-used later
+    for (i = 0; i < vpe_priv->pub.caps->resource_caps.num_cdc_be; i++) {
+        vpe_priv->output_ctx.configs[i] =
+            vpe_vector_create(vpe_priv, sizeof(struct config_record), MIN_NUM_CONFIG);
+        if (!vpe_priv->output_ctx.configs[i]) {
+            return VPE_STATUS_NO_MEMORY;
+        }
+    }
+    return VPE_STATUS_OK;
+}
+
+static void destroy_output_config_vector(struct vpe_priv *vpe_priv)
+{
+    uint32_t i;
+
+    for (i = 0; i < vpe_priv->pub.caps->resource_caps.num_cdc_be; i++) {
+        if (vpe_priv->output_ctx.configs[i]) {
+            vpe_vector_free(vpe_priv->output_ctx.configs[i]);
+            vpe_priv->output_ctx.configs[i] = NULL;
+        }
+    }
+}
+
+static void free_output_ctx(struct vpe_priv *vpe_priv)
+{
+    if (vpe_priv->output_ctx.gamut_remap)
+        vpe_free(vpe_priv->output_ctx.gamut_remap);
+
+    if (vpe_priv->output_ctx.output_tf)
+        vpe_free(vpe_priv->output_ctx.output_tf);
+
+    destroy_output_config_vector(vpe_priv);
+}

 struct vpe *vpe_create(const struct vpe_init_data *params)
 {
@@ -178,6 +214,14 @@ struct vpe *vpe_create(const struct vpe_init_data *params)
        vpe_free(vpe_priv);
        return NULL;
    }
+
+    status = create_output_config_vector(vpe_priv);
+    if (status != VPE_STATUS_OK) {
+        destroy_output_config_vector(vpe_priv);
+        vpe_free(vpe_priv);
+        return NULL;
+    }
+
    override_debug_option(&vpe_priv->init.debug, &params->debug);

    vpe_color_setup_x_points_distribution();
@@ -204,12 +248,12 @@ void vpe_destroy(struct vpe **vpe)

    vpe_destroy_resource(vpe_priv, &vpe_priv->resource);

-    vpe_free_output_ctx(vpe_priv);
+    free_output_ctx(vpe_priv);

    vpe_free_stream_ctx(vpe_priv);

    if (vpe_priv->vpe_cmd_vector)
-        vpe_vector_free(vpe_priv, vpe_priv->vpe_cmd_vector);
+        vpe_vector_free(vpe_priv->vpe_cmd_vector);

    if (vpe_priv->dummy_input_param)
        vpe_free(vpe_priv->dummy_input_param);
@@ -629,13 +673,15 @@ enum vpe_status vpe_build_commands(
    struct vpe_priv      *vpe_priv;
    struct cmd_builder   *builder;
    enum vpe_status       status = VPE_STATUS_OK;
-    uint32_t              cmd_idx, i, pipe_idx, stream_idx, cmd_type_idx;
+    uint32_t              cmd_idx, pipe_idx, stream_idx, cmd_type_idx;
    struct vpe_build_bufs curr_bufs;
    int64_t               cmd_buf_size;
    int64_t               emb_buf_size;
    uint64_t              cmd_buf_gpu_a, cmd_buf_cpu_a;
    uint64_t              emb_buf_gpu_a, emb_buf_cpu_a;
+    struct vpe_vector    *config_vector;
    struct vpe_cmd_info  *cmd_info;
+
    if (!vpe || !param || !bufs)
        return VPE_STATUS_ERROR;

@@ -686,15 +732,26 @@ enum vpe_status vpe_build_commands(

    // copy the param, reset saved configs
    for (stream_idx = 0; stream_idx < vpe_priv->num_streams; stream_idx++) {
+        struct stream_ctx *stream_ctx = &vpe_priv->stream_ctx[stream_idx];
+
        for (pipe_idx = 0; pipe_idx < MAX_INPUT_PIPE; pipe_idx++) {
-            vpe_priv->stream_ctx[stream_idx].num_configs[pipe_idx] = 0;
-            for (cmd_type_idx = 0; cmd_type_idx < VPE_CMD_TYPE_COUNT; cmd_type_idx++)
-                vpe_priv->stream_ctx[stream_idx].num_stream_op_configs[pipe_idx][cmd_type_idx] = 0;
+            config_vector = stream_ctx->configs[pipe_idx];
+            if (config_vector)
+                vpe_vector_clear(config_vector);
+
+            for (cmd_type_idx = 0; cmd_type_idx < VPE_CMD_TYPE_COUNT; cmd_type_idx++) {
+                config_vector = stream_ctx->stream_op_configs[pipe_idx][cmd_type_idx];
+                if (config_vector)
+                    vpe_vector_clear(config_vector);
+            }
        }
    }

-    for (i = 0; i < MAX_OUTPUT_PIPE; i++)
-        vpe_priv->output_ctx.num_configs[i] = 0;
+    for (pipe_idx = 0; pipe_idx < vpe_priv->pub.caps->resource_caps.num_cdc_be; pipe_idx++) {
+        config_vector = vpe_priv->output_ctx.configs[pipe_idx];
+        if (config_vector)
+            vpe_vector_clear(config_vector);
+    }

    // Reset pipes
    vpe_pipe_reset(vpe_priv);
--- a/src/amd/vpelib/src/utils/inc/vector.h
+++ b/src/amd/vpelib/src/utils/inc/vector.h
@@ -31,7 +31,11 @@
 extern "C" {
 #endif

+struct vpe_priv;
+
 struct vpe_vector {
+    struct vpe_priv *vpe_priv; /*< store the vpe_priv for alloc/free memory */
+
    void  *element;      /*< the internal vector memory storage */
    size_t num_elements; /*< number of stored elements */
    size_t capacity;
@@ -57,11 +61,10 @@ void *vpe_vector_get(struct vpe_vector *vector, size_t idx);

 /**
 * Push the element to end of the vector.
- * @param[in]  vpe_priv  vpe instance created by vpe_create()
 * @param[in]  vector    vector that we want to push to the end.
 * @param[in]  p_element pointer of the element
 */
-void vpe_vector_push(struct vpe_priv *vpe_priv, struct vpe_vector *vector, void *p_element);
+void vpe_vector_push(struct vpe_vector *vector, void *p_element);

 /**
 * Clear the vector.
@@ -71,10 +74,9 @@ void vpe_vector_clear(struct vpe_vector *vector);

 /**
 * Free the vector.
- * @param[in]  vpe_priv vpe instance created by vpe_create()
 * @param[in]  vector   vector that we want to free.
 */
-void vpe_vector_free(struct vpe_priv *vpe_priv, struct vpe_vector *vpe_vector);
+void vpe_vector_free(struct vpe_vector *vpe_vector);

 #ifdef __cplusplus
 }
--- a/src/amd/vpelib/src/utils/vector.c
+++ b/src/amd/vpelib/src/utils/vector.c
@@ -39,6 +39,7 @@ struct vpe_vector *vpe_vector_create(
        return NULL;
    }

+    vector->vpe_priv     = vpe_priv;
    vector->num_elements = 0;
    vector->capacity     = initial_capacity;
    vector->element_size = element_size;
@@ -46,9 +47,10 @@ struct vpe_vector *vpe_vector_create(
    return vector;
 }

-static struct vpe_vector *vector_realloc(
-    struct vpe_priv *vpe_priv, struct vpe_vector *vector, size_t new_size)
+static struct vpe_vector *vector_realloc(struct vpe_vector *vector, size_t new_size)
 {
+    struct vpe_priv *vpe_priv = vector->vpe_priv;
+
    void *new_element = vpe_zalloc(new_size);
    if (!new_element)
        return NULL;
@@ -70,14 +72,14 @@ void *vpe_vector_get(struct vpe_vector *vector, size_t idx)
    return (void *)((char *)(vector->element) + (idx * vector->element_size));
 }

-void vpe_vector_push(struct vpe_priv *vpe_priv, struct vpe_vector *vector, void *p_element)
+void vpe_vector_push(struct vpe_vector *vector, void *p_element)
 {
    if (!p_element || !vector)
        return;

    if (vector->num_elements >= vector->capacity) {
        vector->capacity *= 2;
-        vector = vector_realloc(vpe_priv, vector, vector->capacity * vector->element_size);
+        vector = vector_realloc(vector, vector->capacity * vector->element_size);
    }

    if (!vector)
@@ -97,10 +99,11 @@ void vpe_vector_clear(struct vpe_vector *vector)
    memset(vector->element, 0, vector->capacity * vector->element_size);
 }

-void vpe_vector_free(struct vpe_priv *vpe_priv, struct vpe_vector *vector)
+void vpe_vector_free(struct vpe_vector *vector)
 {
+    struct vpe_priv *vpe_priv = vector->vpe_priv;
+
    vpe_free(vector->element);
    vector->element = NULL;
    vpe_free(vector);
-    vector = NULL;
 }