teflon: Initial commit

Teflon is a Gallium frontend that TensorFlow Lite can load to delegate the execution of operations in a neural network model. See docs for more. Acked-by: Christian Gmeiner <cgmeiner@igalia.com> Acked-by: Dave Airlie <airlied@redhat.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25714>
2023-09-06 18:26:37 +02:00
parent 9290410870
commit af199e0ff0
184 changed files with 3756 additions and 2 deletions
--- a/.clang-format-include
+++ b/.clang-format-include
@@ -2,6 +2,7 @@
 # enforcement in the CI.

 src/gallium/drivers/i915
+src/gallium/targets/teflon/**/*
 src/amd/vulkan/**/*
 src/amd/compiler/**/*
 src/egl/**/*
--- a/.gitlab-ci/build/gitlab-ci.yml
+++ b/.gitlab-ci/build/gitlab-ci.yml
@@ -267,6 +267,7 @@ fedora-release:
      -D vulkan-layers=device-select,overlay
      -D intel-clc=enabled
      -D imagination-srv=true
+      -D teflon=true
    GALLIUM_DRIVERS: "crocus,etnaviv,freedreno,i915,iris,kmsro,lima,nouveau,panfrost,r300,r600,radeonsi,svga,swrast,tegra,v3d,vc4,virgl,zink"
    GALLIUM_ST: >
      -D dri3=enabled
@@ -416,6 +417,7 @@ debian-arm64:
      -D imagination-srv=true
      -D perfetto=true
      -D freedreno-kmds=msm,virtio
+      -D teflon=true
    S3_ARTIFACT_NAME: mesa-arm64-default-${BUILDTYPE}
  script:
    - .gitlab-ci/meson/build.sh
@@ -507,6 +509,7 @@ debian-clang:
      -D build-aco-tests=true
      -D intel-clc=enabled
      -D imagination-srv=true
+      -D teflon=true
    CC: clang-${LLVM_VERSION}
    CXX: clang++-${LLVM_VERSION}

--- a/.gitlab-ci/container/debian/arm64_build.sh
+++ b/.gitlab-ci/container/debian/arm64_build.sh
@@ -27,6 +27,7 @@ DEPS=(
    cmake
    curl
    fastboot
+    flatbuffers-compiler
    flex
    g++
    git
@@ -36,6 +37,7 @@ DEPS=(
    libdrm-dev
    libelf-dev
    libexpat1-dev
+    libflatbuffers-dev
    libvulkan-dev
    libx11-dev
    libx11-xcb-dev
@@ -50,6 +52,7 @@ DEPS=(
    libxext-dev
    libxrandr-dev
    libxshmfence-dev
+    libxtensor-dev
    libxxf86vm-dev
    libwayland-dev
    libwayland-egl-backend-dev
--- a/.gitlab-ci/container/debian/x86_64_build-base.sh
+++ b/.gitlab-ci/container/debian/x86_64_build-base.sh
@@ -29,6 +29,7 @@ DEPS=(
    dpkg-cross
    findutils
    flex
+    flatbuffers-compiler
    g++
    cmake
    gcc
@@ -41,6 +42,7 @@ DEPS=(
    libelf-dev
    libepoxy-dev
    libexpat1-dev
+    libflatbuffers-dev
    libgtk-3-dev
    "libllvm${LLVM_VERSION}"
    libomxil-bellagio-dev
@@ -56,6 +58,7 @@ DEPS=(
    libxrandr-dev
    libxrender-dev
    libxshmfence-dev
+    libxtensor-dev
    libxxf86vm-dev
    libwayland-egl-backend-dev
    make
--- a/.gitlab-ci/container/fedora/x86_64_build.sh
+++ b/.gitlab-ci/container/fedora/x86_64_build.sh
@@ -30,6 +30,7 @@ DEPS=(
    ccache
    clang-devel
    flex
+    flatbuffers-compiler
    gcc
    gcc-c++
    gettext
@@ -41,6 +42,7 @@ DEPS=(
    "pkgconfig(SPIRV-Tools)"
    "pkgconfig(dri2proto)"
    "pkgconfig(expat)"
+    "pkgconfig(flatbuffers)"
    "pkgconfig(glproto)"
    "pkgconfig(libclc)"
    "pkgconfig(libelf)"
@@ -66,6 +68,7 @@ DEPS=(
    "pkgconfig(xfixes)"
    "pkgconfig(xrandr)"
    "pkgconfig(xshmfence)"
+    "pkgconfig(xtensor)"
    "pkgconfig(xxf86vm)"
    "pkgconfig(zlib)"
    procps-ng
--- a/.gitlab-ci/image-tags.yml
+++ b/.gitlab-ci/image-tags.yml
@@ -7,7 +7,7 @@

 variables:
   DEBIAN_X86_64_BUILD_BASE_IMAGE: "debian/x86_64_build-base"
-   DEBIAN_BASE_TAG: "2024-01-14-runner"
+   DEBIAN_BASE_TAG: "2024-01-23-teflon-3"

   DEBIAN_X86_64_BUILD_IMAGE_PATH: "debian/x86_64_build"
   DEBIAN_BUILD_TAG: "2024-01-04-find"
@@ -24,7 +24,7 @@ variables:

   ALPINE_X86_64_BUILD_TAG: "2023-01-07-libdrm2_4_119"
   ALPINE_X86_64_LAVA_SSH_TAG: "2023-06-26-first-version"
-   FEDORA_X86_64_BUILD_TAG: "2024-01-06-libdrm"
+   FEDORA_X86_64_BUILD_TAG: "2024-01-23-teflon-3"
   KERNEL_ROOTFS_TAG: "2024-01-19-zlib"
   KERNEL_TAG: "v6.6.12-for-mesa-ci-5a92d0709d0b"
   KERNEL_REPO: "gfx-ci/linux"
--- a/.gitlab-ci/test-source-dep.yml
+++ b/.gitlab-ci/test-source-dep.yml
@@ -254,6 +254,7 @@
        - src/amd/vulkan/**/*
        - src/amd/compiler/**/*
        - src/etnaviv/isa/**/*
+        - src/gallium/targets/teflon/**/*
      when: on_success
      allow_failure: false
    # in other pipelines, formatting checks are allowed to fail
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -76,6 +76,7 @@ Linux, FreeBSD, and other operating systems.
   gallium-nine
   viewperf
   xlibdriver
+   teflon

 .. toctree::
   :maxdepth: 1
--- a/docs/teflon.rst
+++ b/docs/teflon.rst
@@ -0,0 +1,140 @@
+TensorFlow Lite delegate
+========================
+
+Mesa contains a TensorFlow Lite delegate that can make use of NPUs to accelerate ML inference. It is implemented in the form of a *external delegate*, a shared library that the TensorFlow Lite runtime can load at startup. See https://www.tensorflow.org/api_docs/python/tf/lite/experimental/load_delegate.
+
+.. list-table:: Supported acceleration hardware
+   :header-rows: 1
+
+   * - Gallium driver
+     - NPU supported
+     - Hardware tested
+   * - Etnaviv
+     - ``VeriSilicon VIPNano-QI.7120``
+     - ``Amlogic A311D on Libre Computer AML-A311D-CC Alta and Khadas VIM3``
+
+Build
+-----
+
+Build Mesa as usual, with the -Dteflon=true argument.
+
+Example instructions:
+
+.. code-block:: console
+
+   # Install build dependencies
+   ~ # apt-get -y build-dep mesa
+   ~ # apt-get -y install git cmake
+
+   # Download sources
+   ~ $ git clone https://gitlab.freedesktop.org/mesa/mesa.git
+
+   # Build Mesa
+   ~ $ cd mesa
+   mesa $ meson setup build -Dgallium-drivers=etnaviv -Dvulkan-drivers= -Dteflon=true
+   mesa $ meson compile -C build
+
+Install runtime dependencies
+----------------------------
+
+Your board should have booted into a mainline 6.7 or greater kernel and have the etnaviv driver loaded. You will also need to enable the NPU device in the device tree by means of an overlay or by a change such as the below (and rebuild the DTB):
+
+.. code-block:: diff
+
+   diff --git a/arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3.dts b/arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3.dts
+   index 4aa2b20bfbf2..4e8266056bca 100644
+   --- a/arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3.dts
+   +++ b/arch/arm64/boot/dts/amlogic/meson-g12b-a311d-khadas-vim3.dts
+   @@ -50,6 +50,10 @@ galcore {
+         };
+   };
+   
+   +&npu {
+   +       status = "okay";
+   +};
+   +
+   /*
+   * The VIM3 on-board  MCU can mux the PCIe/USB3.0 shared differential
+   * lines using a FUSB340TMX USB 3.1 SuperSpeed Data Switch between
+
+
+.. code-block:: console
+
+   # Install Python 3.10 and dependencies (as root)
+   ~ # echo deb-src http://deb.debian.org/debian testing main >> /etc/apt/sources.list
+   ~ # echo deb http://deb.debian.org/debian unstable main >> /etc/apt/sources.list
+   ~ # echo 'APT::Default-Release "testing";' >> /etc/apt/apt.conf
+   ~ # apt-get update
+   ~ # apt-get -y install python3.10 python3-pytest python3-exceptiongroup
+
+   # Install TensorFlow Lite Python package (as non-root)
+   ~ $ python3.10 -m pip install --break-system-packages tflite-runtime==2.13.0
+
+Do some inference with MobileNetV1
+----------------------------------
+
+.. code-block:: console
+
+   ~ $ cd mesa/
+   mesa $ TEFLON_DEBUG=verbose ETNA_MESA_DEBUG=ml_dbgs python3.10 src/gallium/frontends/teflon/tests/classification.py -i ~/tensorflow/assets/grace_hopper.bmp -m src/gallium/targets/teflon/tests/mobilenet_v1_1.0_224_quant.tflite -l src/gallium/frontends/teflon/tests/labels_mobilenet_quant_v1_224.txt -e build/src/gallium/targets/teflon/libteflon.so
+
+   Loading external delegate from build/src/gallium/targets/teflon/libteflon.so with args: {}
+   Teflon delegate: loaded etnaviv driver
+
+   teflon: compiling graph: 89 tensors 28 operations
+   idx scale     zp has_data size        
+   =======================================
+   0 0.023528   0 no       1x1x1x1024
+   1 0.166099  42 no       1x1x1x1001
+   2 0.000117   0 yes      1001x0x0x0
+   3 0.004987  4a yes      1001x1x1x1024
+   4 0.166099  42 no       1x1001x0x0
+   5 0.166099  42 yes      2x0x0x0
+   6 0.000171   0 yes      32x0x0x0
+   7 0.023528   0 no       1x112x112x32
+   8 0.021827  97 yes      32x3x3x3
+   9 0.023528   0 no       1x14x14x512
+   ...
+
+   idx type    in out  operation type-specific
+   ================================================================================================
+   0 CONV    88   7  w: 8 b: 6 stride: 2 pad: SAME
+   1 DWCONV   7  33  w: 35 b: 34 stride: 1 pad: SAME
+   2 CONV    33  37  w: 38 b: 36 stride: 1 pad: SAME
+   3 DWCONV  37  39  w: 41 b: 40 stride: 2 pad: SAME
+   4 CONV    39  43  w: 44 b: 42 stride: 1 pad: SAME
+   5 DWCONV  43  45  w: 47 b: 46 stride: 1 pad: SAME
+   6 CONV    45  49  w: 50 b: 48 stride: 1 pad: SAME
+   7 DWCONV  49  51  w: 53 b: 52 stride: 2 pad: SAME
+   8 CONV    51  55  w: 56 b: 54 stride: 1 pad: SAME
+   9 DWCONV  55  57  w: 59 b: 58 stride: 1 pad: SAME
+   10 CONV    57  61  w: 62 b: 60 stride: 1 pad: SAME
+   11 DWCONV  61  63  w: 65 b: 64 stride: 2 pad: SAME
+   12 CONV    63  67  w: 68 b: 66 stride: 1 pad: SAME
+   13 DWCONV  67  69  w: 71 b: 70 stride: 1 pad: SAME
+   14 CONV    69  73  w: 74 b: 72 stride: 1 pad: SAME
+   15 DWCONV  73  75  w: 77 b: 76 stride: 1 pad: SAME
+   16 CONV    75  79  w: 80 b: 78 stride: 1 pad: SAME
+   17 DWCONV  79  81  w: 83 b: 82 stride: 1 pad: SAME
+   18 CONV    81  85  w: 86 b: 84 stride: 1 pad: SAME
+   19 DWCONV  85   9  w: 11 b: 10 stride: 1 pad: SAME
+   20 CONV     9  13  w: 14 b: 12 stride: 1 pad: SAME
+   21 DWCONV  13  15  w: 17 b: 16 stride: 1 pad: SAME
+   22 CONV    15  19  w: 20 b: 18 stride: 1 pad: SAME
+   23 DWCONV  19  21  w: 23 b: 22 stride: 2 pad: SAME
+   24 CONV    21  25  w: 26 b: 24 stride: 1 pad: SAME
+   25 DWCONV  25  27  w: 29 b: 28 stride: 1 pad: SAME
+   26 CONV    27  31  w: 32 b: 30 stride: 1 pad: SAME
+   27 POOL    31   0  filter: 0x0 stride: 0 pad: VALID
+
+   teflon: compiled graph, took 10307 ms
+   teflon: invoked graph, took 21 ms
+   teflon: invoked graph, took 17 ms
+   teflon: invoked graph, took 17 ms
+   teflon: invoked graph, took 17 ms
+   teflon: invoked graph, took 16 ms
+   0.866667: military uniform
+   0.031373: Windsor tie
+   0.015686: mortarboard
+   0.007843: bow tie
+   0.007843: academic
--- a/meson.build
+++ b/meson.build
@@ -2144,6 +2144,13 @@ if with_perfetto
  pre_args += '-DHAVE_PERFETTO'
 endif

+with_teflon = get_option('teflon')
+if with_teflon and with_tests
+  dep_xtensor = dependency('xtensor')
+  dep_flatbuffers = dependency('flatbuffers')
+  prog_flatc = find_program('flatc')
+endif
+
 with_gpuvis = get_option('gpuvis')
 if with_gpuvis
  pre_args += '-DHAVE_GPUVIS'
@@ -2362,3 +2369,6 @@ if with_perfetto and with_any_datasource
  perfetto_summary += {'Data source': with_datasources}
 endif
 summary(perfetto_summary, section: 'Perfetto', bool_yn: true, list_sep: ' ')
+
+teflon_summary = {'Enabled': with_teflon}
+summary(teflon_summary, section: 'Teflon (TensorFlow Lite delegate)', bool_yn: true, list_sep: ' ')
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -591,6 +591,13 @@ option(
                'Default: [`auto`]'
 )

+option(
+  'teflon',
+  type : 'boolean',
+  value : false,
+  description : 'Enable TensorFlow Lite delegate. Default: false'
+)
+
 option(
  'gpuvis',
  type : 'boolean',
--- a/src/gallium/frontends/teflon/meson.build
+++ b/src/gallium/frontends/teflon/meson.build
@@ -0,0 +1,11 @@
+libtfl_files = files(
+    'tfl_device.c')
+
+libteflon_st = static_library(
+  'teflon_st',
+  [libtfl_files, sha1_h],
+  c_args : [  ],
+  gnu_symbol_visibility : 'hidden',
+  include_directories : [ inc_include, inc_src, inc_util, inc_gallium, inc_gallium_aux ],
+  dependencies : [ idep_mesautil ]
+)
--- a/src/gallium/frontends/teflon/tfl_device.c
+++ b/src/gallium/frontends/teflon/tfl_device.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "pipe-loader/pipe_loader.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "util/format/u_format.h"
+#include "util/u_inlines.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+
+/* TODO: Move to TfLiteAsyncKernel for zero-copy of buffers */
+
+enum teflon_debug_flags {
+   TEFLON_DEBUG_VERBOSE = 1 << 1,
+};
+
+static const struct debug_named_value teflon_debug_flags[] = {
+    { "verbose", TEFLON_DEBUG_VERBOSE, "Verbose logging." },
+    DEBUG_NAMED_VALUE_END
+};
+
+DEBUG_GET_ONCE_FLAGS_OPTION(debug_teflon, "TEFLON_DEBUG", teflon_debug_flags, 0)
+
+static inline void
+teflon_debug(const char *format, ...)
+{
+   if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
+      va_list ap;
+      va_start(ap, format);
+      _debug_vprintf(format, ap);
+      va_end(ap);
+   }
+}
+
+struct teflon_delegate
+{
+   TfLiteDelegate base;
+   struct pipe_loader_device *dev;
+   struct pipe_context *context;
+};
+
+struct teflon_subgraph
+{
+   struct pipe_ml_subgraph *base;
+
+   unsigned *input_tensors;
+   unsigned input_count;
+
+   unsigned *output_tensors;
+   unsigned output_count;
+};
+
+static struct pipe_resource *
+create_resource(struct pipe_context *context, TfLiteTensor tensor)
+{
+   unsigned bytes;
+   unsigned size = 1;
+
+   for (int i = 0; i < tensor.dims->size; i++)
+      size *= tensor.dims->data[i];
+
+   switch(tensor.type) {
+      case kTfLiteInt8:
+      case kTfLiteUInt8:
+         bytes = 1;
+         break;
+      case kTfLiteInt16:
+      case kTfLiteUInt16:
+      case kTfLiteFloat16:
+         bytes = 2;
+         break;
+      case kTfLiteInt32:
+      case kTfLiteUInt32:
+      case kTfLiteFloat32:
+         bytes = 4;
+         break;
+      case kTfLiteInt64:
+      case kTfLiteUInt64:
+      case kTfLiteFloat64:
+      case kTfLiteComplex64:
+         bytes = 8;
+         break;
+      default:
+         unreachable("Unsupported TF type");
+   }
+
+   return pipe_buffer_create_with_data(context, 0, PIPE_USAGE_DEFAULT, size * bytes, tensor.data.data);
+}
+
+static void
+fill_operation(struct teflon_delegate *delegate, TfLiteContext *tf_context, TfLiteNode *node, TfLiteRegistration *node_registration, struct pipe_ml_operation *operation, struct pipe_tensor *tensors)   
+{
+   TfLiteConvParams* params = (TfLiteConvParams*)node->builtin_data;
+
+   operation->input_tensor = &tensors[node->inputs->data[0]];
+   operation->output_tensor = &tensors[node->outputs->data[0]];
+
+   switch(node_registration->builtin_code) {
+      case kTfLiteBuiltinConv2d:
+      case kTfLiteBuiltinDepthwiseConv2d:
+         operation->type = PIPE_ML_OPERATION_TYPE_CONVOLUTION;
+         operation->conv.weight_tensor = &tensors[node->inputs->data[1]];
+         operation->conv.bias_tensor = &tensors[node->inputs->data[2]];
+         operation->conv.stride_x = params->stride_width;
+         operation->conv.stride_y = params->stride_height;
+         operation->conv.padding_same = params->padding == kTfLitePaddingSame;
+         operation->conv.depthwise = node_registration->builtin_code == kTfLiteBuiltinDepthwiseConv2d;
+         operation->conv.pointwise = operation->conv.weight_tensor->dims[1] == 1 && \
+                                     operation->conv.weight_tensor->dims[2] == 1;
+         break;
+      case kTfLiteBuiltinAveragePool2d:
+         operation->type = PIPE_ML_OPERATION_TYPE_POOLING;
+         break;
+      case kTfLiteBuiltinAdd:
+         operation->type = PIPE_ML_OPERATION_TYPE_ADD;
+         operation->add.input_tensor = &tensors[node->inputs->data[1]];
+         break;
+      default:
+         unreachable("Unsupported ML operation type");
+   }
+}
+
+static void
+fill_tensor(struct teflon_delegate *delegate, TfLiteContext *tf_context, struct pipe_tensor *tensor, unsigned index)
+{
+   struct pipe_context *context = delegate->context;
+   TfLiteTensor tf_tensor = tf_context->tensors[index];
+   const TfLiteAffineQuantization *quant = (const TfLiteAffineQuantization *)tf_tensor.quantization.params;
+
+   if (tf_tensor.type == kTfLiteNoType)
+      return; /* Placeholder tensor */
+
+   if (tf_tensor.data.data)
+      tensor->resource = create_resource(context, tf_tensor);
+
+   tensor->index = index;
+   memcpy(tensor->dims, tf_tensor.dims->data, tf_tensor.dims->size * sizeof(*tensor->dims));
+   tensor->scale = quant->scale->data[0];
+   tensor->zero_point = quant->zero_point->data[0];
+
+   switch(tf_tensor.type) {
+      case kTfLiteUInt8:
+      case kTfLiteUInt16:
+      case kTfLiteUInt32:
+      case kTfLiteUInt64:
+         tensor->is_signed = false;
+         break;
+      default:
+         tensor->is_signed = true;
+   }
+}
+
+static void
+dump_graph(struct pipe_tensor *tensors, unsigned tensor_count, struct pipe_ml_operation *operations, unsigned operation_count)
+{
+   teflon_debug("\n");
+   teflon_debug("teflon: compiling graph: %d tensors %d operations\n",
+                tensor_count, operation_count);
+
+   teflon_debug("%3s %-8s %3s %s %-12s\n", "idx", "scale", "zp", "has_data", "size");
+   teflon_debug("=======================================\n");
+   for (int i = 0; i < tensor_count; i++) {
+      teflon_debug("%3d %6f %3x %-8s %dx%dx%dx%d\n",
+                  tensors[i].index,
+                  tensors[i].scale,
+                  tensors[i].zero_point,
+                  tensors[i].resource == NULL ? "no" : "yes",
+                  tensors[i].dims[0], tensors[i].dims[1], tensors[i].dims[2], tensors[i].dims[3]);
+   }
+
+   teflon_debug("\n");
+   teflon_debug("%3s %-6s %3s %3s  %s\n", "idx", "type", "in", "out", "operation type-specific");
+   teflon_debug("================================================================================================\n");
+   for (int i = 0; i < operation_count; i++) {
+      switch(operations[i].type) {
+      case PIPE_ML_OPERATION_TYPE_ADD:
+         teflon_debug("%3d %-6s %3d %3d  in: %d",
+                     i,
+                     "ADD",
+                     operations[i].input_tensor->index,
+                     operations[i].output_tensor->index,
+                     operations[i].add.input_tensor->index);
+         break;
+      case PIPE_ML_OPERATION_TYPE_CONVOLUTION:
+         teflon_debug("%3d %-6s %3d %3d  w: %d b: %d stride: %d pad: %s",
+                     i,
+                     operations[i].conv.depthwise ? "DWCONV" : "CONV",
+                     operations[i].input_tensor->index,
+                     operations[i].output_tensor->index,
+                     operations[i].conv.weight_tensor->index,
+                     operations[i].conv.bias_tensor->index,
+                     operations[i].conv.stride_x,
+                     operations[i].conv.padding_same ? "SAME" : "VALID");
+         break;
+      case PIPE_ML_OPERATION_TYPE_POOLING:
+         teflon_debug("%3d %-6s %3d %3d  filter: %dx%d stride: %d pad: %s",
+                     i,
+                     "POOL",
+                     operations[i].input_tensor->index,
+                     operations[i].output_tensor->index,
+                     operations[i].pooling.filter_height,
+                     operations[i].pooling.filter_width,
+                     operations[i].pooling.stride_x,
+                     operations[i].pooling.padding_same ? "SAME" : "VALID");
+         break;
+      }
+
+      teflon_debug("\n");
+   }
+   teflon_debug("\n");
+}
+
+static void *
+partition_init(TfLiteContext *tf_context, const char *buffer, size_t length)
+{
+   const TfLiteDelegateParams *params = (const TfLiteDelegateParams *)buffer;
+   struct teflon_delegate *delegate = (struct teflon_delegate *)params->delegate;
+   struct pipe_context *context = delegate->context;
+   struct pipe_ml_operation operations[params->nodes_to_replace->size];
+   struct pipe_tensor tensors[tf_context->tensors_size];
+   long start = 0, end = 0;
+
+   memset(operations, 0, sizeof(operations));
+   memset(tensors, 0, sizeof(tensors));
+
+   if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
+      struct timespec time;
+      clock_gettime(CLOCK_MONOTONIC, &time);
+      start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
+   }
+
+   for (int i = 0; i < tf_context->tensors_size; i++)
+      fill_tensor(delegate, tf_context, &tensors[i], i);
+
+   for (int i = 0; i < params->nodes_to_replace->size; i++)
+   {
+      const int node_index = params->nodes_to_replace->data[i];
+      TfLiteNode *delegated_node = NULL;
+      TfLiteRegistration *delegated_node_registration = NULL;
+      tf_context->GetNodeAndRegistration(tf_context, node_index, &delegated_node,
+                                         &delegated_node_registration);
+
+      fill_operation(delegate, tf_context, delegated_node, delegated_node_registration, &operations[i], tensors);
+   }
+
+   if (debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)
+      dump_graph(tensors, tf_context->tensors_size, operations, params->nodes_to_replace->size);
+
+   struct pipe_ml_subgraph *subgraph;
+   subgraph = context->ml_subgraph_create(context,
+                                          operations,
+                                          params->nodes_to_replace->size);
+
+   for (int i = 0; i < tf_context->tensors_size; i++)
+      pipe_resource_reference(&tensors[i].resource, NULL);
+
+   struct teflon_subgraph *tsubgraph = calloc(1, sizeof(*tsubgraph));
+   tsubgraph->base = subgraph;
+
+   tsubgraph->input_tensors = malloc(params->input_tensors->size * sizeof(*tsubgraph->input_tensors));
+   for (int i = 0; i < params->input_tensors->size; i++) {
+      unsigned tensor_idx = params->input_tensors->data[i];
+      TfLiteTensor *tensor = &tf_context->tensors[tensor_idx];
+      if (tensor->allocation_type == kTfLiteMmapRo)
+         continue;
+      tsubgraph->input_tensors[tsubgraph->input_count] = tensor_idx;
+      tsubgraph->input_count++;
+   }
+
+   tsubgraph->output_count = params->output_tensors->size;
+   tsubgraph->output_tensors = malloc(params->output_tensors->size * sizeof(*tsubgraph->output_tensors));
+   memcpy(tsubgraph->output_tensors, params->output_tensors->data,
+          params->output_tensors->size * sizeof(*tsubgraph->output_tensors));
+
+   if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
+      struct timespec time;
+      clock_gettime(CLOCK_MONOTONIC, &time);
+      end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
+      teflon_debug("teflon: compiled graph, took %ld ms\n", (end - start));
+   }
+
+   return tsubgraph;
+}
+
+static TfLiteStatus
+partition_prepare(TfLiteContext *context, TfLiteNode *node)
+{
+   // TODO: If input size has changed, resize input, intermediate and output buffers
+
+   return kTfLiteOk;
+}
+
+// De-allocates the per-node-and-Interpreter custom data.
+static void
+partition_free(TfLiteContext *tf_context, void *buffer)
+{
+   struct teflon_subgraph *tsubgraph = (struct teflon_subgraph *)buffer;
+   struct pipe_ml_subgraph *subgraph = tsubgraph->base;
+   struct pipe_context *context = subgraph->context;
+
+   context->ml_subgraph_destroy(context, subgraph);
+   free(tsubgraph->input_tensors);
+   free(tsubgraph->output_tensors);
+   free(tsubgraph);
+}
+
+static TfLiteStatus
+partition_invoke(TfLiteContext *tf_context, TfLiteNode *node)
+{
+   struct teflon_delegate *delegate = (struct teflon_delegate *)node->delegate;
+   struct teflon_subgraph *tsubgraph = (struct teflon_subgraph *)node->user_data;
+   struct pipe_ml_subgraph *subgraph = tsubgraph->base;
+   struct pipe_context *context = delegate->context;
+   long start = 0, end = 0;
+
+   if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
+      struct timespec time;
+      clock_gettime(CLOCK_MONOTONIC, &time);
+      start = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
+   }
+
+   struct pipe_tensor input = {0};
+   /* FIXME: Support mutiple inputs */
+   fill_tensor(delegate, tf_context, &input, tsubgraph->input_tensors[0]);
+   context->ml_subgraph_invoke(context, subgraph, &input);
+
+   void **buffers = malloc(tsubgraph->output_count * sizeof(*buffers));
+   for (unsigned i = 0; i < tsubgraph->output_count; i++)
+      buffers[i] = tf_context->tensors[tsubgraph->output_tensors[i]].data.data;
+   context->ml_subgraph_read_output(context, subgraph, tsubgraph->output_count, tsubgraph->output_tensors, buffers);
+   free(buffers);
+
+   pipe_resource_reference(&input.resource, NULL);
+
+   if (unlikely(debug_get_option_debug_teflon() & TEFLON_DEBUG_VERBOSE)) {
+      struct timespec time;
+      clock_gettime(CLOCK_MONOTONIC, &time);
+      end = (long)time.tv_sec * 1000 + (long)time.tv_nsec / 1000000;
+      teflon_debug("teflon: invoked graph, took %ld ms\n", (end - start));
+   }
+
+   return kTfLiteOk;
+}
+
+static TfLiteStatus
+PrepareDelegate(TfLiteContext *context, TfLiteDelegate *delegate)
+{
+   TfLiteIntArray *plan;
+   TfLiteNode *node;
+   TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+
+   // Get a list of supported nodes.
+   TfLiteIntArray *supported_nodes = malloc(plan->size * sizeof(int) + sizeof(*supported_nodes));
+   supported_nodes->size = plan->size;
+   unsigned node_count = 0;
+   for (int i = 0; i < plan->size; i++) {
+      int node_index = plan->data[i];
+      bool supported = false;
+      TfLiteRegistration *registration;
+      TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+          context, node_index, &node, &registration));
+
+      switch(registration->builtin_code) {
+         case kTfLiteBuiltinConv2d:
+         case kTfLiteBuiltinDepthwiseConv2d: {
+            TfLiteTensor bias_tensor = context->tensors[node->inputs->data[2]];
+            /* Skip out channel numbers that the HW doesn't support */
+            if (bias_tensor.dims->data[0] > 8 && bias_tensor.dims->data[0] % 8 != 0)
+               supported = false;
+            else
+               supported = true;
+            break;
+         }
+         case kTfLiteBuiltinAdd:
+            supported = true;
+            break;
+      }
+
+      if (supported)
+         supported_nodes->data[node_count++] = node_index;
+   }
+   supported_nodes->size = node_count;
+
+   TfLiteRegistration registration;
+
+   registration.init = partition_init;
+   registration.free = partition_free;
+   registration.prepare = partition_prepare;
+   registration.invoke = partition_invoke;
+
+   registration.profiling_string = NULL;
+   registration.builtin_code = kTfLiteBuiltinDelegate;
+   registration.version = 1;
+   registration.registration_external = NULL;
+   registration.custom_name = "Teflon Delegate";
+
+   // Replace supported subgraphs.
+   TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
+       context,
+       registration,
+       supported_nodes,
+       delegate);
+
+   free(supported_nodes);
+
+   return status;
+}
+
+static TfLiteStatus
+CopyFromBufferHandle(TfLiteContext *context,
+                                  TfLiteDelegate *delegate,
+                                  TfLiteBufferHandle buffer_handle,
+                                  TfLiteTensor *tensor)
+{
+   return kTfLiteOk;
+}
+
+static void
+FreeBufferHandle(TfLiteContext *context,
+                      TfLiteDelegate *delegate,
+                      TfLiteBufferHandle *handle)
+{
+}
+
+TfLiteDelegate *tflite_plugin_create_delegate(char **options_keys,
+                                                char **options_values,
+                                                size_t num_options,
+                                                void (*report_error)(const char *));
+
+void tflite_plugin_destroy_delegate(TfLiteDelegate *delegate);
+
+__attribute__((visibility("default"))) TfLiteDelegate *tflite_plugin_create_delegate(char **options_keys,
+                                                                                       char **options_values,
+                                                                                       size_t num_options,
+                                                                                       void (*report_error)(const char *))
+{
+   struct teflon_delegate *delegate = (struct teflon_delegate *)calloc(1, sizeof(*delegate));
+   struct pipe_screen *screen;
+   struct pipe_loader_device **devs;
+
+   delegate->base.flags = kTfLiteDelegateFlagsAllowDynamicTensors | kTfLiteDelegateFlagsRequirePropagatedShapes;
+   delegate->base.Prepare = &PrepareDelegate;
+   delegate->base.CopyFromBufferHandle = &CopyFromBufferHandle;
+   delegate->base.FreeBufferHandle = &FreeBufferHandle;
+
+   int n = pipe_loader_probe(NULL, 0, false);
+   devs = (struct pipe_loader_device **)malloc(sizeof(*devs) * n);
+   pipe_loader_probe(devs, n, false);
+
+   for (int i = 0; i < n; i++) {
+      if (strstr("etnaviv", devs[i]->driver_name))
+         delegate->dev = devs[i];
+      else
+         pipe_loader_release(&devs[i], 1);
+   }
+   free(devs);
+
+   if (delegate->dev == NULL) {
+      fprintf(stderr, "Couldn't open kernel device\n");
+      return NULL;
+   }
+
+   teflon_debug("Teflon delegate: loaded %s driver\n", delegate->dev->driver_name);
+
+   screen = pipe_loader_create_screen(delegate->dev);
+   delegate->context = screen->context_create(screen, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
+
+   return &delegate->base;
+}
+
+__attribute__((visibility("default"))) void tflite_plugin_destroy_delegate(TfLiteDelegate *tflite_delegate)
+{
+   struct teflon_delegate *delegate = (struct teflon_delegate *)tflite_delegate;
+   struct pipe_screen *screen;
+
+   if (tflite_delegate == NULL) {
+      fprintf(stderr, "tflite_plugin_destroy_delegate: NULL delegate!\n");
+      return;
+   }
+
+   screen = delegate->context->screen;
+   delegate->context->destroy(delegate->context);
+   screen->destroy(screen);
+   pipe_loader_release(&delegate->dev, 1);
+   free(delegate);
+}
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -79,6 +79,8 @@ struct pipe_video_buffer;
 struct pipe_video_codec;
 struct pipe_viewport_state;
 struct pipe_compute_state;
+struct pipe_ml_operation;
+struct pipe_tensor;
 union pipe_color_union;
 union pipe_query_result;
 struct u_log_context;
@@ -1223,6 +1225,53 @@ struct pipe_context {
                                                     struct winsys_handle *handle,
                                                     unsigned usage );

+   /**
+    * Compiles a ML subgraph, to be executed later. The returned pipe_ml_subgraph
+    * should contain all information needed to execute the subgraph with as
+    * little effort as strictly needed.
+    *
+    * \param ctx         pipe context
+    * \param operations  array containing the definitions of the operations in the graph
+    * \param count       number of operations
+    * \return            a newly allocated pipe_ml_subgraph
+    */
+   struct pipe_ml_subgraph *(*ml_subgraph_create)(struct pipe_context *context,
+                                                  const struct pipe_ml_operation *operations,
+                                                  unsigned count);
+
+   /**
+    * Invokes a ML subgraph for a given input tensor.
+    *
+    * \param ctx         pipe context
+    * \param subgraph    previously-compiled subgraph
+    * \param input       tensor to use as the input
+    */
+   void (*ml_subgraph_invoke)(struct pipe_context *context,
+                              struct pipe_ml_subgraph *subgraph,
+                              struct pipe_tensor *input);
+
+   /**
+    * After a ML subgraph has been invoked, copy the contents of the output
+    * tensors to the provided buffers.
+    * 
+    * \param ctx           pipe context
+    * \param subgraph      previously-executed subgraph
+    * \param outputs_count number of output tensors to copy out
+    * \param output_idxs   array with the indices of output tensors
+    * \param outputs       array of buffers to copy the tensor data to
+    */
+   void (*ml_subgraph_read_output)(struct pipe_context *context,
+                                   struct pipe_ml_subgraph *subgraph,
+                                   unsigned outputs_count, unsigned output_idxs[], void *outputs[]);
+
+   /**
+    * Release all resources allocated by the implementation of ml_subgraph_create
+    * 
+    * \param ctx           pipe context
+    * \param subgraph      subgraph to release
+    */
+   void (*ml_subgraph_destroy)(struct pipe_context *context,
+                               struct pipe_ml_subgraph *subgraph);
 };


--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -1020,6 +1020,148 @@ struct pipe_grid_info
   struct pipe_resource *indirect_draw_count;
 };

+/**
+ * Encapsulates all info about a tensor. Only types supported are INT8 and UINT8.
+ */
+struct pipe_tensor {
+   /**
+    * Memory-backing for this tensor (use pipe_buffer_*).
+    */
+   struct pipe_resource *resource;
+   /**
+    * Index of this tensor in the subgraph that contains it.
+    */
+   unsigned index;
+   /**
+    * Dimensions of this tensor.
+    */
+   unsigned dims[4];
+   /**
+    * Scale used to quantize this tensor. Only per-tensor quantization is supported.
+    */
+   float scale;
+   /**
+    * Zero-point used to quantize this tensor.
+    */
+   int zero_point;
+   /**
+    * Whether the tensor contains data in INT8 or UINT8 format.
+    */
+   bool is_signed;
+};
+
+/**
+ * Type of a pipe_ml_operation.
+ */
+enum pipe_ml_operation_type {
+   PIPE_ML_OPERATION_TYPE_ADD,
+   PIPE_ML_OPERATION_TYPE_CONVOLUTION,
+   PIPE_ML_OPERATION_TYPE_POOLING,
+};
+
+/**
+ * Information about a single operation inside a ML subgraph.
+ */
+struct pipe_ml_operation
+{
+   /**
+    * Type of operation.
+    */
+   enum pipe_ml_operation_type type;
+
+   /**
+    * Tensor used as input.
+    */
+   struct pipe_tensor *input_tensor;
+
+   /**
+    * Tensor used as output.
+    */
+   struct pipe_tensor *output_tensor;
+
+   union {
+      struct {
+         /**
+          * For convolutions, tensor containing the weights.
+          */
+         struct pipe_tensor *weight_tensor;
+         /**
+          * For convolutions, tensor containing the biases.
+          */
+         struct pipe_tensor *bias_tensor;
+
+         /**
+          * Stride used to access the input tensor on the x axis.
+          */
+         unsigned stride_x;
+
+         /**
+          * Stride used to access the input tensor on the x axis.
+          */
+         unsigned stride_y;
+
+         /**
+          * Whether to use padding of type same when accessing the input tensor.
+          */
+         bool padding_same;
+
+         /**
+          * Whether this is a pointwise (1x1 kernels) convolution.
+          */
+         bool pointwise;
+
+         /**
+          * Whether this is a depthwise convolution.
+          */
+         bool depthwise;
+      } conv;
+      struct {
+         /**
+          * Stride used to access the input tensor on the x axis.
+          */
+         unsigned stride_x;
+
+         /**
+          * Stride used to access the input tensor on the x axis.
+          */
+         unsigned stride_y;
+
+         /**
+          * Width of the area used for pooling.
+          */
+         unsigned filter_width;
+
+         /**
+          * Height of the area used for pooling.
+          */
+         unsigned filter_height;
+
+         /**
+          * Whether to use padding of type same when accessing the input tensor.
+          */
+         bool padding_same;
+      } pooling;
+      struct {
+         /**
+          * Additional input tensor, to be added to the other one.
+          */
+         struct pipe_tensor *input_tensor;
+      } add;
+   };
+};
+
+/**
+ * Subgraph that drivers can subclass to keep the output of the subgraph
+ * compilation process.
+ */
+struct pipe_ml_subgraph
+{
+   /**
+    * pipe_context that owns this subgraph.
+    */
+   struct pipe_context *context;
+};
+
 /**
 * Structure used as a header for serialized compute programs.
 */
--- a/src/gallium/meson.build
+++ b/src/gallium/meson.build
@@ -260,3 +260,8 @@ if with_swrast_vk
  subdir('frontends/lavapipe')
  subdir('targets/lavapipe')
 endif
+
+if with_teflon
+  subdir('frontends/teflon')
+  subdir('targets/teflon')
+endif
--- a/src/gallium/targets/teflon/meson.build
+++ b/src/gallium/targets/teflon/meson.build
@@ -0,0 +1,47 @@
+libteflon = shared_library(
+  'teflon',
+  [ 'target.c' ],
+  include_directories : [ inc_src, inc_util, inc_include, inc_gallium, inc_gallium_aux, inc_gallium_winsys, inc_gallium_drivers ],
+  link_whole : [ libteflon_st ],
+  link_with : [libpipe_loader_static, libws_null, libwsw, libswdri, libswkmsdri, libgallium ],
+  gnu_symbol_visibility : 'hidden',
+  link_args : ld_args_build_id,
+  dependencies : [
+    driver_etnaviv,
+    idep_nir,
+  ],
+  install : true,
+)
+
+if with_tests
+  tensorflow_lite = shared_library(
+    'tensorflow-lite',
+    [ 'tflite-stub.c' ],
+    include_directories : [ inc_include ],
+    install : false,
+  )
+
+  tflite_flatbuffer_h_name = 'tflite-schema-v2.15.0_generated.h'
+  tflite_flatbuffer_h = custom_target('tflite_flatbuffer.h',
+    output: tflite_flatbuffer_h_name,
+    input: 'tflite-schema-v2.15.0.fbs',
+    command : [
+      prog_flatc,
+      '--cpp',
+      '--cpp-ptr-type', 'std::shared_ptr',
+      '--gen-object-api',
+      '-o', meson.current_build_dir(),
+      '@INPUT@'
+    ],
+  )
+
+  executable(
+    'test_teflon',
+    'test_teflon.cpp',
+    'test_executor.cpp',
+    tflite_flatbuffer_h,
+    dependencies : [ idep_mesautil, idep_gtest ],
+    link_with : [ tensorflow_lite ],
+    include_directories : [ inc_include ],
+  )
+endif
--- a/src/gallium/targets/teflon/target.c
+++ b/src/gallium/targets/teflon/target.c
@@ -0,0 +1,2 @@
+#include "target-helpers/drm_helper.h"
+#include "target-helpers/sw_helper.h"
--- a/src/gallium/targets/teflon/test_executor.cpp
+++ b/src/gallium/targets/teflon/test_executor.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <vector>
+#include <gtest/gtest.h>
+#include <xtensor/xrandom.hpp>
+
+#include "util/macros.h"
+
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/common.h"
+
+#include <fcntl.h>
+#include "test_executor.h"
+#include "tflite-schema-v2.15.0_generated.h"
+
+static float
+randf(float min, float max)
+{
+   return ((max - min) * ((float)rand() / (float)RAND_MAX)) + min;
+}
+
+static void
+read_model(const char *file_name, tflite::ModelT &model)
+{
+   std::ostringstream file_path;
+   assert(getenv("TEFLON_TEST_DATA"));
+   file_path << getenv("TEFLON_TEST_DATA") << "/" << file_name;
+
+   FILE *f = fopen(file_path.str().c_str(), "rb");
+   assert(f);
+   fseek(f, 0, SEEK_END);
+   long fsize = ftell(f);
+   fseek(f, 0, SEEK_SET);
+   void *buf = malloc(fsize);
+   fread(buf, fsize, 1, f);
+   fclose(f);
+
+   tflite::GetModel(buf)->UnPackTo(&model);
+}
+
+static void
+patch_conv2d(unsigned operation_index,
+             tflite::ModelT *model,
+             int input_size,
+             int weight_size,
+             int input_channels,
+             int output_channels,
+             int stride,
+             bool padding_same,
+             bool is_signed,
+             bool depthwise)
+{
+   unsigned output_size = 0;
+   unsigned input_index;
+   unsigned weights_index;
+   unsigned bias_index;
+   unsigned output_index;
+   unsigned weights_buffer_index;
+   unsigned bias_buffer_index;
+
+   auto subgraph = model->subgraphs[0];
+
+   /* Operation */
+   if (depthwise) {
+      auto value = new tflite::DepthwiseConv2DOptionsT();
+      value->depth_multiplier = 1;
+      value->padding = padding_same ? tflite::Padding_SAME : tflite::Padding_VALID;
+      value->stride_w = stride;
+      value->stride_h = stride;
+      value->dilation_w_factor = 1;
+      value->dilation_h_factor = 1;
+      subgraph->operators[operation_index]->builtin_options.value = value;
+      subgraph->operators[operation_index]->builtin_options.type = tflite::BuiltinOptions_DepthwiseConv2DOptions;
+
+      model->operator_codes[0]->deprecated_builtin_code = 4;
+      model->operator_codes[0]->builtin_code = tflite::BuiltinOperator_DEPTHWISE_CONV_2D;
+   } else {
+      auto value = new tflite::Conv2DOptionsT();
+      value->padding = padding_same ? tflite::Padding_SAME : tflite::Padding_VALID;
+      value->stride_w = stride;
+      value->stride_h = stride;
+      subgraph->operators[operation_index]->builtin_options.value = value;
+   }
+
+   input_index = subgraph->operators[operation_index]->inputs.data()[0];
+   weights_index = subgraph->operators[operation_index]->inputs.data()[1];
+   bias_index = subgraph->operators[operation_index]->inputs.data()[2];
+   output_index = subgraph->operators[operation_index]->outputs.data()[0];
+
+   /* Input */
+   auto input_tensor = subgraph->tensors[input_index];
+   input_tensor->shape.data()[0] = 1;
+   input_tensor->shape.data()[1] = input_size;
+   input_tensor->shape.data()[2] = input_size;
+   input_tensor->shape.data()[3] = input_channels;
+   input_tensor->type = is_signed ? tflite::TensorType_INT8 : tflite::TensorType_UINT8;
+
+   /* Bias */
+   auto bias_tensor = subgraph->tensors[bias_index];
+   bias_buffer_index = bias_tensor->buffer;
+   bias_tensor->shape.data()[0] = output_channels;
+
+   auto bias_data = &model->buffers[bias_buffer_index]->data;
+   xt::xarray<int32_t> bias_array = xt::random::randint<int32_t>({output_channels}, -20000, 20000);
+   bias_data->resize(bias_array.size() * sizeof(int32_t));
+   memcpy(bias_data->data(), bias_array.data(), bias_array.size() * sizeof(int32_t));
+
+   /* Weight */
+   auto weight_tensor = subgraph->tensors[weights_index];
+   weights_buffer_index = weight_tensor->buffer;
+   if (depthwise) {
+      weight_tensor->shape.data()[0] = 1;
+      weight_tensor->shape.data()[1] = weight_size;
+      weight_tensor->shape.data()[2] = weight_size;
+      weight_tensor->shape.data()[3] = output_channels;
+   } else {
+      weight_tensor->shape.data()[0] = output_channels;
+      weight_tensor->shape.data()[1] = weight_size;
+      weight_tensor->shape.data()[2] = weight_size;
+      weight_tensor->shape.data()[3] = input_channels;
+   }
+   weight_tensor->type = is_signed ? tflite::TensorType_INT8 : tflite::TensorType_UINT8;
+
+   auto weights_data = &model->buffers[weights_buffer_index]->data;
+   std::vector<int> weight_shape;
+   if (depthwise)
+      weight_shape = {1, weight_size, weight_size, output_channels};
+   else
+      weight_shape = {output_channels, weight_size, weight_size, input_channels};
+
+   xt::xarray<uint8_t> weights_array = xt::random::randint<uint8_t>(weight_shape, 0, 255);
+   weights_data->resize(weights_array.size());
+   memcpy(weights_data->data(), weights_array.data(), weights_array.size());
+
+   /* Output */
+   if (padding_same)
+      output_size = (input_size + stride - 1) / stride;
+   else
+      output_size = (input_size + stride - weight_size) / stride;
+
+   auto output_tensor = subgraph->tensors[output_index];
+   output_tensor->shape.data()[0] = 1;
+   output_tensor->shape.data()[1] = output_size;
+   output_tensor->shape.data()[2] = output_size;
+   output_tensor->shape.data()[3] = output_channels;
+   output_tensor->type = is_signed ? tflite::TensorType_INT8 : tflite::TensorType_UINT8;
+}
+
+std::vector<uint8_t>
+conv2d_generate_model(int input_size,
+                      int weight_size,
+                      int input_channels,
+                      int output_channels,
+                      int stride,
+                      bool padding_same,
+                      bool is_signed,
+                      bool depthwise)
+{
+   tflite::ModelT model;
+   read_model("conv2d.tflite", model);
+
+   patch_conv2d(0, &model, input_size, weight_size, input_channels, output_channels, stride, padding_same, is_signed, depthwise);
+
+   flatbuffers::FlatBufferBuilder builder;
+   builder.Finish(tflite::Model::Pack(builder, &model), "TFL3");
+
+   return {builder.GetBufferPointer(), builder.GetBufferPointer() + builder.GetSize()};
+}
+
+static void
+patch_quant_for_add(tflite::ModelT *model)
+{
+   auto subgraph = model->subgraphs[0];
+   auto add_op = subgraph->operators[2];
+
+   auto input_index = add_op->inputs.data()[0];
+   auto input_tensor = subgraph->tensors[input_index];
+   input_tensor->quantization->scale[0] = randf(0.0078125, 0.4386410117149353);
+   input_tensor->quantization->zero_point[0] = rand() % 255;
+
+   input_index = add_op->inputs.data()[1];
+   input_tensor = subgraph->tensors[input_index];
+   input_tensor->quantization->scale[0] = randf(0.0078125, 0.4386410117149353);
+   input_tensor->quantization->zero_point[0] = rand() % 255;
+}
+
+std::vector<uint8_t>
+add_generate_model(int input_size,
+                   int weight_size,
+                   int input_channels,
+                   int output_channels,
+                   int stride,
+                   bool padding_same,
+                   bool is_signed,
+                   bool depthwise)
+{
+   tflite::ModelT model;
+   read_model("add.tflite", model);
+
+   patch_conv2d(0, &model, input_size, weight_size, input_channels, output_channels, stride, padding_same, is_signed, depthwise);
+   patch_conv2d(1, &model, input_size, weight_size, input_channels, output_channels, stride, padding_same, is_signed, depthwise);
+   patch_quant_for_add(&model);
+
+   /* Output */
+   auto subgraph = model.subgraphs[0];
+   unsigned input_index = subgraph->operators[2]->inputs.data()[0];
+   unsigned output_index = subgraph->operators[2]->outputs.data()[0];
+
+   auto input_tensor = subgraph->tensors[input_index];
+   auto output_tensor = subgraph->tensors[output_index];
+   output_tensor->shape.data()[0] = input_tensor->shape.data()[0];
+   output_tensor->shape.data()[1] = input_tensor->shape.data()[1];
+   output_tensor->shape.data()[2] = input_tensor->shape.data()[2];
+   output_tensor->shape.data()[3] = input_tensor->shape.data()[3];
+   output_tensor->type = is_signed ? tflite::TensorType_INT8 : tflite::TensorType_UINT8;
+
+   flatbuffers::FlatBufferBuilder builder;
+   builder.Finish(tflite::Model::Pack(builder, &model), "TFL3");
+
+   return {builder.GetBufferPointer(), builder.GetBufferPointer() + builder.GetSize()};
+}
+
+static void
+tflite_error_cb(void *user_data, const char *format, va_list args)
+{
+   vfprintf(stderr, format, args);
+}
+
+TfLiteDelegate *(*tflite_plugin_create_delegate)(char **options_keys,
+                                                 char **options_values,
+                                                 size_t num_options,
+                                                 void (*report_error)(const char *));
+
+void (*tflite_plugin_destroy_delegate)(TfLiteDelegate *delegate);
+
+static void
+load_delegate()
+{
+   const char *delegate_path = getenv("TEFLON_TEST_DELEGATE");
+   assert(delegate_path);
+
+   void *delegate_lib = dlopen(delegate_path, RTLD_LAZY | RTLD_LOCAL);
+   assert(delegate_lib);
+
+   tflite_plugin_create_delegate = reinterpret_cast<TfLiteDelegate *(*)(char **options_keys,
+                                                                        char **options_values,
+                                                                        size_t num_options,
+                                                                        void (*report_error)(const char *))>(
+      dlsym(delegate_lib, "tflite_plugin_create_delegate"));
+   assert(tflite_plugin_create_delegate);
+
+   tflite_plugin_destroy_delegate = reinterpret_cast<void (*)(TfLiteDelegate *delegate)>(
+      dlsym(delegate_lib, "tflite_plugin_destroy_delegate"));
+   assert(tflite_plugin_destroy_delegate);
+}
+
+std::vector<std::vector<uint8_t>>
+run_model(TfLiteModel *model, enum executor executor, std::vector<std::vector<uint8_t>> &input)
+{
+   TfLiteDelegate *delegate = NULL;
+   TfLiteInterpreterOptions *options = TfLiteInterpreterOptionsCreate();
+   bool generate_random_input = input.empty();
+   std::vector<std::vector<uint8_t>> output;
+
+   if (executor == EXECUTOR_NPU) {
+      load_delegate();
+      delegate = tflite_plugin_create_delegate(NULL, NULL, 0, NULL);
+      TfLiteInterpreterOptionsAddDelegate(options, delegate);
+   }
+
+   TfLiteInterpreterOptionsSetErrorReporter(options, tflite_error_cb, NULL);
+
+   TfLiteInterpreter *interpreter = TfLiteInterpreterCreate(model, options);
+   assert(interpreter);
+
+   TfLiteInterpreterAllocateTensors(interpreter);
+
+   unsigned input_tensors = TfLiteInterpreterGetInputTensorCount(interpreter);
+   for (unsigned i = 0; i < input_tensors; i++) {
+      TfLiteTensor *input_tensor = TfLiteInterpreterGetInputTensor(interpreter, i);
+
+      if (generate_random_input) {
+         int shape[4] = {input_tensor->dims->data[0],
+                         input_tensor->dims->data[1],
+                         input_tensor->dims->data[2],
+                         input_tensor->dims->data[3]};
+         xt::xarray<uint8_t> a = xt::random::randint<uint8_t>(shape, 0, 255);
+         input.push_back({a.begin(), a.end()});
+      }
+
+      TfLiteTensorCopyFromBuffer(input_tensor, input[i].data(), input_tensor->bytes);
+   }
+
+   EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+   unsigned output_tensors = TfLiteInterpreterGetOutputTensorCount(interpreter);
+   for (unsigned i = 0; i < output_tensors; i++) {
+      const TfLiteTensor *output_tensor = TfLiteInterpreterGetOutputTensor(interpreter, i);
+
+      std::vector<uint8_t> out;
+      out.resize(output_tensor->bytes);
+      EXPECT_EQ(TfLiteTensorCopyToBuffer(output_tensor, out.data(), output_tensor->bytes), kTfLiteOk);
+
+      output.push_back(out);
+   }
+
+   TfLiteInterpreterDelete(interpreter);
+   if (executor == EXECUTOR_NPU)
+      tflite_plugin_destroy_delegate(delegate);
+   TfLiteInterpreterOptionsDelete(options);
+
+   return output;
+}
--- a/src/gallium/targets/teflon/test_executor.h
+++ b/src/gallium/targets/teflon/test_executor.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+enum executor {
+   EXECUTOR_CPU,
+   EXECUTOR_NPU,
+};
+
+struct TfLiteModel;
+
+std::vector<uint8_t> conv2d_generate_model(int input_size,
+                                           int weight_size,
+                                           int input_channels,
+                                           int output_channels,
+                                           int stride,
+                                           bool padding_same,
+                                           bool is_signed,
+                                           bool depthwise);
+
+std::vector<uint8_t> add_generate_model(int input_size,
+                                        int weight_size,
+                                        int input_channels,
+                                        int output_channels,
+                                        int stride,
+                                        bool padding_same,
+                                        bool is_signed,
+                                        bool depthwise);
+
+std::vector<std::vector<uint8_t>> run_model(TfLiteModel *model, enum executor executor, std::vector<std::vector<uint8_t>> &input);
--- a/src/gallium/targets/teflon/test_model_generation.py
+++ b/src/gallium/targets/teflon/test_model_generation.py
@@ -0,0 +1,218 @@
+# MIT License
+# 
+# Copyright (c) 2021 VeriSilicon, INC.
+# Copyright (c) 2023 Tomeu Vizoso
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import math
+import os
+import os.path
+import re
+import sys
+import tempfile
+import time
+
+import numpy as np
+import pytest
+import json
+
+import tensorflow as tf
+from tensorflow import keras
+
+MODEL_PATH = "conv2d.tflite"
+
+def create_model_keras(batch_size, in_w, in_h, k_w, k_h, in_ch, out_ch, stride, padding, signed, seed, depthwise):
+    tf.random.set_seed(seed)
+
+    input_shape = [batch_size, in_h, in_w, in_ch]
+    out_channel = out_ch
+    kernel_shape = [k_w, k_h]
+    input_dtype = tf.float32
+
+    if depthwise:
+       conv = keras.layers.DepthwiseConv2D(kernel_size=kernel_shape, strides=stride, padding=padding, depth_multiplier=1)
+    else:
+       conv = keras.layers.Conv2D(filters=out_channel, kernel_size=kernel_shape, strides=stride, padding=padding)
+
+    model = keras.models.Sequential([
+        keras.layers.InputLayer(input_shape=input_shape[1:], batch_size=input_shape[0]),
+        conv
+        ])
+    model.build(input_shape=input_shape)
+
+    if depthwise:
+      weight_shape = [k_w, k_h, in_ch, 1]
+    else:
+      weight_shape = [k_w, k_h, in_ch, out_ch]
+
+    weight_data = tf.random.normal(weight_shape, 0, 127, input_dtype, seed=seed)
+    bias_data = tf.random.normal((out_ch, ), 0, 127, input_dtype, seed=seed)
+    model.set_weights([np.asarray(weight_data, dtype=np.float32), np.asarray(bias_data, dtype=np.float32)])
+
+    tmp = tempfile.NamedTemporaryFile(delete=False, prefix="conv2d-", suffix=".h5", mode="w")
+    model.save(tmp.name)
+    tmp.close()
+    converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file(tmp.name)
+    os.unlink(tmp.name)
+
+    converter.quantized_input_stats = {model.layers[0].input.name: (128, 128.0)}
+    converter.default_ranges_stats = (0.0, 6.0)
+
+    if signed:
+      converter.inference_input_type = tf.int8
+      converter.inference_output_type = tf.int8
+      converter.inference_type = tf.int8
+    else:
+      converter.inference_input_type = tf.uint8
+      converter.inference_output_type = tf.uint8
+      converter.inference_type = tf.uint8
+
+    tflite_model = converter.convert()
+
+    fp = open(MODEL_PATH, "wb")
+    fp.write(tflite_model)
+    fp.flush()
+
+    tf.lite.experimental.Analyzer.analyze(model_path=MODEL_PATH, gpu_compatibility=True)
+
+    return MODEL_PATH
+
+def tflite_to_json(file_path):
+    ret = os.system("flatc --json src/gallium/frontends/teflon/tests/tflite_schema.fbs -- " + file_path)
+    assert(ret == 0)
+    return os.path.splitext(file_path)[0] + ".json"
+
+WEIGHTS_BUFFER = 2
+BIAS_BUFFER = 3
+VERSION_BUFFER = 5
+
+def zero_irrelevant_values(file_path, signed):
+    model_data = open(file_path).read()
+    model_data = re.sub("(\\\"(.*?)\\\"|(\\w+))(\\s*:\\s*(\\\".*?\\\"|.))", "\"\\2\\3\"\\4", model_data)
+    model = json.loads(model_data)
+    #print(json.dumps(model, indent=4))
+    if "version" in model["operator_codes"][0].keys():
+       del model["operator_codes"][0]["version"]
+    for subgraph in model["subgraphs"]:
+        for tensor in subgraph["tensors"]:
+            tensor["name"] = ""
+            if signed:
+              tensor["quantization"]["scale"] = [0.0] * len(tensor["quantization"]["scale"])
+            else:
+              tensor["quantization"]["scale"] = [0.0]
+            if signed:
+              tensor["quantization"]["zero_point"] = [0] * len(tensor["quantization"]["zero_point"])
+            else:
+              tensor["quantization"]["zero_point"] = [0]
+
+    model["buffers"][BIAS_BUFFER]["data"] = [0] * len(model["buffers"][BIAS_BUFFER]["data"])
+    model["buffers"][WEIGHTS_BUFFER]["data"] = [0] * len(model["buffers"][WEIGHTS_BUFFER]["data"])
+    model["buffers"][VERSION_BUFFER]["data"] = [0]
+
+    if "signature_defs" in model:
+      del model["signature_defs"]
+
+    open(file_path, "w").write(json.dumps(model, indent=4))
+    
+def diff(file_1, file_2):
+    ret = os.system("diff -U30 -u " + file_1 + " " + file_2)
+    assert(ret == 0)
+
+def create_model(batch_size, in_w, in_h, k_w, k_h, in_ch, out_ch, stride, padding, signed, seed, depthwise):
+    args = ['build/src/gallium/targets/teflon/test_teflon',
+            'generate_model',
+            str(in_w),
+            str(k_w),
+            str(in_ch),
+            str(out_ch),
+            str(stride),
+            "1" if padding == "same" else "0",
+            str(int(signed)),
+            str(int(depthwise)),
+            str(seed)]
+    print(' '.join(args))
+    os.system(' '.join(args))
+    return "model.tflite"
+
+def convolution(batch_size, input_size, weight_size, in_ch, out_ch, stride, padding, signed, seed, depthwise):
+
+    in_w = input_size
+    in_h = input_size
+    k_w = weight_size
+    k_h = weight_size
+
+    # Depthwise convolutions require the out channels to be a multiple of input channels
+    assert not (depthwise and out_ch % in_ch != 0)
+
+    # Depthwise convolutions with a single IFM don't make sense
+    assert not (depthwise and in_ch == 1)
+
+    # Depthwise convolutions with IFM != OFM are not supported
+    assert not (depthwise and out_ch != in_ch)
+
+    np.random.seed(seed)
+
+    model_file = create_model_keras(batch_size, in_w, in_h, k_w, k_h, in_ch, out_ch, stride, padding, signed, seed, depthwise)
+    model_file_2 = create_model(batch_size, in_w, in_h, k_w, k_h, in_ch, out_ch, stride, padding, signed, seed, depthwise)
+
+    json_file = tflite_to_json(model_file)
+    json_file_2 = tflite_to_json(model_file_2)
+
+    os.unlink(model_file)
+    os.unlink(model_file_2)
+
+    zero_irrelevant_values(json_file, signed)
+    zero_irrelevant_values(json_file_2, signed)
+
+    #print(json.dumps(json.loads(open(json_file).read()), indent=4))
+
+    diff(json_file, json_file_2)
+
+    os.unlink(json_file)
+    os.unlink(json_file_2)
+
+@pytest.mark.parametrize("batch_size",  [1])
+@pytest.mark.parametrize("input_size",  [4, 112])
+@pytest.mark.parametrize("weight_size", [1, 3])
+@pytest.mark.parametrize("in_ch",       [1, 32, 120, 128, 256])
+@pytest.mark.parametrize("out_ch",      [1, 32, 120, 128, 256, 480])
+@pytest.mark.parametrize("stride",      [1, 2])
+@pytest.mark.parametrize("padding",     ["valid", "same"])
+@pytest.mark.parametrize("signed",      [False])
+@pytest.mark.parametrize("seed",        [4, 5])
+def test_conv2d(batch_size, input_size, weight_size, in_ch, out_ch, stride, padding, signed, seed):
+  s = "%r-%r-%s-%r-%r-%r-%r-%r-%r" % (seed, signed, padding, stride, out_ch, in_ch, weight_size, input_size, batch_size)
+  print(s, file=sys.stderr)
+  convolution(batch_size, input_size, weight_size, in_ch, out_ch, stride, padding, signed, seed, depthwise=False)
+
+@pytest.mark.parametrize("batch_size",  [1])
+@pytest.mark.parametrize("input_size",  [4, 112])
+@pytest.mark.parametrize("weight_size", [3])
+@pytest.mark.parametrize("channels",    [32, 128, 256])
+@pytest.mark.parametrize("stride",      [1, 2])
+@pytest.mark.parametrize("padding",     ["valid", "same"])
+@pytest.mark.parametrize("signed",      [False])
+@pytest.mark.parametrize("seed",        [4, 5])
+def test_depthwise(batch_size, input_size, weight_size, channels, stride, padding, signed, seed):
+   s = "%r-%s-%r-%r-%r-%r-%r-%r" % (seed, signed, padding, stride, channels, weight_size, input_size, batch_size)
+   print(s, file=sys.stderr)
+   convolution(batch_size, input_size, weight_size, channels, channels, stride, padding, signed, seed, depthwise=True)
+
+test_conv2d(1, 80, 5, 16, 128, 2, "same", False, 4)
--- a/src/gallium/targets/teflon/test_teflon.cpp
+++ b/src/gallium/targets/teflon/test_teflon.cpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2023-2024 Tomeu Vizoso <tomeu@tomeuvizoso.net>
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <fcntl.h>
+#include <filesystem>
+#include <fstream>
+#include <gtest/gtest.h>
+#include <xtensor/xrandom.hpp>
+
+#include <iostream>
+#include "tensorflow/lite/c/c_api.h"
+#include "test_executor.h"
+
+#define TEST_CONV2D      1
+#define TEST_DEPTHWISE   1
+#define TEST_ADD         1
+#define TEST_MOBILENETV1 1
+#define TEST_MOBILEDET   1
+
+#define TOLERANCE       2
+#define MODEL_TOLERANCE 8
+#define QUANT_TOLERANCE 2
+
+std::vector<bool> is_signed{false}; /* TODO: Support INT8? */
+std::vector<bool> padding_same{false, true};
+std::vector<int> stride{1, 2};
+std::vector<int> output_channels{1, 32, 120, 128, 160, 256};
+std::vector<int> input_channels{1, 32, 120, 128, 256};
+std::vector<int> dw_channels{1, 32, 120, 128, 256};
+std::vector<int> dw_weight_size{3, 5};
+std::vector<int> weight_size{1, 3, 5};
+std::vector<int> input_size{3, 5, 8, 80, 112};
+
+static bool
+cache_is_enabled(void)
+{
+   return getenv("TEFLON_ENABLE_CACHE");
+}
+
+static bool
+read_into(const char *path, std::vector<uint8_t> &buf)
+{
+   FILE *f = fopen(path, "rb");
+   if (f == NULL)
+      return false;
+
+   fseek(f, 0, SEEK_END);
+   long fsize = ftell(f);
+   fseek(f, 0, SEEK_SET);
+
+   buf.resize(fsize);
+   fread(buf.data(), fsize, 1, f);
+
+   fclose(f);
+
+   return true;
+}
+
+static void
+set_seed(unsigned seed)
+{
+   srand(seed);
+   xt::random::seed(seed);
+}
+
+static void
+test_model(std::vector<uint8_t> buf, std::string cache_dir, unsigned tolerance)
+{
+   std::vector<std::vector<uint8_t>> input;
+   std::vector<std::vector<uint8_t>> cpu_output;
+   std::ostringstream input_cache;
+   input_cache << cache_dir << "/"
+               << "input.data";
+
+   std::ostringstream output_cache;
+   output_cache << cache_dir << "/"
+               << "output.data";
+
+   TfLiteModel *model = TfLiteModelCreate(buf.data(), buf.size());
+   assert(model);
+
+   if (cache_is_enabled()) {
+      input.resize(1);
+      bool ret = read_into(input_cache.str().c_str(), input[0]);
+
+      if (ret) {
+         cpu_output.resize(1);
+         ret = read_into(output_cache.str().c_str(), cpu_output[0]);
+      }
+   }
+
+   if (cpu_output.size() == 0 || cpu_output[0].size() == 0) {
+      input.resize(0);
+      cpu_output.resize(0);
+
+      cpu_output = run_model(model, EXECUTOR_CPU, input);
+
+      if (cache_is_enabled()) {
+         std::ofstream file(input_cache.str().c_str(), std::ios::out | std::ios::binary);
+         file.write(reinterpret_cast<const char *>(input[0].data()), input[0].size());
+         file.close();
+
+         file = std::ofstream(output_cache.str().c_str(), std::ios::out | std::ios::binary);
+         file.write(reinterpret_cast<const char *>(cpu_output[0].data()), cpu_output[0].size());
+         file.close();
+      }
+   }
+
+   std::vector<std::vector<uint8_t>> npu_output = run_model(model, EXECUTOR_NPU, input);
+
+   EXPECT_EQ(cpu_output.size(), npu_output.size()) << "Array sizes differ.";
+   for (size_t i = 0; i < cpu_output.size(); i++) {
+      EXPECT_EQ(cpu_output[i].size(), npu_output[i].size()) << "Array sizes differ (" << i << ").";
+
+      for (size_t j = 0; j < cpu_output[i].size(); j++) {
+         if (abs(cpu_output[i][j] - npu_output[i][j]) > tolerance) {
+            std::cout << "CPU: ";
+            for (int k = 0; k < std::min(int(cpu_output[i].size()), 24); k++)
+               std::cout << std::setfill('0') << std::setw(2) << std::hex << int(cpu_output[i][k]) << " ";
+            std::cout << "\n";
+            std::cout << "NPU: ";
+            for (int k = 0; k < std::min(int(npu_output[i].size()), 24); k++)
+               std::cout << std::setfill('0') << std::setw(2) << std::hex << int(npu_output[i][k]) << " ";
+            std::cout << "\n";
+
+            FAIL() << "Output at " << j << " from the NPU (" << std::setfill('0') << std::setw(2) << std::hex << int(npu_output[i][j]) << ") doesn't match that from the CPU (" << std::setfill('0') << std::setw(2) << std::hex << int(cpu_output[i][j]) << ").";
+         }
+      }
+   }
+
+   TfLiteModelDelete(model);
+}
+
+static void
+test_model_file(std::string file_name)
+{
+   set_seed(4);
+
+   std::ifstream model_file(file_name, std::ios::binary);
+   std::vector<uint8_t> buffer((std::istreambuf_iterator<char>(model_file)),
+                               std::istreambuf_iterator<char>());
+   test_model(buffer, "", MODEL_TOLERANCE);
+}
+
+void
+test_conv(int input_size, int weight_size, int input_channels, int output_channels,
+          int stride, bool padding_same, bool is_signed, bool depthwise, int seed)
+{
+   std::vector<uint8_t> buf;
+   std::ostringstream cache_dir, model_cache;
+   cache_dir << "/var/cache/teflon_tests/" << input_size << "_" << weight_size << "_" << input_channels << "_" << output_channels << "_" << stride << "_" << padding_same << "_" << is_signed << "_" << depthwise << "_" << seed;
+   model_cache << cache_dir.str() << "/"
+               << "model.tflite";
+
+   if (weight_size > input_size)
+      GTEST_SKIP();
+
+   set_seed(seed);
+
+   if (cache_is_enabled()) {
+      if (access(model_cache.str().c_str(), F_OK) == 0) {
+         read_into(model_cache.str().c_str(), buf);
+      }
+   }
+
+   if (buf.size() == 0) {
+      buf = conv2d_generate_model(input_size, weight_size,
+                                  input_channels, output_channels,
+                                  stride, padding_same, is_signed,
+                                  depthwise);
+
+      if (cache_is_enabled()) {
+         if (access(cache_dir.str().c_str(), F_OK) != 0) {
+            ASSERT_TRUE(std::filesystem::create_directories(cache_dir.str().c_str()));
+         }
+         std::ofstream file(model_cache.str().c_str(), std::ios::out | std::ios::binary);
+         file.write(reinterpret_cast<const char *>(buf.data()), buf.size());
+         file.close();
+      }
+   }
+
+   test_model(buf, cache_dir.str(), TOLERANCE);
+}
+
+void
+test_add(int input_size, int weight_size, int input_channels, int output_channels,
+         int stride, bool padding_same, bool is_signed, bool depthwise, int seed,
+         unsigned tolerance)
+{
+   std::vector<uint8_t> buf;
+   std::ostringstream cache_dir, model_cache;
+   cache_dir << "/var/cache/teflon_tests/"
+             << "add_" << input_size << "_" << weight_size << "_" << input_channels << "_" << output_channels << "_" << stride << "_" << padding_same << "_" << is_signed << "_" << depthwise << "_" << seed;
+   model_cache << cache_dir.str() << "/"
+               << "model.tflite";
+
+   if (weight_size > input_size)
+      GTEST_SKIP();
+
+   set_seed(seed);
+
+   if (cache_is_enabled()) {
+      if (access(model_cache.str().c_str(), F_OK) == 0) {
+         read_into(model_cache.str().c_str(), buf);
+      }
+   }
+
+   if (buf.size() == 0) {
+      buf = add_generate_model(input_size, weight_size,
+                               input_channels, output_channels,
+                               stride, padding_same, is_signed,
+                               depthwise);
+
+      if (cache_is_enabled()) {
+         if (access(cache_dir.str().c_str(), F_OK) != 0) {
+            ASSERT_TRUE(std::filesystem::create_directories(cache_dir.str().c_str()));
+         }
+         std::ofstream file(model_cache.str().c_str(), std::ios::out | std::ios::binary);
+         file.write(reinterpret_cast<const char *>(buf.data()), buf.size());
+         file.close();
+      }
+   }
+
+   test_model(buf, cache_dir.str(), tolerance);
+}
+
+#if TEST_CONV2D
+
+class Conv2D : public testing::TestWithParam<std::tuple<bool, bool, int, int, int, int, int>> {};
+
+TEST_P(Conv2D, Op)
+{
+   test_conv(std::get<6>(GetParam()),
+             std::get<5>(GetParam()),
+             std::get<4>(GetParam()),
+             std::get<3>(GetParam()),
+             std::get<2>(GetParam()),
+             std::get<1>(GetParam()),
+             std::get<0>(GetParam()),
+             false, /* depthwise */
+             4);
+}
+
+static inline std::string
+Conv2DTestCaseName(
+   const testing::TestParamInfo<std::tuple<bool, bool, int, int, int, int, int>> &info)
+{
+   std::string name = "";
+
+   name += "input_size_" + std::to_string(std::get<6>(info.param));
+   name += "_weight_size_" + std::to_string(std::get<5>(info.param));
+   name += "_input_channels_" + std::to_string(std::get<4>(info.param));
+   name += "_output_channels_" + std::to_string(std::get<3>(info.param));
+   name += "_stride_" + std::to_string(std::get<2>(info.param));
+   name += "_padding_same_" + std::to_string(std::get<1>(info.param));
+   name += "_is_signed_" + std::to_string(std::get<0>(info.param));
+
+   return name;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+   , Conv2D,
+   ::testing::Combine(::testing::ValuesIn(is_signed),
+                      ::testing::ValuesIn(padding_same),
+                      ::testing::ValuesIn(stride),
+                      ::testing::ValuesIn(output_channels),
+                      ::testing::ValuesIn(input_channels),
+                      ::testing::ValuesIn(weight_size),
+                      ::testing::ValuesIn(input_size)),
+   Conv2DTestCaseName);
+
+#endif
+
+#if TEST_DEPTHWISE
+
+class DepthwiseConv2D : public testing::TestWithParam<std::tuple<bool, bool, int, int, int, int>> {};
+
+TEST_P(DepthwiseConv2D, Op)
+{
+   test_conv(std::get<5>(GetParam()),
+             std::get<4>(GetParam()),
+             std::get<3>(GetParam()),
+             std::get<3>(GetParam()),
+             std::get<2>(GetParam()),
+             std::get<1>(GetParam()),
+             std::get<0>(GetParam()),
+             true, /* depthwise */
+             4);
+}
+
+static inline std::string
+DepthwiseConv2DTestCaseName(
+   const testing::TestParamInfo<std::tuple<bool, bool, int, int, int, int>> &info)
+{
+   std::string name = "";
+
+   name += "input_size_" + std::to_string(std::get<5>(info.param));
+   name += "_weight_size_" + std::to_string(std::get<4>(info.param));
+   name += "_channels_" + std::to_string(std::get<3>(info.param));
+   name += "_stride_" + std::to_string(std::get<2>(info.param));
+   name += "_padding_same_" + std::to_string(std::get<1>(info.param));
+   name += "_is_signed_" + std::to_string(std::get<0>(info.param));
+
+   return name;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+   , DepthwiseConv2D,
+   ::testing::Combine(::testing::ValuesIn(is_signed),
+                      ::testing::ValuesIn(padding_same),
+                      ::testing::ValuesIn(stride),
+                      ::testing::ValuesIn(dw_channels),
+                      ::testing::ValuesIn(dw_weight_size),
+                      ::testing::ValuesIn(input_size)),
+   DepthwiseConv2DTestCaseName);
+
+#endif
+
+#if TEST_ADD
+
+class Add : public testing::TestWithParam<std::tuple<bool, bool, int, int, int, int, int>> {};
+
+TEST_P(Add, Op)
+{
+   test_add(std::get<6>(GetParam()),
+            std::get<5>(GetParam()),
+            std::get<4>(GetParam()),
+            std::get<3>(GetParam()),
+            std::get<2>(GetParam()),
+            std::get<1>(GetParam()),
+            std::get<0>(GetParam()),
+            false, /* depthwise */
+            4,
+            TOLERANCE);
+}
+
+static inline std::string
+AddTestCaseName(
+   const testing::TestParamInfo<std::tuple<bool, bool, int, int, int, int, int>> &info)
+{
+   std::string name = "";
+
+   name += "input_size_" + std::to_string(std::get<6>(info.param));
+   name += "_weight_size_" + std::to_string(std::get<5>(info.param));
+   name += "_input_channels_" + std::to_string(std::get<4>(info.param));
+   name += "_output_channels_" + std::to_string(std::get<3>(info.param));
+   name += "_stride_" + std::to_string(std::get<2>(info.param));
+   name += "_padding_same_" + std::to_string(std::get<1>(info.param));
+   name += "_is_signed_" + std::to_string(std::get<0>(info.param));
+
+   return name;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+   , Add,
+   ::testing::Combine(::testing::ValuesIn(is_signed),
+                      ::testing::ValuesIn(padding_same),
+                      ::testing::ValuesIn(stride),
+                      ::testing::ValuesIn(output_channels),
+                      ::testing::ValuesIn(input_channels),
+                      ::testing::ValuesIn(weight_size),
+                      ::testing::ValuesIn(input_size)),
+   AddTestCaseName);
+
+class AddQuant : public testing::TestWithParam<int> {};
+
+TEST_P(AddQuant, Op)
+{
+   test_add(40,
+            1,
+            1,
+            1,
+            1,
+            false, /* padding_same */
+            false, /* is_signed */
+            false, /* depthwise */
+            GetParam(),
+            QUANT_TOLERANCE);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+   , AddQuant,
+   ::testing::Range(0, 100));
+
+#endif
+
+#if TEST_MOBILENETV1
+
+class MobileNetV1 : public ::testing::Test {};
+
+class MobileNetV1Param : public testing::TestWithParam<int> {};
+
+TEST(MobileNetV1, Whole)
+{
+   std::ostringstream file_path;
+   assert(getenv("TEFLON_TEST_DATA"));
+   file_path << getenv("TEFLON_TEST_DATA") << "/mobilenet_v1_1.0_224_quant.tflite";
+
+   test_model_file(file_path.str());
+}
+
+TEST_P(MobileNetV1Param, Op)
+{
+   std::ostringstream file_path;
+   assert(getenv("TEFLON_TEST_DATA"));
+   file_path << getenv("TEFLON_TEST_DATA") << "/mb" << GetParam() << ".tflite";
+
+   test_model_file(file_path.str());
+}
+
+static inline std::string
+MobileNetV1TestCaseName(
+   const testing::TestParamInfo<int> &info)
+{
+   std::string name = "";
+
+   name += "mb";
+   name += std::to_string(info.param);
+
+   return name;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+   , MobileNetV1Param,
+   ::testing::Range(0, 28),
+   MobileNetV1TestCaseName);
+
+#endif
+
+#if TEST_MOBILEDET
+
+class MobileDet : public ::testing::Test {};
+
+class MobileDetParam : public testing::TestWithParam<int> {};
+
+TEST(MobileDet, Whole)
+{
+   std::ostringstream file_path;
+   assert(getenv("TEFLON_TEST_DATA"));
+   file_path << getenv("TEFLON_TEST_DATA") << "/ssdlite_mobiledet_coco_qat_postprocess.tflite";
+
+   test_model_file(file_path.str());
+}
+
+TEST_P(MobileDetParam, Op)
+{
+   std::ostringstream file_path;
+   assert(getenv("TEFLON_TEST_DATA"));
+   file_path << getenv("TEFLON_TEST_DATA") << "/mobiledet" << GetParam() << ".tflite";
+
+   test_model_file(file_path.str());
+}
+
+static inline std::string
+MobileDetTestCaseName(
+   const testing::TestParamInfo<int> &info)
+{
+   std::string name = "";
+
+   name += "mobiledet";
+   name += std::to_string(info.param);
+
+   return name;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+   , MobileDetParam,
+   ::testing::Range(0, 121),
+   MobileDetTestCaseName);
+
+#endif
+
+int
+main(int argc, char **argv)
+{
+   if (argc > 1 && !strcmp(argv[1], "generate_model")) {
+      std::vector<uint8_t> buf;
+
+      assert(argc == 11);
+
+      std::cout << "Generating model to ./model.tflite\n";
+
+      int n = 2;
+      int input_size = atoi(argv[n++]);
+      int weight_size = atoi(argv[n++]);
+      int input_channels = atoi(argv[n++]);
+      int output_channels = atoi(argv[n++]);
+      int stride = atoi(argv[n++]);
+      int padding_same = atoi(argv[n++]);
+      int is_signed = atoi(argv[n++]);
+      int depthwise = atoi(argv[n++]);
+      int seed = atoi(argv[n++]);
+
+      set_seed(seed);
+
+      buf = conv2d_generate_model(input_size, weight_size,
+                                  input_channels, output_channels,
+                                  stride, padding_same, is_signed,
+                                  depthwise);
+
+      int fd = open("model.tflite", O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+      write(fd, buf.data(), buf.size());
+      close(fd);
+
+      return 0;
+   } else if (argc > 1 && !strcmp(argv[1], "run_model")) {
+      test_model_file(std::string(argv[2]));
+   } else {
+      testing::InitGoogleTest(&argc, argv);
+      return RUN_ALL_TESTS();
+   }
+}
--- a/src/gallium/targets/teflon/tests/add.tflite
+++ b/src/gallium/targets/teflon/tests/add.tflite
--- a/src/gallium/targets/teflon/tests/conv2d.tflite
+++ b/src/gallium/targets/teflon/tests/conv2d.tflite
--- a/src/gallium/targets/teflon/tests/mb0.tflite
+++ b/src/gallium/targets/teflon/tests/mb0.tflite
--- a/src/gallium/targets/teflon/tests/mb01.tflite
+++ b/src/gallium/targets/teflon/tests/mb01.tflite
--- a/src/gallium/targets/teflon/tests/mb1.tflite
+++ b/src/gallium/targets/teflon/tests/mb1.tflite
--- a/src/gallium/targets/teflon/tests/mb10.tflite
+++ b/src/gallium/targets/teflon/tests/mb10.tflite
--- a/src/gallium/targets/teflon/tests/mb11.tflite
+++ b/src/gallium/targets/teflon/tests/mb11.tflite
--- a/src/gallium/targets/teflon/tests/mb12.tflite
+++ b/src/gallium/targets/teflon/tests/mb12.tflite
--- a/src/gallium/targets/teflon/tests/mb13.tflite
+++ b/src/gallium/targets/teflon/tests/mb13.tflite
--- a/src/gallium/targets/teflon/tests/mb14.tflite
+++ b/src/gallium/targets/teflon/tests/mb14.tflite
--- a/src/gallium/targets/teflon/tests/mb15.tflite
+++ b/src/gallium/targets/teflon/tests/mb15.tflite
--- a/src/gallium/targets/teflon/tests/mb16.tflite
+++ b/src/gallium/targets/teflon/tests/mb16.tflite
--- a/src/gallium/targets/teflon/tests/mb17.tflite
+++ b/src/gallium/targets/teflon/tests/mb17.tflite
--- a/src/gallium/targets/teflon/tests/mb18.tflite
+++ b/src/gallium/targets/teflon/tests/mb18.tflite
--- a/src/gallium/targets/teflon/tests/mb19.tflite
+++ b/src/gallium/targets/teflon/tests/mb19.tflite
--- a/src/gallium/targets/teflon/tests/mb2.tflite
+++ b/src/gallium/targets/teflon/tests/mb2.tflite
--- a/src/gallium/targets/teflon/tests/mb20.tflite
+++ b/src/gallium/targets/teflon/tests/mb20.tflite
--- a/src/gallium/targets/teflon/tests/mb21.tflite
+++ b/src/gallium/targets/teflon/tests/mb21.tflite
--- a/src/gallium/targets/teflon/tests/mb22.tflite
+++ b/src/gallium/targets/teflon/tests/mb22.tflite
--- a/src/gallium/targets/teflon/tests/mb23.tflite
+++ b/src/gallium/targets/teflon/tests/mb23.tflite
--- a/src/gallium/targets/teflon/tests/mb24.tflite
+++ b/src/gallium/targets/teflon/tests/mb24.tflite
--- a/src/gallium/targets/teflon/tests/mb25.tflite
+++ b/src/gallium/targets/teflon/tests/mb25.tflite
--- a/src/gallium/targets/teflon/tests/mb26.tflite
+++ b/src/gallium/targets/teflon/tests/mb26.tflite
--- a/src/gallium/targets/teflon/tests/mb27.tflite
+++ b/src/gallium/targets/teflon/tests/mb27.tflite
--- a/src/gallium/targets/teflon/tests/mb28.tflite
+++ b/src/gallium/targets/teflon/tests/mb28.tflite
--- a/src/gallium/targets/teflon/tests/mb29.tflite
+++ b/src/gallium/targets/teflon/tests/mb29.tflite
--- a/src/gallium/targets/teflon/tests/mb3.tflite
+++ b/src/gallium/targets/teflon/tests/mb3.tflite
--- a/src/gallium/targets/teflon/tests/mb30.tflite
+++ b/src/gallium/targets/teflon/tests/mb30.tflite
--- a/src/gallium/targets/teflon/tests/mb4.tflite
+++ b/src/gallium/targets/teflon/tests/mb4.tflite
--- a/src/gallium/targets/teflon/tests/mb5.tflite
+++ b/src/gallium/targets/teflon/tests/mb5.tflite
--- a/src/gallium/targets/teflon/tests/mb6.tflite
+++ b/src/gallium/targets/teflon/tests/mb6.tflite
--- a/src/gallium/targets/teflon/tests/mb7.tflite
+++ b/src/gallium/targets/teflon/tests/mb7.tflite
--- a/src/gallium/targets/teflon/tests/mb8.tflite
+++ b/src/gallium/targets/teflon/tests/mb8.tflite
--- a/src/gallium/targets/teflon/tests/mb9.tflite
+++ b/src/gallium/targets/teflon/tests/mb9.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet0.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet0.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet1.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet1.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet10.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet10.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet100.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet100.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet101.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet101.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet102.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet102.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet103.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet103.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet104.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet104.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet105.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet105.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet106.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet106.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet107.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet107.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet108.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet108.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet109.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet109.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet11.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet11.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet110.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet110.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet111.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet111.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet112.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet112.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet113.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet113.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet114.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet114.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet115.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet115.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet116.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet116.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet117.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet117.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet118.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet118.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet119.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet119.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet12.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet12.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet120.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet120.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet121.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet121.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet122.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet122.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet123.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet123.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet13.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet13.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet14.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet14.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet15.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet15.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet16.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet16.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet17.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet17.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet18.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet18.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet19.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet19.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet2.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet2.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet20.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet20.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet21.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet21.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet22.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet22.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet23.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet23.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet24.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet24.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet25.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet25.tflite
--- a/src/gallium/targets/teflon/tests/mobiledet26.tflite
+++ b/src/gallium/targets/teflon/tests/mobiledet26.tflite
--- a/Show More
+++ b/Show More