diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c
index c63c392c0b9..a327766c9c5 100644
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -701,8 +701,6 @@ bool ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
    /* Add some margin of error, though this shouldn't be needed in theory. */
    info->all_vram_visible = info->vram_size * 0.9 < info->vram_vis_size;
 
-   util_cpu_detect();
-
    /* Set chip identification. */
    info->pci_id = amdinfo->asic_id; /* TODO: is this correct? */
    info->pci_rev_id = amdinfo->pci_rev_id;
diff --git a/src/amd/compiler/tests/main.cpp b/src/amd/compiler/tests/main.cpp
index e0abf63b525..8f5e8ea914b 100644
--- a/src/amd/compiler/tests/main.cpp
+++ b/src/amd/compiler/tests/main.cpp
@@ -34,8 +34,6 @@
 #include "aco_ir.h"
 #include "framework.h"
 
-#include "util/u_cpu_detect.h"
-
 static const char *help_message =
    "Usage: %s [-h] [-l --list] [--no-check] [TEST [TEST ...]]\n"
    "\n"
@@ -243,8 +241,6 @@ int main(int argc, char **argv)
       return 99;
    }
 
-   util_cpu_detect();
-
    if (do_list) {
       for (auto test : tests)
          printf("%s\n", test.first.c_str());
diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
index 02392bf546f..937d85fad92 100644
--- a/src/broadcom/vulkan/v3dv_device.c
+++ b/src/broadcom/vulkan/v3dv_device.c
@@ -51,7 +51,6 @@
 
 #include "util/build_id.h"
 #include "util/debug.h"
-#include "util/u_cpu_detect.h"
 
 #ifdef VK_USE_PLATFORM_XCB_KHR
 #include <xcb/xcb.h>
@@ -253,8 +252,6 @@ v3dv_CreateInstance(const VkInstanceCreateInfo *pCreateInfo,
       }
    }
 
-   util_cpu_detect();
-
    VG(VALGRIND_CREATE_MEMPOOL(instance, 0, false));
 
    *pInstance = v3dv_instance_to_handle(instance);
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index e0de4fef2ef..538ff49765e 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -26,7 +26,6 @@
 #include "compiler/glsl/glsl_parser_extras.h"
 #include "glsl_types.h"
 #include "util/hash_table.h"
-#include "util/u_cpu_detect.h"
 #include "util/u_string.h"
 
 
@@ -520,11 +519,6 @@ hash_free_type_function(struct hash_entry *entry)
 void
 glsl_type_singleton_init_or_ref()
 {
-   /* This is required for _mesa_half_to_float() which is
-    * required for constant-folding 16-bit float ops.
-    */
-   util_cpu_detect();
-
    mtx_lock(&glsl_type::hash_mutex);
    glsl_type_users++;
    mtx_unlock(&glsl_type::hash_mutex);
diff --git a/src/compiler/isaspec/decode.c b/src/compiler/isaspec/decode.c
index 2dc4969ae44..127773d6641 100644
--- a/src/compiler/isaspec/decode.c
+++ b/src/compiler/isaspec/decode.c
@@ -773,8 +773,6 @@ isa_decode(void *bin, int sz, FILE *out, const struct isa_decode_options *option
 	if (!options)
 		options = &default_options;
 
-	util_cpu_detect();  /* needed for _mesa_half_to_float() */
-
 	state = rzalloc_size(NULL, sizeof(*state));
 	state->options = options;
 	state->num_instr = sz / (BITMASK_WORDS * sizeof(BITSET_WORD));
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 39d3a61b9cf..a8630e2715b 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -34,7 +34,6 @@
 #include "pipe/p_context.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
-#include "util/u_cpu_detect.h"
 #include "util/u_inlines.h"
 #include "util/u_helpers.h"
 #include "util/u_prim.h"
@@ -85,9 +84,6 @@ draw_create_context(struct pipe_context *pipe, void *context,
    if (!draw)
       goto err_out;
 
-   /* we need correct cpu caps for disabling denorms in draw_vbo() */
-   util_cpu_detect();
-
 #ifdef DRAW_LLVM_AVAILABLE
    if (try_llvm && draw_get_option_use_llvm()) {
       draw->llvm = draw_llvm_create(draw, (LLVMContextRef)context);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 3f040ac21d7..19b079b1275 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -430,8 +430,6 @@ lp_build_init(void)
 
    lp_set_target_options();
 
-   util_cpu_detect();
-
    /* For simulating less capable machines */
 #ifdef DEBUG
    if (debug_get_bool_option("LP_FORCE_SSE2", FALSE)) {
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader.c b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
index 6d5204d629f..1c58eaefd04 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c
@@ -27,7 +27,6 @@
 
 #include "pipe_loader_priv.h"
 
-#include "util/u_cpu_detect.h"
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
@@ -165,7 +164,6 @@ pipe_loader_create_screen_vk(struct pipe_loader_device *dev, bool sw_vk)
 {
    struct pipe_screen_config config;
 
-   util_cpu_detect();
    pipe_loader_load_options(dev);
    config.options_info = &dev->option_info;
    config.options = &dev->option_cache;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_cpu.c b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
index 03b11f914b4..27265031412 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_cpu.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_cpu.c
@@ -37,7 +37,6 @@ DEBUG_GET_ONCE_BOOL_OPTION(nosse, "GALLIUM_NOSSE", false);
 
 static const struct util_cpu_caps_t *get_cpu_caps(void)
 {
-   util_cpu_detect();
    return util_get_cpu_caps();
 }
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index dbdc737dfd9..9fc3a712244 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -2159,7 +2159,6 @@ struct x86_reg x86_fn_arg( struct x86_function *p,
 
 static void x86_init_func_common( struct x86_function *p )
 {
-   util_cpu_detect();
    p->caps = 0;
    if(util_get_cpu_caps()->has_mmx)
       p->caps |= X86_MMX;
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c
index de1ead7eabd..c7a5857be49 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -4299,8 +4299,6 @@ threaded_context_create(struct pipe_context *pipe,
    if (!pipe)
       return NULL;
 
-   util_cpu_detect();
-
    if (!debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1))
       return pipe;
 
diff --git a/src/gallium/drivers/lima/standalone/lima_disasm.c b/src/gallium/drivers/lima/standalone/lima_disasm.c
index 82dcddc69be..9c8278cddd9 100644
--- a/src/gallium/drivers/lima/standalone/lima_disasm.c
+++ b/src/gallium/drivers/lima/standalone/lima_disasm.c
@@ -23,7 +23,6 @@
  */
 
 #include "util/ralloc.h"
-#include "util/u_cpu_detect.h"
 
 #include <err.h>
 #include <stdio.h>
@@ -176,9 +175,6 @@ main(int argc, char **argv)
       return -1;
    }
 
-   /* Needed by _mesa_half_to_float() */
-   util_cpu_detect();
-
    if (is_frag) {
       assert((size & 0x3) == 0);
       size >>= 2;
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 359c200279f..7ee715bf380 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -921,8 +921,8 @@ static void update_cache_sha1_cpu(struct mesa_sha1 *ctx)
     * Don't need the cpu cache affinity stuff. The rest
     * is contained in first 5 dwords.
     */
-   STATIC_ASSERT(offsetof(struct util_cpu_caps_t, num_L3_caches) == 5 * sizeof(uint32_t));
-   _mesa_sha1_update(ctx, cpu_caps, 5 * sizeof(uint32_t));
+   STATIC_ASSERT(offsetof(struct util_cpu_caps_t, num_L3_caches) == 6 * sizeof(uint32_t));
+   _mesa_sha1_update(ctx, cpu_caps, 6 * sizeof(uint32_t));
 }
 
 static void lp_disk_cache_create(struct llvmpipe_screen *screen)
@@ -1024,8 +1024,6 @@ llvmpipe_create_screen(struct sw_winsys *winsys)
 {
    struct llvmpipe_screen *screen;
 
-   util_cpu_detect();
-
    glsl_type_singleton_init_or_ref();
 
 #ifdef DEBUG
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 5ec0dd347bd..aee03dafe73 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -34,7 +34,6 @@
  */
 
 
-#include "util/u_cpu_detect.h"
 #include "util/u_math.h"
 
 #include "gallivm/lp_bld_const.h"
@@ -381,7 +380,6 @@ int main(int argc, char **argv)
    boolean single = FALSE;
    unsigned fpstate;
 
-   util_cpu_detect();
    fpstate = util_fpstate_get();
    util_fpstate_set_denorms_to_zero(fpstate);
 
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 5ab278969f1..49a4eccd36c 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -994,8 +994,6 @@ static void si_init_renderer_string(struct si_screen *sscreen)
 
 void si_init_screen_get_functions(struct si_screen *sscreen)
 {
-   util_cpu_detect();
-
    sscreen->b.get_name = si_get_name;
    sscreen->b.get_vendor = si_get_vendor;
    sscreen->b.get_device_vendor = si_get_device_vendor;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index b604cd5545c..0116ef2bdc8 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -27,7 +27,6 @@
 #include "pipe/p_screen.h"
 #include "pipe/p_state.h"
 
-#include "util/u_cpu_detect.h"
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/format/u_format.h"
@@ -591,8 +590,6 @@ vc4_screen_create(int fd, struct renderonly *ro)
         if (!vc4_get_chip_info(screen))
                 goto fail;
 
-        util_cpu_detect();
-
         slab_create_parent(&screen->transfer_pool, sizeof(struct vc4_transfer), 16);
 
         vc4_fence_screen_init(screen);
diff --git a/src/gallium/drivers/zink/zink_screen.c b/src/gallium/drivers/zink/zink_screen.c
index 2abbef81df6..366a9a586e8 100644
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@@ -2035,7 +2035,6 @@ zink_internal_create_screen(const struct pipe_screen_config *config)
    if (!screen)
       return NULL;
 
-   util_cpu_detect();
    screen->threaded = util_get_cpu_caps()->nr_cpus > 1 && debug_get_bool_option("GALLIUM_THREAD", util_get_cpu_caps()->nr_cpus > 1);
 
    zink_debug = debug_get_option_zink_debug();
diff --git a/src/gallium/tests/unit/translate_test.c b/src/gallium/tests/unit/translate_test.c
index 782f16e7f78..16e55b231ca 100644
--- a/src/gallium/tests/unit/translate_test.c
+++ b/src/gallium/tests/unit/translate_test.c
@@ -69,8 +69,6 @@ int main(int argc, char** argv)
 
    create_fn = 0;
 
-   util_cpu_detect();
-
    if (argc <= 1 ||
        !strcmp(argv[1], "default") )
       create_fn = translate_create;
diff --git a/src/gallium/tests/unit/u_half_test.c b/src/gallium/tests/unit/u_half_test.c
index 4474cfb82b0..36422afea2f 100644
--- a/src/gallium/tests/unit/u_half_test.c
+++ b/src/gallium/tests/unit/u_half_test.c
@@ -36,7 +36,6 @@ test(void)
 int
 main(int argc, char **argv)
 {
-   util_cpu_detect();
    test();
 
    /* Test non-f16c. */
diff --git a/src/mesa/main/tests/mesa_formats.cpp b/src/mesa/main/tests/mesa_formats.cpp
index 7ca3a269896..916f7378d38 100644
--- a/src/mesa/main/tests/mesa_formats.cpp
+++ b/src/mesa/main/tests/mesa_formats.cpp
@@ -35,15 +35,9 @@
 #include "main/glformats.h"
 #include "main/format_unpack.h"
 #include "main/format_pack.h"
-#include "util/u_cpu_detect.h"
 
 // Test fixture for Format tests.
-// Currently just ensures that util_cpu_detect() has been called
 class MesaFormatsTest : public ::testing::Test {
-  protected:
-    MesaFormatsTest() {
-       util_cpu_detect();
-    }
 };
 
 /**
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 88286f44634..b3a12d4a77e 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -480,8 +480,6 @@ st_create_context_priv(struct gl_context *ctx, struct pipe_context *pipe,
    uint i;
    struct st_context *st = CALLOC_STRUCT( st_context);
 
-   util_cpu_detect();
-
    st->options = *options;
 
    ctx->st_opts = &st->options;
@@ -862,8 +860,6 @@ st_create_context(gl_api api, struct pipe_context *pipe,
    struct dd_function_table funcs;
    struct st_context *st;
 
-   util_cpu_detect();
-
    memset(&funcs, 0, sizeof(funcs));
    st_init_driver_functions(pipe->screen, &funcs, has_egl_image_validate);
 
diff --git a/src/util/tests/format/u_format_test.c b/src/util/tests/format/u_format_test.c
index 1ce93b97cc0..0b852ace540 100644
--- a/src/util/tests/format/u_format_test.c
+++ b/src/util/tests/format/u_format_test.c
@@ -869,8 +869,6 @@ int main(int argc, char **argv)
 {
    boolean success;
 
-   util_cpu_detect();
-
    success = test_all();
 
    return success ? 0 : 1;
diff --git a/src/util/u_cpu_detect.c b/src/util/u_cpu_detect.c
index 6e8743e254f..fb96e1042bd 100644
--- a/src/util/u_cpu_detect.c
+++ b/src/util/u_cpu_detect.c
@@ -860,6 +860,9 @@ util_cpu_detect_once(void)
       printf("util_cpu_caps.num_L3_caches = %u\n", util_cpu_caps.num_L3_caches);
       printf("util_cpu_caps.num_cpu_mask_bits = %u\n", util_cpu_caps.num_cpu_mask_bits);
    }
+
+   /* This must happen at the end as it's used to guard everything else */
+   p_atomic_set(&util_cpu_caps.detect_done, 1);
 }
 
 static once_flag cpu_once_flag = ONCE_FLAG_INIT;
diff --git a/src/util/u_cpu_detect.h b/src/util/u_cpu_detect.h
index a1416d5a3f5..8e35b3640a1 100644
--- a/src/util/u_cpu_detect.h
+++ b/src/util/u_cpu_detect.h
@@ -38,6 +38,7 @@
 #include <stdbool.h>
 
 #include "pipe/p_config.h"
+#include "util/u_atomic.h"
 #include "util/u_thread.h"
 
 
@@ -60,6 +61,12 @@ enum cpu_family {
 typedef uint32_t util_affinity_mask[UTIL_MAX_CPUS / 32];
 
 struct util_cpu_caps_t {
+   /**
+    * Initialized to 0 and set to non-zero with an atomic after the entire
+    * struct has been initialized.
+    */
+   uint32_t detect_done;
+
    /**
     * Number of CPUs available to the process.
     *
@@ -127,21 +134,41 @@ struct util_cpu_caps_t {
 
 #define U_CPU_INVALID_L3 0xffff
 
-static inline const struct util_cpu_caps_t *
-util_get_cpu_caps(void)
-{
-	extern struct util_cpu_caps_t util_cpu_caps;
-
-	/* If you hit this assert, it means that something is using the
-	 * cpu-caps without having first called util_cpu_detect()
-	 */
-	assert(util_cpu_caps.nr_cpus >= 1);
-
-	return &util_cpu_caps;
-}
-
 void util_cpu_detect(void);
 
+static inline ATTRIBUTE_CONST const struct util_cpu_caps_t *
+util_get_cpu_caps(void)
+{
+   extern struct util_cpu_caps_t util_cpu_caps;
+
+   /* On most CPU architectures, an atomic read is simply a regular memory
+    * load instruction with some extra compiler magic to prevent code
+    * re-ordering around it.  The perf impact of doing this check should be
+    * negligible in most cases.
+    *
+    * Also, even though it looks like  a bit of a lie, we've declared this
+    * function with ATTRIBUTE_CONST.  The GCC docs say:
+    *
+    *    "Calls to functions whose return value is not affected by changes to
+    *    the observable state of the program and that have no observable
+    *    effects on such state other than to return a value may lend
+    *    themselves to optimizations such as common subexpression elimination.
+    *    Declaring such functions with the const attribute allows GCC to avoid
+    *    emitting some calls in repeated invocations of the function with the
+    *    same argument values."
+    *
+    * The word "observable" is important here.  With the exception of a
+    * llvmpipe debug flag behind an environment variable and a few unit tests,
+    * all of which emulate worse CPUs, this function neither affects nor is
+    * affected by any "observable" state.  It has its own internal state for
+    * sure, but that state is such that it appears to return exactly the same
+    * value with the same internal data every time.
+    */
+   if (unlikely(!p_atomic_read(&util_cpu_caps.detect_done)))
+      util_cpu_detect();
+
+   return &util_cpu_caps;
+}
 
 #ifdef	__cplusplus
 }
diff --git a/src/util/u_queue.c b/src/util/u_queue.c
index d35b8f2f29b..413f8aa019e 100644
--- a/src/util/u_queue.c
+++ b/src/util/u_queue.c
@@ -263,9 +263,6 @@ util_queue_thread_func(void *input)
 
       memset(mask, 0xff, sizeof(mask));
 
-      /* Ensure util_cpu_caps.num_cpu_mask_bits is initialized: */
-      util_cpu_detect();
-
       util_set_current_thread_affinity(mask, NULL,
                                        util_get_cpu_caps()->num_cpu_mask_bits);
    }