1 files changed, 48 insertions, 1 deletions
diff --git a/src/corelib/global/qnumeric_p.h b/src/corelib/global/qnumeric_p.h
index bdace21000f..64fb69c8f3a 100644
--- a/src/corelib/global/qnumeric_p.h
+++ b/src/corelib/global/qnumeric_p.h
@@ -23,6 +23,10 @@
 #include <limits>
 #include <type_traits>
 
+#ifndef __has_extension
+#  define __has_extension(X)    0
+#endif
+
 #if !defined(Q_CC_MSVC) && defined(Q_OS_QNX)
 #  include <math.h>
 #  ifdef isnan
@@ -281,7 +285,7 @@ QT_WARNING_DISABLE_FLOAT_COMPARE
 QT_WARNING_POP
 }
 
-template <typename T> static inline
+template <typename T> static
 std::enable_if_t<std::is_floating_point_v<T> || std::is_same_v<T, qfloat16>, bool>
 convertDoubleTo(double v, T *value, bool allow_precision_upgrade = true)
 {
@@ -294,6 +298,49 @@ convertDoubleTo(double v, T *value, bool allow_precision_upgrade = true)
         *value = T(v);
         return true;
     }
+
+#if defined(__SSE2__) && (defined(Q_CC_GNU) || __has_extension(gnu_asm))
+    // The x86 CVTSD2SH instruction from SSE2 does what we want:
+    // - converts out-of-range doubles to ±infinity and sets #O
+    // - converts underflows to zero and sets #U
+    // We need to clear any previously-stored exceptions from it before the
+    // operation (3-cycle cost) and obtain the new state afterwards (1 cycle).
+
+    unsigned csr = _MM_MASK_MASK;         // clear stored exception indicators
+    auto sse_check_result = [&](auto result) {
+        if ((csr & (_MM_EXCEPT_UNDERFLOW | _MM_EXCEPT_OVERFLOW)) == 0)
+            return true;
+        if (csr & _MM_EXCEPT_OVERFLOW)
+            return false;
+
+        // According to IEEE 754[1], #U is also set when the result is tiny and
+        // inexact, but still non-zero, so detect that (this won't generate
+        // good code for types without hardware support).
+        // [1] https://en.wikipedia.org/wiki/Floating-point_arithmetic#Exception_handling
+        return result != 0;
+    };
+
+    // Written directly in assembly because both Clang and GCC have been
+    // observed to reorder the STMXCSR instruction above the conversion
+    // operation. MSVC generates horrid code when using the intrinsics anyway,
+    // so it's not a loss.
+    // See https://github.com/llvm/llvm-project/issues/83661.
+    if constexpr (std::is_same_v<T, float>) {
+#  ifdef __AVX__
+        asm ("vldmxcsr  %[csr]\n\t"
+             "vcvtsd2ss %[in], %[in], %[out]\n\t"
+             "vstmxcsr  %[csr]"
+            : [csr] "+m" (csr), [out] "=v" (*value) : [in] "v" (v));
+#  else
+        asm ("ldmxcsr  %[csr]\n\t"
+             "cvtsd2ss %[in], %[out]\n\t"
+             "stmxcsr  %[csr]"
+            : [csr] "+m" (csr), [out] "=v" (*value) : [in] "v" (v));
+#  endif
+        return sse_check_result(*value);
+    }
+#endif // __SSE2__ && inline assembly
+
     if (!qt_is_finite(v) && std::numeric_limits<T>::has_infinity) {
         // infinity (or NaN)
         *value = T(v);